Source Code Cross Referenced for HTMLParser.java in  » Web-Crawler » WebSPHINX » websphinx » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » WebSPHINX » websphinx 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:/*
0002: * WebSphinx web-crawling toolkit
0003: *
0004: * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights
0005: * reserved.
0006: *
0007: * Redistribution and use in source and binary forms, with or without
0008: * modification, are permitted provided that the following conditions
0009: * are met:
0010: *
0011: * 1. Redistributions of source code must retain the above copyright
0012: *    notice, this  list of conditions and the following disclaimer.
0013: *
0014: * 2. Redistributions in binary form must reproduce the above copyright
0015: *    notice, this  list of conditions and the following disclaimer in
0016: *    the documentation and/or other materials provided with the
0017: *    distribution.
0018: *
0019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
0020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
0021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0022: * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
0023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0030: *
0031: */
0032:
0033:package websphinx;
0034:
0035:import java.io.InputStream;
0036:import java.io.IOException;
0037://#ifdef JDK1.1 
0038:import java.io.Reader;
0039:import java.io.InputStreamReader;
0040:import java.io.StringReader;
0041://#endif JDK1.1
0042:/*#ifdef JDK1.0
0043:import java.io.StringBufferInputStream;
0044:#endif JDK1.0*/
0045:import java.io.ByteArrayOutputStream;
0046:import java.util.Hashtable;
0047:import java.util.Enumeration;
0048:import java.util.Vector;
0049:import java.util.Stack;
0050:import java.net.URL;
0051:import java.net.MalformedURLException;
0052:
0053:/**
0054: * HTML parser.  Parses an input stream or String and
0055: * converts it to a sequence of Tags and a tree of Elements.
0056: * HTMLParser is used by Page to parse pages.
0057: */
0058:// FIX: make HTMLParser into an interface, and
0059:// split this implementation into Tokenizer and TreeBuilder
0060:public class HTMLParser {
0061:
0062:    // parameter for HTML type detection.  
0063:    // If the parser doesn't encounter at least one HTML tag
0064:    // in the first VALID_HTML_PREFIX bytes of the stream, then parser 
0065:    // concludes that the stream isn't HTML and stops parsing it.
0066:    static final int VALID_HTML_PREFIX = 10000;
0067:
0068:    int maxBytes = Integer.MAX_VALUE;
0069:
0070:    /**
0071:     * Make an HTMLParser.
0072:     */
0073:    public HTMLParser () {
0074:    }
0075:
0076:    /**
0077:     * Parse a page as HTML.
0078:     * @param page Page to parse
0079:     */
0080:    public void parse (Page page) throws IOException {
0081:        tokenize (page);
0082:        buildParseTree (page);
0083:    }
0084:
0085:
0086:    /*
0087:     *  HTML tokenizer state machine
0088:     */
0089:
0090:    // state takes on one of the following values:
0091:    private static final int START = 0;
0092:    private static final int INWORD = 1;
0093:    private static final int ENTITY = 2;
0094:    private static final int LT = 4;
0095:    private static final int BANG = 5;
0096:    private static final int BANG_DASH = 6;
0097:    private static final int CMT = 7;
0098:    private static final int CMT_DASH = 8;
0099:    private static final int CMT_DASHDASH = 9;
0100:    private static final int DIRECTIVE = 10;
0101:    private static final int STAG = 11;
0102:    private static final int ETAG = 12;
0103:    private static final int ATTR = 13;
0104:    private static final int ATTRNAME = 14;
0105:    private static final int EQ = 15;
0106:    private static final int AFTEREQ = 16;
0107:    private static final int ATTRVAL = 17;
0108:    private static final int ATTRVAL_SQ = 18;
0109:    private static final int ATTRVAL_DQ = 19;
0110:    private static final int DONE = 20;
0111:    private static final int ENTNUM = 21;
0112:    private static final int ENTREF = 22;
0113:
0114:    StringBuffer wordBuf = new StringBuffer ();
0115:    StringBuffer tagName = new StringBuffer ();
0116:    StringBuffer attrName = new StringBuffer ();
0117:    StringBuffer attrVal = new StringBuffer ();
0118:    Vector attrs = new Vector ();
0119:    StringBuffer entity = new StringBuffer ();
0120:
0121:    // FIX: should entities in attr names or values be expanded?
0122:    private void tokenize (Page page) throws IOException {
0123:        int state = START;
0124:
0125:        String content = page.getContent ();
0126:        int buflen = content.length ();
0127:        int bufptr = 0;
0128:        int bufbase = 0;
0129:
0130:        // token list
0131:        Vector tokens = new Vector();
0132:
0133:        int wordStart = 0;
0134:        int nWords = 0;
0135:
0136:        Tag tag = null;
0137:        int tagStart = 0;
0138:
0139:        int entnum = 0;
0140:        
0141:        StringBuffer entityTargetBuf = null;
0142:        int postEntityState = 0;
0143:
0144:        boolean isHTML = "text/html".equals (page.getContentType ());
0145:
0146:        while (bufptr < buflen) {
0147:            if (!isHTML && bufptr >= VALID_HTML_PREFIX)
0148:                // we didn't see any HTML tags in the first
0149:                // VALID_HTML_PREFIX bytes,
0150:                // so assume the document isn't HTML and stop parsing it.
0151:                return;
0152:
0153:            char c = content.charAt (bufptr);
0154:
0155:            //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c);
0156:
0157:            switch (state) {
0158:                case START:
0159:                    // after whitespace or tag
0160:                    switch (c) {
0161:                        case '<':
0162:                            ++bufptr;
0163:                            state = LT;
0164:                            break;
0165:                        case ' ':
0166:                        case '\t':
0167:                        case '\n':
0168:                        case '\r':
0169:                            ++bufptr;
0170:                            break;
0171:                        default:
0172:                            wordBuf.setLength (0);
0173:                            wordStart = bufbase+bufptr;
0174:                            state = INWORD;
0175:                            break;
0176:                    }
0177:                    break;
0178:
0179:                case INWORD:
0180:                    // Character data
0181:                    switch (c) {
0182:                        case '<':
0183:                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0184:                            ++nWords;
0185:                            state = START;
0186:                            break;
0187:                        case ' ':
0188:                        case '\t':
0189:                        case '\n':
0190:                        case '\r':
0191:                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0192:                            ++nWords;
0193:                            state = START;
0194:                            ++bufptr;
0195:                            break;
0196:                        case '&':
0197:                            ++bufptr;
0198:                            postEntityState = INWORD;
0199:                            entityTargetBuf = wordBuf;
0200:                            state = ENTITY;
0201:                            break;
0202:                        default:
0203:                            wordBuf.append ((char)c);
0204:                            ++bufptr;
0205:                            // state == INWORD;
0206:                            break;
0207:                    }
0208:                    break;
0209:
0210:                //  Entities
0211:                case ENTITY:
0212:                    if (c == '#') {
0213:                        ++bufptr;
0214:                        entnum = 0;
0215:                        state = ENTNUM;
0216:                    }
0217:                    else if ((c >= 'A' && c <= 'Z')
0218:                             || (c >= 'a' && c <= 'z')) {
0219:                        entity.setLength (0);
0220:                        state = ENTREF;
0221:                    }
0222:                    else {
0223:                        entityTargetBuf.append ('&');
0224:                        state = postEntityState;
0225:                    }
0226:                    break;
0227:
0228:                case ENTREF:
0229:                    if (!Character.isLetterOrDigit(c)) {
0230:                        Character ent = lookupEntityRef (entity.toString ());
0231:                        if (ent != null) {
0232:                            entityTargetBuf.append (ent.charValue());
0233:                            if (c == ';')
0234:                                ++bufptr;
0235:                        }
0236:                        else {
0237:                            // unrecognized entity -- leave
0238:                            // as-is
0239:                            entityTargetBuf.append ('&');
0240:                            entityTargetBuf.append (entity.toString ());
0241:                        }                            
0242:                        state = postEntityState;
0243:                    }
0244:                    else {
0245:                        ++bufptr;
0246:                        entity.append (c);
0247:                        // state == ENTREF;
0248:                    }
0249:                    break;
0250:
0251:                case ENTNUM:
0252:                    if (c==';' || !Character.isDigit(c)) {
0253:                        entityTargetBuf.append ((char) entnum);
0254:                        if (c == ';')
0255:                            ++bufptr;
0256:                        state = postEntityState;
0257:                    }
0258:                    else {
0259:                        entnum = 10*entnum + (c - '0');
0260:                        ++bufptr;
0261:                    }
0262:                    break;
0263:
0264:                case LT:
0265:                    tagStart = bufbase+bufptr-1;
0266:                    switch (c) {
0267:                        case '/':
0268:                            ++bufptr;
0269:                            tagName.setLength (0);
0270:                            state = ETAG;
0271:                            break;
0272:                        case '!':
0273:                            ++bufptr;
0274:                            state = BANG;
0275:                            break;
0276:                        default:
0277:                            if (Character.isLetter (c)) {
0278:                                tagName.setLength (0);
0279:                                state = STAG;
0280:                            }
0281:                            else {
0282:                                wordBuf.append ('<');
0283:                                state = INWORD;
0284:                            }
0285:                            break;
0286:                    }
0287:                    break;
0288:
0289:                // Comments and directives.
0290:                // Implements the (broken, but easy) Netscape rule:
0291:                // <!-- starts a comment, --> closes.
0292:                // All other directives <!foo> are also returned as comments.
0293:                case BANG:
0294:                    if (c == '-') {
0295:                        ++bufptr;
0296:                        state = BANG_DASH;
0297:                    }
0298:                    else {
0299:                        state = DIRECTIVE;
0300:                    }
0301:                    break;
0302:
0303:                case BANG_DASH:
0304:                    if (c == '-') {
0305:                        ++bufptr;
0306:                        state = CMT;
0307:                    }
0308:                    else {
0309:                        state = DIRECTIVE;
0310:                    }
0311:                    break;
0312:
0313:                case CMT:
0314:                    if (c == '-') {
0315:                        ++bufptr;
0316:                        state = CMT_DASH;
0317:                    }
0318:                    else {
0319:                        ++bufptr;
0320:                    }
0321:                    break;
0322:
0323:                case CMT_DASH:
0324:                    if (c == '-') {
0325:                        ++bufptr;
0326:                        state = CMT_DASHDASH;
0327:                    }
0328:                    else {
0329:                        ++bufptr;
0330:                        state = CMT;
0331:                    }
0332:                    break;
0333:
0334:                case CMT_DASHDASH:
0335:                    if (c == '>') {
0336:                        ++bufptr;
0337:                        tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);
0338:                        tokens.addElement (tag);
0339:                        state = START;
0340:                    }
0341:                    else if (c == '-') {
0342:                        ++bufptr;
0343:                        state = CMT_DASHDASH;
0344:                    }
0345:                    else {
0346:                        ++bufptr;
0347:                        state = CMT;
0348:                    }
0349:                    break;
0350:
0351:                case DIRECTIVE:
0352:                    if (c == '>') {
0353:                        ++bufptr;
0354:                        tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);
0355:                        tokens.addElement (tag);
0356:                        state = START;
0357:                    }
0358:                    else {
0359:                        ++bufptr;
0360:                    }
0361:                    break;
0362:
0363:                // Tags
0364:                case STAG:
0365:                    if (c == '>' || isWhitespace(c)) {
0366:                        tag = new Tag (page, tagStart, bufbase+bufptr, // tag doesn't really end here
0367:                                                                       // -- we'll fix it up when we actually see it
0368:                                       tagName.toString (), true);
0369:                        tokens.addElement (tag);
0370:                        attrs.setSize (0);
0371:                        state = ATTR;
0372:                        isHTML = true;
0373:                    }
0374:                    else {
0375:                        tagName.append (c);
0376:                        ++bufptr;
0377:                        // state == STAG;
0378:                    }
0379:                    break;
0380:
0381:                case ETAG:
0382:                    if (c == '>') {
0383:                        ++bufptr;
0384:                        tag = new Tag (page, tagStart, bufbase+bufptr, tagName.toString (), false);
0385:                        tokens.addElement (tag);
0386:                        state = START;
0387:                    }
0388:                    else {
0389:                        tagName.append (c);
0390:                        ++bufptr;
0391:                        // state == ETAG
0392:                    }
0393:                    break;
0394:
0395:                // Attributes
0396:                case ATTR:
0397:                    if (isWhitespace(c))
0398:                        ++bufptr;
0399:                    else if (c == '>') {
0400:                        ++bufptr;
0401:                        tag.end = bufbase+bufptr;
0402:                        if (attrs.size() > 0) {
0403:                            tag.htmlAttributes = new String[attrs.size()];
0404:                            attrs.copyInto (tag.htmlAttributes);
0405:                        }
0406:                        state = START;
0407:                    }
0408:                    else {
0409:                        attrName.setLength (0);
0410:                        state = ATTRNAME;
0411:                    }
0412:                    break;
0413:
0414:                case ATTRNAME:
0415:                    if (c == '>' || c == '=' || isWhitespace(c)) {
0416:                        state = EQ;
0417:                    }
0418:                    else {
0419:                        attrName.append (c);
0420:                        ++bufptr;
0421:                        // state == ATTRNAME;
0422:                    }
0423:                    break;
0424:
0425:                case EQ:
0426:                    if (isWhitespace(c))
0427:                        ++bufptr;
0428:                    else if (c == '=') {
0429:                        ++bufptr;
0430:                        state = AFTEREQ;
0431:                    }
0432:                    else {
0433:                        String name = Tag.toHTMLAttributeName (attrName.toString());
0434:                        tag.setLabel (name);
0435:                        attrs.addElement (name);
0436:                        state = ATTR;
0437:                    }
0438:                    break;
0439:
0440:                case AFTEREQ:
0441:                    if (isWhitespace (c))
0442:                        ++bufptr;
0443:                    else
0444:                        switch (c) {
0445:                            case '>':
0446:                            {
0447:                                String name = Tag.toHTMLAttributeName (attrName.toString());
0448:                                tag.setLabel (name);
0449:                                attrs.addElement (name);
0450:                                state = ATTR;
0451:                                break;
0452:                            }
0453:                            case '\'':
0454:                                ++bufptr;
0455:                                attrVal.setLength (0);
0456:                                state = ATTRVAL_SQ;
0457:                                break;
0458:                            case '"':
0459:                                ++bufptr;
0460:                                attrVal.setLength (0);
0461:                                state = ATTRVAL_DQ;
0462:                                break;
0463:                            default:
0464:                                attrVal.setLength (0);
0465:                                state = ATTRVAL;
0466:                                break;
0467:                        }
0468:                    break;
0469:
0470:                case ATTRVAL:
0471:                    if (c == '>' || isWhitespace(c)) {
0472:                        String name = Tag.toHTMLAttributeName (attrName.toString());
0473:                        tag.setLabel (name, attrVal.toString());
0474:                        attrs.addElement (name);
0475:                        state = ATTR;
0476:                    }
0477:                    else if (c == '&') {
0478:                        ++bufptr;
0479:                        postEntityState = ATTRVAL;
0480:                        entityTargetBuf = attrVal;
0481:                        state = ENTITY;
0482:                    }
0483:                    else {
0484:                        ++bufptr;
0485:                        attrVal.append (c);
0486:                        // state == ATTRVAL;
0487:                    }
0488:                    break;
0489:
0490:                case ATTRVAL_SQ:
0491:                    if (c=='\'') {
0492:                        ++bufptr;
0493:                        String name = Tag.toHTMLAttributeName (attrName.toString());
0494:                        tag.setLabel (name, attrVal.toString());
0495:                        attrs.addElement (name);
0496:                        state = ATTR;
0497:                    }
0498:                    else if (c == '&') {
0499:                        ++bufptr;
0500:                        postEntityState = ATTRVAL_SQ;
0501:                        entityTargetBuf = attrVal;
0502:                        state = ENTITY;
0503:                    }
0504:                    else {
0505:                        ++bufptr;
0506:                        attrVal.append (c);
0507:                        // state == ATTRVAL_SQ;
0508:                    }
0509:                    break;
0510:
0511:                case ATTRVAL_DQ:
0512:                    if (c=='"') {
0513:                        ++bufptr;
0514:                        String name = Tag.toHTMLAttributeName (attrName.toString());
0515:                        tag.setLabel (name, attrVal.toString());
0516:                        attrs.addElement (name);
0517:                        state = ATTR;
0518:                    }
0519:                    else if (c == '&') {
0520:                        ++bufptr;
0521:                        postEntityState = ATTRVAL_DQ;
0522:                        entityTargetBuf = attrVal;
0523:                        state = ENTITY;
0524:                    }
0525:                    else {
0526:                        ++bufptr;
0527:                        attrVal.append (c);
0528:                        // state == ATTRVAL_DQ;
0529:                    }
0530:                    break;
0531:
0532:                default:
0533:                    throw new RuntimeException ("HtmlTokenizer entered illegal state " + state);
0534:            }
0535:        }
0536:
0537:        // EOF
0538:        switch (state) {
0539:            case INWORD:
0540:                // EOF terminated some text -- save the text
0541:                tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0542:                ++nWords;
0543:                break;
0544:
0545:            default:
0546:                // EOF in the middle of tags is illegal
0547:                // don't try to recover
0548:                break;
0549:        }
0550:
0551:        int nTotal = tokens.size ();
0552:
0553:        page.tokens = new Region[nTotal];
0554:        tokens.copyInto (page.tokens);
0555:
0556:        page.words = new Text[nWords];
0557:        int textnum = 0;
0558:        page.tags = new Tag[nTotal - nWords];
0559:        int tagnum = 0;
0560:
0561:        for (int i=0; i < nTotal; ++i) {
0562:            if (page.tokens[i] instanceof  Tag)
0563:                page.tags[tagnum++] = (Tag)page.tokens[i];
0564:            else
0565:                page.words[textnum++] = (Text)page.tokens[i];
0566:        }
0567:    }
0568:
0569:    private static boolean isWhitespace (char c) {
0570://#ifdef JDK1.1 
0571:        return Character.isWhitespace (c);
0572://#endif JDK1.1
0573:/*#ifdef JDK1.0
0574:        return Character.isSpace (c);
0575:#endif JDK1.0*/
0576:    }
0577:        
0578:
0579:    private static Hashtable entities = new Hashtable2()
0580:          .add ("quot", new Character ((char)34))
0581:          .add ("amp", new Character ((char)38))
0582:          .add ("lt", new Character ((char)60))
0583:          .add ("gt", new Character ((char)62))
0584:          .add ("nbsp", new Character ((char)160))
0585:          .add ("iexcl", new Character ((char)161))
0586:          .add ("cent", new Character ((char)162))
0587:          .add ("pound", new Character ((char)163))
0588:          .add ("curren", new Character ((char)164))
0589:          .add ("yen", new Character ((char)165))
0590:          .add ("brvbar", new Character ((char)167))
0591:          .add ("sect", new Character ((char)167))
0592:          .add ("uml", new Character ((char)168))
0593:          .add ("copy", new Character ((char)169))
0594:          .add ("ordf", new Character ((char)170))
0595:          .add ("laquo", new Character ((char)171))
0596:          .add ("not", new Character ((char)172))
0597:          .add ("shy", new Character ((char)173))
0598:          .add ("reg", new Character ((char)174))
0599:          .add ("macr", new Character ((char)175))
0600:          .add ("deg", new Character ((char)176))
0601:          .add ("plusmn", new Character ((char)177))
0602:          .add ("sup2", new Character ((char)178))
0603:          .add ("sup3", new Character ((char)179))
0604:          .add ("acute", new Character ((char)180))
0605:          .add ("micro", new Character ((char)181))
0606:          .add ("para", new Character ((char)182))
0607:          .add ("middot", new Character ((char)183))
0608:          .add ("cedil", new Character ((char)184))
0609:          .add ("sup1", new Character ((char)185))
0610:          .add ("ordm", new Character ((char)186))
0611:          .add ("raquo", new Character ((char)187))
0612:          .add ("frac14", new Character ((char)188))
0613:          .add ("frac12", new Character ((char)189))
0614:          .add ("frac34", new Character ((char)190))
0615:          .add ("iquest", new Character ((char)191))
0616:          .add ("Agrave", new Character ((char)192))
0617:          .add ("Aacute", new Character ((char)193))
0618:          .add ("Acirc", new Character ((char)194))
0619:          .add ("Atilde", new Character ((char)195))
0620:          .add ("Auml", new Character ((char)196))
0621:          .add ("Aring", new Character ((char)197))
0622:          .add ("AElig", new Character ((char)198))
0623:          .add ("Ccedil", new Character ((char)199))
0624:          .add ("Egrave", new Character ((char)200))
0625:          .add ("Eacute", new Character ((char)201))
0626:          .add ("Ecirc", new Character ((char)202))
0627:          .add ("Euml", new Character ((char)203))
0628:          .add ("Igrave", new Character ((char)204))
0629:          .add ("Iacute", new Character ((char)205))
0630:          .add ("Icirc", new Character ((char)206))
0631:          .add ("Iuml", new Character ((char)207))
0632:          .add ("ETH", new Character ((char)208))
0633:          .add ("Ntilde", new Character ((char)209))
0634:          .add ("Ograve", new Character ((char)210))
0635:          .add ("Oacute", new Character ((char)211))
0636:          .add ("Ocirc", new Character ((char)212))
0637:          .add ("Otilde", new Character ((char)213))
0638:          .add ("Ouml", new Character ((char)214))
0639:          .add ("times", new Character ((char)215))
0640:          .add ("Oslash", new Character ((char)216))
0641:          .add ("Ugrave", new Character ((char)217))
0642:          .add ("Uacute", new Character ((char)218))
0643:          .add ("Ucirc", new Character ((char)219))
0644:          .add ("Uuml", new Character ((char)220))
0645:          .add ("Yacute", new Character ((char)221))
0646:          .add ("THORN", new Character ((char)222))
0647:          .add ("szlig", new Character ((char)223))
0648:          .add ("agrave", new Character ((char)224))
0649:          .add ("aacute", new Character ((char)225))
0650:          .add ("acirc", new Character ((char)226))
0651:          .add ("atilde", new Character ((char)227))
0652:          .add ("auml", new Character ((char)228))
0653:          .add ("aring", new Character ((char)229))
0654:          .add ("aelig", new Character ((char)230))
0655:          .add ("ccedil", new Character ((char)231))
0656:          .add ("egrave", new Character ((char)232))
0657:          .add ("eacute", new Character ((char)233))
0658:          .add ("ecirc", new Character ((char)234))
0659:          .add ("euml", new Character ((char)235))
0660:          .add ("igrave", new Character ((char)236))
0661:          .add ("iacute", new Character ((char)237))
0662:          .add ("icirc", new Character ((char)238))
0663:          .add ("iuml", new Character ((char)239))
0664:          .add ("eth", new Character ((char)240))
0665:          .add ("ntilde", new Character ((char)241))
0666:          .add ("ograve", new Character ((char)242))
0667:          .add ("oacute", new Character ((char)243))
0668:          .add ("ocirc", new Character ((char)244))
0669:          .add ("otilde", new Character ((char)245))
0670:          .add ("ouml", new Character ((char)246))
0671:          .add ("divide", new Character ((char)247))
0672:          .add ("oslash", new Character ((char)248))
0673:          .add ("ugrave", new Character ((char)249))
0674:          .add ("uacute", new Character ((char)250))
0675:          .add ("ucirc", new Character ((char)251))
0676:          .add ("uuml", new Character ((char)252))
0677:          .add ("yacute", new Character ((char)253))
0678:          .add ("thorn", new Character ((char)254))
0679:          .add ("yuml", new Character ((char)255))
0680:          ;
0681:
0682:    private static Character lookupEntityRef (String name) {
0683:        return (Character) entities.get (name);
0684:    }
0685:
0686:    /*
0687:     *  Parser (constructs a canonical tree of elements)
0688:     *
0689:     */
0690:
0691:    Vector vElements = new Vector ();
0692:    Vector vLinks = new Vector ();
0693:
0694:    StringBuffer text = new StringBuffer ();
0695:
0696:    // elements with no content: e.g., IMG, BR, HR.  End tags for these elements
0697:    // are simply ignored.
0698:    private static Hashtable empty = new Hashtable2 ()
0699:          .add (Tag.AREA)
0700:          .add (Tag.BASE)
0701:          .add (Tag.BASEFONT)
0702:          .add (Tag.BGSOUND)
0703:          .add (Tag.BR)
0704:          .add (Tag.COL)
0705:          .add (Tag.COLGROUP)
0706:          .add (Tag.COMMENT) // actually <!-- ... -->
0707:          .add (Tag.HR)
0708:          .add (Tag.IMG)
0709:          .add (Tag.INPUT)
0710:          .add (Tag.ISINDEX)
0711:          .add (Tag.LINK)
0712:          .add (Tag.META)
0713:          .add (Tag.NEXTID)
0714:          .add (Tag.PARAM)
0715:          .add (Tag.SPACER)
0716:          .add (Tag.WBR)
0717:          ;
0718:
0719:    // elements that close <P> (correspond to "%block" entity in HTML 3.2 DTD)
0720:    static Hashtable blocktag = new Hashtable2()
0721:          .add (Tag.P)
0722:          .add (Tag.UL)
0723:          .add (Tag.OL)
0724:          .add (Tag.DIR)
0725:          .add (Tag.MENU)
0726:          .add (Tag.PRE)
0727:          .add (Tag.XMP)
0728:          .add (Tag.LISTING)
0729:          .add (Tag.DL)
0730:          .add (Tag.DIV)
0731:          .add (Tag.CENTER)
0732:          .add (Tag.BLOCKQUOTE)
0733:          .add (Tag.FORM)
0734:          .add (Tag.ISINDEX)
0735:          .add (Tag.HR)
0736:          .add (Tag.TABLE)
0737:          .add (Tag.H1)
0738:          .add (Tag.H2)
0739:          .add (Tag.H3)
0740:          .add (Tag.H4)
0741:          .add (Tag.H5)
0742:          .add (Tag.H6)
0743:          .add (Tag.ADDRESS)
0744:          ;
0745:
0746:    // maps elements which force closure to the elements that they close, e.g.,
0747:    // LI maps to LI, DT maps to DD,DT, and all block-level tags map to P.
0748:    private static Hashtable forcesClosed = new Hashtable2 ()
0749:          .add (Tag.DD, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))
0750:          .add (Tag.DT, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))
0751:          .add (Tag.LI, new Hashtable2 () .add (Tag.LI))
0752:          .add (Tag.OPTION, new Hashtable2 () .add (Tag.OPTION))
0753:          .add (Tag.TR, new Hashtable2 () .add (Tag.TR))
0754:          .add (Tag.TD, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))
0755:          .add (Tag.TH, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))
0756:          ;
0757:    static {
0758:        Hashtable p = new Hashtable2 () .add (Tag.P);
0759:        Enumeration enum = blocktag.keys ();
0760:        while (enum.hasMoreElements ())
0761:            union (forcesClosed, enum.nextElement(), p);
0762:    }
0763:
0764:    // union of forcesClosed plus the tag's possible containers.  For instance,
0765:    // LI maps to LI, OL, UL, MENU, DIR.  When a forcesClosed tag like LI is
0766:    // encountered, the parser looks upward for the first context tag.
0767:    // Having the tag's container element included in the search ensures that
0768:    // LI in a nested list won't close its parent LI.
0769:    static Hashtable context = new Hashtable2 ()
0770:          .add (Tag.DD, new Hashtable2 () .add (Tag.DL))
0771:          .add (Tag.DT, new Hashtable2 () .add (Tag.DL))
0772:          .add (Tag.LI, new Hashtable2 () .add (Tag.OL) .add (Tag.UL) .add (Tag.MENU) .add (Tag.DIR))
0773:          .add (Tag.OPTION, new Hashtable2 () .add (Tag.SELECT))
0774:          .add (Tag.TR, new Hashtable2 () .add (Tag.TABLE))
0775:          .add (Tag.TD, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))
0776:          .add (Tag.TH, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))
0777:          ;
0778:    static {
0779:        Enumeration enum = forcesClosed.keys ();
0780:        while (enum.hasMoreElements ()) {
0781:            Object tagname = enum.nextElement();
0782:            union (context, tagname, (Hashtable)forcesClosed.get (tagname));
0783:        }
0784:    }
0785:
0786:    // NIY: handle literal and semi-literal elements (XMP, LISTING, TEXTAREA, OPTION)
0787:    // elements whose content should be treated as plain text
0788:    static Hashtable literal = new Hashtable2()
0789:            ;
0790:
0791:    // maps link elements to their URL attribute (e.g., A maps to HREF)
0792:    static Hashtable linktag = new Hashtable2 ()
0793:            .add (Tag.A, "href")
0794:            .add (Tag.AREA, "href")
0795:            .add (Tag.APPLET, "code")
0796:            .add (Tag.EMBED, "src")
0797:            .add (Tag.FRAME, "src")
0798:            .add (Tag.FORM, "action")
0799:            .add (Tag.IMG, "src")
0800:            .add (Tag.LINK, "href")
0801:            .add (Tag.SCRIPT, "src")
0802:            ;
0803:
0804:    // elements whose text contents are crucial to the crawler
0805:    static Hashtable savetext = new Hashtable2 ()
0806:            .add (Tag.A)
0807:            .add (Tag.TITLE);
0808:
0809:    // elements found in <HEAD>
0810:    static Hashtable headtag = new Hashtable2()
0811:          .add (Tag.META)
0812:          .add (Tag.TITLE)
0813:          .add (Tag.BASE)
0814:          .add (Tag.LINK)
0815:          .add (Tag.ISINDEX)
0816:          ;
0817:
0818:    private static void union (Hashtable map, Object tagname, Hashtable tagset) {
0819:        Hashtable2 currset = (Hashtable2)map.get (tagname);
0820:        if (currset == null)
0821:            map.put (tagname, tagset);
0822:        else
0823:            map.put (tagname, currset.union (tagset));
0824:    }
0825:
0826:    private void buildParseTree (Page page) {
0827:        boolean keepText = false;
0828:
0829:        elems.setSize (0);
0830:        openPtr = 0;
0831:
0832:        Region[] tokens = page.tokens;
0833:        for (int t=0; t<tokens.length; ++t) {
0834:            Region r = tokens[t];
0835:
0836:            if (r instanceof  Tag) {
0837:                Tag tag = (Tag)r;
0838:                String tagName = tag.getTagName();
0839:
0840:                if (tag.isStartTag()) {
0841:                    // start tag <X>
0842:
0843:                    // check if <X> forces closure of an open element
0844:                    if (forcesClosed.containsKey (tagName)) {
0845:                        Element e = findOpenElement ((Hashtable)context.get (tagName));
0846:                        if (e != null && ((Hashtable)forcesClosed.get (tagName)).containsKey (e.getTagName()))
0847:                            close (e, tag.start);
0848:                    }
0849:
0850:                    // create the element and push it on the elems stack
0851:                    Element e = makeElement (page.base, tag);
0852:                    open (e);
0853:
0854:                    if (empty.containsKey (tagName)) {
0855:                        // element has no content
0856:                        // close it off right now
0857:                        close (e, tag.end);
0858:                    }
0859:                    else if (savetext.containsKey (tagName)) {
0860:                        text.setLength (0);
0861:                        keepText = true;
0862:                    }
0863:
0864:                    if (tagName == Tag.BASE) {
0865:                        String href = tag.getHTMLAttribute ("href");
0866:                        if (href != null) {
0867:                            try {                         
0868:                                page.base = new URL (page.base, new String (href.toCharArray())); // make copy to avoid reference to page content
0869:                            } catch (MalformedURLException ex) {} // bad URL
0870:                              catch (NullPointerException ex) {} // base == null
0871:                        }
0872:                    }
0873:                }
0874:                else {
0875:                    // end tag </X>
0876:
0877:                    // find matching start tag <X>
0878:                    Element e = findOpenElement (tagName);
0879:                    if (e != null) {
0880:                        close (e, tag);
0881:
0882:                        if (savetext.containsKey (tagName)) {
0883:                            if (tagName == Tag.TITLE)
0884:                                page.title = text.toString();
0885:                            else if (e instanceof  Link)
0886:                                ((Link)e).setText (text.toString());
0887:                            keepText = false;
0888:                        }
0889:                    }
0890:                }
0891:
0892:            }
0893:            else { // r is a text token
0894:                if (keepText) {
0895:                    if (text.length() > 0)
0896:                        text.append (' ');
0897:                    text.append (r.toText());
0898:                }
0899:            }
0900:        }
0901:
0902:        // close any remaining open elements
0903:        closeAll (page.end);
0904:
0905:        // link together the top-level elements
0906:        if (!elems.empty()) {
0907:            int nElems = elems.size ();
0908:            Element c = (Element)elems.elementAt (0);
0909:            page.root = c;
0910:            for (int j=1; j<nElems; ++j) {
0911:                Element d = (Element)elems.elementAt (j);
0912:                c.sibling = d;
0913:                c = d;
0914:            }
0915:        }
0916:
0917:        page.elements = new Element[vElements.size()];
0918:        vElements.copyInto (page.elements);
0919:
0920:        page.links = new Link[vLinks.size()];
0921:        vLinks.copyInto (page.links);
0922:    }
0923:
0924:    private Element makeElement (URL base, Tag tag) {
0925:        Element e = null;
0926:        String tagName = tag.getTagName ();
0927:        String hrefAttr = (String)linktag.get (tagName);
0928:        String type;
0929:
0930:        try {
0931:            if (tagName == Tag.FORM) {
0932:                e = new Form (tag, null, base);
0933:                vLinks.addElement (e);
0934:            }
0935:            else if (tagName == Tag.INPUT 
0936:                     && (type = tag.getHTMLAttribute ("type")) != null
0937:                     && (type.equalsIgnoreCase ("submit") || type.equalsIgnoreCase ("image"))) {
0938:                e = new FormButton (tag, null, currentForm);
0939:                vLinks.addElement (e);
0940:            }
0941:            else if (hrefAttr != null && tag.hasHTMLAttribute (hrefAttr)) {
0942:                e = new Link (tag, null, base);
0943:                vLinks.addElement (e);
0944:            }
0945:        } catch (MalformedURLException f) {} // bad URL
0946:          catch (NullPointerException ex) {} // base == null
0947:
0948:        if (e == null)
0949:            // just make an ordinary element
0950:            e = new Element (tag, null);
0951:            
0952:        vElements.addElement (e);
0953:        tag.element = e;
0954:        return e;
0955:    }
0956:
0957:    // Stack management
0958:
0959:    Stack elems = new Stack();
0960:        // stack of Elements appearing before than the current element in
0961:        // a preorder traversal, except that completely-visited subtrees
0962:        // are represented by their root.
0963:    int[] openElems = new int[20];
0964:    int openPtr = 0;
0965:        // stack of indices of open elements in elems
0966:
0967:    Form currentForm;
0968:
0969:    private void open (Element e) {
0970:        if (openPtr > 0)
0971:            e.parent = (Element)elems.elementAt (openElems[openPtr-1]);
0972:        else
0973:            e.parent = null;
0974:
0975:        elems.push (e);
0976:        if (e instanceof  Form)
0977:            currentForm = (Form)e;
0978:
0979:        if (openPtr == openElems.length) {
0980:            int[] newarr = new int[openElems.length + 10];
0981:            System.arraycopy (openElems, 0, newarr, 0, openElems.length);
0982:            openElems = newarr;
0983:        }
0984:        openElems[openPtr] = elems.size()-1;
0985:        ++openPtr;
0986:    }
0987:
0988:    private Element findOpenElement (String tagname) {
0989:        for (int i=openPtr-1; i >= 0; --i) {
0990:            Element e = (Element)elems.elementAt (openElems[i]);
0991:            if (tagname == e.getTagName ())
0992:                return e;
0993:        }
0994:        return null;
0995:    }
0996:
0997:    private Element findOpenElement (Hashtable tags) {
0998:        for (int i=openPtr-1; i >= 0; --i) {
0999:            Element e = (Element)elems.elementAt (openElems[i]);
1000:            if (tags.containsKey (e.getTagName ()))
1001:                return e;
1002:        }
1003:        return null;
1004:    }
1005:
1006:    // NIY: stack up unclosed flow tags (like <B> and <A>) and reopen them
1007:    // when the next element is opened
1008:    private void close (Element elem, Tag tag) {
1009:        elem.endTag = tag;
1010:        tag.element = elem;
1011:        close (elem, tag.start);
1012:        elem.end = tag.end;
1013:    }
1014:
1015:    private void close (Element elem, int end) {
1016:        int v;
1017:        Element e;
1018:        do {
1019:            v = openElems[--openPtr];
1020:            e = (Element)elems.elementAt (v);
1021:
1022:            e.end = end;
1023:            if (e instanceof  Form)
1024:                currentForm = null;
1025:
1026:            int firstChild = v+1;
1027:            int nElems = elems.size();
1028:            if (firstChild < nElems) {
1029:                Element c = (Element)elems.elementAt (firstChild);
1030:                e.child = c;
1031:                for (int j=firstChild+1; j<nElems; ++j) {
1032:                    Element d = (Element)elems.elementAt (j);
1033:                    c.sibling = d;
1034:                    c = d;
1035:                }
1036:                elems.setSize (firstChild);
1037:            }
1038:            
1039:        } while (e != elem);
1040:    }
1041:
1042:    private void closeAll (int end) {
1043:        if (openPtr > 0)
1044:            close ((Element)elems.elementAt (openElems[0]), end);
1045:    }
1046:
1047:    /*
1048:     * Testing interface
1049:     *
1050:     */
1051:
1052:    public static void main (String[] args) throws Exception {
1053:        if (args.length < 1 || args.length > 2) {
1054:            System.err.println ("usage: HTMLParser <URL>");
1055:            System.exit(-1);
1056:        }
1057:
1058:        Page page;
1059:        if (args.length == 1)
1060:            page = new Page (new Link(args[0]), new DownloadParameters (), new HTMLParser ());
1061:        else
1062:            page = new Page (new URL(args[0]), args[1], new HTMLParser ());
1063:
1064:        /*
1065:        long tm = System.currentTimeMillis();     //??dk
1066:        HTMLParser tokenizer = new HTMLParser ();
1067:
1068:        tm = System.currentTimeMillis() - tm;       //??dk
1069:            System.err.println("[Parsed " + args[0] + " in " + tm + "ms]");
1070:        */
1071: 
1072:        System.out.println ("Tokens: ------------------------------------------");
1073:        Region[] tokens = page.tokens;
1074:        for (int i=0; i<tokens.length; ++i) {
1075:            System.out.println ("[" + tokens[i].getStart() + "," + tokens[i].getEnd() + "]" + tokens[i]);
1076:        }
1077:
1078:       System.out.println ("Tags: ------------------------------------------");
1079:        Tag[] tags = page.tags;
1080:        for (int i=0; i<tags.length; ++i) {
1081:            Tag t = tags[i];
1082:            System.out.print ((t.isStartTag() ? "start tag" : "end tag") + " " + t.getTagName ());
1083:
1084:            Enumeration attrs = t.enumerateHTMLAttributes();
1085:            String name, val;
1086:            while (attrs.hasMoreElements()) {
1087:                name = (String)attrs.nextElement();
1088:                val = t.getHTMLAttribute (name);
1089:                System.out.print (" " + name + "=\"" + val + "\"");
1090:            }
1091:            System.out.println ();
1092:            System.out.println ("    " + t);
1093:        }
1094:
1095:        System.out.println ("Words: ------------------------------------------");
1096:        Text[] words = page.words;
1097:        for (int i=0; i<words.length; ++i) {
1098:            System.out.println (words[i]);
1099:        }
1100:
1101:        System.out.println ("Elements: ------------------------------------------");
1102:        printout (page.root, 0);
1103:
1104:        System.out.println ("Links: ------------------------------------------");
1105:        printout (page.getLinks (), 0);
1106:
1107:    }
1108:
1109:    private static String indentation (int indent) {
1110:        StringBuffer s = new StringBuffer();
1111:        for (int i=0; i<indent; ++i)
1112:            s.append ("    ");
1113:        return s.toString();
1114:    }
1115:
1116:    private static void printout (Element element, int indent) {
1117:      for (Element e = element; e != null; e = e.getSibling ()) {
1118:          Element c = e.getChild();
1119:
1120:          System.out.println (indentation(indent) + e.getStartTag() + "[" + e.getStart() + "," + e.getEnd() + "]");
1121:          if (c != null)
1122:              printout (c, indent+1);
1123:          if (e.getEndTag() != null)
1124:              System.out.println (indentation(indent) + e.getEndTag());
1125:      }
1126:    }
1127:    private static void printout (Link[] elements, int indent) {
1128:        for (int i=0; i<elements.length; ++i) {
1129:            Link e = elements[i];
1130:            System.out.println (indentation(indent) + e.toDescription());
1131:        }
1132:    }
1133:}
1134:
1135:class Hashtable2 extends Hashtable {
1136:    public Hashtable2 () {
1137:    }
1138:
1139:    public Hashtable2 add (Object key) {
1140:        put (key, key);
1141:        return this ;
1142:    }
1143:
1144:    public Hashtable2 add (Object key, Object val) {
1145:        put (key, val);
1146:        return this ;
1147:    }
1148:
1149:    public Hashtable2 union (Hashtable map) {
1150:        Enumeration enum = map.keys ();
1151:        while (enum.hasMoreElements ()) {
1152:            Object key = enum.nextElement ();
1153:            put (key, map.get (key));
1154:        }
1155:
1156:        return this;
1157:    }
1158:}
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.