0001:/*
0002: * WebSphinx web-crawling toolkit
0003: *
0004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
0005: * reserved.
0006: *
0007: * Redistribution and use in source and binary forms, with or without
0008: * modification, are permitted provided that the following conditions
0009: * are met:
0010: *
0011: * 1. Redistributions of source code must retain the above copyright
0012: * notice, this list of conditions and the following disclaimer.
0013: *
0014: * 2. Redistributions in binary form must reproduce the above copyright
0015: * notice, this list of conditions and the following disclaimer in
0016: * the documentation and/or other materials provided with the
0017: * distribution.
0018: *
0019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
0020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
0021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
0023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0030: *
0031: */
0032:
0033:package websphinx;
0034:
0035:import java.io.InputStream;
0036:import java.io.IOException;
0037://#ifdef JDK1.1
0038:import java.io.Reader;
0039:import java.io.InputStreamReader;
0040:import java.io.StringReader;
0041://#endif JDK1.1
0042:/*#ifdef JDK1.0
0043:import java.io.StringBufferInputStream;
0044:#endif JDK1.0*/
0045:import java.io.ByteArrayOutputStream;
0046:import java.util.Hashtable;
0047:import java.util.Enumeration;
0048:import java.util.Vector;
0049:import java.util.Stack;
0050:import java.net.URL;
0051:import java.net.MalformedURLException;
0052:
0053:/**
0054: * HTML parser. Parses an input stream or String and
0055: * converts it to a sequence of Tags and a tree of Elements.
0056: * HTMLParser is used by Page to parse pages.
0057: */
0058:// FIX: make HTMLParser into an interface, and
0059:// split this implementation into Tokenizer and TreeBuilder
0060:public class HTMLParser {
0061:
0062: // parameter for HTML type detection.
0063: // If the parser doesn't encounter at least one HTML tag
0064: // in the first VALID_HTML_PREFIX bytes of the stream, then parser
0065: // concludes that the stream isn't HTML and stops parsing it.
0066: static final int VALID_HTML_PREFIX = 10000;
0067:
0068: int maxBytes = Integer.MAX_VALUE;
0069:
0070: /**
0071: * Make an HTMLParser.
0072: */
0073: public HTMLParser () {
0074: }
0075:
0076: /**
0077: * Parse a page as HTML.
0078: * @param page Page to parse
0079: */
0080: public void parse (Page page) throws IOException {
0081: tokenize (page);
0082: buildParseTree (page);
0083: }
0084:
0085:
0086: /*
0087: * HTML tokenizer state machine
0088: */
0089:
0090: // state takes on one of the following values:
0091: private static final int START = 0;
0092: private static final int INWORD = 1;
0093: private static final int ENTITY = 2;
0094: private static final int LT = 4;
0095: private static final int BANG = 5;
0096: private static final int BANG_DASH = 6;
0097: private static final int CMT = 7;
0098: private static final int CMT_DASH = 8;
0099: private static final int CMT_DASHDASH = 9;
0100: private static final int DIRECTIVE = 10;
0101: private static final int STAG = 11;
0102: private static final int ETAG = 12;
0103: private static final int ATTR = 13;
0104: private static final int ATTRNAME = 14;
0105: private static final int EQ = 15;
0106: private static final int AFTEREQ = 16;
0107: private static final int ATTRVAL = 17;
0108: private static final int ATTRVAL_SQ = 18;
0109: private static final int ATTRVAL_DQ = 19;
0110: private static final int DONE = 20;
0111: private static final int ENTNUM = 21;
0112: private static final int ENTREF = 22;
0113:
0114: StringBuffer wordBuf = new StringBuffer ();
0115: StringBuffer tagName = new StringBuffer ();
0116: StringBuffer attrName = new StringBuffer ();
0117: StringBuffer attrVal = new StringBuffer ();
0118: Vector attrs = new Vector ();
0119: StringBuffer entity = new StringBuffer ();
0120:
0121: // FIX: should entities in attr names or values be expanded?
0122: private void tokenize (Page page) throws IOException {
0123: int state = START;
0124:
0125: String content = page.getContent ();
0126: int buflen = content.length ();
0127: int bufptr = 0;
0128: int bufbase = 0;
0129:
0130: // token list
0131: Vector tokens = new Vector();
0132:
0133: int wordStart = 0;
0134: int nWords = 0;
0135:
0136: Tag tag = null;
0137: int tagStart = 0;
0138:
0139: int entnum = 0;
0140:
0141: StringBuffer entityTargetBuf = null;
0142: int postEntityState = 0;
0143:
0144: boolean isHTML = "text/html".equals (page.getContentType ());
0145:
0146: while (bufptr < buflen) {
0147: if (!isHTML && bufptr >= VALID_HTML_PREFIX)
0148: // we didn't see any HTML tags in the first
0149: // VALID_HTML_PREFIX bytes,
0150: // so assume the document isn't HTML and stop parsing it.
0151: return;
0152:
0153: char c = content.charAt (bufptr);
0154:
0155: //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c);
0156:
0157: switch (state) {
0158: case START:
0159: // after whitespace or tag
0160: switch (c) {
0161: case '<':
0162: ++bufptr;
0163: state = LT;
0164: break;
0165: case ' ':
0166: case '\t':
0167: case '\n':
0168: case '\r':
0169: ++bufptr;
0170: break;
0171: default:
0172: wordBuf.setLength (0);
0173: wordStart = bufbase+bufptr;
0174: state = INWORD;
0175: break;
0176: }
0177: break;
0178:
0179: case INWORD:
0180: // Character data
0181: switch (c) {
0182: case '<':
0183: tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0184: ++nWords;
0185: state = START;
0186: break;
0187: case ' ':
0188: case '\t':
0189: case '\n':
0190: case '\r':
0191: tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0192: ++nWords;
0193: state = START;
0194: ++bufptr;
0195: break;
0196: case '&':
0197: ++bufptr;
0198: postEntityState = INWORD;
0199: entityTargetBuf = wordBuf;
0200: state = ENTITY;
0201: break;
0202: default:
0203: wordBuf.append ((char)c);
0204: ++bufptr;
0205: // state == INWORD;
0206: break;
0207: }
0208: break;
0209:
0210: // Entities
0211: case ENTITY:
0212: if (c == '#') {
0213: ++bufptr;
0214: entnum = 0;
0215: state = ENTNUM;
0216: }
0217: else if ((c >= 'A' && c <= 'Z')
0218: || (c >= 'a' && c <= 'z')) {
0219: entity.setLength (0);
0220: state = ENTREF;
0221: }
0222: else {
0223: entityTargetBuf.append ('&');
0224: state = postEntityState;
0225: }
0226: break;
0227:
0228: case ENTREF:
0229: if (!Character.isLetterOrDigit(c)) {
0230: Character ent = lookupEntityRef (entity.toString ());
0231: if (ent != null) {
0232: entityTargetBuf.append (ent.charValue());
0233: if (c == ';')
0234: ++bufptr;
0235: }
0236: else {
0237: // unrecognized entity -- leave
0238: // as-is
0239: entityTargetBuf.append ('&');
0240: entityTargetBuf.append (entity.toString ());
0241: }
0242: state = postEntityState;
0243: }
0244: else {
0245: ++bufptr;
0246: entity.append (c);
0247: // state == ENTREF;
0248: }
0249: break;
0250:
0251: case ENTNUM:
0252: if (c==';' || !Character.isDigit(c)) {
0253: entityTargetBuf.append ((char) entnum);
0254: if (c == ';')
0255: ++bufptr;
0256: state = postEntityState;
0257: }
0258: else {
0259: entnum = 10*entnum + (c - '0');
0260: ++bufptr;
0261: }
0262: break;
0263:
0264: case LT:
0265: tagStart = bufbase+bufptr-1;
0266: switch (c) {
0267: case '/':
0268: ++bufptr;
0269: tagName.setLength (0);
0270: state = ETAG;
0271: break;
0272: case '!':
0273: ++bufptr;
0274: state = BANG;
0275: break;
0276: default:
0277: if (Character.isLetter (c)) {
0278: tagName.setLength (0);
0279: state = STAG;
0280: }
0281: else {
0282: wordBuf.append ('<');
0283: state = INWORD;
0284: }
0285: break;
0286: }
0287: break;
0288:
0289: // Comments and directives.
0290: // Implements the (broken, but easy) Netscape rule:
0291: // <!-- starts a comment, --> closes.
0292: // All other directives <!foo> are also returned as comments.
0293: case BANG:
0294: if (c == '-') {
0295: ++bufptr;
0296: state = BANG_DASH;
0297: }
0298: else {
0299: state = DIRECTIVE;
0300: }
0301: break;
0302:
0303: case BANG_DASH:
0304: if (c == '-') {
0305: ++bufptr;
0306: state = CMT;
0307: }
0308: else {
0309: state = DIRECTIVE;
0310: }
0311: break;
0312:
0313: case CMT:
0314: if (c == '-') {
0315: ++bufptr;
0316: state = CMT_DASH;
0317: }
0318: else {
0319: ++bufptr;
0320: }
0321: break;
0322:
0323: case CMT_DASH:
0324: if (c == '-') {
0325: ++bufptr;
0326: state = CMT_DASHDASH;
0327: }
0328: else {
0329: ++bufptr;
0330: state = CMT;
0331: }
0332: break;
0333:
0334: case CMT_DASHDASH:
0335: if (c == '>') {
0336: ++bufptr;
0337: tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);
0338: tokens.addElement (tag);
0339: state = START;
0340: }
0341: else if (c == '-') {
0342: ++bufptr;
0343: state = CMT_DASHDASH;
0344: }
0345: else {
0346: ++bufptr;
0347: state = CMT;
0348: }
0349: break;
0350:
0351: case DIRECTIVE:
0352: if (c == '>') {
0353: ++bufptr;
0354: tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);
0355: tokens.addElement (tag);
0356: state = START;
0357: }
0358: else {
0359: ++bufptr;
0360: }
0361: break;
0362:
0363: // Tags
0364: case STAG:
0365: if (c == '>' || isWhitespace(c)) {
0366: tag = new Tag (page, tagStart, bufbase+bufptr, // tag doesn't really end here
0367: // -- we'll fix it up when we actually see it
0368: tagName.toString (), true);
0369: tokens.addElement (tag);
0370: attrs.setSize (0);
0371: state = ATTR;
0372: isHTML = true;
0373: }
0374: else {
0375: tagName.append (c);
0376: ++bufptr;
0377: // state == STAG;
0378: }
0379: break;
0380:
0381: case ETAG:
0382: if (c == '>') {
0383: ++bufptr;
0384: tag = new Tag (page, tagStart, bufbase+bufptr, tagName.toString (), false);
0385: tokens.addElement (tag);
0386: state = START;
0387: }
0388: else {
0389: tagName.append (c);
0390: ++bufptr;
0391: // state == ETAG
0392: }
0393: break;
0394:
0395: // Attributes
0396: case ATTR:
0397: if (isWhitespace(c))
0398: ++bufptr;
0399: else if (c == '>') {
0400: ++bufptr;
0401: tag.end = bufbase+bufptr;
0402: if (attrs.size() > 0) {
0403: tag.htmlAttributes = new String[attrs.size()];
0404: attrs.copyInto (tag.htmlAttributes);
0405: }
0406: state = START;
0407: }
0408: else {
0409: attrName.setLength (0);
0410: state = ATTRNAME;
0411: }
0412: break;
0413:
0414: case ATTRNAME:
0415: if (c == '>' || c == '=' || isWhitespace(c)) {
0416: state = EQ;
0417: }
0418: else {
0419: attrName.append (c);
0420: ++bufptr;
0421: // state == ATTRNAME;
0422: }
0423: break;
0424:
0425: case EQ:
0426: if (isWhitespace(c))
0427: ++bufptr;
0428: else if (c == '=') {
0429: ++bufptr;
0430: state = AFTEREQ;
0431: }
0432: else {
0433: String name = Tag.toHTMLAttributeName (attrName.toString());
0434: tag.setLabel (name);
0435: attrs.addElement (name);
0436: state = ATTR;
0437: }
0438: break;
0439:
0440: case AFTEREQ:
0441: if (isWhitespace (c))
0442: ++bufptr;
0443: else
0444: switch (c) {
0445: case '>':
0446: {
0447: String name = Tag.toHTMLAttributeName (attrName.toString());
0448: tag.setLabel (name);
0449: attrs.addElement (name);
0450: state = ATTR;
0451: break;
0452: }
0453: case '\'':
0454: ++bufptr;
0455: attrVal.setLength (0);
0456: state = ATTRVAL_SQ;
0457: break;
0458: case '"':
0459: ++bufptr;
0460: attrVal.setLength (0);
0461: state = ATTRVAL_DQ;
0462: break;
0463: default:
0464: attrVal.setLength (0);
0465: state = ATTRVAL;
0466: break;
0467: }
0468: break;
0469:
0470: case ATTRVAL:
0471: if (c == '>' || isWhitespace(c)) {
0472: String name = Tag.toHTMLAttributeName (attrName.toString());
0473: tag.setLabel (name, attrVal.toString());
0474: attrs.addElement (name);
0475: state = ATTR;
0476: }
0477: else if (c == '&') {
0478: ++bufptr;
0479: postEntityState = ATTRVAL;
0480: entityTargetBuf = attrVal;
0481: state = ENTITY;
0482: }
0483: else {
0484: ++bufptr;
0485: attrVal.append (c);
0486: // state == ATTRVAL;
0487: }
0488: break;
0489:
0490: case ATTRVAL_SQ:
0491: if (c=='\'') {
0492: ++bufptr;
0493: String name = Tag.toHTMLAttributeName (attrName.toString());
0494: tag.setLabel (name, attrVal.toString());
0495: attrs.addElement (name);
0496: state = ATTR;
0497: }
0498: else if (c == '&') {
0499: ++bufptr;
0500: postEntityState = ATTRVAL_SQ;
0501: entityTargetBuf = attrVal;
0502: state = ENTITY;
0503: }
0504: else {
0505: ++bufptr;
0506: attrVal.append (c);
0507: // state == ATTRVAL_SQ;
0508: }
0509: break;
0510:
0511: case ATTRVAL_DQ:
0512: if (c=='"') {
0513: ++bufptr;
0514: String name = Tag.toHTMLAttributeName (attrName.toString());
0515: tag.setLabel (name, attrVal.toString());
0516: attrs.addElement (name);
0517: state = ATTR;
0518: }
0519: else if (c == '&') {
0520: ++bufptr;
0521: postEntityState = ATTRVAL_DQ;
0522: entityTargetBuf = attrVal;
0523: state = ENTITY;
0524: }
0525: else {
0526: ++bufptr;
0527: attrVal.append (c);
0528: // state == ATTRVAL_DQ;
0529: }
0530: break;
0531:
0532: default:
0533: throw new RuntimeException ("HtmlTokenizer entered illegal state " + state);
0534: }
0535: }
0536:
0537: // EOF
0538: switch (state) {
0539: case INWORD:
0540: // EOF terminated some text -- save the text
0541: tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));
0542: ++nWords;
0543: break;
0544:
0545: default:
0546: // EOF in the middle of tags is illegal
0547: // don't try to recover
0548: break;
0549: }
0550:
0551: int nTotal = tokens.size ();
0552:
0553: page.tokens = new Region[nTotal];
0554: tokens.copyInto (page.tokens);
0555:
0556: page.words = new Text[nWords];
0557: int textnum = 0;
0558: page.tags = new Tag[nTotal - nWords];
0559: int tagnum = 0;
0560:
0561: for (int i=0; i < nTotal; ++i) {
0562: if (page.tokens[i] instanceof Tag)
0563: page.tags[tagnum++] = (Tag)page.tokens[i];
0564: else
0565: page.words[textnum++] = (Text)page.tokens[i];
0566: }
0567: }
0568:
0569: private static boolean isWhitespace (char c) {
0570://#ifdef JDK1.1
0571: return Character.isWhitespace (c);
0572://#endif JDK1.1
0573:/*#ifdef JDK1.0
0574: return Character.isSpace (c);
0575:#endif JDK1.0*/
0576: }
0577:
0578:
0579: private static Hashtable entities = new Hashtable2()
0580: .add ("quot", new Character ((char)34))
0581: .add ("amp", new Character ((char)38))
0582: .add ("lt", new Character ((char)60))
0583: .add ("gt", new Character ((char)62))
0584: .add ("nbsp", new Character ((char)160))
0585: .add ("iexcl", new Character ((char)161))
0586: .add ("cent", new Character ((char)162))
0587: .add ("pound", new Character ((char)163))
0588: .add ("curren", new Character ((char)164))
0589: .add ("yen", new Character ((char)165))
0590: .add ("brvbar", new Character ((char)167))
0591: .add ("sect", new Character ((char)167))
0592: .add ("uml", new Character ((char)168))
0593: .add ("copy", new Character ((char)169))
0594: .add ("ordf", new Character ((char)170))
0595: .add ("laquo", new Character ((char)171))
0596: .add ("not", new Character ((char)172))
0597: .add ("shy", new Character ((char)173))
0598: .add ("reg", new Character ((char)174))
0599: .add ("macr", new Character ((char)175))
0600: .add ("deg", new Character ((char)176))
0601: .add ("plusmn", new Character ((char)177))
0602: .add ("sup2", new Character ((char)178))
0603: .add ("sup3", new Character ((char)179))
0604: .add ("acute", new Character ((char)180))
0605: .add ("micro", new Character ((char)181))
0606: .add ("para", new Character ((char)182))
0607: .add ("middot", new Character ((char)183))
0608: .add ("cedil", new Character ((char)184))
0609: .add ("sup1", new Character ((char)185))
0610: .add ("ordm", new Character ((char)186))
0611: .add ("raquo", new Character ((char)187))
0612: .add ("frac14", new Character ((char)188))
0613: .add ("frac12", new Character ((char)189))
0614: .add ("frac34", new Character ((char)190))
0615: .add ("iquest", new Character ((char)191))
0616: .add ("Agrave", new Character ((char)192))
0617: .add ("Aacute", new Character ((char)193))
0618: .add ("Acirc", new Character ((char)194))
0619: .add ("Atilde", new Character ((char)195))
0620: .add ("Auml", new Character ((char)196))
0621: .add ("Aring", new Character ((char)197))
0622: .add ("AElig", new Character ((char)198))
0623: .add ("Ccedil", new Character ((char)199))
0624: .add ("Egrave", new Character ((char)200))
0625: .add ("Eacute", new Character ((char)201))
0626: .add ("Ecirc", new Character ((char)202))
0627: .add ("Euml", new Character ((char)203))
0628: .add ("Igrave", new Character ((char)204))
0629: .add ("Iacute", new Character ((char)205))
0630: .add ("Icirc", new Character ((char)206))
0631: .add ("Iuml", new Character ((char)207))
0632: .add ("ETH", new Character ((char)208))
0633: .add ("Ntilde", new Character ((char)209))
0634: .add ("Ograve", new Character ((char)210))
0635: .add ("Oacute", new Character ((char)211))
0636: .add ("Ocirc", new Character ((char)212))
0637: .add ("Otilde", new Character ((char)213))
0638: .add ("Ouml", new Character ((char)214))
0639: .add ("times", new Character ((char)215))
0640: .add ("Oslash", new Character ((char)216))
0641: .add ("Ugrave", new Character ((char)217))
0642: .add ("Uacute", new Character ((char)218))
0643: .add ("Ucirc", new Character ((char)219))
0644: .add ("Uuml", new Character ((char)220))
0645: .add ("Yacute", new Character ((char)221))
0646: .add ("THORN", new Character ((char)222))
0647: .add ("szlig", new Character ((char)223))
0648: .add ("agrave", new Character ((char)224))
0649: .add ("aacute", new Character ((char)225))
0650: .add ("acirc", new Character ((char)226))
0651: .add ("atilde", new Character ((char)227))
0652: .add ("auml", new Character ((char)228))
0653: .add ("aring", new Character ((char)229))
0654: .add ("aelig", new Character ((char)230))
0655: .add ("ccedil", new Character ((char)231))
0656: .add ("egrave", new Character ((char)232))
0657: .add ("eacute", new Character ((char)233))
0658: .add ("ecirc", new Character ((char)234))
0659: .add ("euml", new Character ((char)235))
0660: .add ("igrave", new Character ((char)236))
0661: .add ("iacute", new Character ((char)237))
0662: .add ("icirc", new Character ((char)238))
0663: .add ("iuml", new Character ((char)239))
0664: .add ("eth", new Character ((char)240))
0665: .add ("ntilde", new Character ((char)241))
0666: .add ("ograve", new Character ((char)242))
0667: .add ("oacute", new Character ((char)243))
0668: .add ("ocirc", new Character ((char)244))
0669: .add ("otilde", new Character ((char)245))
0670: .add ("ouml", new Character ((char)246))
0671: .add ("divide", new Character ((char)247))
0672: .add ("oslash", new Character ((char)248))
0673: .add ("ugrave", new Character ((char)249))
0674: .add ("uacute", new Character ((char)250))
0675: .add ("ucirc", new Character ((char)251))
0676: .add ("uuml", new Character ((char)252))
0677: .add ("yacute", new Character ((char)253))
0678: .add ("thorn", new Character ((char)254))
0679: .add ("yuml", new Character ((char)255))
0680: ;
0681:
0682: private static Character lookupEntityRef (String name) {
0683: return (Character) entities.get (name);
0684: }
0685:
0686: /*
0687: * Parser (constructs a canonical tree of elements)
0688: *
0689: */
0690:
0691: Vector vElements = new Vector ();
0692: Vector vLinks = new Vector ();
0693:
0694: StringBuffer text = new StringBuffer ();
0695:
0696: // elements with no content: e.g., IMG, BR, HR. End tags for these elements
0697: // are simply ignored.
0698: private static Hashtable empty = new Hashtable2 ()
0699: .add (Tag.AREA)
0700: .add (Tag.BASE)
0701: .add (Tag.BASEFONT)
0702: .add (Tag.BGSOUND)
0703: .add (Tag.BR)
0704: .add (Tag.COL)
0705: .add (Tag.COLGROUP)
0706: .add (Tag.COMMENT) // actually <!-- ... -->
0707: .add (Tag.HR)
0708: .add (Tag.IMG)
0709: .add (Tag.INPUT)
0710: .add (Tag.ISINDEX)
0711: .add (Tag.LINK)
0712: .add (Tag.META)
0713: .add (Tag.NEXTID)
0714: .add (Tag.PARAM)
0715: .add (Tag.SPACER)
0716: .add (Tag.WBR)
0717: ;
0718:
0719: // elements that close <P> (correspond to "%block" entity in HTML 3.2 DTD)
0720: static Hashtable blocktag = new Hashtable2()
0721: .add (Tag.P)
0722: .add (Tag.UL)
0723: .add (Tag.OL)
0724: .add (Tag.DIR)
0725: .add (Tag.MENU)
0726: .add (Tag.PRE)
0727: .add (Tag.XMP)
0728: .add (Tag.LISTING)
0729: .add (Tag.DL)
0730: .add (Tag.DIV)
0731: .add (Tag.CENTER)
0732: .add (Tag.BLOCKQUOTE)
0733: .add (Tag.FORM)
0734: .add (Tag.ISINDEX)
0735: .add (Tag.HR)
0736: .add (Tag.TABLE)
0737: .add (Tag.H1)
0738: .add (Tag.H2)
0739: .add (Tag.H3)
0740: .add (Tag.H4)
0741: .add (Tag.H5)
0742: .add (Tag.H6)
0743: .add (Tag.ADDRESS)
0744: ;
0745:
0746: // maps elements which force closure to the elements that they close, e.g.,
0747: // LI maps to LI, DT maps to DD,DT, and all block-level tags map to P.
0748: private static Hashtable forcesClosed = new Hashtable2 ()
0749: .add (Tag.DD, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))
0750: .add (Tag.DT, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))
0751: .add (Tag.LI, new Hashtable2 () .add (Tag.LI))
0752: .add (Tag.OPTION, new Hashtable2 () .add (Tag.OPTION))
0753: .add (Tag.TR, new Hashtable2 () .add (Tag.TR))
0754: .add (Tag.TD, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))
0755: .add (Tag.TH, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))
0756: ;
0757: static {
0758: Hashtable p = new Hashtable2 () .add (Tag.P);
0759: Enumeration enum = blocktag.keys ();
0760: while (enum.hasMoreElements ())
0761: union (forcesClosed, enum.nextElement(), p);
0762: }
0763:
0764: // union of forcesClosed plus the tag's possible containers. For instance,
0765: // LI maps to LI, OL, UL, MENU, DIR. When a forcesClosed tag like LI is
0766: // encountered, the parser looks upward for the first context tag.
0767: // Having the tag's container element included in the search ensures that
0768: // LI in a nested list won't close its parent LI.
0769: static Hashtable context = new Hashtable2 ()
0770: .add (Tag.DD, new Hashtable2 () .add (Tag.DL))
0771: .add (Tag.DT, new Hashtable2 () .add (Tag.DL))
0772: .add (Tag.LI, new Hashtable2 () .add (Tag.OL) .add (Tag.UL) .add (Tag.MENU) .add (Tag.DIR))
0773: .add (Tag.OPTION, new Hashtable2 () .add (Tag.SELECT))
0774: .add (Tag.TR, new Hashtable2 () .add (Tag.TABLE))
0775: .add (Tag.TD, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))
0776: .add (Tag.TH, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))
0777: ;
0778: static {
0779: Enumeration enum = forcesClosed.keys ();
0780: while (enum.hasMoreElements ()) {
0781: Object tagname = enum.nextElement();
0782: union (context, tagname, (Hashtable)forcesClosed.get (tagname));
0783: }
0784: }
0785:
0786: // NIY: handle literal and semi-literal elements (XMP, LISTING, TEXTAREA, OPTION)
0787: // elements whose content should be treated as plain text
0788: static Hashtable literal = new Hashtable2()
0789: ;
0790:
0791: // maps link elements to their URL attribute (e.g., A maps to HREF)
0792: static Hashtable linktag = new Hashtable2 ()
0793: .add (Tag.A, "href")
0794: .add (Tag.AREA, "href")
0795: .add (Tag.APPLET, "code")
0796: .add (Tag.EMBED, "src")
0797: .add (Tag.FRAME, "src")
0798: .add (Tag.FORM, "action")
0799: .add (Tag.IMG, "src")
0800: .add (Tag.LINK, "href")
0801: .add (Tag.SCRIPT, "src")
0802: ;
0803:
0804: // elements whose text contents are crucial to the crawler
0805: static Hashtable savetext = new Hashtable2 ()
0806: .add (Tag.A)
0807: .add (Tag.TITLE);
0808:
0809: // elements found in <HEAD>
0810: static Hashtable headtag = new Hashtable2()
0811: .add (Tag.META)
0812: .add (Tag.TITLE)
0813: .add (Tag.BASE)
0814: .add (Tag.LINK)
0815: .add (Tag.ISINDEX)
0816: ;
0817:
0818: private static void union (Hashtable map, Object tagname, Hashtable tagset) {
0819: Hashtable2 currset = (Hashtable2)map.get (tagname);
0820: if (currset == null)
0821: map.put (tagname, tagset);
0822: else
0823: map.put (tagname, currset.union (tagset));
0824: }
0825:
0826: private void buildParseTree (Page page) {
0827: boolean keepText = false;
0828:
0829: elems.setSize (0);
0830: openPtr = 0;
0831:
0832: Region[] tokens = page.tokens;
0833: for (int t=0; t<tokens.length; ++t) {
0834: Region r = tokens[t];
0835:
0836: if (r instanceof Tag) {
0837: Tag tag = (Tag)r;
0838: String tagName = tag.getTagName();
0839:
0840: if (tag.isStartTag()) {
0841: // start tag <X>
0842:
0843: // check if <X> forces closure of an open element
0844: if (forcesClosed.containsKey (tagName)) {
0845: Element e = findOpenElement ((Hashtable)context.get (tagName));
0846: if (e != null && ((Hashtable)forcesClosed.get (tagName)).containsKey (e.getTagName()))
0847: close (e, tag.start);
0848: }
0849:
0850: // create the element and push it on the elems stack
0851: Element e = makeElement (page.base, tag);
0852: open (e);
0853:
0854: if (empty.containsKey (tagName)) {
0855: // element has no content
0856: // close it off right now
0857: close (e, tag.end);
0858: }
0859: else if (savetext.containsKey (tagName)) {
0860: text.setLength (0);
0861: keepText = true;
0862: }
0863:
0864: if (tagName == Tag.BASE) {
0865: String href = tag.getHTMLAttribute ("href");
0866: if (href != null) {
0867: try {
0868: page.base = new URL (page.base, new String (href.toCharArray())); // make copy to avoid reference to page content
0869: } catch (MalformedURLException ex) {} // bad URL
0870: catch (NullPointerException ex) {} // base == null
0871: }
0872: }
0873: }
0874: else {
0875: // end tag </X>
0876:
0877: // find matching start tag <X>
0878: Element e = findOpenElement (tagName);
0879: if (e != null) {
0880: close (e, tag);
0881:
0882: if (savetext.containsKey (tagName)) {
0883: if (tagName == Tag.TITLE)
0884: page.title = text.toString();
0885: else if (e instanceof Link)
0886: ((Link)e).setText (text.toString());
0887: keepText = false;
0888: }
0889: }
0890: }
0891:
0892: }
0893: else { // r is a text token
0894: if (keepText) {
0895: if (text.length() > 0)
0896: text.append (' ');
0897: text.append (r.toText());
0898: }
0899: }
0900: }
0901:
0902: // close any remaining open elements
0903: closeAll (page.end);
0904:
0905: // link together the top-level elements
0906: if (!elems.empty()) {
0907: int nElems = elems.size ();
0908: Element c = (Element)elems.elementAt (0);
0909: page.root = c;
0910: for (int j=1; j<nElems; ++j) {
0911: Element d = (Element)elems.elementAt (j);
0912: c.sibling = d;
0913: c = d;
0914: }
0915: }
0916:
0917: page.elements = new Element[vElements.size()];
0918: vElements.copyInto (page.elements);
0919:
0920: page.links = new Link[vLinks.size()];
0921: vLinks.copyInto (page.links);
0922: }
0923:
0924: private Element makeElement (URL base, Tag tag) {
0925: Element e = null;
0926: String tagName = tag.getTagName ();
0927: String hrefAttr = (String)linktag.get (tagName);
0928: String type;
0929:
0930: try {
0931: if (tagName == Tag.FORM) {
0932: e = new Form (tag, null, base);
0933: vLinks.addElement (e);
0934: }
0935: else if (tagName == Tag.INPUT
0936: && (type = tag.getHTMLAttribute ("type")) != null
0937: && (type.equalsIgnoreCase ("submit") || type.equalsIgnoreCase ("image"))) {
0938: e = new FormButton (tag, null, currentForm);
0939: vLinks.addElement (e);
0940: }
0941: else if (hrefAttr != null && tag.hasHTMLAttribute (hrefAttr)) {
0942: e = new Link (tag, null, base);
0943: vLinks.addElement (e);
0944: }
0945: } catch (MalformedURLException f) {} // bad URL
0946: catch (NullPointerException ex) {} // base == null
0947:
0948: if (e == null)
0949: // just make an ordinary element
0950: e = new Element (tag, null);
0951:
0952: vElements.addElement (e);
0953: tag.element = e;
0954: return e;
0955: }
0956:
0957: // Stack management
0958:
0959: Stack elems = new Stack();
0960: // stack of Elements appearing before than the current element in
0961: // a preorder traversal, except that completely-visited subtrees
0962: // are represented by their root.
0963: int[] openElems = new int[20];
0964: int openPtr = 0;
0965: // stack of indices of open elements in elems
0966:
0967: Form currentForm;
0968:
0969: private void open (Element e) {
0970: if (openPtr > 0)
0971: e.parent = (Element)elems.elementAt (openElems[openPtr-1]);
0972: else
0973: e.parent = null;
0974:
0975: elems.push (e);
0976: if (e instanceof Form)
0977: currentForm = (Form)e;
0978:
0979: if (openPtr == openElems.length) {
0980: int[] newarr = new int[openElems.length + 10];
0981: System.arraycopy (openElems, 0, newarr, 0, openElems.length);
0982: openElems = newarr;
0983: }
0984: openElems[openPtr] = elems.size()-1;
0985: ++openPtr;
0986: }
0987:
0988: private Element findOpenElement (String tagname) {
0989: for (int i=openPtr-1; i >= 0; --i) {
0990: Element e = (Element)elems.elementAt (openElems[i]);
0991: if (tagname == e.getTagName ())
0992: return e;
0993: }
0994: return null;
0995: }
0996:
0997: private Element findOpenElement (Hashtable tags) {
0998: for (int i=openPtr-1; i >= 0; --i) {
0999: Element e = (Element)elems.elementAt (openElems[i]);
1000: if (tags.containsKey (e.getTagName ()))
1001: return e;
1002: }
1003: return null;
1004: }
1005:
1006: // NIY: stack up unclosed flow tags (like <B> and <A>) and reopen them
1007: // when the next element is opened
1008: private void close (Element elem, Tag tag) {
1009: elem.endTag = tag;
1010: tag.element = elem;
1011: close (elem, tag.start);
1012: elem.end = tag.end;
1013: }
1014:
1015: private void close (Element elem, int end) {
1016: int v;
1017: Element e;
1018: do {
1019: v = openElems[--openPtr];
1020: e = (Element)elems.elementAt (v);
1021:
1022: e.end = end;
1023: if (e instanceof Form)
1024: currentForm = null;
1025:
1026: int firstChild = v+1;
1027: int nElems = elems.size();
1028: if (firstChild < nElems) {
1029: Element c = (Element)elems.elementAt (firstChild);
1030: e.child = c;
1031: for (int j=firstChild+1; j<nElems; ++j) {
1032: Element d = (Element)elems.elementAt (j);
1033: c.sibling = d;
1034: c = d;
1035: }
1036: elems.setSize (firstChild);
1037: }
1038:
1039: } while (e != elem);
1040: }
1041:
1042: private void closeAll (int end) {
1043: if (openPtr > 0)
1044: close ((Element)elems.elementAt (openElems[0]), end);
1045: }
1046:
1047: /*
1048: * Testing interface
1049: *
1050: */
1051:
1052: public static void main (String[] args) throws Exception {
1053: if (args.length < 1 || args.length > 2) {
1054: System.err.println ("usage: HTMLParser <URL>");
1055: System.exit(-1);
1056: }
1057:
1058: Page page;
1059: if (args.length == 1)
1060: page = new Page (new Link(args[0]), new DownloadParameters (), new HTMLParser ());
1061: else
1062: page = new Page (new URL(args[0]), args[1], new HTMLParser ());
1063:
1064: /*
1065: long tm = System.currentTimeMillis(); //??dk
1066: HTMLParser tokenizer = new HTMLParser ();
1067:
1068: tm = System.currentTimeMillis() - tm; //??dk
1069: System.err.println("[Parsed " + args[0] + " in " + tm + "ms]");
1070: */
1071:
1072: System.out.println ("Tokens: ------------------------------------------");
1073: Region[] tokens = page.tokens;
1074: for (int i=0; i<tokens.length; ++i) {
1075: System.out.println ("[" + tokens[i].getStart() + "," + tokens[i].getEnd() + "]" + tokens[i]);
1076: }
1077:
1078: System.out.println ("Tags: ------------------------------------------");
1079: Tag[] tags = page.tags;
1080: for (int i=0; i<tags.length; ++i) {
1081: Tag t = tags[i];
1082: System.out.print ((t.isStartTag() ? "start tag" : "end tag") + " " + t.getTagName ());
1083:
1084: Enumeration attrs = t.enumerateHTMLAttributes();
1085: String name, val;
1086: while (attrs.hasMoreElements()) {
1087: name = (String)attrs.nextElement();
1088: val = t.getHTMLAttribute (name);
1089: System.out.print (" " + name + "=\"" + val + "\"");
1090: }
1091: System.out.println ();
1092: System.out.println (" " + t);
1093: }
1094:
1095: System.out.println ("Words: ------------------------------------------");
1096: Text[] words = page.words;
1097: for (int i=0; i<words.length; ++i) {
1098: System.out.println (words[i]);
1099: }
1100:
1101: System.out.println ("Elements: ------------------------------------------");
1102: printout (page.root, 0);
1103:
1104: System.out.println ("Links: ------------------------------------------");
1105: printout (page.getLinks (), 0);
1106:
1107: }
1108:
1109: private static String indentation (int indent) {
1110: StringBuffer s = new StringBuffer();
1111: for (int i=0; i<indent; ++i)
1112: s.append (" ");
1113: return s.toString();
1114: }
1115:
1116: private static void printout (Element element, int indent) {
1117: for (Element e = element; e != null; e = e.getSibling ()) {
1118: Element c = e.getChild();
1119:
1120: System.out.println (indentation(indent) + e.getStartTag() + "[" + e.getStart() + "," + e.getEnd() + "]");
1121: if (c != null)
1122: printout (c, indent+1);
1123: if (e.getEndTag() != null)
1124: System.out.println (indentation(indent) + e.getEndTag());
1125: }
1126: }
1127: private static void printout (Link[] elements, int indent) {
1128: for (int i=0; i<elements.length; ++i) {
1129: Link e = elements[i];
1130: System.out.println (indentation(indent) + e.toDescription());
1131: }
1132: }
1133:}
1134:
1135:class Hashtable2 extends Hashtable {
1136: public Hashtable2 () {
1137: }
1138:
1139: public Hashtable2 add (Object key) {
1140: put (key, key);
1141: return this ;
1142: }
1143:
1144: public Hashtable2 add (Object key, Object val) {
1145: put (key, val);
1146: return this ;
1147: }
1148:
1149: public Hashtable2 union (Hashtable map) {
1150: Enumeration enum = map.keys ();
1151: while (enum.hasMoreElements ()) {
1152: Object key = enum.nextElement ();
1153: put (key, map.get (key));
1154: }
1155:
1156: return this;
1157: }
1158:}
|