0001: /*
0002: * Java HTML Tidy - JTidy
0003: * HTML parser and pretty printer
0004: *
0005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0006: * Institute of Technology, Institut National de Recherche en
0007: * Informatique et en Automatique, Keio University). All Rights
0008: * Reserved.
0009: *
0010: * Contributing Author(s):
0011: *
0012: * Dave Raggett <dsr@w3.org>
0013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0014: * Gary L Peskin <garyp@firstech.com> (Java development)
0015: * Sami Lempinen <sami@lempinen.net> (release management)
0016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
0017: *
0018: * The contributing author(s) would like to thank all those who
0019: * helped with testing, bug fixes, and patience. This wouldn't
0020: * have been possible without all of you.
0021: *
0022: * COPYRIGHT NOTICE:
0023: *
0024: * This software and documentation is provided "as is," and
0025: * the copyright holders and contributing author(s) make no
0026: * representations or warranties, express or implied, including
0027: * but not limited to, warranties of merchantability or fitness
0028: * for any particular purpose or that the use of the software or
0029: * documentation will not infringe any third party patents,
0030: * copyrights, trademarks or other rights.
0031: *
0032: * The copyright holders and contributing author(s) will not be
0033: * liable for any direct, indirect, special or consequential damages
0034: * arising out of any use of the software or documentation, even if
0035: * advised of the possibility of such damage.
0036: *
0037: * Permission is hereby granted to use, copy, modify, and distribute
0038: * this source code, or portions hereof, documentation and executables,
0039: * for any purpose, without fee, subject to the following restrictions:
0040: *
0041: * 1. The origin of this source code must not be misrepresented.
0042: * 2. Altered versions must be plainly marked as such and must
0043: * not be misrepresented as being the original source.
0044: * 3. This Copyright notice may not be removed or altered from any
0045: * source or altered source distribution.
0046: *
0047: * The copyright holders and contributing author(s) specifically
0048: * permit, without fee, and encourage the use of this source code
0049: * as a component for supporting the Hypertext Markup Language in
0050: * commercial products. If you use this source code in a product,
0051: * acknowledgment is not required but would be appreciated.
0052: *
0053: */
0054: package org.w3c.tidy;
0055:
0056: /**
0057: * HTML Parser implementation.
0058: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
0059: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
0060: * @author Fabrizio Giustina
0061: * @version $Revision: 1.53 $ ($Author: fgiust $)
0062: */
0063: public final class ParserImpl {
0064:
0065: /**
0066: * parser for html.
0067: */
0068: public static final Parser HTML = new ParseHTML();
0069:
0070: /**
0071: * parser for head.
0072: */
0073: public static final Parser HEAD = new ParseHead();
0074:
0075: /**
0076: * parser for title.
0077: */
0078: public static final Parser TITLE = new ParseTitle();
0079:
0080: /**
0081: * parser for script.
0082: */
0083: public static final Parser SCRIPT = new ParseScript();
0084:
0085: /**
0086: * parser for body.
0087: */
0088: public static final Parser BODY = new ParseBody();
0089:
0090: /**
0091: * parser for frameset.
0092: */
0093: public static final Parser FRAMESET = new ParseFrameSet();
0094:
0095: /**
0096: * parser for inline.
0097: */
0098: public static final Parser INLINE = new ParseInline();
0099:
0100: /**
0101: * parser for list.
0102: */
0103: public static final Parser LIST = new ParseList();
0104:
0105: /**
0106: * parser for definition lists.
0107: */
0108: public static final Parser DEFLIST = new ParseDefList();
0109:
0110: /**
0111: * parser for pre.
0112: */
0113: public static final Parser PRE = new ParsePre();
0114:
0115: /**
0116: * parser for block elements.
0117: */
0118: public static final Parser BLOCK = new ParseBlock();
0119:
0120: /**
0121: * parser for table.
0122: */
0123: public static final Parser TABLETAG = new ParseTableTag();
0124:
0125: /**
0126: * parser for colgroup.
0127: */
0128: public static final Parser COLGROUP = new ParseColGroup();
0129:
0130: /**
0131: * parser for rowgroup.
0132: */
0133: public static final Parser ROWGROUP = new ParseRowGroup();
0134:
0135: /**
0136: * parser for row.
0137: */
0138: public static final Parser ROW = new ParseRow();
0139:
0140: /**
0141: * parser for noframes.
0142: */
0143: public static final Parser NOFRAMES = new ParseNoFrames();
0144:
0145: /**
0146: * parser for select.
0147: */
0148: public static final Parser SELECT = new ParseSelect();
0149:
0150: /**
0151: * parser for text.
0152: */
0153: public static final Parser TEXT = new ParseText();
0154:
0155: /**
0156: * parser for empty elements.
0157: */
0158: public static final Parser EMPTY = new ParseEmpty();
0159:
0160: /**
0161: * parser for optgroup.
0162: */
0163: public static final Parser OPTGROUP = new ParseOptGroup();
0164:
0165: /**
0166: * ParserImpl should not be instantiated.
0167: */
0168: private ParserImpl() {
0169: // unused
0170: }
0171:
0172: /**
0173: * @param lexer
0174: * @param node
0175: * @param mode
0176: */
0177: protected static void parseTag(Lexer lexer, Node node, short mode) {
0178: // Fix by GLP 2000-12-21. Need to reset insertspace if this
0179: // is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
0180: if ((node.tag.model & Dict.CM_EMPTY) != 0) {
0181: lexer.waswhite = false;
0182: } else if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
0183: lexer.insertspace = false;
0184: }
0185:
0186: if (node.tag.getParser() == null) {
0187: return;
0188: }
0189:
0190: if (node.type == Node.START_END_TAG) {
0191: Node.trimEmptyElement(lexer, node);
0192: return;
0193: }
0194:
0195: node.tag.getParser().parse(lexer, node, mode);
0196: }
0197:
0198: /**
0199: * Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing.
0200: * @param lexer
0201: * @param element
0202: * @param node
0203: */
0204: protected static void moveToHead(Lexer lexer, Node element,
0205: Node node) {
0206: Node head;
0207: node.removeNode(); // make sure that node is isolated
0208:
0209: TagTable tt = lexer.configuration.tt;
0210:
0211: if (node.type == Node.START_TAG
0212: || node.type == Node.START_END_TAG) {
0213: lexer.report.warning(lexer, element, node,
0214: Report.TAG_NOT_ALLOWED_IN);
0215:
0216: while (element.tag != tt.tagHtml) {
0217: element = element.parent;
0218: }
0219:
0220: for (head = element.content; head != null; head = head.next) {
0221: if (head.tag == tt.tagHead) {
0222: head.insertNodeAtEnd(node);
0223: break;
0224: }
0225: }
0226:
0227: if (node.tag.getParser() != null) {
0228: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
0229: }
0230: } else {
0231: lexer.report.warning(lexer, element, node,
0232: Report.DISCARDING_UNEXPECTED);
0233: }
0234: }
0235:
0236: /**
0237: * moves given node to end of body element.
0238: * @param lexer Lexer
0239: * @param node Node to insert
0240: */
0241: static void moveNodeToBody(Lexer lexer, Node node) {
0242: node.removeNode();
0243: Node body = lexer.root.findBody(lexer.configuration.tt);
0244: body.insertNodeAtEnd(node);
0245: }
0246:
0247: /**
0248: * Parser for HTML.
0249: */
0250: public static class ParseHTML implements Parser {
0251:
0252: /**
0253: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0254: */
0255: public void parse(Lexer lexer, Node html, short mode) {
0256: Node node, head;
0257: Node frameset = null;
0258: Node noframes = null;
0259:
0260: lexer.configuration.xmlTags = false;
0261: lexer.seenEndBody = false;
0262: TagTable tt = lexer.configuration.tt;
0263:
0264: while (true) {
0265: node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
0266:
0267: if (node == null) {
0268: node = lexer.inferredTag("head");
0269: break;
0270: }
0271:
0272: if (node.tag == tt.tagHead) {
0273: break;
0274: }
0275:
0276: if (node.tag == html.tag && node.type == Node.END_TAG) {
0277: lexer.report.warning(lexer, html, node,
0278: Report.DISCARDING_UNEXPECTED);
0279: continue;
0280: }
0281:
0282: // deal with comments etc.
0283: if (Node.insertMisc(html, node)) {
0284: continue;
0285: }
0286:
0287: lexer.ungetToken();
0288: node = lexer.inferredTag("head");
0289: break;
0290: }
0291:
0292: head = node;
0293: html.insertNodeAtEnd(head);
0294: HEAD.parse(lexer, head, mode);
0295:
0296: while (true) {
0297: node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
0298:
0299: if (node == null) {
0300: if (frameset == null) {
0301: // implied body
0302: node = lexer.inferredTag("body");
0303: html.insertNodeAtEnd(node);
0304: BODY.parse(lexer, node, mode);
0305: }
0306:
0307: return;
0308: }
0309:
0310: // robustly handle html tags
0311: if (node.tag == html.tag) {
0312: if (node.type != Node.START_TAG && frameset == null) {
0313: lexer.report.warning(lexer, html, node,
0314: Report.DISCARDING_UNEXPECTED);
0315: } else if (node.type == Node.END_TAG) {
0316: lexer.seenEndHtml = true;
0317: }
0318:
0319: continue;
0320: }
0321:
0322: // deal with comments etc.
0323: if (Node.insertMisc(html, node)) {
0324: continue;
0325: }
0326:
0327: // if frameset document coerce <body> to <noframes>
0328: if (node.tag == tt.tagBody) {
0329: if (node.type != Node.START_TAG) {
0330: lexer.report.warning(lexer, html, node,
0331: Report.DISCARDING_UNEXPECTED);
0332: continue;
0333: }
0334:
0335: if (frameset != null) {
0336: lexer.ungetToken();
0337:
0338: if (noframes == null) {
0339: noframes = lexer.inferredTag("noframes");
0340: frameset.insertNodeAtEnd(noframes);
0341: lexer.report.warning(lexer, html, noframes,
0342: Report.INSERTING_TAG);
0343: }
0344:
0345: parseTag(lexer, noframes, mode);
0346: continue;
0347: }
0348:
0349: lexer.constrainVersion(~Dict.VERS_FRAMESET);
0350: break; // to parse body
0351: }
0352:
0353: // flag an error if we see more than one frameset
0354: if (node.tag == tt.tagFrameset) {
0355: if (node.type != Node.START_TAG) {
0356: lexer.report.warning(lexer, html, node,
0357: Report.DISCARDING_UNEXPECTED);
0358: continue;
0359: }
0360:
0361: if (frameset != null) {
0362: lexer.report.error(lexer, html, node,
0363: Report.DUPLICATE_FRAMESET);
0364: } else {
0365: frameset = node;
0366: }
0367:
0368: html.insertNodeAtEnd(node);
0369: parseTag(lexer, node, mode);
0370:
0371: // see if it includes a noframes element so that we can merge subsequent noframes elements
0372:
0373: for (node = frameset.content; node != null; node = node.next) {
0374: if (node.tag == tt.tagNoframes) {
0375: noframes = node;
0376: }
0377: }
0378: continue;
0379: }
0380:
0381: // if not a frameset document coerce <noframes> to <body>
0382: if (node.tag == tt.tagNoframes) {
0383: if (node.type != Node.START_TAG) {
0384: lexer.report.warning(lexer, html, node,
0385: Report.DISCARDING_UNEXPECTED);
0386: continue;
0387: }
0388:
0389: if (frameset == null) {
0390: lexer.report.warning(lexer, html, node,
0391: Report.DISCARDING_UNEXPECTED);
0392: node = lexer.inferredTag("body");
0393: break;
0394: }
0395:
0396: if (noframes == null) {
0397: noframes = node;
0398: frameset.insertNodeAtEnd(noframes);
0399: }
0400:
0401: parseTag(lexer, noframes, mode);
0402: continue;
0403: }
0404:
0405: if (node.type == Node.START_TAG
0406: || node.type == Node.START_END_TAG) {
0407: if (node.tag != null
0408: && (node.tag.model & Dict.CM_HEAD) != 0) {
0409: moveToHead(lexer, html, node);
0410: continue;
0411: }
0412:
0413: // #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00
0414: if (frameset != null && node.tag == tt.tagFrame) {
0415: lexer.report.warning(lexer, html, node,
0416: Report.DISCARDING_UNEXPECTED);
0417: continue;
0418: }
0419: }
0420:
0421: lexer.ungetToken();
0422:
0423: // insert other content into noframes element
0424: if (frameset != null) {
0425: if (noframes == null) {
0426: noframes = lexer.inferredTag("noframes");
0427: frameset.insertNodeAtEnd(noframes);
0428: } else {
0429: lexer.report.warning(lexer, html, node,
0430: Report.NOFRAMES_CONTENT);
0431: }
0432:
0433: lexer.constrainVersion(Dict.VERS_FRAMESET);
0434: parseTag(lexer, noframes, mode);
0435: continue;
0436: }
0437:
0438: node = lexer.inferredTag("body");
0439: lexer.constrainVersion(~Dict.VERS_FRAMESET);
0440: break;
0441: }
0442:
0443: // node must be body
0444: html.insertNodeAtEnd(node);
0445: parseTag(lexer, node, mode);
0446: lexer.seenEndHtml = true;
0447: }
0448:
0449: }
0450:
0451: /**
0452: * Parser for HEAD.
0453: */
0454: public static class ParseHead implements Parser {
0455:
0456: /**
0457: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0458: */
0459: public void parse(Lexer lexer, Node head, short mode) {
0460: Node node;
0461: int hasTitle = 0;
0462: int hasBase = 0;
0463: TagTable tt = lexer.configuration.tt;
0464:
0465: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
0466: if (node.tag == head.tag && node.type == Node.END_TAG) {
0467: head.closed = true;
0468: break;
0469: }
0470:
0471: if (node.type == Node.TEXT_NODE) {
0472: lexer.report.warning(lexer, head, node,
0473: Report.TAG_NOT_ALLOWED_IN);
0474: lexer.ungetToken();
0475: break;
0476: }
0477:
0478: // deal with comments etc.
0479: if (Node.insertMisc(head, node)) {
0480: continue;
0481: }
0482:
0483: if (node.type == Node.DOCTYPE_TAG) {
0484: Node.insertDocType(lexer, head, node);
0485: continue;
0486: }
0487:
0488: // discard unknown tags
0489: if (node.tag == null) {
0490: lexer.report.warning(lexer, head, node,
0491: Report.DISCARDING_UNEXPECTED);
0492: continue;
0493: }
0494:
0495: if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD)) {
0496: // #545067 Implicit closing of head broken - warn only for XHTML input
0497: if (lexer.isvoyager) {
0498: lexer.report.warning(lexer, head, node,
0499: Report.TAG_NOT_ALLOWED_IN);
0500: }
0501: lexer.ungetToken();
0502: break;
0503: }
0504:
0505: if (node.type == Node.START_TAG
0506: || node.type == Node.START_END_TAG) {
0507: if (node.tag == tt.tagTitle) {
0508: ++hasTitle;
0509:
0510: if (hasTitle > 1) {
0511: lexer.report.warning(lexer, head, node,
0512: Report.TOO_MANY_ELEMENTS);
0513: }
0514: } else if (node.tag == tt.tagBase) {
0515: ++hasBase;
0516:
0517: if (hasBase > 1) {
0518: lexer.report.warning(lexer, head, node,
0519: Report.TOO_MANY_ELEMENTS);
0520: }
0521: } else if (node.tag == tt.tagNoscript) {
0522: lexer.report.warning(lexer, head, node,
0523: Report.TAG_NOT_ALLOWED_IN);
0524: }
0525:
0526: head.insertNodeAtEnd(node);
0527: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
0528: continue;
0529: }
0530:
0531: // discard unexpected text nodes and end tags
0532: lexer.report.warning(lexer, head, node,
0533: Report.DISCARDING_UNEXPECTED);
0534: }
0535:
0536: if (hasTitle == 0) {
0537: if (!lexer.configuration.bodyOnly) {
0538: lexer.report.warning(lexer, head, null,
0539: Report.MISSING_TITLE_ELEMENT);
0540: }
0541: head.insertNodeAtEnd(lexer.inferredTag("title"));
0542: }
0543: }
0544:
0545: }
0546:
0547: /**
0548: * Parser for TITLE.
0549: */
0550: public static class ParseTitle implements Parser {
0551:
0552: /**
0553: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0554: */
0555: public void parse(Lexer lexer, Node title, short mode) {
0556: Node node;
0557:
0558: while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null) {
0559: // [438658] : Missing / in title endtag makes 2 titles
0560: if (node.tag == title.tag
0561: && node.type == Node.START_TAG) {
0562: lexer.report.warning(lexer, title, node,
0563: Report.COERCE_TO_ENDTAG);
0564: node.type = Node.END_TAG;
0565: continue;
0566: } else if (node.tag == title.tag
0567: && node.type == Node.END_TAG) {
0568: title.closed = true;
0569: Node.trimSpaces(lexer, title);
0570: return;
0571: }
0572:
0573: if (node.type == Node.TEXT_NODE) {
0574: // only called for 1st child
0575: if (title.content == null) {
0576: Node.trimInitialSpace(lexer, title, node);
0577: }
0578:
0579: if (node.start >= node.end) {
0580: continue;
0581: }
0582:
0583: title.insertNodeAtEnd(node);
0584: continue;
0585: }
0586:
0587: // deal with comments etc.
0588: if (Node.insertMisc(title, node)) {
0589: continue;
0590: }
0591:
0592: // discard unknown tags
0593: if (node.tag == null) {
0594: lexer.report.warning(lexer, title, node,
0595: Report.DISCARDING_UNEXPECTED);
0596: continue;
0597: }
0598:
0599: // pushback unexpected tokens
0600: lexer.report.warning(lexer, title, node,
0601: Report.MISSING_ENDTAG_BEFORE);
0602: lexer.ungetToken();
0603: Node.trimSpaces(lexer, title);
0604: return;
0605: }
0606:
0607: lexer.report.warning(lexer, title, node,
0608: Report.MISSING_ENDTAG_FOR);
0609: }
0610:
0611: }
0612:
0613: /**
0614: * Parser for SCRIPT.
0615: */
0616: public static class ParseScript implements Parser {
0617:
0618: /**
0619: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0620: */
0621: public void parse(Lexer lexer, Node script, short mode) {
0622: // This isn't quite right for CDATA content as it recognises tags within the content and parses them
0623: // accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / +
0624: // letter
0625:
0626: Node node = lexer.getCDATA(script);
0627:
0628: if (node != null) {
0629: script.insertNodeAtEnd(node);
0630: }
0631: }
0632:
0633: }
0634:
0635: /**
0636: * Parser for BODY.
0637: */
0638: public static class ParseBody implements Parser {
0639:
0640: /**
0641: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0642: */
0643: public void parse(Lexer lexer, Node body, short mode) {
0644: Node node;
0645: boolean checkstack, iswhitenode;
0646:
0647: mode = Lexer.IGNORE_WHITESPACE;
0648: checkstack = true;
0649: TagTable tt = lexer.configuration.tt;
0650:
0651: Clean.bumpObject(lexer, body.parent);
0652:
0653: while ((node = lexer.getToken(mode)) != null) {
0654:
0655: // #538536 Extra endtags not detected
0656: if (node.tag == tt.tagHtml) {
0657: if (node.type == Node.START_TAG
0658: || node.type == Node.START_END_TAG
0659: || lexer.seenEndHtml) {
0660: lexer.report.warning(lexer, body, node,
0661: Report.DISCARDING_UNEXPECTED);
0662: } else {
0663: lexer.seenEndHtml = true;
0664: }
0665:
0666: continue;
0667: }
0668:
0669: if (lexer.seenEndBody
0670: && (node.type == Node.START_TAG
0671: || node.type == Node.END_TAG || node.type == Node.START_END_TAG)) {
0672: lexer.report.warning(lexer, body, node,
0673: Report.CONTENT_AFTER_BODY);
0674: }
0675:
0676: if (node.tag == body.tag && node.type == Node.END_TAG) {
0677: body.closed = true;
0678: Node.trimSpaces(lexer, body);
0679: lexer.seenEndBody = true;
0680: mode = Lexer.IGNORE_WHITESPACE;
0681:
0682: if (body.parent.tag == tt.tagNoframes) {
0683: break;
0684: }
0685:
0686: continue;
0687: }
0688:
0689: if (node.tag == tt.tagNoframes) {
0690: if (node.type == Node.START_TAG) {
0691: body.insertNodeAtEnd(node);
0692: BLOCK.parse(lexer, node, mode);
0693: continue;
0694: }
0695:
0696: if (node.type == Node.END_TAG
0697: && body.parent.tag == tt.tagNoframes) {
0698: Node.trimSpaces(lexer, body);
0699: lexer.ungetToken();
0700: break;
0701: }
0702: }
0703:
0704: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)
0705: && body.parent.tag == tt.tagNoframes) {
0706: Node.trimSpaces(lexer, body);
0707: lexer.ungetToken();
0708: break;
0709: }
0710:
0711: iswhitenode = false;
0712:
0713: if (node.type == Node.TEXT_NODE
0714: && node.end <= node.start + 1
0715: && node.textarray[node.start] == (byte) ' ') {
0716: iswhitenode = true;
0717: }
0718:
0719: // deal with comments etc.
0720: if (Node.insertMisc(body, node)) {
0721: continue;
0722: }
0723:
0724: // #538536 Extra endtags not detected
0725: // if (lexer.seenEndBody && !iswhitenode)
0726: // {
0727: // lexer.seenEndBody = true;
0728: // lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
0729: // }
0730:
0731: // mixed content model permits text
0732: if (node.type == Node.TEXT_NODE) {
0733: if (iswhitenode && mode == Lexer.IGNORE_WHITESPACE) {
0734: continue;
0735: }
0736:
0737: if (lexer.configuration.encloseBodyText
0738: && !iswhitenode) {
0739: Node para;
0740:
0741: lexer.ungetToken();
0742: para = lexer.inferredTag("p");
0743: body.insertNodeAtEnd(para);
0744: parseTag(lexer, para, mode);
0745: mode = Lexer.MIXED_CONTENT;
0746: continue;
0747: }
0748:
0749: // HTML2 and HTML4 strict doesn't allow text here
0750: lexer
0751: .constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
0752:
0753: if (checkstack) {
0754: checkstack = false;
0755:
0756: if (lexer.inlineDup(node) > 0) {
0757: continue;
0758: }
0759: }
0760:
0761: body.insertNodeAtEnd(node);
0762: mode = Lexer.MIXED_CONTENT;
0763: continue;
0764: }
0765:
0766: if (node.type == Node.DOCTYPE_TAG) {
0767: Node.insertDocType(lexer, body, node);
0768: continue;
0769: }
0770: // discard unknown and PARAM tags
0771: if (node.tag == null || node.tag == tt.tagParam) {
0772: lexer.report.warning(lexer, body, node,
0773: Report.DISCARDING_UNEXPECTED);
0774: continue;
0775: }
0776:
0777: // Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this boolean to
0778: // exclude block-level elements so as to match Netscape's observed behaviour.
0779:
0780: lexer.excludeBlocks = false;
0781:
0782: if ((!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0))
0783: || node.tag == tt.tagInput) {
0784: // avoid this error message being issued twice
0785: if (!((node.tag.model & Dict.CM_HEAD) != 0)) {
0786: lexer.report.warning(lexer, body, node,
0787: Report.TAG_NOT_ALLOWED_IN);
0788: }
0789:
0790: if ((node.tag.model & Dict.CM_HTML) != 0) {
0791: // copy body attributes if current body was inferred
0792: if (node.tag == tt.tagBody && body.implicit
0793: && body.attributes == null) {
0794: body.attributes = node.attributes;
0795: node.attributes = null;
0796: }
0797:
0798: continue;
0799: }
0800:
0801: if ((node.tag.model & Dict.CM_HEAD) != 0) {
0802: moveToHead(lexer, body, node);
0803: continue;
0804: }
0805:
0806: if ((node.tag.model & Dict.CM_LIST) != 0) {
0807: lexer.ungetToken();
0808: node = lexer.inferredTag("ul");
0809: node.addClass("noindent");
0810: lexer.excludeBlocks = true;
0811: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
0812: lexer.ungetToken();
0813: node = lexer.inferredTag("dl");
0814: lexer.excludeBlocks = true;
0815: } else if ((node.tag.model & (Dict.CM_TABLE
0816: | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) {
0817: lexer.ungetToken();
0818: node = lexer.inferredTag("table");
0819: lexer.excludeBlocks = true;
0820: } else if (node.tag == tt.tagInput) {
0821: lexer.ungetToken();
0822: node = lexer.inferredTag("form");
0823: lexer.excludeBlocks = true;
0824: } else {
0825: if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) {
0826: lexer.ungetToken();
0827: return;
0828: }
0829:
0830: // ignore </td></th> <option> etc.
0831: continue;
0832: }
0833: }
0834:
0835: if (node.type == Node.END_TAG) {
0836: if (node.tag == tt.tagBr) {
0837: node.type = Node.START_TAG;
0838: } else if (node.tag == tt.tagP) {
0839: Node.coerceNode(lexer, node, tt.tagBr);
0840: body.insertNodeAtEnd(node);
0841: node = lexer.inferredTag("br");
0842: } else if ((node.tag.model & Dict.CM_INLINE) != 0) {
0843: lexer.popInline(node);
0844: }
0845: }
0846:
0847: if (node.type == Node.START_TAG
0848: || node.type == Node.START_END_TAG) {
0849: if (((node.tag.model & Dict.CM_INLINE) != 0)
0850: && !((node.tag.model & Dict.CM_MIXED) != 0)) {
0851: // HTML4 strict doesn't allow inline content here
0852: // but HTML2 does allow img elements as children of body
0853: if (node.tag == tt.tagImg) {
0854: lexer
0855: .constrainVersion(~Dict.VERS_HTML40_STRICT);
0856: } else {
0857: lexer
0858: .constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
0859: }
0860:
0861: if (checkstack && !node.implicit) {
0862: checkstack = false;
0863:
0864: if (lexer.inlineDup(node) > 0) {
0865: continue;
0866: }
0867: }
0868:
0869: mode = Lexer.MIXED_CONTENT;
0870: } else {
0871: checkstack = true;
0872: mode = Lexer.IGNORE_WHITESPACE;
0873: }
0874:
0875: if (node.implicit) {
0876: lexer.report.warning(lexer, body, node,
0877: Report.INSERTING_TAG);
0878: }
0879:
0880: body.insertNodeAtEnd(node);
0881: parseTag(lexer, node, mode);
0882: continue;
0883: }
0884:
0885: // discard unexpected tags
0886: lexer.report.warning(lexer, body, node,
0887: Report.DISCARDING_UNEXPECTED);
0888: }
0889: }
0890:
0891: }
0892:
0893: /**
0894: * Parser for FRAMESET.
0895: */
0896: public static class ParseFrameSet implements Parser {
0897:
0898: /**
0899: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0900: */
0901: public void parse(Lexer lexer, Node frameset, short mode) {
0902: Node node;
0903: TagTable tt = lexer.configuration.tt;
0904:
0905: lexer.badAccess |= Report.USING_FRAMES;
0906:
0907: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
0908: if (node.tag == frameset.tag
0909: && node.type == Node.END_TAG) {
0910: frameset.closed = true;
0911: Node.trimSpaces(lexer, frameset);
0912: return;
0913: }
0914:
0915: // deal with comments etc.
0916: if (Node.insertMisc(frameset, node)) {
0917: continue;
0918: }
0919:
0920: if (node.tag == null) {
0921: lexer.report.warning(lexer, frameset, node,
0922: Report.DISCARDING_UNEXPECTED);
0923: continue;
0924: }
0925:
0926: if (node.type == Node.START_TAG
0927: || node.type == Node.START_END_TAG) {
0928: if (node.tag != null
0929: && (node.tag.model & Dict.CM_HEAD) != 0) {
0930: moveToHead(lexer, frameset, node);
0931: continue;
0932: }
0933: }
0934:
0935: if (node.tag == tt.tagBody) {
0936: lexer.ungetToken();
0937: node = lexer.inferredTag("noframes");
0938: lexer.report.warning(lexer, frameset, node,
0939: Report.INSERTING_TAG);
0940: }
0941:
0942: if (node.type == Node.START_TAG
0943: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0944: frameset.insertNodeAtEnd(node);
0945: lexer.excludeBlocks = false;
0946: parseTag(lexer, node, Lexer.MIXED_CONTENT);
0947: continue;
0948: } else if (node.type == Node.START_END_TAG
0949: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0950: frameset.insertNodeAtEnd(node);
0951: continue;
0952: }
0953:
0954: // discard unexpected tags
0955: lexer.report.warning(lexer, frameset, node,
0956: Report.DISCARDING_UNEXPECTED);
0957: }
0958:
0959: lexer.report.warning(lexer, frameset, node,
0960: Report.MISSING_ENDTAG_FOR);
0961: }
0962:
0963: }
0964:
0965: /**
0966: * Parser for INLINE.
0967: */
0968: public static class ParseInline implements Parser {
0969:
0970: /**
0971: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
0972: */
0973: public void parse(Lexer lexer, Node element, short mode) {
0974: Node node, parent;
0975: TagTable tt = lexer.configuration.tt;
0976:
0977: if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) {
0978: return;
0979: }
0980:
0981: // ParseInline is used for some block level elements like H1 to H6 For such elements we need to insert
0982: // inline emphasis tags currently on the inline stack. For Inline elements, we normally push them onto the
0983: // inline stack provided they aren't implicit or OBJECT/APPLET. This test is carried out in PushInline and
0984: // PopInline, see istack.c We don't push SPAN to replicate current browser behavior
0985:
0986: if (TidyUtils.toBoolean(element.tag.model & Dict.CM_BLOCK)
0987: || (element.tag == tt.tagDt)) {
0988: lexer.inlineDup(null);
0989: } else if (TidyUtils.toBoolean(element.tag.model
0990: & Dict.CM_INLINE)) {
0991: // && element.tag != tt.tagSpan #540571 Inconsistent behaviour with span inline element
0992: lexer.pushInline(element);
0993: }
0994:
0995: if (element.tag == tt.tagNobr) {
0996: lexer.badLayout |= Report.USING_NOBR;
0997: } else if (element.tag == tt.tagFont) {
0998: lexer.badLayout |= Report.USING_FONT;
0999: }
1000:
1001: // Inline elements may or may not be within a preformatted element
1002: if (mode != Lexer.PREFORMATTED) {
1003: mode = Lexer.MIXED_CONTENT;
1004: }
1005:
1006: while ((node = lexer.getToken(mode)) != null) {
1007: // end tag for current element
1008: if (node.tag == element.tag
1009: && node.type == Node.END_TAG) {
1010: if (TidyUtils.toBoolean(element.tag.model
1011: & Dict.CM_INLINE)) {
1012: lexer.popInline(node);
1013: }
1014:
1015: if (!TidyUtils.toBoolean(mode & Lexer.PREFORMATTED)) {
1016: Node.trimSpaces(lexer, element);
1017: }
1018:
1019: // if a font element wraps an anchor and nothing else then move the font element inside the anchor
1020: // since otherwise it won't alter the anchor text color
1021:
1022: if (element.tag == tt.tagFont
1023: && element.content != null
1024: && element.content == element.last) {
1025: Node child = element.content;
1026:
1027: if (child.tag == tt.tagA) {
1028: child.parent = element.parent;
1029: child.next = element.next;
1030: child.prev = element.prev;
1031:
1032: if (child.prev != null) {
1033: child.prev.next = child;
1034: } else {
1035: child.parent.content = child;
1036: }
1037:
1038: if (child.next != null) {
1039: child.next.prev = child;
1040: } else {
1041: child.parent.last = child;
1042: }
1043:
1044: element.next = null;
1045: element.prev = null;
1046: element.parent = child;
1047: element.content = child.content;
1048: element.last = child.last;
1049: child.content = element;
1050: child.last = element;
1051: for (child = element.content; child != null; child = child.next) {
1052: child.parent = element;
1053: }
1054: }
1055: }
1056: element.closed = true;
1057: Node.trimSpaces(lexer, element);
1058: Node.trimEmptyElement(lexer, element);
1059: return;
1060: }
1061:
1062: // <u> ... <u> map 2nd <u> to </u> if 1st is explicit
1063: // otherwise emphasis nesting is probably unintentional
1064: // big and small have cumulative effect to leave them alone
1065: if (node.type == Node.START_TAG
1066: && node.tag == element.tag
1067: && lexer.isPushed(node) && !node.implicit
1068: && !element.implicit && node.tag != null
1069: && ((node.tag.model & Dict.CM_INLINE) != 0)
1070: && node.tag != tt.tagA
1071: && node.tag != tt.tagFont
1072: && node.tag != tt.tagBig
1073: && node.tag != tt.tagSmall
1074: && node.tag != tt.tagQ) {
1075: if (element.content != null
1076: && node.attributes == null) {
1077: lexer.report.warning(lexer, element, node,
1078: Report.COERCE_TO_ENDTAG);
1079: node.type = Node.END_TAG;
1080: lexer.ungetToken();
1081: continue;
1082: }
1083:
1084: lexer.report.warning(lexer, element, node,
1085: Report.NESTED_EMPHASIS);
1086: } else if (lexer.isPushed(node)
1087: && node.type == Node.START_TAG
1088: && node.tag == tt.tagQ) {
1089: lexer.report.warning(lexer, element, node,
1090: Report.NESTED_QUOTATION);
1091: }
1092:
1093: if (node.type == Node.TEXT_NODE) {
1094: // only called for 1st child
1095: if (element.content == null
1096: && !TidyUtils.toBoolean(mode
1097: & Lexer.PREFORMATTED)) {
1098: Node.trimSpaces(lexer, element);
1099: }
1100:
1101: if (node.start >= node.end) {
1102: continue;
1103: }
1104:
1105: element.insertNodeAtEnd(node);
1106: continue;
1107: }
1108:
1109: // mixed content model so allow text
1110: if (Node.insertMisc(element, node)) {
1111: continue;
1112: }
1113:
1114: // deal with HTML tags
1115: if (node.tag == tt.tagHtml) {
1116: if (node.type == Node.START_TAG
1117: || node.type == Node.START_END_TAG) {
1118: lexer.report.warning(lexer, element, node,
1119: Report.DISCARDING_UNEXPECTED);
1120: continue;
1121: }
1122:
1123: // otherwise infer end of inline element
1124: lexer.ungetToken();
1125: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1126: Node.trimSpaces(lexer, element);
1127: }
1128: Node.trimEmptyElement(lexer, element);
1129: return;
1130: }
1131:
1132: // within <dt> or <pre> map <p> to <br>
1133: if (node.tag == tt.tagP
1134: && node.type == Node.START_TAG
1135: && ((mode & Lexer.PREFORMATTED) != 0
1136: || element.tag == tt.tagDt || element
1137: .isDescendantOf(tt.tagDt))) {
1138: node.tag = tt.tagBr;
1139: node.element = "br";
1140: Node.trimSpaces(lexer, element);
1141: element.insertNodeAtEnd(node);
1142: continue;
1143: }
1144:
1145: // ignore unknown and PARAM tags
1146: if (node.tag == null || node.tag == tt.tagParam) {
1147: lexer.report.warning(lexer, element, node,
1148: Report.DISCARDING_UNEXPECTED);
1149: continue;
1150: }
1151:
1152: if (node.tag == tt.tagBr && node.type == Node.END_TAG) {
1153: node.type = Node.START_TAG;
1154: }
1155:
1156: if (node.type == Node.END_TAG) {
1157: // coerce </br> to <br>
1158: if (node.tag == tt.tagBr) {
1159: node.type = Node.START_TAG;
1160: } else if (node.tag == tt.tagP) {
1161: // coerce unmatched </p> to <br><br>
1162: if (!element.isDescendantOf(tt.tagP)) {
1163: Node.coerceNode(lexer, node, tt.tagBr);
1164: Node.trimSpaces(lexer, element);
1165: element.insertNodeAtEnd(node);
1166: node = lexer.inferredTag("br");
1167: continue;
1168: }
1169: } else if ((node.tag.model & Dict.CM_INLINE) != 0
1170: && node.tag != tt.tagA
1171: && !((node.tag.model & Dict.CM_OBJECT) != 0)
1172: && (element.tag.model & Dict.CM_INLINE) != 0) {
1173: // allow any inline end tag to end current element
1174: lexer.popInline(element);
1175:
1176: if (element.tag != tt.tagA) {
1177: if (node.tag == tt.tagA
1178: && node.tag != element.tag) {
1179: lexer.report.warning(lexer, element,
1180: node,
1181: Report.MISSING_ENDTAG_BEFORE);
1182: lexer.ungetToken();
1183: } else {
1184: lexer.report.warning(lexer, element,
1185: node,
1186: Report.NON_MATCHING_ENDTAG);
1187: }
1188:
1189: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1190: Node.trimSpaces(lexer, element);
1191: }
1192: Node.trimEmptyElement(lexer, element);
1193: return;
1194: }
1195:
1196: // if parent is <a> then discard unexpected inline end tag
1197: lexer.report.warning(lexer, element, node,
1198: Report.DISCARDING_UNEXPECTED);
1199: continue;
1200: } // special case </tr> etc. for stuff moved in front of table
1201: else if (lexer.exiled && node.tag.model != 0
1202: && (node.tag.model & Dict.CM_TABLE) != 0) {
1203: lexer.ungetToken();
1204: Node.trimSpaces(lexer, element);
1205: Node.trimEmptyElement(lexer, element);
1206: return;
1207: }
1208: }
1209:
1210: // allow any header tag to end current header
1211: if ((node.tag.model & Dict.CM_HEADING) != 0
1212: && (element.tag.model & Dict.CM_HEADING) != 0) {
1213: if (node.tag == element.tag) {
1214: lexer.report.warning(lexer, element, node,
1215: Report.NON_MATCHING_ENDTAG);
1216: } else {
1217: lexer.report.warning(lexer, element, node,
1218: Report.MISSING_ENDTAG_BEFORE);
1219: lexer.ungetToken();
1220: }
1221: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1222: Node.trimSpaces(lexer, element);
1223: }
1224: Node.trimEmptyElement(lexer, element);
1225: return;
1226: }
1227:
1228: // an <A> tag to ends any open <A> element but <A href=...> is mapped to </A><A href=...>
1229:
1230: // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1231: // if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node))
1232: if (node.tag == tt.tagA
1233: && !node.implicit
1234: && (element.tag == tt.tagA || element
1235: .isDescendantOf(tt.tagA))) {
1236: // coerce <a> to </a> unless it has some attributes
1237: // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1238: // other fixes by Dave Raggett
1239: // if (node.attributes == null)
1240: if (node.type != Node.END_TAG
1241: && node.attributes == null) {
1242: node.type = Node.END_TAG;
1243: lexer.report.warning(lexer, element, node,
1244: Report.COERCE_TO_ENDTAG);
1245: // lexer.popInline(node);
1246: lexer.ungetToken();
1247: continue;
1248: }
1249:
1250: lexer.ungetToken();
1251: lexer.report.warning(lexer, element, node,
1252: Report.MISSING_ENDTAG_BEFORE);
1253: // lexer.popInline(element);
1254: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1255: Node.trimSpaces(lexer, element);
1256: }
1257: Node.trimEmptyElement(lexer, element);
1258: return;
1259: }
1260:
1261: if ((element.tag.model & Dict.CM_HEADING) != 0) {
1262: if (node.tag == tt.tagCenter
1263: || node.tag == tt.tagDiv) {
1264: if (node.type != Node.START_TAG
1265: && node.type != Node.START_END_TAG) {
1266: lexer.report.warning(lexer, element, node,
1267: Report.DISCARDING_UNEXPECTED);
1268: continue;
1269: }
1270:
1271: lexer.report.warning(lexer, element, node,
1272: Report.TAG_NOT_ALLOWED_IN);
1273:
1274: // insert center as parent if heading is empty
1275: if (element.content == null) {
1276: Node.insertNodeAsParent(element, node);
1277: continue;
1278: }
1279:
1280: // split heading and make center parent of 2nd part
1281: element.insertNodeAfterElement(node);
1282:
1283: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1284: Node.trimSpaces(lexer, element);
1285: }
1286:
1287: element = lexer.cloneNode(element);
1288: element.start = lexer.lexsize;
1289: element.end = lexer.lexsize;
1290: node.insertNodeAtEnd(element);
1291: continue;
1292: }
1293:
1294: if (node.tag == tt.tagHr) {
1295: if (node.type != Node.START_TAG
1296: && node.type != Node.START_END_TAG) {
1297: lexer.report.warning(lexer, element, node,
1298: Report.DISCARDING_UNEXPECTED);
1299: continue;
1300: }
1301:
1302: lexer.report.warning(lexer, element, node,
1303: Report.TAG_NOT_ALLOWED_IN);
1304:
1305: // insert hr before heading if heading is empty
1306: if (element.content == null) {
1307: Node.insertNodeBeforeElement(element, node);
1308: continue;
1309: }
1310:
1311: // split heading and insert hr before 2nd part
1312: element.insertNodeAfterElement(node);
1313:
1314: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1315: Node.trimSpaces(lexer, element);
1316: }
1317:
1318: element = lexer.cloneNode(element);
1319: element.start = lexer.lexsize;
1320: element.end = lexer.lexsize;
1321: node.insertNodeAfterElement(element);
1322: continue;
1323: }
1324: }
1325:
1326: if (element.tag == tt.tagDt) {
1327: if (node.tag == tt.tagHr) {
1328: Node dd;
1329:
1330: if (node.type != Node.START_TAG
1331: && node.type != Node.START_END_TAG) {
1332: lexer.report.warning(lexer, element, node,
1333: Report.DISCARDING_UNEXPECTED);
1334: continue;
1335: }
1336:
1337: lexer.report.warning(lexer, element, node,
1338: Report.TAG_NOT_ALLOWED_IN);
1339: dd = lexer.inferredTag("dd");
1340:
1341: // insert hr within dd before dt if dt is empty
1342: if (element.content == null) {
1343: Node.insertNodeBeforeElement(element, dd);
1344: dd.insertNodeAtEnd(node);
1345: continue;
1346: }
1347:
1348: // split dt and insert hr within dd before 2nd part
1349: element.insertNodeAfterElement(dd);
1350: dd.insertNodeAtEnd(node);
1351:
1352: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1353: Node.trimSpaces(lexer, element);
1354: }
1355:
1356: element = lexer.cloneNode(element);
1357: element.start = lexer.lexsize;
1358: element.end = lexer.lexsize;
1359: dd.insertNodeAfterElement(element);
1360: continue;
1361: }
1362: }
1363:
1364: // if this is the end tag for an ancestor element then infer end tag for this element
1365:
1366: if (node.type == Node.END_TAG) {
1367: for (parent = element.parent; parent != null; parent = parent.parent) {
1368: if (node.tag == parent.tag) {
1369: if (!((element.tag.model & Dict.CM_OPT) != 0)
1370: && !element.implicit) {
1371: lexer.report.warning(lexer, element,
1372: node,
1373: Report.MISSING_ENDTAG_BEFORE);
1374: }
1375:
1376: if (element.tag == tt.tagA) {
1377: lexer.popInline(element);
1378: }
1379:
1380: lexer.ungetToken();
1381:
1382: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1383: Node.trimSpaces(lexer, element);
1384: }
1385:
1386: Node.trimEmptyElement(lexer, element);
1387: return;
1388: }
1389: }
1390: }
1391:
1392: // block level tags end this element
1393: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1394: if (node.type != Node.START_TAG) {
1395: lexer.report.warning(lexer, element, node,
1396: Report.DISCARDING_UNEXPECTED);
1397: continue;
1398: }
1399:
1400: if (!((element.tag.model & Dict.CM_OPT) != 0)) {
1401: lexer.report.warning(lexer, element, node,
1402: Report.MISSING_ENDTAG_BEFORE);
1403: }
1404:
1405: if ((node.tag.model & Dict.CM_HEAD) != 0
1406: && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
1407: moveToHead(lexer, element, node);
1408: continue;
1409: }
1410:
1411: // prevent anchors from propagating into block tags except for headings h1 to h6
1412:
1413: if (element.tag == tt.tagA) {
1414: if (node.tag != null
1415: && !((node.tag.model & Dict.CM_HEADING) != 0)) {
1416: lexer.popInline(element);
1417: } else if (!(element.content != null)) {
1418: Node.discardElement(element);
1419: lexer.ungetToken();
1420: return;
1421: }
1422: }
1423:
1424: lexer.ungetToken();
1425:
1426: if (!((mode & Lexer.PREFORMATTED) != 0)) {
1427: Node.trimSpaces(lexer, element);
1428: }
1429:
1430: Node.trimEmptyElement(lexer, element);
1431: return;
1432: }
1433:
1434: // parse inline element
1435: if (node.type == Node.START_TAG
1436: || node.type == Node.START_END_TAG) {
1437: if (node.implicit) {
1438: lexer.report.warning(lexer, element, node,
1439: Report.INSERTING_TAG);
1440: }
1441:
1442: // trim white space before <br>
1443: if (node.tag == tt.tagBr) {
1444: Node.trimSpaces(lexer, element);
1445: }
1446:
1447: element.insertNodeAtEnd(node);
1448: parseTag(lexer, node, mode);
1449: continue;
1450: }
1451:
1452: // discard unexpected tags
1453: lexer.report.warning(lexer, element, node,
1454: Report.DISCARDING_UNEXPECTED);
1455: continue;
1456: }
1457:
1458: if (!((element.tag.model & Dict.CM_OPT) != 0)) {
1459: lexer.report.warning(lexer, element, node,
1460: Report.MISSING_ENDTAG_FOR);
1461: }
1462:
1463: Node.trimEmptyElement(lexer, element);
1464: }
1465: }
1466:
1467: /**
1468: * Parser for LIST.
1469: */
1470: public static class ParseList implements Parser {
1471:
1472: public void parse(Lexer lexer, Node list, short mode) {
1473: Node node;
1474: Node parent;
1475: TagTable tt = lexer.configuration.tt;
1476:
1477: if ((list.tag.model & Dict.CM_EMPTY) != 0) {
1478: return;
1479: }
1480:
1481: lexer.insert = -1; // defer implicit inline start tags
1482:
1483: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
1484: if (node.tag == list.tag && node.type == Node.END_TAG) {
1485: if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
1486: Node.coerceNode(lexer, list, tt.tagUl);
1487: }
1488:
1489: list.closed = true;
1490: Node.trimEmptyElement(lexer, list);
1491: return;
1492: }
1493:
1494: // deal with comments etc.
1495: if (Node.insertMisc(list, node)) {
1496: continue;
1497: }
1498:
1499: if (node.type != Node.TEXT_NODE && node.tag == null) {
1500: lexer.report.warning(lexer, list, node,
1501: Report.DISCARDING_UNEXPECTED);
1502: continue;
1503: }
1504:
1505: // if this is the end tag for an ancestor element then infer end tag for this element
1506:
1507: if (node.type == Node.END_TAG) {
1508: if (node.tag == tt.tagForm) {
1509: badForm(lexer);
1510: lexer.report.warning(lexer, list, node,
1511: Report.DISCARDING_UNEXPECTED);
1512: continue;
1513: }
1514:
1515: if (node.tag != null
1516: && (node.tag.model & Dict.CM_INLINE) != 0) {
1517: lexer.report.warning(lexer, list, node,
1518: Report.DISCARDING_UNEXPECTED);
1519: lexer.popInline(node);
1520: continue;
1521: }
1522:
1523: for (parent = list.parent; parent != null; parent = parent.parent) {
1524: if (node.tag == parent.tag) {
1525: lexer.report.warning(lexer, list, node,
1526: Report.MISSING_ENDTAG_BEFORE);
1527: lexer.ungetToken();
1528:
1529: if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
1530: Node.coerceNode(lexer, list, tt.tagUl);
1531: }
1532:
1533: Node.trimEmptyElement(lexer, list);
1534: return;
1535: }
1536: }
1537:
1538: lexer.report.warning(lexer, list, node,
1539: Report.DISCARDING_UNEXPECTED);
1540: continue;
1541: }
1542:
1543: if (node.tag != tt.tagLi) {
1544: lexer.ungetToken();
1545:
1546: if (node.tag != null
1547: && (node.tag.model & Dict.CM_BLOCK) != 0
1548: && lexer.excludeBlocks) {
1549: lexer.report.warning(lexer, list, node,
1550: Report.MISSING_ENDTAG_BEFORE);
1551: Node.trimEmptyElement(lexer, list);
1552: return;
1553: }
1554:
1555: node = lexer.inferredTag("li");
1556: node.addAttribute("style", "list-style: none");
1557: lexer.report.warning(lexer, list, node,
1558: Report.MISSING_STARTTAG);
1559: }
1560:
1561: // node should be <LI>
1562: list.insertNodeAtEnd(node);
1563: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1564: }
1565:
1566: if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
1567: Node.coerceNode(lexer, list, tt.tagUl);
1568: }
1569:
1570: lexer.report.warning(lexer, list, node,
1571: Report.MISSING_ENDTAG_FOR);
1572: Node.trimEmptyElement(lexer, list);
1573: }
1574:
1575: }
1576:
1577: /**
1578: * Parser for empty elements.
1579: */
1580: public static class ParseEmpty implements Parser {
1581:
1582: /**
1583: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1584: */
1585: public void parse(Lexer lexer, Node element, short mode) {
1586: if (lexer.isvoyager) {
1587: Node node = lexer.getToken(mode);
1588: if (node != null
1589: && !(node.type == Node.END_TAG && node.tag == element.tag)) {
1590: lexer.report.warning(lexer, element, node,
1591: Report.ELEMENT_NOT_EMPTY);
1592: lexer.ungetToken();
1593: }
1594: }
1595: }
1596: }
1597:
1598: /**
1599: * Parser for DEFLIST.
1600: */
1601: public static class ParseDefList implements Parser {
1602:
1603: /**
1604: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1605: */
1606: public void parse(Lexer lexer, Node list, short mode) {
1607: Node node, parent;
1608: TagTable tt = lexer.configuration.tt;
1609:
1610: if ((list.tag.model & Dict.CM_EMPTY) != 0) {
1611: return;
1612: }
1613:
1614: lexer.insert = -1; // defer implicit inline start tags
1615:
1616: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
1617: if (node.tag == list.tag && node.type == Node.END_TAG) {
1618: list.closed = true;
1619: Node.trimEmptyElement(lexer, list);
1620: return;
1621: }
1622:
1623: // deal with comments etc.
1624: if (Node.insertMisc(list, node)) {
1625: continue;
1626: }
1627:
1628: if (node.type == Node.TEXT_NODE) {
1629: lexer.ungetToken();
1630: node = lexer.inferredTag("dt");
1631: lexer.report.warning(lexer, list, node,
1632: Report.MISSING_STARTTAG);
1633: }
1634:
1635: if (node.tag == null) {
1636: lexer.report.warning(lexer, list, node,
1637: Report.DISCARDING_UNEXPECTED);
1638: continue;
1639: }
1640:
1641: // if this is the end tag for an ancestor element then infer end tag for this element
1642:
1643: if (node.type == Node.END_TAG) {
1644: if (node.tag == tt.tagForm) {
1645: badForm(lexer);
1646: lexer.report.warning(lexer, list, node,
1647: Report.DISCARDING_UNEXPECTED);
1648: continue;
1649: }
1650:
1651: for (parent = list.parent; parent != null; parent = parent.parent) {
1652: if (node.tag == parent.tag) {
1653: lexer.report.warning(lexer, list, node,
1654: Report.MISSING_ENDTAG_BEFORE);
1655:
1656: lexer.ungetToken();
1657: Node.trimEmptyElement(lexer, list);
1658: return;
1659: }
1660: }
1661: }
1662:
1663: // center in a dt or a dl breaks the dl list in two
1664: if (node.tag == tt.tagCenter) {
1665: if (list.content != null) {
1666: list.insertNodeAfterElement(node);
1667: } else {
1668: // trim empty dl list
1669: Node.insertNodeBeforeElement(list, node);
1670:
1671: // #540296 tidy dumps with empty definition list
1672: Node.discardElement(list);
1673: }
1674:
1675: // and parse contents of center
1676: parseTag(lexer, node, mode);
1677:
1678: // now create a new dl element
1679: list = lexer.inferredTag("dl");
1680: node.insertNodeAfterElement(list);
1681: continue;
1682: }
1683:
1684: if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) {
1685: lexer.ungetToken();
1686:
1687: if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
1688: lexer.report.warning(lexer, list, node,
1689: Report.TAG_NOT_ALLOWED_IN);
1690: Node.trimEmptyElement(lexer, list);
1691: return;
1692: }
1693:
1694: // if DD appeared directly in BODY then exclude blocks
1695: if (!((node.tag.model & Dict.CM_INLINE) != 0)
1696: && lexer.excludeBlocks) {
1697: Node.trimEmptyElement(lexer, list);
1698: return;
1699: }
1700:
1701: node = lexer.inferredTag("dd");
1702: lexer.report.warning(lexer, list, node,
1703: Report.MISSING_STARTTAG);
1704: }
1705:
1706: if (node.type == Node.END_TAG) {
1707: lexer.report.warning(lexer, list, node,
1708: Report.DISCARDING_UNEXPECTED);
1709: continue;
1710: }
1711:
1712: // node should be <DT> or <DD>
1713: list.insertNodeAtEnd(node);
1714: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1715: }
1716:
1717: lexer.report.warning(lexer, list, node,
1718: Report.MISSING_ENDTAG_FOR);
1719: Node.trimEmptyElement(lexer, list);
1720: }
1721:
1722: }
1723:
1724: /**
1725: * Parser for PRE.
1726: */
1727: public static class ParsePre implements Parser {
1728:
1729: /**
1730: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1731: */
1732: public void parse(Lexer lexer, Node pre, short mode) {
1733: Node node;
1734: TagTable tt = lexer.configuration.tt;
1735:
1736: if ((pre.tag.model & Dict.CM_EMPTY) != 0) {
1737: return;
1738: }
1739:
1740: if ((pre.tag.model & Dict.CM_OBSOLETE) != 0) {
1741: Node.coerceNode(lexer, pre, tt.tagPre);
1742: }
1743:
1744: lexer.inlineDup(null); // tell lexer to insert inlines if needed
1745:
1746: while ((node = lexer.getToken(Lexer.PREFORMATTED)) != null) {
1747: if (node.tag == pre.tag && node.type == Node.END_TAG) {
1748: Node.trimSpaces(lexer, pre);
1749: pre.closed = true;
1750: Node.trimEmptyElement(lexer, pre);
1751: return;
1752: }
1753:
1754: if (node.tag == tt.tagHtml) {
1755: if (node.type == Node.START_TAG
1756: || node.type == Node.START_END_TAG) {
1757: lexer.report.warning(lexer, pre, node,
1758: Report.DISCARDING_UNEXPECTED);
1759: }
1760:
1761: continue;
1762: }
1763:
1764: if (node.type == Node.TEXT_NODE) {
1765: // if first check for inital newline
1766: if (pre.content == null) {
1767: if (node.textarray[node.start] == (byte) '\n') {
1768: ++node.start;
1769: }
1770:
1771: if (node.start >= node.end) {
1772: continue;
1773: }
1774: }
1775:
1776: pre.insertNodeAtEnd(node);
1777: continue;
1778: }
1779:
1780: // deal with comments etc.
1781: if (Node.insertMisc(pre, node)) {
1782: continue;
1783: }
1784:
1785: // strip unexpected tags
1786: if (!lexer.preContent(node)) {
1787: Node newnode;
1788:
1789: lexer.report.warning(lexer, pre, node,
1790: Report.UNESCAPED_ELEMENT);
1791: newnode = Node.escapeTag(lexer, node);
1792: pre.insertNodeAtEnd(newnode);
1793: continue;
1794: }
1795:
1796: if (node.tag == tt.tagP) {
1797: if (node.type == Node.START_TAG) {
1798: lexer.report.warning(lexer, pre, node,
1799: Report.USING_BR_INPLACE_OF);
1800:
1801: // trim white space before <p> in <pre>
1802: Node.trimSpaces(lexer, pre);
1803:
1804: // coerce both <p> and </p> to <br>
1805: Node.coerceNode(lexer, node, tt.tagBr);
1806: pre.insertNodeAtEnd(node);
1807: } else {
1808: lexer.report.warning(lexer, pre, node,
1809: Report.DISCARDING_UNEXPECTED);
1810: }
1811: continue;
1812: }
1813:
1814: if (node.type == Node.START_TAG
1815: || node.type == Node.START_END_TAG) {
1816: // trim white space before <br>
1817: if (node.tag == tt.tagBr) {
1818: Node.trimSpaces(lexer, pre);
1819: }
1820:
1821: pre.insertNodeAtEnd(node);
1822: parseTag(lexer, node, Lexer.PREFORMATTED);
1823: continue;
1824: }
1825:
1826: // discard unexpected tags
1827: lexer.report.warning(lexer, pre, node,
1828: Report.DISCARDING_UNEXPECTED);
1829: }
1830:
1831: lexer.report.warning(lexer, pre, node,
1832: Report.MISSING_ENDTAG_FOR);
1833: Node.trimEmptyElement(lexer, pre);
1834: }
1835:
1836: }
1837:
1838: /**
1839: * Parser for block elements.
1840: */
1841: public static class ParseBlock implements Parser {
1842:
1843: /**
1844: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1845: */
1846: public void parse(Lexer lexer, Node element, short mode) {
1847: // element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is
1848: // inferred.
1849: Node node, parent;
1850: boolean checkstack;
1851: int istackbase = 0;
1852: TagTable tt = lexer.configuration.tt;
1853:
1854: checkstack = true;
1855:
1856: if ((element.tag.model & Dict.CM_EMPTY) != 0) {
1857: return;
1858: }
1859:
1860: if (element.tag == tt.tagForm
1861: && element.isDescendantOf(tt.tagForm)) {
1862: lexer.report.warning(lexer, element, null,
1863: Report.ILLEGAL_NESTING);
1864: }
1865:
1866: // InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care
1867: // to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack
1868: // context is created and disposed of upon reaching the end of the element. They thus behave like table
1869: // cells in this respect.
1870:
1871: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1872: istackbase = lexer.istackbase;
1873: lexer.istackbase = lexer.istack.size();
1874: }
1875:
1876: if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
1877: lexer.inlineDup(null);
1878: }
1879:
1880: mode = Lexer.IGNORE_WHITESPACE;
1881:
1882: while ((node = lexer.getToken(mode)) != null) {
1883: // end tag for this element
1884: if (node.type == Node.END_TAG
1885: && node.tag != null
1886: && (node.tag == element.tag || element.was == node.tag)) {
1887:
1888: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1889: // pop inline stack
1890: while (lexer.istack.size() > lexer.istackbase) {
1891: lexer.popInline(null);
1892: }
1893: lexer.istackbase = istackbase;
1894: }
1895:
1896: element.closed = true;
1897: Node.trimSpaces(lexer, element);
1898: Node.trimEmptyElement(lexer, element);
1899: return;
1900: }
1901:
1902: if (node.tag == tt.tagHtml || node.tag == tt.tagHead
1903: || node.tag == tt.tagBody) {
1904: if (node.type == Node.START_TAG
1905: || node.type == Node.START_END_TAG) {
1906: lexer.report.warning(lexer, element, node,
1907: Report.DISCARDING_UNEXPECTED);
1908: }
1909:
1910: continue;
1911: }
1912:
1913: if (node.type == Node.END_TAG) {
1914: if (node.tag == null) {
1915: lexer.report.warning(lexer, element, node,
1916: Report.DISCARDING_UNEXPECTED);
1917:
1918: continue;
1919: } else if (node.tag == tt.tagBr) {
1920: node.type = Node.START_TAG;
1921: } else if (node.tag == tt.tagP) {
1922: Node.coerceNode(lexer, node, tt.tagBr);
1923: element.insertNodeAtEnd(node);
1924: node = lexer.inferredTag("br");
1925: } else {
1926: // if this is the end tag for an ancestor element then infer end tag for this element
1927:
1928: for (parent = element.parent; parent != null; parent = parent.parent) {
1929: if (node.tag == parent.tag) {
1930: if (!((element.tag.model & Dict.CM_OPT) != 0)) {
1931: lexer.report
1932: .warning(
1933: lexer,
1934: element,
1935: node,
1936: Report.MISSING_ENDTAG_BEFORE);
1937: }
1938:
1939: lexer.ungetToken();
1940:
1941: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1942: // pop inline stack
1943: while (lexer.istack.size() > lexer.istackbase) {
1944: lexer.popInline(null);
1945: }
1946: lexer.istackbase = istackbase;
1947: }
1948:
1949: Node.trimSpaces(lexer, element);
1950: Node.trimEmptyElement(lexer, element);
1951: return;
1952: }
1953: }
1954: // special case </tr> etc. for stuff moved in front of table
1955: if (lexer.exiled
1956: && node.tag.model != 0
1957: && (node.tag.model & Dict.CM_TABLE) != 0) {
1958: lexer.ungetToken();
1959: Node.trimSpaces(lexer, element);
1960: Node.trimEmptyElement(lexer, element);
1961: return;
1962: }
1963: }
1964: }
1965:
1966: // mixed content model permits text
1967: if (node.type == Node.TEXT_NODE) {
1968: boolean iswhitenode = false;
1969:
1970: if (node.type == Node.TEXT_NODE
1971: && node.end <= node.start + 1
1972: && lexer.lexbuf[node.start] == (byte) ' ') {
1973: iswhitenode = true;
1974: }
1975:
1976: if (lexer.configuration.encloseBlockText
1977: && !iswhitenode) {
1978: lexer.ungetToken();
1979: node = lexer.inferredTag("p");
1980: element.insertNodeAtEnd(node);
1981: parseTag(lexer, node, Lexer.MIXED_CONTENT);
1982: continue;
1983: }
1984:
1985: if (checkstack) {
1986: checkstack = false;
1987:
1988: if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
1989: if (lexer.inlineDup(node) > 0) {
1990: continue;
1991: }
1992: }
1993: }
1994:
1995: element.insertNodeAtEnd(node);
1996: mode = Lexer.MIXED_CONTENT;
1997:
1998: // HTML4 strict doesn't allow mixed content for elements with %block; as their content model
1999: // But only body, map, blockquote, form and noscript have content model %block;
2000: if (element.tag == tt.tagBody
2001: || element.tag == tt.tagMap
2002: || element.tag == tt.tagBlockquote
2003: || element.tag == tt.tagForm
2004: || element.tag == tt.tagNoscript) {
2005: lexer
2006: .constrainVersion(~Dict.VERS_HTML40_STRICT);
2007: }
2008: continue;
2009: }
2010:
2011: if (Node.insertMisc(element, node)) {
2012: continue;
2013: }
2014:
2015: // allow PARAM elements?
2016: if (node.tag == tt.tagParam) {
2017: if (((element.tag.model & Dict.CM_PARAM) != 0)
2018: && (node.type == Node.START_TAG || node.type == Node.START_END_TAG)) {
2019: element.insertNodeAtEnd(node);
2020: continue;
2021: }
2022:
2023: // otherwise discard it
2024: lexer.report.warning(lexer, element, node,
2025: Report.DISCARDING_UNEXPECTED);
2026: continue;
2027: }
2028:
2029: // allow AREA elements?
2030: if (node.tag == tt.tagArea) {
2031: if ((element.tag == tt.tagMap)
2032: && (node.type == Node.START_TAG || node.type == Node.START_END_TAG)) {
2033: element.insertNodeAtEnd(node);
2034: continue;
2035: }
2036:
2037: // otherwise discard it
2038: lexer.report.warning(lexer, element, node,
2039: Report.DISCARDING_UNEXPECTED);
2040: continue;
2041: }
2042:
2043: // ignore unknown start/end tags
2044: if (node.tag == null) {
2045: lexer.report.warning(lexer, element, node,
2046: Report.DISCARDING_UNEXPECTED);
2047: continue;
2048: }
2049:
2050: // Allow Dict.CM_INLINE elements here. Allow Dict.CM_BLOCK elements here unless lexer.excludeBlocks is
2051: // yes. LI and DD are special cased. Otherwise infer end tag for this element.
2052:
2053: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
2054: if (node.type != Node.START_TAG
2055: && node.type != Node.START_END_TAG) {
2056: if (node.tag == tt.tagForm) {
2057: badForm(lexer);
2058: }
2059: lexer.report.warning(lexer, element, node,
2060: Report.DISCARDING_UNEXPECTED);
2061: continue;
2062: }
2063:
2064: // #427671 - Fix by Randy Waki - 10 Aug 00
2065: // If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start
2066: // tag and let the subsequent content get parsed as content of the enclosing LI. This seems to
2067: // mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is
2068: // parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly
2069: // defer to each other to parse the illegal start tag, each time inferring a missing </li> or <li>
2070: // respectively. NOTE: This check is a bit fragile. It specifically checks for the four tags that
2071: // happen to weave their way through the current series of tests performed by ParseBlock and
2072: // ParseList to trigger the infinite loop.
2073:
2074: if (element.tag == tt.tagLi) {
2075: if (node.tag == tt.tagFrame
2076: || node.tag == tt.tagFrameset
2077: || node.tag == tt.tagOptgroup
2078: || node.tag == tt.tagOption) {
2079: lexer.report.warning(lexer, element, node,
2080: Report.DISCARDING_UNEXPECTED);
2081: continue;
2082: }
2083: }
2084:
2085: if (element.tag == tt.tagTd
2086: || element.tag == tt.tagTh) {
2087: // if parent is a table cell, avoid inferring the end of the cell
2088:
2089: if ((node.tag.model & Dict.CM_HEAD) != 0) {
2090: moveToHead(lexer, element, node);
2091: continue;
2092: }
2093:
2094: if ((node.tag.model & Dict.CM_LIST) != 0) {
2095: lexer.ungetToken();
2096: node = lexer.inferredTag("ul");
2097: node.addClass("noindent");
2098: lexer.excludeBlocks = true;
2099: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
2100: lexer.ungetToken();
2101: node = lexer.inferredTag("dl");
2102: lexer.excludeBlocks = true;
2103: }
2104:
2105: // infer end of current table cell
2106: if (!((node.tag.model & Dict.CM_BLOCK) != 0)) {
2107: lexer.ungetToken();
2108: Node.trimSpaces(lexer, element);
2109: Node.trimEmptyElement(lexer, element);
2110: return;
2111: }
2112: } else if ((node.tag.model & Dict.CM_BLOCK) != 0) {
2113: if (lexer.excludeBlocks) {
2114: if (!((element.tag.model & Dict.CM_OPT) != 0)) {
2115: lexer.report.warning(lexer, element,
2116: node,
2117: Report.MISSING_ENDTAG_BEFORE);
2118: }
2119:
2120: lexer.ungetToken();
2121:
2122: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
2123: lexer.istackbase = istackbase;
2124: }
2125:
2126: Node.trimSpaces(lexer, element);
2127: Node.trimEmptyElement(lexer, element);
2128: return;
2129: }
2130: } else {
2131: // things like list items
2132:
2133: if ((node.tag.model & Dict.CM_HEAD) != 0) {
2134: moveToHead(lexer, element, node);
2135: continue;
2136: }
2137:
2138: // special case where a form start tag occurs in a tr and is followed by td or th
2139: if (element.tag == tt.tagForm
2140: && element.parent.tag == tt.tagTd
2141: && element.parent.implicit) {
2142: if (node.tag == tt.tagTd) {
2143: lexer.report.warning(lexer, element,
2144: node,
2145: Report.DISCARDING_UNEXPECTED);
2146: continue;
2147: }
2148:
2149: if (node.tag == tt.tagTh) {
2150: lexer.report.warning(lexer, element,
2151: node,
2152: Report.DISCARDING_UNEXPECTED);
2153: node = element.parent;
2154: node.element = "th";
2155: node.tag = tt.tagTh;
2156: continue;
2157: }
2158: }
2159:
2160: if (!((element.tag.model & Dict.CM_OPT) != 0)
2161: && !element.implicit) {
2162: lexer.report.warning(lexer, element, node,
2163: Report.MISSING_ENDTAG_BEFORE);
2164: }
2165:
2166: lexer.ungetToken();
2167:
2168: if ((node.tag.model & Dict.CM_LIST) != 0) {
2169: if (element.parent != null
2170: && element.parent.tag != null
2171: && element.parent.tag.getParser() == LIST) {
2172: Node.trimSpaces(lexer, element);
2173: Node.trimEmptyElement(lexer, element);
2174: return;
2175: }
2176:
2177: node = lexer.inferredTag("ul");
2178: node.addClass("noindent");
2179: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
2180: if (element.parent.tag == tt.tagDl) {
2181: Node.trimSpaces(lexer, element);
2182: Node.trimEmptyElement(lexer, element);
2183: return;
2184: }
2185:
2186: node = lexer.inferredTag("dl");
2187: } else if ((node.tag.model & Dict.CM_TABLE) != 0
2188: || (node.tag.model & Dict.CM_ROW) != 0) {
2189: node = lexer.inferredTag("table");
2190: } else if ((element.tag.model & Dict.CM_OBJECT) != 0) {
2191: // pop inline stack
2192: while (lexer.istack.size() > lexer.istackbase) {
2193: lexer.popInline(null);
2194: }
2195: lexer.istackbase = istackbase;
2196: Node.trimSpaces(lexer, element);
2197: Node.trimEmptyElement(lexer, element);
2198: return;
2199:
2200: } else {
2201: Node.trimSpaces(lexer, element);
2202: Node.trimEmptyElement(lexer, element);
2203: return;
2204: }
2205: }
2206: }
2207:
2208: // parse known element
2209: if (node.type == Node.START_TAG
2210: || node.type == Node.START_END_TAG) {
2211: if (TidyUtils.toBoolean(node.tag.model
2212: & Dict.CM_INLINE)) {
2213: // DSR - 27Apr02 ensure we wrap anchors and other inline content
2214: if (lexer.configuration.encloseBlockText) {
2215: lexer.ungetToken();
2216: node = lexer.inferredTag("p");
2217: element.insertNodeAtEnd(node);
2218: parseTag(lexer, node, Lexer.MIXED_CONTENT);
2219: continue;
2220: }
2221:
2222: if (checkstack && !node.implicit) {
2223: checkstack = false;
2224:
2225: // #431731 - fix by Randy Waki 25 Dec 00
2226: if (!TidyUtils.toBoolean(element.tag.model
2227: & Dict.CM_MIXED)) {
2228: if (lexer.inlineDup(node) > 0) {
2229: continue;
2230: }
2231: }
2232: }
2233:
2234: mode = Lexer.MIXED_CONTENT;
2235: } else {
2236: checkstack = true;
2237: mode = Lexer.IGNORE_WHITESPACE;
2238: }
2239:
2240: // trim white space before <br>
2241: if (node.tag == tt.tagBr) {
2242: Node.trimSpaces(lexer, element);
2243: }
2244:
2245: element.insertNodeAtEnd(node);
2246:
2247: if (node.implicit) {
2248: lexer.report.warning(lexer, element, node,
2249: Report.INSERTING_TAG);
2250: }
2251:
2252: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE // Lexer.MixedContent
2253: );
2254: continue;
2255: }
2256:
2257: // discard unexpected tags
2258: if (node.type == Node.END_TAG) {
2259: lexer.popInline(node); // if inline end tag
2260: }
2261:
2262: lexer.report.warning(lexer, element, node,
2263: Report.DISCARDING_UNEXPECTED);
2264: continue;
2265: }
2266:
2267: if (!((element.tag.model & Dict.CM_OPT) != 0)) {
2268: lexer.report.warning(lexer, element, node,
2269: Report.MISSING_ENDTAG_FOR);
2270: }
2271:
2272: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
2273: // pop inline stack
2274: while (lexer.istack.size() > lexer.istackbase) {
2275: lexer.popInline(null);
2276: }
2277: lexer.istackbase = istackbase;
2278: }
2279:
2280: Node.trimSpaces(lexer, element);
2281: Node.trimEmptyElement(lexer, element);
2282: }
2283:
2284: }
2285:
2286: /**
2287: * Parser for TABLE.
2288: */
2289: public static class ParseTableTag implements Parser {
2290:
2291: /**
2292: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2293: */
2294: public void parse(Lexer lexer, Node table, short mode) {
2295: Node node, parent;
2296: int istackbase;
2297: TagTable tt = lexer.configuration.tt;
2298:
2299: lexer.deferDup();
2300: istackbase = lexer.istackbase;
2301: lexer.istackbase = lexer.istack.size();
2302:
2303: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
2304: if (node.tag == table.tag && node.type == Node.END_TAG) {
2305: lexer.istackbase = istackbase;
2306: table.closed = true;
2307: Node.trimEmptyElement(lexer, table);
2308: return;
2309: }
2310:
2311: // deal with comments etc.
2312: if (Node.insertMisc(table, node)) {
2313: continue;
2314: }
2315:
2316: // discard unknown tags
2317: if (node.tag == null && node.type != Node.TEXT_NODE) {
2318: lexer.report.warning(lexer, table, node,
2319: Report.DISCARDING_UNEXPECTED);
2320: continue;
2321: }
2322:
2323: // if TD or TH or text or inline or block then infer <TR>
2324:
2325: if (node.type != Node.END_TAG) {
2326: if (node.tag == tt.tagTd || node.tag == tt.tagTh
2327: || node.tag == tt.tagTable) {
2328: lexer.ungetToken();
2329: node = lexer.inferredTag("tr");
2330: lexer.report.warning(lexer, table, node,
2331: Report.MISSING_STARTTAG);
2332: } else if (node.type == Node.TEXT_NODE
2333: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2334: Node.insertNodeBeforeElement(table, node);
2335: lexer.report.warning(lexer, table, node,
2336: Report.TAG_NOT_ALLOWED_IN);
2337: lexer.exiled = true;
2338:
2339: if (!(node.type == Node.TEXT_NODE)) // #427662 - was (!node.type == TextNode) - fix by Young
2340: {
2341: parseTag(lexer, node,
2342: Lexer.IGNORE_WHITESPACE);
2343: }
2344:
2345: lexer.exiled = false;
2346: continue;
2347: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2348: moveToHead(lexer, table, node);
2349: continue;
2350: }
2351: }
2352:
2353: // if this is the end tag for an ancestor element then infer end tag for this element
2354:
2355: if (node.type == Node.END_TAG) {
2356: if (node.tag == tt.tagForm
2357: || (node.tag != null && ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))) {
2358: badForm(lexer);
2359: lexer.report.warning(lexer, table, node,
2360: Report.DISCARDING_UNEXPECTED);
2361: continue;
2362: }
2363:
2364: if ((node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0)
2365: || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
2366: lexer.report.warning(lexer, table, node,
2367: Report.DISCARDING_UNEXPECTED);
2368: continue;
2369: }
2370:
2371: for (parent = table.parent; parent != null; parent = parent.parent) {
2372: if (node.tag == parent.tag) {
2373: lexer.report.warning(lexer, table, node,
2374: Report.MISSING_ENDTAG_BEFORE);
2375: lexer.ungetToken();
2376: lexer.istackbase = istackbase;
2377: Node.trimEmptyElement(lexer, table);
2378: return;
2379: }
2380: }
2381: }
2382:
2383: if (!((node.tag.model & Dict.CM_TABLE) != 0)) {
2384: lexer.ungetToken();
2385: lexer.report.warning(lexer, table, node,
2386: Report.TAG_NOT_ALLOWED_IN);
2387: lexer.istackbase = istackbase;
2388: Node.trimEmptyElement(lexer, table);
2389: return;
2390: }
2391:
2392: if (node.type == Node.START_TAG
2393: || node.type == Node.START_END_TAG) {
2394: table.insertNodeAtEnd(node);
2395:
2396: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2397: continue;
2398: }
2399:
2400: // discard unexpected text nodes and end tags
2401: lexer.report.warning(lexer, table, node,
2402: Report.DISCARDING_UNEXPECTED);
2403: }
2404:
2405: lexer.report.warning(lexer, table, node,
2406: Report.MISSING_ENDTAG_FOR);
2407: Node.trimEmptyElement(lexer, table);
2408: lexer.istackbase = istackbase;
2409: }
2410:
2411: }
2412:
2413: /**
2414: * Parser for COLGROUP.
2415: */
2416: public static class ParseColGroup implements Parser {
2417:
2418: /**
2419: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2420: */
2421: public void parse(Lexer lexer, Node colgroup, short mode) {
2422: Node node, parent;
2423: TagTable tt = lexer.configuration.tt;
2424:
2425: if ((colgroup.tag.model & Dict.CM_EMPTY) != 0) {
2426: return;
2427: }
2428:
2429: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
2430: if (node.tag == colgroup.tag
2431: && node.type == Node.END_TAG) {
2432: colgroup.closed = true;
2433: return;
2434: }
2435:
2436: // if this is the end tag for an ancestor element then infer end tag for this element
2437:
2438: if (node.type == Node.END_TAG) {
2439: if (node.tag == tt.tagForm) {
2440: badForm(lexer);
2441: lexer.report.warning(lexer, colgroup, node,
2442: Report.DISCARDING_UNEXPECTED);
2443: continue;
2444: }
2445:
2446: for (parent = colgroup.parent; parent != null; parent = parent.parent) {
2447:
2448: if (node.tag == parent.tag) {
2449: lexer.ungetToken();
2450: return;
2451: }
2452: }
2453: }
2454:
2455: if (node.type == Node.TEXT_NODE) {
2456: lexer.ungetToken();
2457: return;
2458: }
2459:
2460: // deal with comments etc.
2461: if (Node.insertMisc(colgroup, node)) {
2462: continue;
2463: }
2464:
2465: // discard unknown tags
2466: if (node.tag == null) {
2467: lexer.report.warning(lexer, colgroup, node,
2468: Report.DISCARDING_UNEXPECTED);
2469: continue;
2470: }
2471:
2472: if (node.tag != tt.tagCol) {
2473: lexer.ungetToken();
2474: return;
2475: }
2476:
2477: if (node.type == Node.END_TAG) {
2478: lexer.report.warning(lexer, colgroup, node,
2479: Report.DISCARDING_UNEXPECTED);
2480: continue;
2481: }
2482:
2483: // node should be <COL>
2484: colgroup.insertNodeAtEnd(node);
2485: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2486: }
2487: }
2488:
2489: }
2490:
2491: /**
2492: * Parser for ROWGROUP.
2493: */
2494: public static class ParseRowGroup implements Parser {
2495:
2496: /**
2497: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2498: */
2499: public void parse(Lexer lexer, Node rowgroup, short mode) {
2500: Node node, parent;
2501: TagTable tt = lexer.configuration.tt;
2502:
2503: if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0) {
2504: return;
2505: }
2506:
2507: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
2508: if (node.tag == rowgroup.tag) {
2509: if (node.type == Node.END_TAG) {
2510: rowgroup.closed = true;
2511: Node.trimEmptyElement(lexer, rowgroup);
2512: return;
2513: }
2514:
2515: lexer.ungetToken();
2516: return;
2517: }
2518:
2519: // if </table> infer end tag
2520: if (node.tag == tt.tagTable
2521: && node.type == Node.END_TAG) {
2522: lexer.ungetToken();
2523: Node.trimEmptyElement(lexer, rowgroup);
2524: return;
2525: }
2526:
2527: // deal with comments etc.
2528: if (Node.insertMisc(rowgroup, node)) {
2529: continue;
2530: }
2531:
2532: // discard unknown tags
2533: if (node.tag == null && node.type != Node.TEXT_NODE) {
2534: lexer.report.warning(lexer, rowgroup, node,
2535: Report.DISCARDING_UNEXPECTED);
2536: continue;
2537: }
2538:
2539: // if TD or TH then infer <TR> if text or inline or block move before table if head content move to
2540: // head
2541:
2542: if (node.type != Node.END_TAG) {
2543: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2544: lexer.ungetToken();
2545: node = lexer.inferredTag("tr");
2546: lexer.report.warning(lexer, rowgroup, node,
2547: Report.MISSING_STARTTAG);
2548: } else if (node.type == Node.TEXT_NODE
2549: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2550: Node.moveBeforeTable(rowgroup, node, tt);
2551: lexer.report.warning(lexer, rowgroup, node,
2552: Report.TAG_NOT_ALLOWED_IN);
2553: lexer.exiled = true;
2554:
2555: // #427662 was (!node.type == TextNode) fix by Young 04 Aug 00
2556: if (node.type != Node.TEXT_NODE) {
2557: parseTag(lexer, node,
2558: Lexer.IGNORE_WHITESPACE);
2559: }
2560:
2561: lexer.exiled = false;
2562: continue;
2563: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2564: lexer.report.warning(lexer, rowgroup, node,
2565: Report.TAG_NOT_ALLOWED_IN);
2566: moveToHead(lexer, rowgroup, node);
2567: continue;
2568: }
2569: }
2570:
2571: // if this is the end tag for ancestor element then infer end tag for this element
2572:
2573: if (node.type == Node.END_TAG) {
2574:
2575: if (node.tag == tt.tagForm
2576: || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
2577: if (node.tag == tt.tagForm) {
2578: badForm(lexer);
2579: }
2580: lexer.report.warning(lexer, rowgroup, node,
2581: Report.DISCARDING_UNEXPECTED);
2582: continue;
2583: }
2584:
2585: if (node.tag == tt.tagTr || node.tag == tt.tagTd
2586: || node.tag == tt.tagTh) {
2587: lexer.report.warning(lexer, rowgroup, node,
2588: Report.DISCARDING_UNEXPECTED);
2589: continue;
2590: }
2591:
2592: for (parent = rowgroup.parent; parent != null; parent = parent.parent) {
2593: if (node.tag == parent.tag) {
2594: lexer.ungetToken();
2595: Node.trimEmptyElement(lexer, rowgroup);
2596: return;
2597: }
2598: }
2599:
2600: }
2601:
2602: // if THEAD, TFOOT or TBODY then implied end tag
2603:
2604: if ((node.tag.model & Dict.CM_ROWGRP) != 0) {
2605: if (node.type != Node.END_TAG) {
2606: lexer.ungetToken();
2607: }
2608:
2609: Node.trimEmptyElement(lexer, rowgroup);
2610: return;
2611: }
2612:
2613: if (node.type == Node.END_TAG) {
2614: lexer.report.warning(lexer, rowgroup, node,
2615: Report.DISCARDING_UNEXPECTED);
2616: continue;
2617: }
2618:
2619: if (!(node.tag == tt.tagTr)) {
2620: node = lexer.inferredTag("tr");
2621: lexer.report.warning(lexer, rowgroup, node,
2622: Report.MISSING_STARTTAG);
2623: lexer.ungetToken();
2624: }
2625:
2626: // node should be <TR>
2627: rowgroup.insertNodeAtEnd(node);
2628: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2629: }
2630: Node.trimEmptyElement(lexer, rowgroup);
2631: }
2632: }
2633:
2634: /**
2635: * Parser for ROW.
2636: */
2637: public static class ParseRow implements Parser {
2638:
2639: /**
2640: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2641: */
2642: public void parse(Lexer lexer, Node row, short mode) {
2643: Node node, parent;
2644: boolean excludeState;
2645: TagTable tt = lexer.configuration.tt;
2646:
2647: if ((row.tag.model & Dict.CM_EMPTY) != 0) {
2648: return;
2649: }
2650:
2651: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
2652: if (node.tag == row.tag) {
2653: if (node.type == Node.END_TAG) {
2654: row.closed = true;
2655: Node.fixEmptyRow(lexer, row);
2656: return;
2657: }
2658:
2659: lexer.ungetToken();
2660: Node.fixEmptyRow(lexer, row);
2661: return;
2662: }
2663:
2664: // if this is the end tag for an ancestor element then infer end tag for this element
2665: if (node.type == Node.END_TAG) {
2666: if (node.tag == tt.tagForm
2667: || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
2668: if (node.tag == tt.tagForm) {
2669: badForm(lexer);
2670: }
2671: lexer.report.warning(lexer, row, node,
2672: Report.DISCARDING_UNEXPECTED);
2673: continue;
2674: }
2675:
2676: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2677: lexer.report.warning(lexer, row, node,
2678: Report.DISCARDING_UNEXPECTED);
2679: continue;
2680: }
2681:
2682: for (parent = row.parent; parent != null; parent = parent.parent) {
2683: if (node.tag == parent.tag) {
2684: lexer.ungetToken();
2685: Node.trimEmptyElement(lexer, row);
2686: return;
2687: }
2688: }
2689: }
2690:
2691: // deal with comments etc.
2692: if (Node.insertMisc(row, node)) {
2693: continue;
2694: }
2695:
2696: // discard unknown tags
2697: if (node.tag == null && node.type != Node.TEXT_NODE) {
2698: lexer.report.warning(lexer, row, node,
2699: Report.DISCARDING_UNEXPECTED);
2700: continue;
2701: }
2702:
2703: // discard unexpected <table> element
2704: if (node.tag == tt.tagTable) {
2705: lexer.report.warning(lexer, row, node,
2706: Report.DISCARDING_UNEXPECTED);
2707: continue;
2708: }
2709:
2710: // THEAD, TFOOT or TBODY
2711: if (node.tag != null
2712: && (node.tag.model & Dict.CM_ROWGRP) != 0) {
2713: lexer.ungetToken();
2714: Node.trimEmptyElement(lexer, row);
2715: return;
2716: }
2717:
2718: if (node.type == Node.END_TAG) {
2719: lexer.report.warning(lexer, row, node,
2720: Report.DISCARDING_UNEXPECTED);
2721: continue;
2722: }
2723:
2724: // if text or inline or block move before table if head content move to head
2725:
2726: if (node.type != Node.END_TAG) {
2727: if (node.tag == tt.tagForm) {
2728: lexer.ungetToken();
2729: node = lexer.inferredTag("td");
2730: lexer.report.warning(lexer, row, node,
2731: Report.MISSING_STARTTAG);
2732: } else if (node.type == Node.TEXT_NODE
2733: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2734: Node.moveBeforeTable(row, node, tt);
2735: lexer.report.warning(lexer, row, node,
2736: Report.TAG_NOT_ALLOWED_IN);
2737: lexer.exiled = true;
2738:
2739: if (node.type != Node.TEXT_NODE) {
2740: parseTag(lexer, node,
2741: Lexer.IGNORE_WHITESPACE);
2742: }
2743:
2744: lexer.exiled = false;
2745: continue;
2746: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2747: lexer.report.warning(lexer, row, node,
2748: Report.TAG_NOT_ALLOWED_IN);
2749: moveToHead(lexer, row, node);
2750: continue;
2751: }
2752: }
2753:
2754: if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) {
2755: lexer.report.warning(lexer, row, node,
2756: Report.TAG_NOT_ALLOWED_IN);
2757: continue;
2758: }
2759:
2760: // node should be <TD> or <TH>
2761: row.insertNodeAtEnd(node);
2762: excludeState = lexer.excludeBlocks;
2763: lexer.excludeBlocks = false;
2764: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2765: lexer.excludeBlocks = excludeState;
2766:
2767: // pop inline stack
2768:
2769: while (lexer.istack.size() > lexer.istackbase) {
2770: lexer.popInline(null);
2771: }
2772: }
2773:
2774: Node.trimEmptyElement(lexer, row);
2775: }
2776:
2777: }
2778:
2779: /**
2780: * Parser for NOFRAMES.
2781: */
2782: public static class ParseNoFrames implements Parser {
2783:
2784: /**
2785: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2786: */
2787: public void parse(Lexer lexer, Node noframes, short mode) {
2788: Node node;
2789: TagTable tt = lexer.configuration.tt;
2790:
2791: lexer.badAccess |= Report.USING_NOFRAMES;
2792: mode = Lexer.IGNORE_WHITESPACE;
2793:
2794: while ((node = lexer.getToken(mode)) != null) {
2795: if (node.tag == noframes.tag
2796: && node.type == Node.END_TAG) {
2797: noframes.closed = true;
2798: Node.trimSpaces(lexer, noframes);
2799: return;
2800: }
2801:
2802: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)) {
2803:
2804: Node.trimSpaces(lexer, noframes);
2805:
2806: // fix for [539369]
2807: if (node.type == Node.END_TAG) {
2808: lexer.report.warning(lexer, noframes, node,
2809: Report.DISCARDING_UNEXPECTED);
2810: // Throw it away
2811: } else {
2812: lexer.report.warning(lexer, noframes, node,
2813: Report.MISSING_ENDTAG_BEFORE);
2814:
2815: lexer.ungetToken();
2816: }
2817: return;
2818: }
2819:
2820: if (node.tag == tt.tagHtml) {
2821: if (node.type == Node.START_TAG
2822: || node.type == Node.START_END_TAG) {
2823: lexer.report.warning(lexer, noframes, node,
2824: Report.DISCARDING_UNEXPECTED);
2825: }
2826:
2827: continue;
2828: }
2829:
2830: // deal with comments etc.
2831: if (Node.insertMisc(noframes, node)) {
2832: continue;
2833: }
2834:
2835: if (node.tag == tt.tagBody
2836: && node.type == Node.START_TAG) {
2837: boolean seenbody = lexer.seenEndBody;
2838: noframes.insertNodeAtEnd(node);
2839: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); // MixedContent
2840:
2841: if (seenbody) {
2842: Node.coerceNode(lexer, node, tt.tagDiv);
2843: moveNodeToBody(lexer, node);
2844: }
2845: continue;
2846: }
2847:
2848: // implicit body element inferred
2849: if (node.type == Node.TEXT_NODE
2850: || (node.tag != null && node.type != Node.END_TAG)) {
2851: if (lexer.seenEndBody) {
2852: Node body = lexer.root.findBody(tt);
2853:
2854: if (node.type == Node.TEXT_NODE) {
2855: lexer.ungetToken();
2856: node = lexer.inferredTag("p");
2857: lexer.report.warning(lexer, noframes, node,
2858: Report.CONTENT_AFTER_BODY);
2859: }
2860:
2861: body.insertNodeAtEnd(node);
2862: } else {
2863: lexer.ungetToken();
2864: node = lexer.inferredTag("body");
2865: if (lexer.configuration.xmlOut) {
2866: lexer.report.warning(lexer, noframes, node,
2867: Report.INSERTING_TAG);
2868: }
2869: noframes.insertNodeAtEnd(node);
2870: }
2871: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2872: // MixedContent
2873: continue;
2874: }
2875: // discard unexpected end tags
2876: lexer.report.warning(lexer, noframes, node,
2877: Report.DISCARDING_UNEXPECTED);
2878: }
2879:
2880: lexer.report.warning(lexer, noframes, node,
2881: Report.MISSING_ENDTAG_FOR);
2882: }
2883:
2884: }
2885:
2886: /**
2887: * Parser for SELECT.
2888: */
2889: public static class ParseSelect implements Parser {
2890:
2891: /**
2892: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2893: */
2894: public void parse(Lexer lexer, Node field, short mode) {
2895: Node node;
2896: TagTable tt = lexer.configuration.tt;
2897:
2898: lexer.insert = -1; // defer implicit inline start tags
2899:
2900: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
2901: if (node.tag == field.tag && node.type == Node.END_TAG) {
2902: field.closed = true;
2903: Node.trimSpaces(lexer, field);
2904: return;
2905: }
2906:
2907: // deal with comments etc.
2908: if (Node.insertMisc(field, node)) {
2909: continue;
2910: }
2911:
2912: if (node.type == Node.START_TAG
2913: && (node.tag == tt.tagOption
2914: || node.tag == tt.tagOptgroup || node.tag == tt.tagScript)) {
2915: field.insertNodeAtEnd(node);
2916: parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2917: continue;
2918: }
2919:
2920: // discard unexpected tags
2921: lexer.report.warning(lexer, field, node,
2922: Report.DISCARDING_UNEXPECTED);
2923: }
2924:
2925: lexer.report.warning(lexer, field, node,
2926: Report.MISSING_ENDTAG_FOR);
2927: }
2928:
2929: }
2930:
2931: /**
2932: * Parser for text nodes.
2933: */
2934: public static class ParseText implements Parser {
2935:
2936: /**
2937: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2938: */
2939: public void parse(Lexer lexer, Node field, short mode) {
2940: Node node;
2941: TagTable tt = lexer.configuration.tt;
2942:
2943: lexer.insert = -1; // defer implicit inline start tags
2944:
2945: if (field.tag == tt.tagTextarea) {
2946: mode = Lexer.PREFORMATTED;
2947: } else {
2948: mode = Lexer.MIXED_CONTENT; // kludge for font tags
2949: }
2950:
2951: while ((node = lexer.getToken(mode)) != null) {
2952: if (node.tag == field.tag && node.type == Node.END_TAG) {
2953: field.closed = true;
2954: Node.trimSpaces(lexer, field);
2955: return;
2956: }
2957:
2958: // deal with comments etc.
2959: if (Node.insertMisc(field, node)) {
2960: continue;
2961: }
2962:
2963: if (node.type == Node.TEXT_NODE) {
2964: // only called for 1st child
2965: if (field.content == null
2966: && !((mode & Lexer.PREFORMATTED) != 0)) {
2967: Node.trimSpaces(lexer, field);
2968: }
2969:
2970: if (node.start >= node.end) {
2971: continue;
2972: }
2973:
2974: field.insertNodeAtEnd(node);
2975: continue;
2976: }
2977:
2978: // for textarea should all cases of < and & be escaped?
2979: // discard inline tags e.g. font
2980: if (node.tag != null
2981: && ((node.tag.model & Dict.CM_INLINE) != 0)
2982: && (node.tag.model & Dict.CM_FIELD) == 0) // #487283 - fix by Lee Passey 25 Jan 02
2983: {
2984: lexer.report.warning(lexer, field, node,
2985: Report.DISCARDING_UNEXPECTED);
2986: continue;
2987: }
2988:
2989: // terminate element on other tags
2990: if (!((field.tag.model & Dict.CM_OPT) != 0)) {
2991: lexer.report.warning(lexer, field, node,
2992: Report.MISSING_ENDTAG_BEFORE);
2993: }
2994:
2995: lexer.ungetToken();
2996: Node.trimSpaces(lexer, field);
2997: return;
2998: }
2999:
3000: if (!((field.tag.model & Dict.CM_OPT) != 0)) {
3001: lexer.report.warning(lexer, field, node,
3002: Report.MISSING_ENDTAG_FOR);
3003: }
3004: }
3005:
3006: }
3007:
3008: /**
3009: * Parser for OPTGROUP.
3010: */
3011: public static class ParseOptGroup implements Parser {
3012:
3013: /**
3014: * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3015: */
3016: public void parse(Lexer lexer, Node field, short mode) {
3017: Node node;
3018: TagTable tt = lexer.configuration.tt;
3019:
3020: lexer.insert = -1; // defer implicit inline start tags
3021:
3022: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
3023: if (node.tag == field.tag && node.type == Node.END_TAG) {
3024: field.closed = true;
3025: Node.trimSpaces(lexer, field);
3026: return;
3027: }
3028:
3029: // deal with comments etc.
3030: if (Node.insertMisc(field, node)) {
3031: continue;
3032: }
3033:
3034: if (node.type == Node.START_TAG
3035: && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) {
3036: if (node.tag == tt.tagOptgroup) {
3037: lexer.report.warning(lexer, field, node,
3038: Report.CANT_BE_NESTED);
3039: }
3040:
3041: field.insertNodeAtEnd(node);
3042: parseTag(lexer, node, Lexer.MIXED_CONTENT);
3043: continue;
3044: }
3045:
3046: // discard unexpected tags
3047: lexer.report.warning(lexer, field, node,
3048: Report.DISCARDING_UNEXPECTED);
3049: }
3050: }
3051:
3052: }
3053:
3054: /**
3055: * HTML is the top level element.
3056: */
3057: public static Node parseDocument(Lexer lexer) {
3058: Node node, document, html;
3059: Node doctype = null;
3060: TagTable tt = lexer.configuration.tt;
3061:
3062: document = lexer.newNode();
3063: document.type = Node.ROOT_NODE;
3064:
3065: lexer.root = document;
3066:
3067: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
3068: // deal with comments etc.
3069: if (Node.insertMisc(document, node)) {
3070: continue;
3071: }
3072:
3073: if (node.type == Node.DOCTYPE_TAG) {
3074: if (doctype == null) {
3075: document.insertNodeAtEnd(node);
3076: doctype = node;
3077: } else {
3078: lexer.report.warning(lexer, document, node,
3079: Report.DISCARDING_UNEXPECTED);
3080: }
3081: continue;
3082: }
3083:
3084: if (node.type == Node.END_TAG) {
3085: lexer.report.warning(lexer, document, node,
3086: Report.DISCARDING_UNEXPECTED); //TODO?
3087: continue;
3088: }
3089:
3090: if (node.type != Node.START_TAG || node.tag != tt.tagHtml) {
3091: lexer.ungetToken();
3092: html = lexer.inferredTag("html");
3093: } else {
3094: html = node;
3095: }
3096:
3097: if (document.findDocType() == null
3098: && !lexer.configuration.bodyOnly) {
3099: lexer.report.warning(lexer, null, null,
3100: Report.MISSING_DOCTYPE);
3101: }
3102:
3103: document.insertNodeAtEnd(html);
3104: HTML.parse(lexer, html, (short) 0); // TODO?
3105: break;
3106: }
3107:
3108: return document;
3109: }
3110:
3111: /**
3112: * Indicates whether or not whitespace should be preserved for this element. If an <code>xml:space</code>
3113: * attribute is found, then if the attribute value is <code>preserve</code>, returns <code>true</code>. For
3114: * any other value, returns <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
3115: * found, then the following element names result in a return value of <code>true:
3116: * pre, script, style,</code> and
3117: * <code>xsl:text</code>. Finally, if a <code>TagTable</code> was passed in and the element appears as the
3118: * "pre" element in the <code>TagTable</code>, then <code>true</code> will be returned. Otherwise,
3119: * <code>false</code> is returned.
3120: * @param element The <code>Node</code> to test to see if whitespace should be preserved.
3121: * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> function. This may be
3122: * <code>null</code>, in which case this test is bypassed.
3123: * @return <code>true</code> or <code>false</code>, as explained above.
3124: */
3125: public static boolean XMLPreserveWhiteSpace(Node element,
3126: TagTable tt) {
3127: AttVal attribute;
3128:
3129: // search attributes for xml:space
3130: for (attribute = element.attributes; attribute != null; attribute = attribute.next) {
3131: if (attribute.attribute.equals("xml:space")) {
3132: if (attribute.value.equals("preserve")) {
3133: return true;
3134: }
3135:
3136: return false;
3137: }
3138: }
3139:
3140: if (element.element == null) // Debian Bug #137124. Fix based on suggestion by Cesar Eduardo Barros 06 Mar 02
3141: {
3142: return false;
3143: }
3144:
3145: // kludge for html docs without explicit xml:space attribute
3146: if ("pre".equalsIgnoreCase(element.element)
3147: || "script".equalsIgnoreCase(element.element)
3148: || "style".equalsIgnoreCase(element.element)) {
3149: return true;
3150: }
3151:
3152: if ((tt != null) && (tt.findParser(element) == PRE)) {
3153: return true;
3154: }
3155:
3156: // kludge for XSL docs
3157: if ("xsl:text".equalsIgnoreCase(element.element)) {
3158: return true;
3159: }
3160:
3161: return false;
3162: }
3163:
3164: /**
3165: * XML documents.
3166: */
3167: public static void parseXMLElement(Lexer lexer, Node element,
3168: short mode) {
3169: Node node;
3170:
3171: // if node is pre or has xml:space="preserve" then do so
3172:
3173: if (XMLPreserveWhiteSpace(element, lexer.configuration.tt)) {
3174: mode = Lexer.PREFORMATTED;
3175: }
3176:
3177: while ((node = lexer.getToken(mode)) != null) {
3178: if (node.type == Node.END_TAG
3179: && node.element.equals(element.element)) {
3180: element.closed = true;
3181: break;
3182: }
3183:
3184: // discard unexpected end tags
3185: if (node.type == Node.END_TAG) {
3186: lexer.report.error(lexer, element, node,
3187: Report.UNEXPECTED_ENDTAG);
3188: continue;
3189: }
3190:
3191: // parse content on seeing start tag
3192: if (node.type == Node.START_TAG) {
3193: parseXMLElement(lexer, node, mode);
3194: }
3195:
3196: element.insertNodeAtEnd(node);
3197: }
3198:
3199: // if first child is text then trim initial space and delete text node if it is empty.
3200:
3201: node = element.content;
3202:
3203: if (node != null && node.type == Node.TEXT_NODE
3204: && mode != Lexer.PREFORMATTED) {
3205: if (node.textarray[node.start] == (byte) ' ') {
3206: node.start++;
3207:
3208: if (node.start >= node.end) {
3209: Node.discardElement(node);
3210: }
3211: }
3212: }
3213:
3214: // if last child is text then trim final space and delete the text node if it is empty
3215:
3216: node = element.last;
3217:
3218: if (node != null && node.type == Node.TEXT_NODE
3219: && mode != Lexer.PREFORMATTED) {
3220: if (node.textarray[node.end - 1] == (byte) ' ') {
3221: node.end--;
3222:
3223: if (node.start >= node.end) {
3224: Node.discardElement(node);
3225: }
3226: }
3227: }
3228: }
3229:
3230: public static Node parseXMLDocument(Lexer lexer) {
3231: Node node, document, doctype;
3232:
3233: document = lexer.newNode();
3234: document.type = Node.ROOT_NODE;
3235: doctype = null;
3236: lexer.configuration.xmlTags = true;
3237:
3238: while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) {
3239: // discard unexpected end tags
3240: if (node.type == Node.END_TAG) {
3241: lexer.report.warning(lexer, null, node,
3242: Report.UNEXPECTED_ENDTAG);
3243: continue;
3244: }
3245:
3246: // deal with comments etc.
3247: if (Node.insertMisc(document, node)) {
3248: continue;
3249: }
3250:
3251: if (node.type == Node.DOCTYPE_TAG) {
3252: if (doctype == null) {
3253: document.insertNodeAtEnd(node);
3254: doctype = node;
3255: } else {
3256: lexer.report.warning(lexer, document, node,
3257: Report.DISCARDING_UNEXPECTED); // TODO
3258: }
3259: continue;
3260: }
3261:
3262: if (node.type == Node.START_END_TAG) {
3263: document.insertNodeAtEnd(node);
3264: continue;
3265: }
3266:
3267: // if start tag then parse element's content
3268: if (node.type == Node.START_TAG) {
3269: document.insertNodeAtEnd(node);
3270: parseXMLElement(lexer, node, Lexer.IGNORE_WHITESPACE);
3271: }
3272:
3273: }
3274:
3275: if (doctype != null && !lexer.checkDocTypeKeyWords(doctype)) {
3276: lexer.report.warning(lexer, doctype, null,
3277: Report.DTYPE_NOT_UPPER_CASE);
3278: }
3279:
3280: // ensure presence of initial <?XML version="1.0"?>
3281: if (lexer.configuration.xmlPi) {
3282: lexer.fixXmlDecl(document);
3283: }
3284:
3285: return document;
3286: }
3287:
3288: /**
3289: * errors in positioning of form start or end tags generally require human intervention to fix.
3290: */
3291: static void badForm(Lexer lexer) {
3292: lexer.badForm = 1;
3293: lexer.errors++;
3294: }
3295:
3296: }
|