0001: /*
0002: * @(#)ParserImpl.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: package org.w3c.tidy;
0007:
0008: /**
0009: *
0010: * HTML Parser implementation
0011: *
0012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013: * See Tidy.java for the copyright notice.
0014: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015: * HTML Tidy Release 4 Aug 2000</a>
0016: *
0017: * @author Dave Raggett <dsr@w3.org>
0018: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019: * @version 1.0, 1999/05/22
0020: * @version 1.0.1, 1999/05/29
0021: * @version 1.1, 1999/06/18 Java Bean
0022: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024: * @version 1.4, 1999/09/04 DOM support
0025: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032: */
0033:
0034: public class ParserImpl {
0035:
0036: //private static int SeenBodyEndTag; /* AQ: moved into lexer structure */
0037:
0038: private static void parseTag(Lexer lexer, Node node, short mode) {
0039: // Local fix by GLP 2000-12-21. Need to reset insertspace if this
0040: // is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
0041: // Remove this code once the fix is made in Tidy.
0042:
0043: /****** (Original code follows)
0044: if ((node.tag.model & Dict.CM_EMPTY) != 0)
0045: {
0046: lexer.waswhite = false;
0047: return;
0048: }
0049: else if (!((node.tag.model & Dict.CM_INLINE) != 0))
0050: lexer.insertspace = false;
0051: *******/
0052:
0053: if (!((node.tag.model & Dict.CM_INLINE) != 0))
0054: lexer.insertspace = false;
0055:
0056: if ((node.tag.model & Dict.CM_EMPTY) != 0) {
0057: lexer.waswhite = false;
0058: return;
0059: }
0060:
0061: if (node.tag.parser == null || node.type == Node.StartEndTag)
0062: return;
0063:
0064: node.tag.parser.parse(lexer, node, mode);
0065: }
0066:
0067: private static void moveToHead(Lexer lexer, Node element, Node node) {
0068: Node head;
0069: TagTable tt = lexer.configuration.tt;
0070:
0071: if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
0072: Report.warning(lexer, element, node,
0073: Report.TAG_NOT_ALLOWED_IN);
0074:
0075: while (element.tag != tt.tagHtml)
0076: element = element.parent;
0077:
0078: for (head = element.content; head != null; head = head.next) {
0079: if (head.tag == tt.tagHead) {
0080: Node.insertNodeAtEnd(head, node);
0081: break;
0082: }
0083: }
0084:
0085: if (node.tag.parser != null)
0086: parseTag(lexer, node, Lexer.IgnoreWhitespace);
0087: } else {
0088: Report.warning(lexer, element, node,
0089: Report.DISCARDING_UNEXPECTED);
0090: }
0091: }
0092:
0093: public static class ParseHTML implements Parser {
0094:
0095: public void parse(Lexer lexer, Node html, short mode) {
0096: Node node, head;
0097: Node frameset = null;
0098: Node noframes = null;
0099:
0100: lexer.configuration.XmlTags = false;
0101: lexer.seenBodyEndTag = 0;
0102: TagTable tt = lexer.configuration.tt;
0103:
0104: for (;;) {
0105: node = lexer.getToken(Lexer.IgnoreWhitespace);
0106:
0107: if (node == null) {
0108: node = lexer.inferredTag("head");
0109: break;
0110: }
0111:
0112: if (node.tag == tt.tagHead)
0113: break;
0114:
0115: if (node.tag == html.tag && node.type == Node.EndTag) {
0116: Report.warning(lexer, html, node,
0117: Report.DISCARDING_UNEXPECTED);
0118: continue;
0119: }
0120:
0121: /* deal with comments etc. */
0122: if (Node.insertMisc(html, node))
0123: continue;
0124:
0125: lexer.ungetToken();
0126: node = lexer.inferredTag("head");
0127: break;
0128: }
0129:
0130: head = node;
0131: Node.insertNodeAtEnd(html, head);
0132: getParseHead().parse(lexer, head, mode);
0133:
0134: for (;;) {
0135: node = lexer.getToken(Lexer.IgnoreWhitespace);
0136:
0137: if (node == null) {
0138: if (frameset == null) /* create an empty body */
0139: node = lexer.inferredTag("body");
0140:
0141: return;
0142: }
0143:
0144: /* robustly handle html tags */
0145: if (node.tag == html.tag) {
0146: if (node.type != Node.StartTag && frameset == null)
0147: Report.warning(lexer, html, node,
0148: Report.DISCARDING_UNEXPECTED);
0149:
0150: continue;
0151: }
0152:
0153: /* deal with comments etc. */
0154: if (Node.insertMisc(html, node))
0155: continue;
0156:
0157: /* if frameset document coerce <body> to <noframes> */
0158: if (node.tag == tt.tagBody) {
0159: if (node.type != Node.StartTag) {
0160: Report.warning(lexer, html, node,
0161: Report.DISCARDING_UNEXPECTED);
0162: continue;
0163: }
0164:
0165: if (frameset != null) {
0166: lexer.ungetToken();
0167:
0168: if (noframes == null) {
0169: noframes = lexer.inferredTag("noframes");
0170: Node.insertNodeAtEnd(frameset, noframes);
0171: Report.warning(lexer, html, noframes,
0172: Report.INSERTING_TAG);
0173: }
0174:
0175: parseTag(lexer, noframes, mode);
0176: continue;
0177: }
0178:
0179: break; /* to parse body */
0180: }
0181:
0182: /* flag an error if we see more than one frameset */
0183: if (node.tag == tt.tagFrameset) {
0184: if (node.type != Node.StartTag) {
0185: Report.warning(lexer, html, node,
0186: Report.DISCARDING_UNEXPECTED);
0187: continue;
0188: }
0189:
0190: if (frameset != null)
0191: Report.error(lexer, html, node,
0192: Report.DUPLICATE_FRAMESET);
0193: else
0194: frameset = node;
0195:
0196: Node.insertNodeAtEnd(html, node);
0197: parseTag(lexer, node, mode);
0198:
0199: /*
0200: see if it includes a noframes element so
0201: that we can merge subsequent noframes elements
0202: */
0203:
0204: for (node = frameset.content; node != null; node = node.next) {
0205: if (node.tag == tt.tagNoframes)
0206: noframes = node;
0207: }
0208: continue;
0209: }
0210:
0211: /* if not a frameset document coerce <noframes> to <body> */
0212: if (node.tag == tt.tagNoframes) {
0213: if (node.type != Node.StartTag) {
0214: Report.warning(lexer, html, node,
0215: Report.DISCARDING_UNEXPECTED);
0216: continue;
0217: }
0218:
0219: if (frameset == null) {
0220: Report.warning(lexer, html, node,
0221: Report.DISCARDING_UNEXPECTED);
0222: node = lexer.inferredTag("body");
0223: break;
0224: }
0225:
0226: if (noframes == null) {
0227: noframes = node;
0228: Node.insertNodeAtEnd(frameset, noframes);
0229: }
0230:
0231: parseTag(lexer, noframes, mode);
0232: continue;
0233: }
0234:
0235: if (node.type == Node.StartTag
0236: || node.type == Node.StartEndTag) {
0237: if (node.tag != null
0238: && (node.tag.model & Dict.CM_HEAD) != 0) {
0239: moveToHead(lexer, html, node);
0240: continue;
0241: }
0242: }
0243:
0244: lexer.ungetToken();
0245:
0246: /* insert other content into noframes element */
0247:
0248: if (frameset != null) {
0249: if (noframes == null) {
0250: noframes = lexer.inferredTag("noframes");
0251: Node.insertNodeAtEnd(frameset, noframes);
0252: } else
0253: Report.warning(lexer, html, node,
0254: Report.NOFRAMES_CONTENT);
0255:
0256: parseTag(lexer, noframes, mode);
0257: continue;
0258: }
0259:
0260: node = lexer.inferredTag("body");
0261: break;
0262: }
0263:
0264: /* node must be body */
0265:
0266: Node.insertNodeAtEnd(html, node);
0267: parseTag(lexer, node, mode);
0268: }
0269:
0270: };
0271:
0272: public static class ParseHead implements Parser {
0273:
0274: public void parse(Lexer lexer, Node head, short mode) {
0275: Node node;
0276: int HasTitle = 0;
0277: int HasBase = 0;
0278: TagTable tt = lexer.configuration.tt;
0279:
0280: while (true) {
0281: node = lexer.getToken(Lexer.IgnoreWhitespace);
0282: if (node == null)
0283: break;
0284: if (node.tag == head.tag && node.type == Node.EndTag) {
0285: head.closed = true;
0286: break;
0287: }
0288:
0289: if (node.type == Node.TextNode) {
0290: lexer.ungetToken();
0291: break;
0292: }
0293:
0294: /* deal with comments etc. */
0295: if (Node.insertMisc(head, node))
0296: continue;
0297:
0298: if (node.type == Node.DocTypeTag) {
0299: Node.insertDocType(lexer, head, node);
0300: continue;
0301: }
0302:
0303: /* discard unknown tags */
0304: if (node.tag == null) {
0305: Report.warning(lexer, head, node,
0306: Report.DISCARDING_UNEXPECTED);
0307: continue;
0308: }
0309:
0310: if (!((node.tag.model & Dict.CM_HEAD) != 0)) {
0311: lexer.ungetToken();
0312: break;
0313: }
0314:
0315: if (node.type == Node.StartTag
0316: || node.type == Node.StartEndTag) {
0317: if (node.tag == tt.tagTitle) {
0318: ++HasTitle;
0319:
0320: if (HasTitle > 1)
0321: Report.warning(lexer, head, node,
0322: Report.TOO_MANY_ELEMENTS);
0323: } else if (node.tag == tt.tagBase) {
0324: ++HasBase;
0325:
0326: if (HasBase > 1)
0327: Report.warning(lexer, head, node,
0328: Report.TOO_MANY_ELEMENTS);
0329: } else if (node.tag == tt.tagNoscript)
0330: Report.warning(lexer, head, node,
0331: Report.TAG_NOT_ALLOWED_IN);
0332:
0333: Node.insertNodeAtEnd(head, node);
0334: parseTag(lexer, node, Lexer.IgnoreWhitespace);
0335: continue;
0336: }
0337:
0338: /* discard unexpected text nodes and end tags */
0339: Report.warning(lexer, head, node,
0340: Report.DISCARDING_UNEXPECTED);
0341: }
0342:
0343: if (HasTitle == 0) {
0344: Report.warning(lexer, head, null,
0345: Report.MISSING_TITLE_ELEMENT);
0346: Node.insertNodeAtEnd(head, lexer.inferredTag("title"));
0347: }
0348: }
0349:
0350: };
0351:
0352: public static class ParseTitle implements Parser {
0353:
0354: public void parse(Lexer lexer, Node title, short mode) {
0355: Node node;
0356:
0357: while (true) {
0358: node = lexer.getToken(Lexer.MixedContent);
0359: if (node == null)
0360: break;
0361: if (node.tag == title.tag && node.type == Node.EndTag) {
0362: title.closed = true;
0363: Node.trimSpaces(lexer, title);
0364: return;
0365: }
0366:
0367: if (node.type == Node.TextNode) {
0368: /* only called for 1st child */
0369: if (title.content == null)
0370: Node.trimInitialSpace(lexer, title, node);
0371:
0372: if (node.start >= node.end) {
0373: continue;
0374: }
0375:
0376: Node.insertNodeAtEnd(title, node);
0377: continue;
0378: }
0379:
0380: /* deal with comments etc. */
0381: if (Node.insertMisc(title, node))
0382: continue;
0383:
0384: /* discard unknown tags */
0385: if (node.tag == null) {
0386: Report.warning(lexer, title, node,
0387: Report.DISCARDING_UNEXPECTED);
0388: continue;
0389: }
0390:
0391: /* pushback unexpected tokens */
0392: Report.warning(lexer, title, node,
0393: Report.MISSING_ENDTAG_BEFORE);
0394: lexer.ungetToken();
0395: Node.trimSpaces(lexer, title);
0396: return;
0397: }
0398:
0399: Report.warning(lexer, title, node,
0400: Report.MISSING_ENDTAG_FOR);
0401: }
0402:
0403: };
0404:
0405: public static class ParseScript implements Parser {
0406:
0407: public void parse(Lexer lexer, Node script, short mode) {
0408: /*
0409: This isn't quite right for CDATA content as it recognises
0410: tags within the content and parses them accordingly.
0411: This will unfortunately screw up scripts which include
0412: < + letter, < + !, < + ? or < + / + letter
0413: */
0414:
0415: Node node;
0416:
0417: node = lexer.getCDATA(script);
0418:
0419: if (node != null)
0420: Node.insertNodeAtEnd(script, node);
0421: }
0422:
0423: };
0424:
0425: public static class ParseBody implements Parser {
0426:
0427: public void parse(Lexer lexer, Node body, short mode) {
0428: Node node;
0429: boolean checkstack, iswhitenode;
0430:
0431: mode = Lexer.IgnoreWhitespace;
0432: checkstack = true;
0433: TagTable tt = lexer.configuration.tt;
0434:
0435: while (true) {
0436: node = lexer.getToken(mode);
0437: if (node == null)
0438: break;
0439: if (node.tag == body.tag && node.type == Node.EndTag) {
0440: body.closed = true;
0441: Node.trimSpaces(lexer, body);
0442: lexer.seenBodyEndTag = 1;
0443: mode = Lexer.IgnoreWhitespace;
0444:
0445: if (body.parent.tag == tt.tagNoframes)
0446: break;
0447:
0448: continue;
0449: }
0450:
0451: if (node.tag == tt.tagNoframes) {
0452: if (node.type == Node.StartTag) {
0453: Node.insertNodeAtEnd(body, node);
0454: getParseBlock().parse(lexer, node, mode);
0455: continue;
0456: }
0457:
0458: if (node.type == Node.EndTag
0459: && body.parent.tag == tt.tagNoframes) {
0460: Node.trimSpaces(lexer, body);
0461: lexer.ungetToken();
0462: break;
0463: }
0464: }
0465:
0466: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)
0467: && body.parent.tag == tt.tagNoframes) {
0468: Node.trimSpaces(lexer, body);
0469: lexer.ungetToken();
0470: break;
0471: }
0472:
0473: if (node.tag == tt.tagHtml) {
0474: if (node.type == Node.StartTag
0475: || node.type == Node.StartEndTag)
0476: Report.warning(lexer, body, node,
0477: Report.DISCARDING_UNEXPECTED);
0478:
0479: continue;
0480: }
0481:
0482: iswhitenode = false;
0483:
0484: if (node.type == Node.TextNode
0485: && node.end <= node.start + 1
0486: && node.textarray[node.start] == (byte) ' ')
0487: iswhitenode = true;
0488:
0489: /* deal with comments etc. */
0490: if (Node.insertMisc(body, node))
0491: continue;
0492:
0493: if (lexer.seenBodyEndTag == 1 && !iswhitenode) {
0494: ++lexer.seenBodyEndTag;
0495: Report.warning(lexer, body, node,
0496: Report.CONTENT_AFTER_BODY);
0497: }
0498:
0499: /* mixed content model permits text */
0500: if (node.type == Node.TextNode) {
0501: if (iswhitenode && mode == Lexer.IgnoreWhitespace) {
0502: continue;
0503: }
0504:
0505: if (lexer.configuration.EncloseBodyText
0506: && !iswhitenode) {
0507: Node para;
0508:
0509: lexer.ungetToken();
0510: para = lexer.inferredTag("p");
0511: Node.insertNodeAtEnd(body, para);
0512: parseTag(lexer, para, mode);
0513: mode = Lexer.MixedContent;
0514: continue;
0515: } else
0516: /* strict doesn't allow text here */
0517: lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
0518:
0519: if (checkstack) {
0520: checkstack = false;
0521:
0522: if (lexer.inlineDup(node) > 0)
0523: continue;
0524: }
0525:
0526: Node.insertNodeAtEnd(body, node);
0527: mode = Lexer.MixedContent;
0528: continue;
0529: }
0530:
0531: if (node.type == Node.DocTypeTag) {
0532: Node.insertDocType(lexer, body, node);
0533: continue;
0534: }
0535: /* discard unknown and PARAM tags */
0536: if (node.tag == null || node.tag == tt.tagParam) {
0537: Report.warning(lexer, body, node,
0538: Report.DISCARDING_UNEXPECTED);
0539: continue;
0540: }
0541:
0542: /*
0543: Netscape allows LI and DD directly in BODY
0544: We infer UL or DL respectively and use this
0545: boolean to exclude block-level elements so as
0546: to match Netscape's observed behaviour.
0547: */
0548: lexer.excludeBlocks = false;
0549:
0550: if (!((node.tag.model & Dict.CM_BLOCK) != 0)
0551: && !((node.tag.model & Dict.CM_INLINE) != 0)) {
0552: /* avoid this error message being issued twice */
0553: if (!((node.tag.model & Dict.CM_HEAD) != 0))
0554: Report.warning(lexer, body, node,
0555: Report.TAG_NOT_ALLOWED_IN);
0556:
0557: if ((node.tag.model & Dict.CM_HTML) != 0) {
0558: /* copy body attributes if current body was inferred */
0559: if (node.tag == tt.tagBody && body.implicit
0560: && body.attributes == null) {
0561: body.attributes = node.attributes;
0562: node.attributes = null;
0563: }
0564:
0565: continue;
0566: }
0567:
0568: if ((node.tag.model & Dict.CM_HEAD) != 0) {
0569: moveToHead(lexer, body, node);
0570: continue;
0571: }
0572:
0573: if ((node.tag.model & Dict.CM_LIST) != 0) {
0574: lexer.ungetToken();
0575: node = lexer.inferredTag("ul");
0576: Node.addClass(node, "noindent");
0577: lexer.excludeBlocks = true;
0578: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
0579: lexer.ungetToken();
0580: node = lexer.inferredTag("dl");
0581: lexer.excludeBlocks = true;
0582: } else if ((node.tag.model & (Dict.CM_TABLE
0583: | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) {
0584: lexer.ungetToken();
0585: node = lexer.inferredTag("table");
0586: lexer.excludeBlocks = true;
0587: } else {
0588: /* AQ: The following line is from the official C
0589: version of tidy. It doesn't make sense to me
0590: because the '!' operator has higher precedence
0591: than the '&' operator. It seems to me that the
0592: expression always evaluates to 0.
0593:
0594: if (!node->tag->model & (CM_ROW | CM_FIELD))
0595:
0596: AQ: 13Jan2000 fixed in C tidy
0597: */
0598: if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) {
0599: lexer.ungetToken();
0600: return;
0601: }
0602:
0603: /* ignore </td> </th> <option> etc. */
0604: continue;
0605: }
0606: }
0607:
0608: if (node.type == Node.EndTag) {
0609: if (node.tag == tt.tagBr)
0610: node.type = Node.StartTag;
0611: else if (node.tag == tt.tagP) {
0612: Node.coerceNode(lexer, node, tt.tagBr);
0613: Node.insertNodeAtEnd(body, node);
0614: node = lexer.inferredTag("br");
0615: } else if ((node.tag.model & Dict.CM_INLINE) != 0)
0616: lexer.popInline(node);
0617: }
0618:
0619: if (node.type == Node.StartTag
0620: || node.type == Node.StartEndTag) {
0621: if (((node.tag.model & Dict.CM_INLINE) != 0)
0622: && !((node.tag.model & Dict.CM_MIXED) != 0)) {
0623: /* HTML4 strict doesn't allow inline content here */
0624: /* but HTML2 does allow img elements as children of body */
0625: if (node.tag == tt.tagImg)
0626: lexer.versions &= ~Dict.VERS_HTML40_STRICT;
0627: else
0628: lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
0629:
0630: if (checkstack && !node.implicit) {
0631: checkstack = false;
0632:
0633: if (lexer.inlineDup(node) > 0)
0634: continue;
0635: }
0636:
0637: mode = Lexer.MixedContent;
0638: } else {
0639: checkstack = true;
0640: mode = Lexer.IgnoreWhitespace;
0641: }
0642:
0643: if (node.implicit)
0644: Report.warning(lexer, body, node,
0645: Report.INSERTING_TAG);
0646:
0647: Node.insertNodeAtEnd(body, node);
0648: parseTag(lexer, node, mode);
0649: continue;
0650: }
0651:
0652: /* discard unexpected tags */
0653: Report.warning(lexer, body, node,
0654: Report.DISCARDING_UNEXPECTED);
0655: }
0656: }
0657:
0658: };
0659:
0660: public static class ParseFrameSet implements Parser {
0661:
0662: public void parse(Lexer lexer, Node frameset, short mode) {
0663: Node node;
0664: TagTable tt = lexer.configuration.tt;
0665:
0666: lexer.badAccess |= Report.USING_FRAMES;
0667:
0668: while (true) {
0669: node = lexer.getToken(Lexer.IgnoreWhitespace);
0670: if (node == null)
0671: break;
0672: if (node.tag == frameset.tag
0673: && node.type == Node.EndTag) {
0674: frameset.closed = true;
0675: Node.trimSpaces(lexer, frameset);
0676: return;
0677: }
0678:
0679: /* deal with comments etc. */
0680: if (Node.insertMisc(frameset, node))
0681: continue;
0682:
0683: if (node.tag == null) {
0684: Report.warning(lexer, frameset, node,
0685: Report.DISCARDING_UNEXPECTED);
0686: continue;
0687: }
0688:
0689: if (node.type == Node.StartTag
0690: || node.type == Node.StartEndTag) {
0691: if (node.tag != null
0692: && (node.tag.model & Dict.CM_HEAD) != 0) {
0693: moveToHead(lexer, frameset, node);
0694: continue;
0695: }
0696: }
0697:
0698: if (node.tag == tt.tagBody) {
0699: lexer.ungetToken();
0700: node = lexer.inferredTag("noframes");
0701: Report.warning(lexer, frameset, node,
0702: Report.INSERTING_TAG);
0703: }
0704:
0705: if (node.type == Node.StartTag
0706: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0707: Node.insertNodeAtEnd(frameset, node);
0708: lexer.excludeBlocks = false;
0709: parseTag(lexer, node, Lexer.MixedContent);
0710: continue;
0711: } else if (node.type == Node.StartEndTag
0712: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0713: Node.insertNodeAtEnd(frameset, node);
0714: continue;
0715: }
0716:
0717: /* discard unexpected tags */
0718: Report.warning(lexer, frameset, node,
0719: Report.DISCARDING_UNEXPECTED);
0720: }
0721:
0722: Report.warning(lexer, frameset, node,
0723: Report.MISSING_ENDTAG_FOR);
0724: }
0725:
0726: };
0727:
0728: public static class ParseInline implements Parser {
0729:
0730: public void parse(Lexer lexer, Node element, short mode) {
0731: Node node, parent;
0732: TagTable tt = lexer.configuration.tt;
0733:
0734: if ((element.tag.model & Dict.CM_EMPTY) != 0)
0735: return;
0736:
0737: if (element.tag == tt.tagA) {
0738: if (element.attributes == null) {
0739: Report.warning(lexer, element.parent, element,
0740: Report.DISCARDING_UNEXPECTED);
0741: Node.discardElement(element);
0742: return;
0743: }
0744: }
0745:
0746: /*
0747: ParseInline is used for some block level elements like H1 to H6
0748: For such elements we need to insert inline emphasis tags currently
0749: on the inline stack. For Inline elements, we normally push them
0750: onto the inline stack provided they aren't implicit or OBJECT/APPLET.
0751: This test is carried out in PushInline and PopInline, see istack.c
0752: We don't push A or SPAN to replicate current browser behavior
0753: */
0754: if (((element.tag.model & Dict.CM_BLOCK) != 0)
0755: || (element.tag == tt.tagDt))
0756: lexer.inlineDup(null);
0757: else if ((element.tag.model & Dict.CM_INLINE) != 0
0758: && element.tag != tt.tagA
0759: && element.tag != tt.tagSpan)
0760: lexer.pushInline(element);
0761:
0762: if (element.tag == tt.tagNobr)
0763: lexer.badLayout |= Report.USING_NOBR;
0764: else if (element.tag == tt.tagFont)
0765: lexer.badLayout |= Report.USING_FONT;
0766:
0767: /* Inline elements may or may not be within a preformatted element */
0768: if (mode != Lexer.Preformatted)
0769: mode = Lexer.MixedContent;
0770:
0771: while (true) {
0772: node = lexer.getToken(mode);
0773: if (node == null)
0774: break;
0775: /* end tag for current element */
0776: if (node.tag == element.tag && node.type == Node.EndTag) {
0777: if ((element.tag.model & Dict.CM_INLINE) != 0
0778: && element.tag != tt.tagA)
0779: lexer.popInline(node);
0780:
0781: if (!((mode & Lexer.Preformatted) != 0))
0782: Node.trimSpaces(lexer, element);
0783: /*
0784: if a font element wraps an anchor and nothing else
0785: then move the font element inside the anchor since
0786: otherwise it won't alter the anchor text color
0787: */
0788: if (element.tag == tt.tagFont
0789: && element.content != null
0790: && element.content == element.last) {
0791: Node child = element.content;
0792:
0793: if (child.tag == tt.tagA) {
0794: child.parent = element.parent;
0795: child.next = element.next;
0796: child.prev = element.prev;
0797:
0798: if (child.prev != null)
0799: child.prev.next = child;
0800: else
0801: child.parent.content = child;
0802:
0803: if (child.next != null)
0804: child.next.prev = child;
0805: else
0806: child.parent.last = child;
0807:
0808: element.next = null;
0809: element.prev = null;
0810: element.parent = child;
0811: element.content = child.content;
0812: element.last = child.last;
0813: child.content = element;
0814: child.last = element;
0815: for (child = element.content; child != null; child = child.next)
0816: child.parent = element;
0817: }
0818: }
0819: element.closed = true;
0820: Node.trimSpaces(lexer, element);
0821: Node.trimEmptyElement(lexer, element);
0822: return;
0823: }
0824:
0825: /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
0826: /* otherwise emphasis nesting is probably unintentional */
0827: /* big and small have cumulative effect to leave them alone */
0828: if (node.type == Node.StartTag
0829: && node.tag == element.tag
0830: && lexer.isPushed(node) && !node.implicit
0831: && !element.implicit && node.tag != null
0832: && ((node.tag.model & Dict.CM_INLINE) != 0)
0833: && node.tag != tt.tagA
0834: && node.tag != tt.tagFont
0835: && node.tag != tt.tagBig
0836: && node.tag != tt.tagSmall) {
0837: if (element.content != null
0838: && node.attributes == null) {
0839: Report.warning(lexer, element, node,
0840: Report.COERCE_TO_ENDTAG);
0841: node.type = Node.EndTag;
0842: lexer.ungetToken();
0843: continue;
0844: }
0845:
0846: Report.warning(lexer, element, node,
0847: Report.NESTED_EMPHASIS);
0848: }
0849:
0850: if (node.type == Node.TextNode) {
0851: /* only called for 1st child */
0852: if (element.content == null
0853: && !((mode & Lexer.Preformatted) != 0))
0854: Node.trimSpaces(lexer, element);
0855:
0856: if (node.start >= node.end) {
0857: continue;
0858: }
0859:
0860: Node.insertNodeAtEnd(element, node);
0861: continue;
0862: }
0863:
0864: /* mixed content model so allow text */
0865: if (Node.insertMisc(element, node))
0866: continue;
0867:
0868: /* deal with HTML tags */
0869: if (node.tag == tt.tagHtml) {
0870: if (node.type == Node.StartTag
0871: || node.type == Node.StartEndTag) {
0872: Report.warning(lexer, element, node,
0873: Report.DISCARDING_UNEXPECTED);
0874: continue;
0875: }
0876:
0877: /* otherwise infer end of inline element */
0878: lexer.ungetToken();
0879: if (!((mode & Lexer.Preformatted) != 0))
0880: Node.trimSpaces(lexer, element);
0881: Node.trimEmptyElement(lexer, element);
0882: return;
0883: }
0884:
0885: /* within <dt> or <pre> map <p> to <br> */
0886: if (node.tag == tt.tagP
0887: && node.type == Node.StartTag
0888: && ((mode & Lexer.Preformatted) != 0
0889: || element.tag == tt.tagDt || element
0890: .isDescendantOf(tt.tagDt))) {
0891: node.tag = tt.tagBr;
0892: node.element = "br";
0893: Node.trimSpaces(lexer, element);
0894: Node.insertNodeAtEnd(element, node);
0895: continue;
0896: }
0897:
0898: /* ignore unknown and PARAM tags */
0899: if (node.tag == null || node.tag == tt.tagParam) {
0900: Report.warning(lexer, element, node,
0901: Report.DISCARDING_UNEXPECTED);
0902: continue;
0903: }
0904:
0905: if (node.tag == tt.tagBr && node.type == Node.EndTag)
0906: node.type = Node.StartTag;
0907:
0908: if (node.type == Node.EndTag) {
0909: /* coerce </br> to <br> */
0910: if (node.tag == tt.tagBr)
0911: node.type = Node.StartTag;
0912: else if (node.tag == tt.tagP) {
0913: /* coerce unmatched </p> to <br><br> */
0914: if (!element.isDescendantOf(tt.tagP)) {
0915: Node.coerceNode(lexer, node, tt.tagBr);
0916: Node.trimSpaces(lexer, element);
0917: Node.insertNodeAtEnd(element, node);
0918: node = lexer.inferredTag("br");
0919: continue;
0920: }
0921: } else if ((node.tag.model & Dict.CM_INLINE) != 0
0922: && node.tag != tt.tagA
0923: && !((node.tag.model & Dict.CM_OBJECT) != 0)
0924: && (element.tag.model & Dict.CM_INLINE) != 0) {
0925: /* allow any inline end tag to end current element */
0926: lexer.popInline(element);
0927:
0928: if (element.tag != tt.tagA) {
0929: if (node.tag == tt.tagA
0930: && node.tag != element.tag) {
0931: Report.warning(lexer, element, node,
0932: Report.MISSING_ENDTAG_BEFORE);
0933: lexer.ungetToken();
0934: } else {
0935: Report.warning(lexer, element, node,
0936: Report.NON_MATCHING_ENDTAG);
0937: }
0938:
0939: if (!((mode & Lexer.Preformatted) != 0))
0940: Node.trimSpaces(lexer, element);
0941: Node.trimEmptyElement(lexer, element);
0942: return;
0943: }
0944:
0945: /* if parent is <a> then discard unexpected inline end tag */
0946: Report.warning(lexer, element, node,
0947: Report.DISCARDING_UNEXPECTED);
0948: continue;
0949: } /* special case </tr> etc. for stuff moved in front of table */
0950: else if (lexer.exiled && node.tag.model != 0
0951: && (node.tag.model & Dict.CM_TABLE) != 0) {
0952: lexer.ungetToken();
0953: Node.trimSpaces(lexer, element);
0954: Node.trimEmptyElement(lexer, element);
0955: return;
0956: }
0957: }
0958:
0959: /* allow any header tag to end current header */
0960: if ((node.tag.model & Dict.CM_HEADING) != 0
0961: && (element.tag.model & Dict.CM_HEADING) != 0) {
0962: if (node.tag == element.tag) {
0963: Report.warning(lexer, element, node,
0964: Report.NON_MATCHING_ENDTAG);
0965: } else {
0966: Report.warning(lexer, element, node,
0967: Report.MISSING_ENDTAG_BEFORE);
0968: lexer.ungetToken();
0969: }
0970: if (!((mode & Lexer.Preformatted) != 0))
0971: Node.trimSpaces(lexer, element);
0972: Node.trimEmptyElement(lexer, element);
0973: return;
0974: }
0975:
0976: /*
0977: an <A> tag to ends any open <A> element
0978: but <A href=...> is mapped to </A><A href=...>
0979: */
0980: if (node.tag == tt.tagA && !node.implicit
0981: && lexer.isPushed(node)) {
0982: /* coerce <a> to </a> unless it has some attributes */
0983: if (node.attributes == null) {
0984: node.type = Node.EndTag;
0985: Report.warning(lexer, element, node,
0986: Report.COERCE_TO_ENDTAG);
0987: lexer.popInline(node);
0988: lexer.ungetToken();
0989: continue;
0990: }
0991:
0992: lexer.ungetToken();
0993: Report.warning(lexer, element, node,
0994: Report.MISSING_ENDTAG_BEFORE);
0995: lexer.popInline(element);
0996: if (!((mode & Lexer.Preformatted) != 0))
0997: Node.trimSpaces(lexer, element);
0998: Node.trimEmptyElement(lexer, element);
0999: return;
1000: }
1001:
1002: if ((element.tag.model & Dict.CM_HEADING) != 0) {
1003: if (node.tag == tt.tagCenter
1004: || node.tag == tt.tagDiv) {
1005: if (node.type != Node.StartTag
1006: && node.type != Node.StartEndTag) {
1007: Report.warning(lexer, element, node,
1008: Report.DISCARDING_UNEXPECTED);
1009: continue;
1010: }
1011:
1012: Report.warning(lexer, element, node,
1013: Report.TAG_NOT_ALLOWED_IN);
1014:
1015: /* insert center as parent if heading is empty */
1016: if (element.content == null) {
1017: Node.insertNodeAsParent(element, node);
1018: continue;
1019: }
1020:
1021: /* split heading and make center parent of 2nd part */
1022: Node.insertNodeAfterElement(element, node);
1023:
1024: if (!((mode & Lexer.Preformatted) != 0))
1025: Node.trimSpaces(lexer, element);
1026:
1027: element = lexer.cloneNode(element);
1028: element.start = lexer.lexsize;
1029: element.end = lexer.lexsize;
1030: Node.insertNodeAtEnd(node, element);
1031: continue;
1032: }
1033:
1034: if (node.tag == tt.tagHr) {
1035: if (node.type != Node.StartTag
1036: && node.type != Node.StartEndTag) {
1037: Report.warning(lexer, element, node,
1038: Report.DISCARDING_UNEXPECTED);
1039: continue;
1040: }
1041:
1042: Report.warning(lexer, element, node,
1043: Report.TAG_NOT_ALLOWED_IN);
1044:
1045: /* insert hr before heading if heading is empty */
1046: if (element.content == null) {
1047: Node.insertNodeBeforeElement(element, node);
1048: continue;
1049: }
1050:
1051: /* split heading and insert hr before 2nd part */
1052: Node.insertNodeAfterElement(element, node);
1053:
1054: if (!((mode & Lexer.Preformatted) != 0))
1055: Node.trimSpaces(lexer, element);
1056:
1057: element = lexer.cloneNode(element);
1058: element.start = lexer.lexsize;
1059: element.end = lexer.lexsize;
1060: Node.insertNodeAfterElement(node, element);
1061: continue;
1062: }
1063: }
1064:
1065: if (element.tag == tt.tagDt) {
1066: if (node.tag == tt.tagHr) {
1067: Node dd;
1068:
1069: if (node.type != Node.StartTag
1070: && node.type != Node.StartEndTag) {
1071: Report.warning(lexer, element, node,
1072: Report.DISCARDING_UNEXPECTED);
1073: continue;
1074: }
1075:
1076: Report.warning(lexer, element, node,
1077: Report.TAG_NOT_ALLOWED_IN);
1078: dd = lexer.inferredTag("dd");
1079:
1080: /* insert hr within dd before dt if dt is empty */
1081: if (element.content == null) {
1082: Node.insertNodeBeforeElement(element, dd);
1083: Node.insertNodeAtEnd(dd, node);
1084: continue;
1085: }
1086:
1087: /* split dt and insert hr within dd before 2nd part */
1088: Node.insertNodeAfterElement(element, dd);
1089: Node.insertNodeAtEnd(dd, node);
1090:
1091: if (!((mode & Lexer.Preformatted) != 0))
1092: Node.trimSpaces(lexer, element);
1093:
1094: element = lexer.cloneNode(element);
1095: element.start = lexer.lexsize;
1096: element.end = lexer.lexsize;
1097: Node.insertNodeAfterElement(dd, element);
1098: continue;
1099: }
1100: }
1101:
1102: /*
1103: if this is the end tag for an ancestor element
1104: then infer end tag for this element
1105: */
1106: if (node.type == Node.EndTag) {
1107: for (parent = element.parent; parent != null; parent = parent.parent) {
1108: if (node.tag == parent.tag) {
1109: if (!((element.tag.model & Dict.CM_OPT) != 0)
1110: && !element.implicit)
1111: Report.warning(lexer, element, node,
1112: Report.MISSING_ENDTAG_BEFORE);
1113:
1114: if (element.tag == tt.tagA)
1115: lexer.popInline(element);
1116:
1117: lexer.ungetToken();
1118:
1119: if (!((mode & Lexer.Preformatted) != 0))
1120: Node.trimSpaces(lexer, element);
1121:
1122: Node.trimEmptyElement(lexer, element);
1123: return;
1124: }
1125: }
1126: }
1127:
1128: /* block level tags end this element */
1129: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1130: if (node.type != Node.StartTag) {
1131: Report.warning(lexer, element, node,
1132: Report.DISCARDING_UNEXPECTED);
1133: continue;
1134: }
1135:
1136: if (!((element.tag.model & Dict.CM_OPT) != 0))
1137: Report.warning(lexer, element, node,
1138: Report.MISSING_ENDTAG_BEFORE);
1139:
1140: if ((node.tag.model & Dict.CM_HEAD) != 0
1141: && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
1142: moveToHead(lexer, element, node);
1143: continue;
1144: }
1145:
1146: /*
1147: prevent anchors from propagating into block tags
1148: except for headings h1 to h6
1149: */
1150: if (element.tag == tt.tagA) {
1151: if (node.tag != null
1152: && !((node.tag.model & Dict.CM_HEADING) != 0))
1153: lexer.popInline(element);
1154: else if (!(element.content != null)) {
1155: Node.discardElement(element);
1156: lexer.ungetToken();
1157: return;
1158: }
1159: }
1160:
1161: lexer.ungetToken();
1162:
1163: if (!((mode & Lexer.Preformatted) != 0))
1164: Node.trimSpaces(lexer, element);
1165:
1166: Node.trimEmptyElement(lexer, element);
1167: return;
1168: }
1169:
1170: /* parse inline element */
1171: if (node.type == Node.StartTag
1172: || node.type == Node.StartEndTag) {
1173: if (node.implicit)
1174: Report.warning(lexer, element, node,
1175: Report.INSERTING_TAG);
1176:
1177: /* trim white space before <br> */
1178: if (node.tag == tt.tagBr)
1179: Node.trimSpaces(lexer, element);
1180:
1181: Node.insertNodeAtEnd(element, node);
1182: parseTag(lexer, node, mode);
1183: continue;
1184: }
1185:
1186: /* discard unexpected tags */
1187: Report.warning(lexer, element, node,
1188: Report.DISCARDING_UNEXPECTED);
1189: }
1190:
1191: if (!((element.tag.model & Dict.CM_OPT) != 0))
1192: Report.warning(lexer, element, node,
1193: Report.MISSING_ENDTAG_FOR);
1194:
1195: Node.trimEmptyElement(lexer, element);
1196: }
1197: };
1198:
1199: public static class ParseList implements Parser {
1200:
1201: public void parse(Lexer lexer, Node list, short mode) {
1202: Node node;
1203: Node parent;
1204: TagTable tt = lexer.configuration.tt;
1205:
1206: if ((list.tag.model & Dict.CM_EMPTY) != 0)
1207: return;
1208:
1209: lexer.insert = -1; /* defer implicit inline start tags */
1210:
1211: while (true) {
1212: node = lexer.getToken(Lexer.IgnoreWhitespace);
1213: if (node == null)
1214: break;
1215:
1216: if (node.tag == list.tag && node.type == Node.EndTag) {
1217: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1218: Node.coerceNode(lexer, list, tt.tagUl);
1219:
1220: list.closed = true;
1221: Node.trimEmptyElement(lexer, list);
1222: return;
1223: }
1224:
1225: /* deal with comments etc. */
1226: if (Node.insertMisc(list, node))
1227: continue;
1228:
1229: if (node.type != Node.TextNode && node.tag == null) {
1230: Report.warning(lexer, list, node,
1231: Report.DISCARDING_UNEXPECTED);
1232: continue;
1233: }
1234:
1235: /*
1236: if this is the end tag for an ancestor element
1237: then infer end tag for this element
1238: */
1239: if (node.type == Node.EndTag) {
1240: if (node.tag == tt.tagForm) {
1241: lexer.badForm = 1;
1242: Report.warning(lexer, list, node,
1243: Report.DISCARDING_UNEXPECTED);
1244: continue;
1245: }
1246:
1247: if (node.tag != null
1248: && (node.tag.model & Dict.CM_INLINE) != 0) {
1249: Report.warning(lexer, list, node,
1250: Report.DISCARDING_UNEXPECTED);
1251: lexer.popInline(node);
1252: continue;
1253: }
1254:
1255: for (parent = list.parent; parent != null; parent = parent.parent) {
1256: if (node.tag == parent.tag) {
1257: Report.warning(lexer, list, node,
1258: Report.MISSING_ENDTAG_BEFORE);
1259: lexer.ungetToken();
1260:
1261: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1262: Node.coerceNode(lexer, list, tt.tagUl);
1263:
1264: Node.trimEmptyElement(lexer, list);
1265: return;
1266: }
1267: }
1268:
1269: Report.warning(lexer, list, node,
1270: Report.DISCARDING_UNEXPECTED);
1271: continue;
1272: }
1273:
1274: if (node.tag != tt.tagLi) {
1275: lexer.ungetToken();
1276:
1277: if (node.tag != null
1278: && (node.tag.model & Dict.CM_BLOCK) != 0
1279: && lexer.excludeBlocks) {
1280: Report.warning(lexer, list, node,
1281: Report.MISSING_ENDTAG_BEFORE);
1282: Node.trimEmptyElement(lexer, list);
1283: return;
1284: }
1285:
1286: node = lexer.inferredTag("li");
1287: node.addAttribute("style", "list-style: none");
1288: Report.warning(lexer, list, node,
1289: Report.MISSING_STARTTAG);
1290: }
1291:
1292: /* node should be <LI> */
1293: Node.insertNodeAtEnd(list, node);
1294: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1295: }
1296:
1297: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1298: Node.coerceNode(lexer, list, tt.tagUl);
1299:
1300: Report
1301: .warning(lexer, list, node,
1302: Report.MISSING_ENDTAG_FOR);
1303: Node.trimEmptyElement(lexer, list);
1304: }
1305:
1306: };
1307:
1308: public static class ParseDefList implements Parser {
1309:
1310: public void parse(Lexer lexer, Node list, short mode) {
1311: Node node, parent;
1312: TagTable tt = lexer.configuration.tt;
1313:
1314: if ((list.tag.model & Dict.CM_EMPTY) != 0)
1315: return;
1316:
1317: lexer.insert = -1; /* defer implicit inline start tags */
1318:
1319: while (true) {
1320: node = lexer.getToken(Lexer.IgnoreWhitespace);
1321: if (node == null)
1322: break;
1323: if (node.tag == list.tag && node.type == Node.EndTag) {
1324: list.closed = true;
1325: Node.trimEmptyElement(lexer, list);
1326: return;
1327: }
1328:
1329: /* deal with comments etc. */
1330: if (Node.insertMisc(list, node))
1331: continue;
1332:
1333: if (node.type == Node.TextNode) {
1334: lexer.ungetToken();
1335: node = lexer.inferredTag("dt");
1336: Report.warning(lexer, list, node,
1337: Report.MISSING_STARTTAG);
1338: }
1339:
1340: if (node.tag == null) {
1341: Report.warning(lexer, list, node,
1342: Report.DISCARDING_UNEXPECTED);
1343: continue;
1344: }
1345:
1346: /*
1347: if this is the end tag for an ancestor element
1348: then infer end tag for this element
1349: */
1350: if (node.type == Node.EndTag) {
1351: if (node.tag == tt.tagForm) {
1352: lexer.badForm = 1;
1353: Report.warning(lexer, list, node,
1354: Report.DISCARDING_UNEXPECTED);
1355: continue;
1356: }
1357:
1358: for (parent = list.parent; parent != null; parent = parent.parent) {
1359: if (node.tag == parent.tag) {
1360: Report.warning(lexer, list, node,
1361: Report.MISSING_ENDTAG_BEFORE);
1362:
1363: lexer.ungetToken();
1364: Node.trimEmptyElement(lexer, list);
1365: return;
1366: }
1367: }
1368: }
1369:
1370: /* center in a dt or a dl breaks the dl list in two */
1371: if (node.tag == tt.tagCenter) {
1372: if (list.content != null)
1373: Node.insertNodeAfterElement(list, node);
1374: else /* trim empty dl list */
1375: {
1376: Node.insertNodeBeforeElement(list, node);
1377: Node.discardElement(list);
1378: }
1379:
1380: /* and parse contents of center */
1381: parseTag(lexer, node, mode);
1382:
1383: /* now create a new dl element */
1384: list = lexer.inferredTag("dl");
1385: Node.insertNodeAfterElement(node, list);
1386: continue;
1387: }
1388:
1389: if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) {
1390: lexer.ungetToken();
1391:
1392: if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
1393: Report.warning(lexer, list, node,
1394: Report.TAG_NOT_ALLOWED_IN);
1395: Node.trimEmptyElement(lexer, list);
1396: return;
1397: }
1398:
1399: /* if DD appeared directly in BODY then exclude blocks */
1400: if (!((node.tag.model & Dict.CM_INLINE) != 0)
1401: && lexer.excludeBlocks) {
1402: Node.trimEmptyElement(lexer, list);
1403: return;
1404: }
1405:
1406: node = lexer.inferredTag("dd");
1407: Report.warning(lexer, list, node,
1408: Report.MISSING_STARTTAG);
1409: }
1410:
1411: if (node.type == Node.EndTag) {
1412: Report.warning(lexer, list, node,
1413: Report.DISCARDING_UNEXPECTED);
1414: continue;
1415: }
1416:
1417: /* node should be <DT> or <DD>*/
1418: Node.insertNodeAtEnd(list, node);
1419: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1420: }
1421:
1422: Report
1423: .warning(lexer, list, node,
1424: Report.MISSING_ENDTAG_FOR);
1425: Node.trimEmptyElement(lexer, list);
1426: }
1427:
1428: };
1429:
1430: public static class ParsePre implements Parser {
1431:
1432: public void parse(Lexer lexer, Node pre, short mode) {
1433: Node node, parent;
1434: TagTable tt = lexer.configuration.tt;
1435:
1436: if ((pre.tag.model & Dict.CM_EMPTY) != 0)
1437: return;
1438:
1439: if ((pre.tag.model & Dict.CM_OBSOLETE) != 0)
1440: Node.coerceNode(lexer, pre, tt.tagPre);
1441:
1442: lexer.inlineDup(null); /* tell lexer to insert inlines if needed */
1443:
1444: while (true) {
1445: node = lexer.getToken(Lexer.Preformatted);
1446: if (node == null)
1447: break;
1448: if (node.tag == pre.tag && node.type == Node.EndTag) {
1449: Node.trimSpaces(lexer, pre);
1450: pre.closed = true;
1451: Node.trimEmptyElement(lexer, pre);
1452: return;
1453: }
1454:
1455: if (node.tag == tt.tagHtml) {
1456: if (node.type == Node.StartTag
1457: || node.type == Node.StartEndTag)
1458: Report.warning(lexer, pre, node,
1459: Report.DISCARDING_UNEXPECTED);
1460:
1461: continue;
1462: }
1463:
1464: if (node.type == Node.TextNode) {
1465: /* if first check for inital newline */
1466: if (pre.content == null) {
1467: if (node.textarray[node.start] == (byte) '\n')
1468: ++node.start;
1469:
1470: if (node.start >= node.end) {
1471: continue;
1472: }
1473: }
1474:
1475: Node.insertNodeAtEnd(pre, node);
1476: continue;
1477: }
1478:
1479: /* deal with comments etc. */
1480: if (Node.insertMisc(pre, node))
1481: continue;
1482:
1483: /* discard unknown and PARAM tags */
1484: if (node.tag == null || node.tag == tt.tagParam) {
1485: Report.warning(lexer, pre, node,
1486: Report.DISCARDING_UNEXPECTED);
1487: continue;
1488: }
1489:
1490: if (node.tag == tt.tagP) {
1491: if (node.type == Node.StartTag) {
1492: Report.warning(lexer, pre, node,
1493: Report.USING_BR_INPLACE_OF);
1494:
1495: /* trim white space before <p> in <pre>*/
1496: Node.trimSpaces(lexer, pre);
1497:
1498: /* coerce both <p> and </p> to <br> */
1499: Node.coerceNode(lexer, node, tt.tagBr);
1500: Node.insertNodeAtEnd(pre, node);
1501: } else {
1502: Report.warning(lexer, pre, node,
1503: Report.DISCARDING_UNEXPECTED);
1504: }
1505: continue;
1506: }
1507:
1508: if ((node.tag.model & Dict.CM_HEAD) != 0
1509: && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
1510: moveToHead(lexer, pre, node);
1511: continue;
1512: }
1513:
1514: /*
1515: if this is the end tag for an ancestor element
1516: then infer end tag for this element
1517: */
1518: if (node.type == Node.EndTag) {
1519: if (node.tag == tt.tagForm) {
1520: lexer.badForm = 1;
1521: Report.warning(lexer, pre, node,
1522: Report.DISCARDING_UNEXPECTED);
1523: continue;
1524: }
1525:
1526: for (parent = pre.parent; parent != null; parent = parent.parent) {
1527: if (node.tag == parent.tag) {
1528: Report.warning(lexer, pre, node,
1529: Report.MISSING_ENDTAG_BEFORE);
1530:
1531: lexer.ungetToken();
1532: Node.trimSpaces(lexer, pre);
1533: Node.trimEmptyElement(lexer, pre);
1534: return;
1535: }
1536: }
1537: }
1538:
1539: /* what about head content, HEAD, BODY tags etc? */
1540: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1541: if (node.type != Node.StartTag) {
1542: Report.warning(lexer, pre, node,
1543: Report.DISCARDING_UNEXPECTED);
1544: continue;
1545: }
1546:
1547: Report.warning(lexer, pre, node,
1548: Report.MISSING_ENDTAG_BEFORE);
1549: lexer.excludeBlocks = true;
1550:
1551: /* check if we need to infer a container */
1552: if ((node.tag.model & Dict.CM_LIST) != 0) {
1553: lexer.ungetToken();
1554: node = lexer.inferredTag("ul");
1555: Node.addClass(node, "noindent");
1556: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1557: lexer.ungetToken();
1558: node = lexer.inferredTag("dl");
1559: } else if ((node.tag.model & Dict.CM_TABLE) != 0) {
1560: lexer.ungetToken();
1561: node = lexer.inferredTag("table");
1562: }
1563:
1564: Node.insertNodeAfterElement(pre, node);
1565: pre = lexer.inferredTag("pre");
1566: Node.insertNodeAfterElement(node, pre);
1567: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1568: lexer.excludeBlocks = false;
1569: continue;
1570: }
1571: /*
1572: if (!((node.tag.model & Dict.CM_INLINE) != 0))
1573: {
1574: Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE);
1575: lexer.ungetToken();
1576: return;
1577: }
1578: */
1579: if (node.type == Node.StartTag
1580: || node.type == Node.StartEndTag) {
1581: /* trim white space before <br> */
1582: if (node.tag == tt.tagBr)
1583: Node.trimSpaces(lexer, pre);
1584:
1585: Node.insertNodeAtEnd(pre, node);
1586: parseTag(lexer, node, Lexer.Preformatted);
1587: continue;
1588: }
1589:
1590: /* discard unexpected tags */
1591: Report.warning(lexer, pre, node,
1592: Report.DISCARDING_UNEXPECTED);
1593: }
1594:
1595: Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
1596: Node.trimEmptyElement(lexer, pre);
1597: }
1598:
1599: };
1600:
1601: public static class ParseBlock implements Parser {
1602:
1603: public void parse(Lexer lexer, Node element, short mode)
1604: /*
1605: element is node created by the lexer
1606: upon seeing the start tag, or by the
1607: parser when the start tag is inferred
1608: */
1609: {
1610: Node node, parent;
1611: boolean checkstack;
1612: int istackbase = 0;
1613: TagTable tt = lexer.configuration.tt;
1614:
1615: checkstack = true;
1616:
1617: if ((element.tag.model & Dict.CM_EMPTY) != 0)
1618: return;
1619:
1620: if (element.tag == tt.tagForm
1621: && element.isDescendantOf(tt.tagForm))
1622: Report.warning(lexer, element, null,
1623: Report.ILLEGAL_NESTING);
1624:
1625: /*
1626: InlineDup() asks the lexer to insert inline emphasis tags
1627: currently pushed on the istack, but take care to avoid
1628: propagating inline emphasis inside OBJECT or APPLET.
1629: For these elements a fresh inline stack context is created
1630: and disposed of upon reaching the end of the element.
1631: They thus behave like table cells in this respect.
1632: */
1633: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1634: istackbase = lexer.istackbase;
1635: lexer.istackbase = lexer.istack.size();
1636: }
1637:
1638: if (!((element.tag.model & Dict.CM_MIXED) != 0))
1639: lexer.inlineDup(null);
1640:
1641: mode = Lexer.IgnoreWhitespace;
1642:
1643: while (true) {
1644: node = lexer.getToken(mode /*Lexer.MixedContent*/);
1645: if (node == null)
1646: break;
1647: /* end tag for this element */
1648: if (node.type == Node.EndTag
1649: && node.tag != null
1650: && (node.tag == element.tag || element.was == node.tag)) {
1651:
1652: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1653: /* pop inline stack */
1654: while (lexer.istack.size() > lexer.istackbase)
1655: lexer.popInline(null);
1656: lexer.istackbase = istackbase;
1657: }
1658:
1659: element.closed = true;
1660: Node.trimSpaces(lexer, element);
1661: Node.trimEmptyElement(lexer, element);
1662: return;
1663: }
1664:
1665: if (node.tag == tt.tagHtml || node.tag == tt.tagHead
1666: || node.tag == tt.tagBody) {
1667: if (node.type == Node.StartTag
1668: || node.type == Node.StartEndTag)
1669: Report.warning(lexer, element, node,
1670: Report.DISCARDING_UNEXPECTED);
1671:
1672: continue;
1673: }
1674:
1675: if (node.type == Node.EndTag) {
1676: if (node.tag == null) {
1677: Report.warning(lexer, element, node,
1678: Report.DISCARDING_UNEXPECTED);
1679:
1680: continue;
1681: } else if (node.tag == tt.tagBr)
1682: node.type = Node.StartTag;
1683: else if (node.tag == tt.tagP) {
1684: Node.coerceNode(lexer, node, tt.tagBr);
1685: Node.insertNodeAtEnd(element, node);
1686: node = lexer.inferredTag("br");
1687: } else {
1688: /*
1689: if this is the end tag for an ancestor element
1690: then infer end tag for this element
1691: */
1692: for (parent = element.parent; parent != null; parent = parent.parent) {
1693: if (node.tag == parent.tag) {
1694: if (!((element.tag.model & Dict.CM_OPT) != 0))
1695: Report
1696: .warning(
1697: lexer,
1698: element,
1699: node,
1700: Report.MISSING_ENDTAG_BEFORE);
1701:
1702: lexer.ungetToken();
1703:
1704: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1705: /* pop inline stack */
1706: while (lexer.istack.size() > lexer.istackbase)
1707: lexer.popInline(null);
1708: lexer.istackbase = istackbase;
1709: }
1710:
1711: Node.trimSpaces(lexer, element);
1712: Node.trimEmptyElement(lexer, element);
1713: return;
1714: }
1715: }
1716: /* special case </tr> etc. for stuff moved in front of table */
1717: if (lexer.exiled
1718: && node.tag.model != 0
1719: && (node.tag.model & Dict.CM_TABLE) != 0) {
1720: lexer.ungetToken();
1721: Node.trimSpaces(lexer, element);
1722: Node.trimEmptyElement(lexer, element);
1723: return;
1724: }
1725: }
1726: }
1727:
1728: /* mixed content model permits text */
1729: if (node.type == Node.TextNode) {
1730: boolean iswhitenode = false;
1731:
1732: if (node.type == Node.TextNode
1733: && node.end <= node.start + 1
1734: && lexer.lexbuf[node.start] == (byte) ' ')
1735: iswhitenode = true;
1736:
1737: if (lexer.configuration.EncloseBlockText
1738: && !iswhitenode) {
1739: lexer.ungetToken();
1740: node = lexer.inferredTag("p");
1741: Node.insertNodeAtEnd(element, node);
1742: parseTag(lexer, node, Lexer.MixedContent);
1743: continue;
1744: }
1745:
1746: if (checkstack) {
1747: checkstack = false;
1748:
1749: if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
1750: if (lexer.inlineDup(node) > 0)
1751: continue;
1752: }
1753: }
1754:
1755: Node.insertNodeAtEnd(element, node);
1756: mode = Lexer.MixedContent;
1757: /*
1758: HTML4 strict doesn't allow mixed content for
1759: elements with %block; as their content model
1760: */
1761: lexer.versions &= ~Dict.VERS_HTML40_STRICT;
1762: continue;
1763: }
1764:
1765: if (Node.insertMisc(element, node))
1766: continue;
1767:
1768: /* allow PARAM elements? */
1769: if (node.tag == tt.tagParam) {
1770: if (((element.tag.model & Dict.CM_PARAM) != 0)
1771: && (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
1772: Node.insertNodeAtEnd(element, node);
1773: continue;
1774: }
1775:
1776: /* otherwise discard it */
1777: Report.warning(lexer, element, node,
1778: Report.DISCARDING_UNEXPECTED);
1779: continue;
1780: }
1781:
1782: /* allow AREA elements? */
1783: if (node.tag == tt.tagArea) {
1784: if ((element.tag == tt.tagMap)
1785: && (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
1786: Node.insertNodeAtEnd(element, node);
1787: continue;
1788: }
1789:
1790: /* otherwise discard it */
1791: Report.warning(lexer, element, node,
1792: Report.DISCARDING_UNEXPECTED);
1793: continue;
1794: }
1795:
1796: /* ignore unknown start/end tags */
1797: if (node.tag == null) {
1798: Report.warning(lexer, element, node,
1799: Report.DISCARDING_UNEXPECTED);
1800: continue;
1801: }
1802:
1803: /*
1804: Allow Dict.CM_INLINE elements here.
1805:
1806: Allow Dict.CM_BLOCK elements here unless
1807: lexer.excludeBlocks is yes.
1808:
1809: LI and DD are special cased.
1810:
1811: Otherwise infer end tag for this element.
1812: */
1813:
1814: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1815: if (node.type != Node.StartTag
1816: && node.type != Node.StartEndTag) {
1817: Report.warning(lexer, element, node,
1818: Report.DISCARDING_UNEXPECTED);
1819: continue;
1820: }
1821:
1822: if (element.tag == tt.tagTd
1823: || element.tag == tt.tagTh) {
1824: /* if parent is a table cell, avoid inferring the end of the cell */
1825:
1826: if ((node.tag.model & Dict.CM_HEAD) != 0) {
1827: moveToHead(lexer, element, node);
1828: continue;
1829: }
1830:
1831: if ((node.tag.model & Dict.CM_LIST) != 0) {
1832: lexer.ungetToken();
1833: node = lexer.inferredTag("ul");
1834: Node.addClass(node, "noindent");
1835: lexer.excludeBlocks = true;
1836: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1837: lexer.ungetToken();
1838: node = lexer.inferredTag("dl");
1839: lexer.excludeBlocks = true;
1840: }
1841:
1842: /* infer end of current table cell */
1843: if (!((node.tag.model & Dict.CM_BLOCK) != 0)) {
1844: lexer.ungetToken();
1845: Node.trimSpaces(lexer, element);
1846: Node.trimEmptyElement(lexer, element);
1847: return;
1848: }
1849: } else if ((node.tag.model & Dict.CM_BLOCK) != 0) {
1850: if (lexer.excludeBlocks) {
1851: if (!((element.tag.model & Dict.CM_OPT) != 0))
1852: Report.warning(lexer, element, node,
1853: Report.MISSING_ENDTAG_BEFORE);
1854:
1855: lexer.ungetToken();
1856:
1857: if ((element.tag.model & Dict.CM_OBJECT) != 0)
1858: lexer.istackbase = istackbase;
1859:
1860: Node.trimSpaces(lexer, element);
1861: Node.trimEmptyElement(lexer, element);
1862: return;
1863: }
1864: } else /* things like list items */
1865: {
1866: if (!((element.tag.model & Dict.CM_OPT) != 0)
1867: && !element.implicit)
1868: Report.warning(lexer, element, node,
1869: Report.MISSING_ENDTAG_BEFORE);
1870:
1871: if ((node.tag.model & Dict.CM_HEAD) != 0) {
1872: moveToHead(lexer, element, node);
1873: continue;
1874: }
1875:
1876: lexer.ungetToken();
1877:
1878: if ((node.tag.model & Dict.CM_LIST) != 0) {
1879: if (element.parent != null
1880: && element.parent.tag != null
1881: && element.parent.tag.parser == getParseList()) {
1882: Node.trimSpaces(lexer, element);
1883: Node.trimEmptyElement(lexer, element);
1884: return;
1885: }
1886:
1887: node = lexer.inferredTag("ul");
1888: Node.addClass(node, "noindent");
1889: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1890: if (element.parent.tag == tt.tagDl) {
1891: Node.trimSpaces(lexer, element);
1892: Node.trimEmptyElement(lexer, element);
1893: return;
1894: }
1895:
1896: node = lexer.inferredTag("dl");
1897: } else if ((node.tag.model & Dict.CM_TABLE) != 0
1898: || (node.tag.model & Dict.CM_ROW) != 0) {
1899: node = lexer.inferredTag("table");
1900: } else if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1901: /* pop inline stack */
1902: while (lexer.istack.size() > lexer.istackbase)
1903: lexer.popInline(null);
1904: lexer.istackbase = istackbase;
1905: Node.trimSpaces(lexer, element);
1906: Node.trimEmptyElement(lexer, element);
1907: return;
1908:
1909: } else {
1910: Node.trimSpaces(lexer, element);
1911: Node.trimEmptyElement(lexer, element);
1912: return;
1913: }
1914: }
1915: }
1916:
1917: /* parse known element */
1918: if (node.type == Node.StartTag
1919: || node.type == Node.StartEndTag) {
1920: if ((node.tag.model & Dict.CM_INLINE) != 0) {
1921: if (checkstack && !node.implicit) {
1922: checkstack = false;
1923:
1924: if (lexer.inlineDup(node) > 0)
1925: continue;
1926: }
1927:
1928: mode = Lexer.MixedContent;
1929: } else {
1930: checkstack = true;
1931: mode = Lexer.IgnoreWhitespace;
1932: }
1933:
1934: /* trim white space before <br> */
1935: if (node.tag == tt.tagBr)
1936: Node.trimSpaces(lexer, element);
1937:
1938: Node.insertNodeAtEnd(element, node);
1939:
1940: if (node.implicit)
1941: Report.warning(lexer, element, node,
1942: Report.INSERTING_TAG);
1943:
1944: parseTag(lexer, node, Lexer.IgnoreWhitespace /*Lexer.MixedContent*/);
1945: continue;
1946: }
1947:
1948: /* discard unexpected tags */
1949: if (node.type == Node.EndTag)
1950: lexer.popInline(node); /* if inline end tag */
1951:
1952: Report.warning(lexer, element, node,
1953: Report.DISCARDING_UNEXPECTED);
1954: }
1955:
1956: if (!((element.tag.model & Dict.CM_OPT) != 0))
1957: Report.warning(lexer, element, node,
1958: Report.MISSING_ENDTAG_FOR);
1959:
1960: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1961: /* pop inline stack */
1962: while (lexer.istack.size() > lexer.istackbase)
1963: lexer.popInline(null);
1964: lexer.istackbase = istackbase;
1965: }
1966:
1967: Node.trimSpaces(lexer, element);
1968: Node.trimEmptyElement(lexer, element);
1969: }
1970:
1971: };
1972:
1973: public static class ParseTableTag implements Parser {
1974:
1975: public void parse(Lexer lexer, Node table, short mode) {
1976: Node node, parent;
1977: int istackbase;
1978: TagTable tt = lexer.configuration.tt;
1979:
1980: lexer.deferDup();
1981: istackbase = lexer.istackbase;
1982: lexer.istackbase = lexer.istack.size();
1983:
1984: while (true) {
1985: node = lexer.getToken(Lexer.IgnoreWhitespace);
1986: if (node == null)
1987: break;
1988: if (node.tag == table.tag && node.type == Node.EndTag) {
1989: lexer.istackbase = istackbase;
1990: table.closed = true;
1991: Node.trimEmptyElement(lexer, table);
1992: return;
1993: }
1994:
1995: /* deal with comments etc. */
1996: if (Node.insertMisc(table, node))
1997: continue;
1998:
1999: /* discard unknown tags */
2000: if (node.tag == null && node.type != Node.TextNode) {
2001: Report.warning(lexer, table, node,
2002: Report.DISCARDING_UNEXPECTED);
2003: continue;
2004: }
2005:
2006: /* if TD or TH or text or inline or block then infer <TR> */
2007:
2008: if (node.type != Node.EndTag) {
2009: if (node.tag == tt.tagTd || node.tag == tt.tagTh
2010: || node.tag == tt.tagTable) {
2011: lexer.ungetToken();
2012: node = lexer.inferredTag("tr");
2013: Report.warning(lexer, table, node,
2014: Report.MISSING_STARTTAG);
2015: } else if (node.type == Node.TextNode
2016: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2017: Node.insertNodeBeforeElement(table, node);
2018: Report.warning(lexer, table, node,
2019: Report.TAG_NOT_ALLOWED_IN);
2020: lexer.exiled = true;
2021:
2022: /* AQ: TODO
2023: Line 2040 of parser.c (13 Jan 2000) reads as follows:
2024: if (!node->type == TextNode)
2025: This will always evaluate to false.
2026: This has been reported to Dave Raggett <dsr@w3.org>
2027: */
2028: //Should be?: if (!(node.type == Node.TextNode))
2029: if (false)
2030: parseTag(lexer, node,
2031: Lexer.IgnoreWhitespace);
2032:
2033: lexer.exiled = false;
2034: continue;
2035: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2036: moveToHead(lexer, table, node);
2037: continue;
2038: }
2039: }
2040:
2041: /*
2042: if this is the end tag for an ancestor element
2043: then infer end tag for this element
2044: */
2045: if (node.type == Node.EndTag) {
2046: if (node.tag == tt.tagForm) {
2047: lexer.badForm = 1;
2048: Report.warning(lexer, table, node,
2049: Report.DISCARDING_UNEXPECTED);
2050: continue;
2051: }
2052:
2053: if (node.tag != null
2054: && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0) {
2055: Report.warning(lexer, table, node,
2056: Report.DISCARDING_UNEXPECTED);
2057: continue;
2058: }
2059:
2060: for (parent = table.parent; parent != null; parent = parent.parent) {
2061: if (node.tag == parent.tag) {
2062: Report.warning(lexer, table, node,
2063: Report.MISSING_ENDTAG_BEFORE);
2064: lexer.ungetToken();
2065: lexer.istackbase = istackbase;
2066: Node.trimEmptyElement(lexer, table);
2067: return;
2068: }
2069: }
2070: }
2071:
2072: if (!((node.tag.model & Dict.CM_TABLE) != 0)) {
2073: lexer.ungetToken();
2074: Report.warning(lexer, table, node,
2075: Report.TAG_NOT_ALLOWED_IN);
2076: lexer.istackbase = istackbase;
2077: Node.trimEmptyElement(lexer, table);
2078: return;
2079: }
2080:
2081: if (node.type == Node.StartTag
2082: || node.type == Node.StartEndTag) {
2083: Node.insertNodeAtEnd(table, node);
2084: ;
2085: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2086: continue;
2087: }
2088:
2089: /* discard unexpected text nodes and end tags */
2090: Report.warning(lexer, table, node,
2091: Report.DISCARDING_UNEXPECTED);
2092: }
2093:
2094: Report.warning(lexer, table, node,
2095: Report.MISSING_ENDTAG_FOR);
2096: Node.trimEmptyElement(lexer, table);
2097: lexer.istackbase = istackbase;
2098: }
2099:
2100: };
2101:
2102: public static class ParseColGroup implements Parser {
2103:
2104: public void parse(Lexer lexer, Node colgroup, short mode) {
2105: Node node, parent;
2106: TagTable tt = lexer.configuration.tt;
2107:
2108: if ((colgroup.tag.model & Dict.CM_EMPTY) != 0)
2109: return;
2110:
2111: while (true) {
2112: node = lexer.getToken(Lexer.IgnoreWhitespace);
2113: if (node == null)
2114: break;
2115: if (node.tag == colgroup.tag
2116: && node.type == Node.EndTag) {
2117: colgroup.closed = true;
2118: return;
2119: }
2120:
2121: /*
2122: if this is the end tag for an ancestor element
2123: then infer end tag for this element
2124: */
2125: if (node.type == Node.EndTag) {
2126: if (node.tag == tt.tagForm) {
2127: lexer.badForm = 1;
2128: Report.warning(lexer, colgroup, node,
2129: Report.DISCARDING_UNEXPECTED);
2130: continue;
2131: }
2132:
2133: for (parent = colgroup.parent; parent != null; parent = parent.parent) {
2134:
2135: if (node.tag == parent.tag) {
2136: lexer.ungetToken();
2137: return;
2138: }
2139: }
2140: }
2141:
2142: if (node.type == Node.TextNode) {
2143: lexer.ungetToken();
2144: return;
2145: }
2146:
2147: /* deal with comments etc. */
2148: if (Node.insertMisc(colgroup, node))
2149: continue;
2150:
2151: /* discard unknown tags */
2152: if (node.tag == null) {
2153: Report.warning(lexer, colgroup, node,
2154: Report.DISCARDING_UNEXPECTED);
2155: continue;
2156: }
2157:
2158: if (node.tag != tt.tagCol) {
2159: lexer.ungetToken();
2160: return;
2161: }
2162:
2163: if (node.type == Node.EndTag) {
2164: Report.warning(lexer, colgroup, node,
2165: Report.DISCARDING_UNEXPECTED);
2166: continue;
2167: }
2168:
2169: /* node should be <COL> */
2170: Node.insertNodeAtEnd(colgroup, node);
2171: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2172: }
2173: }
2174:
2175: };
2176:
2177: public static class ParseRowGroup implements Parser {
2178:
2179: public void parse(Lexer lexer, Node rowgroup, short mode) {
2180: Node node, parent;
2181: TagTable tt = lexer.configuration.tt;
2182:
2183: if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0)
2184: return;
2185:
2186: while (true) {
2187: node = lexer.getToken(Lexer.IgnoreWhitespace);
2188: if (node == null)
2189: break;
2190: if (node.tag == rowgroup.tag) {
2191: if (node.type == Node.EndTag) {
2192: rowgroup.closed = true;
2193: Node.trimEmptyElement(lexer, rowgroup);
2194: return;
2195: }
2196:
2197: lexer.ungetToken();
2198: return;
2199: }
2200:
2201: /* if </table> infer end tag */
2202: if (node.tag == tt.tagTable && node.type == Node.EndTag) {
2203: lexer.ungetToken();
2204: Node.trimEmptyElement(lexer, rowgroup);
2205: return;
2206: }
2207:
2208: /* deal with comments etc. */
2209: if (Node.insertMisc(rowgroup, node))
2210: continue;
2211:
2212: /* discard unknown tags */
2213: if (node.tag == null && node.type != Node.TextNode) {
2214: Report.warning(lexer, rowgroup, node,
2215: Report.DISCARDING_UNEXPECTED);
2216: continue;
2217: }
2218:
2219: /*
2220: if TD or TH then infer <TR>
2221: if text or inline or block move before table
2222: if head content move to head
2223: */
2224:
2225: if (node.type != Node.EndTag) {
2226: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2227: lexer.ungetToken();
2228: node = lexer.inferredTag("tr");
2229: Report.warning(lexer, rowgroup, node,
2230: Report.MISSING_STARTTAG);
2231: } else if (node.type == Node.TextNode
2232: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2233: Node.moveBeforeTable(rowgroup, node, tt);
2234: Report.warning(lexer, rowgroup, node,
2235: Report.TAG_NOT_ALLOWED_IN);
2236: lexer.exiled = true;
2237:
2238: if (node.type != Node.TextNode)
2239: parseTag(lexer, node,
2240: Lexer.IgnoreWhitespace);
2241:
2242: lexer.exiled = false;
2243: continue;
2244: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2245: Report.warning(lexer, rowgroup, node,
2246: Report.TAG_NOT_ALLOWED_IN);
2247: moveToHead(lexer, rowgroup, node);
2248: continue;
2249: }
2250: }
2251:
2252: /*
2253: if this is the end tag for ancestor element
2254: then infer end tag for this element
2255: */
2256: if (node.type == Node.EndTag) {
2257: if (node.tag == tt.tagForm) {
2258: lexer.badForm = 1;
2259: Report.warning(lexer, rowgroup, node,
2260: Report.DISCARDING_UNEXPECTED);
2261: continue;
2262: }
2263:
2264: if (node.tag == tt.tagTr || node.tag == tt.tagTd
2265: || node.tag == tt.tagTh) {
2266: Report.warning(lexer, rowgroup, node,
2267: Report.DISCARDING_UNEXPECTED);
2268: continue;
2269: }
2270:
2271: for (parent = rowgroup.parent; parent != null; parent = parent.parent) {
2272: if (node.tag == parent.tag) {
2273: lexer.ungetToken();
2274: Node.trimEmptyElement(lexer, rowgroup);
2275: return;
2276: }
2277: }
2278: }
2279:
2280: /*
2281: if THEAD, TFOOT or TBODY then implied end tag
2282:
2283: */
2284: if ((node.tag.model & Dict.CM_ROWGRP) != 0) {
2285: if (node.type != Node.EndTag)
2286: lexer.ungetToken();
2287:
2288: Node.trimEmptyElement(lexer, rowgroup);
2289: return;
2290: }
2291:
2292: if (node.type == Node.EndTag) {
2293: Report.warning(lexer, rowgroup, node,
2294: Report.DISCARDING_UNEXPECTED);
2295: continue;
2296: }
2297:
2298: if (!(node.tag == tt.tagTr)) {
2299: node = lexer.inferredTag("tr");
2300: Report.warning(lexer, rowgroup, node,
2301: Report.MISSING_STARTTAG);
2302: lexer.ungetToken();
2303: }
2304:
2305: /* node should be <TR> */
2306: Node.insertNodeAtEnd(rowgroup, node);
2307: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2308: }
2309:
2310: Node.trimEmptyElement(lexer, rowgroup);
2311: }
2312:
2313: };
2314:
2315: public static class ParseRow implements Parser {
2316:
2317: public void parse(Lexer lexer, Node row, short mode) {
2318: Node node, parent;
2319: boolean exclude_state;
2320: TagTable tt = lexer.configuration.tt;
2321:
2322: if ((row.tag.model & Dict.CM_EMPTY) != 0)
2323: return;
2324:
2325: while (true) {
2326: node = lexer.getToken(Lexer.IgnoreWhitespace);
2327: if (node == null)
2328: break;
2329: if (node.tag == row.tag) {
2330: if (node.type == Node.EndTag) {
2331: row.closed = true;
2332: Node.fixEmptyRow(lexer, row);
2333: return;
2334: }
2335:
2336: lexer.ungetToken();
2337: Node.fixEmptyRow(lexer, row);
2338: return;
2339: }
2340:
2341: /*
2342: if this is the end tag for an ancestor element
2343: then infer end tag for this element
2344: */
2345: if (node.type == Node.EndTag) {
2346: if (node.tag == tt.tagForm) {
2347: lexer.badForm = 1;
2348: Report.warning(lexer, row, node,
2349: Report.DISCARDING_UNEXPECTED);
2350: continue;
2351: }
2352:
2353: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2354: Report.warning(lexer, row, node,
2355: Report.DISCARDING_UNEXPECTED);
2356: continue;
2357: }
2358:
2359: for (parent = row.parent; parent != null; parent = parent.parent) {
2360: if (node.tag == parent.tag) {
2361: lexer.ungetToken();
2362: Node.trimEmptyElement(lexer, row);
2363: return;
2364: }
2365: }
2366: }
2367:
2368: /* deal with comments etc. */
2369: if (Node.insertMisc(row, node))
2370: continue;
2371:
2372: /* discard unknown tags */
2373: if (node.tag == null && node.type != Node.TextNode) {
2374: Report.warning(lexer, row, node,
2375: Report.DISCARDING_UNEXPECTED);
2376: continue;
2377: }
2378:
2379: /* discard unexpected <table> element */
2380: if (node.tag == tt.tagTable) {
2381: Report.warning(lexer, row, node,
2382: Report.DISCARDING_UNEXPECTED);
2383: continue;
2384: }
2385:
2386: /* THEAD, TFOOT or TBODY */
2387: if (node.tag != null
2388: && (node.tag.model & Dict.CM_ROWGRP) != 0) {
2389: lexer.ungetToken();
2390: Node.trimEmptyElement(lexer, row);
2391: return;
2392: }
2393:
2394: if (node.type == Node.EndTag) {
2395: Report.warning(lexer, row, node,
2396: Report.DISCARDING_UNEXPECTED);
2397: continue;
2398: }
2399:
2400: /*
2401: if text or inline or block move before table
2402: if head content move to head
2403: */
2404:
2405: if (node.type != Node.EndTag) {
2406: if (node.tag == tt.tagForm) {
2407: lexer.ungetToken();
2408: node = lexer.inferredTag("td");
2409: Report.warning(lexer, row, node,
2410: Report.MISSING_STARTTAG);
2411: } else if (node.type == Node.TextNode
2412: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2413: Node.moveBeforeTable(row, node, tt);
2414: Report.warning(lexer, row, node,
2415: Report.TAG_NOT_ALLOWED_IN);
2416: lexer.exiled = true;
2417:
2418: if (node.type != Node.TextNode)
2419: parseTag(lexer, node,
2420: Lexer.IgnoreWhitespace);
2421:
2422: lexer.exiled = false;
2423: continue;
2424: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2425: Report.warning(lexer, row, node,
2426: Report.TAG_NOT_ALLOWED_IN);
2427: moveToHead(lexer, row, node);
2428: continue;
2429: }
2430: }
2431:
2432: if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) {
2433: Report.warning(lexer, row, node,
2434: Report.TAG_NOT_ALLOWED_IN);
2435: continue;
2436: }
2437:
2438: /* node should be <TD> or <TH> */
2439: Node.insertNodeAtEnd(row, node);
2440: exclude_state = lexer.excludeBlocks;
2441: lexer.excludeBlocks = false;
2442: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2443: lexer.excludeBlocks = exclude_state;
2444:
2445: /* pop inline stack */
2446:
2447: while (lexer.istack.size() > lexer.istackbase)
2448: lexer.popInline(null);
2449: }
2450:
2451: Node.trimEmptyElement(lexer, row);
2452: }
2453:
2454: };
2455:
2456: public static class ParseNoFrames implements Parser {
2457:
2458: public void parse(Lexer lexer, Node noframes, short mode) {
2459: Node node;
2460: boolean checkstack;
2461: TagTable tt = lexer.configuration.tt;
2462:
2463: lexer.badAccess |= Report.USING_NOFRAMES;
2464: mode = Lexer.IgnoreWhitespace;
2465: checkstack = true;
2466:
2467: while (true) {
2468: node = lexer.getToken(mode);
2469: if (node == null)
2470: break;
2471: if (node.tag == noframes.tag
2472: && node.type == Node.EndTag) {
2473: noframes.closed = true;
2474: Node.trimSpaces(lexer, noframes);
2475: return;
2476: }
2477:
2478: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)) {
2479: Report.warning(lexer, noframes, node,
2480: Report.MISSING_ENDTAG_BEFORE);
2481: Node.trimSpaces(lexer, noframes);
2482: lexer.ungetToken();
2483: return;
2484: }
2485:
2486: if (node.tag == tt.tagHtml) {
2487: if (node.type == Node.StartTag
2488: || node.type == Node.StartEndTag)
2489: Report.warning(lexer, noframes, node,
2490: Report.DISCARDING_UNEXPECTED);
2491:
2492: continue;
2493: }
2494:
2495: /* deal with comments etc. */
2496: if (Node.insertMisc(noframes, node))
2497: continue;
2498:
2499: if (node.tag == tt.tagBody
2500: && node.type == Node.StartTag) {
2501: Node.insertNodeAtEnd(noframes, node);
2502: parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/);
2503: continue;
2504: }
2505:
2506: /* implicit body element inferred */
2507: if (node.type == Node.TextNode || node.tag != null) {
2508: lexer.ungetToken();
2509: node = lexer.inferredTag("body");
2510: if (lexer.configuration.XmlOut)
2511: Report.warning(lexer, noframes, node,
2512: Report.INSERTING_TAG);
2513: Node.insertNodeAtEnd(noframes, node);
2514: parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/);
2515: continue;
2516: }
2517: /* discard unexpected end tags */
2518: Report.warning(lexer, noframes, node,
2519: Report.DISCARDING_UNEXPECTED);
2520: }
2521:
2522: Report.warning(lexer, noframes, node,
2523: Report.MISSING_ENDTAG_FOR);
2524: }
2525:
2526: };
2527:
2528: public static class ParseSelect implements Parser {
2529:
2530: public void parse(Lexer lexer, Node field, short mode) {
2531: Node node;
2532: TagTable tt = lexer.configuration.tt;
2533:
2534: lexer.insert = -1; /* defer implicit inline start tags */
2535:
2536: while (true) {
2537: node = lexer.getToken(Lexer.IgnoreWhitespace);
2538: if (node == null)
2539: break;
2540: if (node.tag == field.tag && node.type == Node.EndTag) {
2541: field.closed = true;
2542: Node.trimSpaces(lexer, field);
2543: return;
2544: }
2545:
2546: /* deal with comments etc. */
2547: if (Node.insertMisc(field, node))
2548: continue;
2549:
2550: if (node.type == Node.StartTag
2551: && (node.tag == tt.tagOption
2552: || node.tag == tt.tagOptgroup || node.tag == tt.tagScript)) {
2553: Node.insertNodeAtEnd(field, node);
2554: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2555: continue;
2556: }
2557:
2558: /* discard unexpected tags */
2559: Report.warning(lexer, field, node,
2560: Report.DISCARDING_UNEXPECTED);
2561: }
2562:
2563: Report.warning(lexer, field, node,
2564: Report.MISSING_ENDTAG_FOR);
2565: }
2566:
2567: };
2568:
2569: public static class ParseText implements Parser {
2570:
2571: public void parse(Lexer lexer, Node field, short mode) {
2572: Node node;
2573: TagTable tt = lexer.configuration.tt;
2574:
2575: lexer.insert = -1; /* defer implicit inline start tags */
2576:
2577: if (field.tag == tt.tagTextarea)
2578: mode = Lexer.Preformatted;
2579:
2580: while (true) {
2581: node = lexer.getToken(mode);
2582: if (node == null)
2583: break;
2584: if (node.tag == field.tag && node.type == Node.EndTag) {
2585: field.closed = true;
2586: Node.trimSpaces(lexer, field);
2587: return;
2588: }
2589:
2590: /* deal with comments etc. */
2591: if (Node.insertMisc(field, node))
2592: continue;
2593:
2594: if (node.type == Node.TextNode) {
2595: /* only called for 1st child */
2596: if (field.content == null
2597: && !((mode & Lexer.Preformatted) != 0))
2598: Node.trimSpaces(lexer, field);
2599:
2600: if (node.start >= node.end) {
2601: continue;
2602: }
2603:
2604: Node.insertNodeAtEnd(field, node);
2605: continue;
2606: }
2607:
2608: if (node.tag == tt.tagFont) {
2609: Report.warning(lexer, field, node,
2610: Report.DISCARDING_UNEXPECTED);
2611: continue;
2612: }
2613:
2614: /* terminate element on other tags */
2615: if (!((field.tag.model & Dict.CM_OPT) != 0))
2616: Report.warning(lexer, field, node,
2617: Report.MISSING_ENDTAG_BEFORE);
2618:
2619: lexer.ungetToken();
2620: Node.trimSpaces(lexer, field);
2621: return;
2622: }
2623:
2624: if (!((field.tag.model & Dict.CM_OPT) != 0))
2625: Report.warning(lexer, field, node,
2626: Report.MISSING_ENDTAG_FOR);
2627: }
2628:
2629: };
2630:
2631: public static class ParseOptGroup implements Parser {
2632:
2633: public void parse(Lexer lexer, Node field, short mode) {
2634: Node node;
2635: TagTable tt = lexer.configuration.tt;
2636:
2637: lexer.insert = -1; /* defer implicit inline start tags */
2638:
2639: while (true) {
2640: node = lexer.getToken(Lexer.IgnoreWhitespace);
2641: if (node == null)
2642: break;
2643: if (node.tag == field.tag && node.type == Node.EndTag) {
2644: field.closed = true;
2645: Node.trimSpaces(lexer, field);
2646: return;
2647: }
2648:
2649: /* deal with comments etc. */
2650: if (Node.insertMisc(field, node))
2651: continue;
2652:
2653: if (node.type == Node.StartTag
2654: && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) {
2655: if (node.tag == tt.tagOptgroup)
2656: Report.warning(lexer, field, node,
2657: Report.CANT_BE_NESTED);
2658:
2659: Node.insertNodeAtEnd(field, node);
2660: parseTag(lexer, node, Lexer.MixedContent);
2661: continue;
2662: }
2663:
2664: /* discard unexpected tags */
2665: Report.warning(lexer, field, node,
2666: Report.DISCARDING_UNEXPECTED);
2667: }
2668: }
2669:
2670: };
2671:
2672: public static Parser getParseHTML() {
2673: return _parseHTML;
2674: }
2675:
2676: public static Parser getParseHead() {
2677: return _parseHead;
2678: }
2679:
2680: public static Parser getParseTitle() {
2681: return _parseTitle;
2682: }
2683:
2684: public static Parser getParseScript() {
2685: return _parseScript;
2686: }
2687:
2688: public static Parser getParseBody() {
2689: return _parseBody;
2690: }
2691:
2692: public static Parser getParseFrameSet() {
2693: return _parseFrameSet;
2694: }
2695:
2696: public static Parser getParseInline() {
2697: return _parseInline;
2698: }
2699:
2700: public static Parser getParseList() {
2701: return _parseList;
2702: }
2703:
2704: public static Parser getParseDefList() {
2705: return _parseDefList;
2706: }
2707:
2708: public static Parser getParsePre() {
2709: return _parsePre;
2710: }
2711:
2712: public static Parser getParseBlock() {
2713: return _parseBlock;
2714: }
2715:
2716: public static Parser getParseTableTag() {
2717: return _parseTableTag;
2718: }
2719:
2720: public static Parser getParseColGroup() {
2721: return _parseColGroup;
2722: }
2723:
2724: public static Parser getParseRowGroup() {
2725: return _parseRowGroup;
2726: }
2727:
2728: public static Parser getParseRow() {
2729: return _parseRow;
2730: }
2731:
2732: public static Parser getParseNoFrames() {
2733: return _parseNoFrames;
2734: }
2735:
2736: public static Parser getParseSelect() {
2737: return _parseSelect;
2738: }
2739:
2740: public static Parser getParseText() {
2741: return _parseText;
2742: }
2743:
2744: public static Parser getParseOptGroup() {
2745: return _parseOptGroup;
2746: }
2747:
2748: private static Parser _parseHTML = new ParseHTML();
2749: private static Parser _parseHead = new ParseHead();
2750: private static Parser _parseTitle = new ParseTitle();
2751: private static Parser _parseScript = new ParseScript();
2752: private static Parser _parseBody = new ParseBody();
2753: private static Parser _parseFrameSet = new ParseFrameSet();
2754: private static Parser _parseInline = new ParseInline();
2755: private static Parser _parseList = new ParseList();
2756: private static Parser _parseDefList = new ParseDefList();
2757: private static Parser _parsePre = new ParsePre();
2758: private static Parser _parseBlock = new ParseBlock();
2759: private static Parser _parseTableTag = new ParseTableTag();
2760: private static Parser _parseColGroup = new ParseColGroup();
2761: private static Parser _parseRowGroup = new ParseRowGroup();
2762: private static Parser _parseRow = new ParseRow();
2763: private static Parser _parseNoFrames = new ParseNoFrames();
2764: private static Parser _parseSelect = new ParseSelect();
2765: private static Parser _parseText = new ParseText();
2766: private static Parser _parseOptGroup = new ParseOptGroup();
2767:
2768: /*
2769: HTML is the top level element
2770: */
2771: public static Node parseDocument(Lexer lexer) {
2772: Node node, document, html;
2773: Node doctype = null;
2774: TagTable tt = lexer.configuration.tt;
2775:
2776: document = lexer.newNode();
2777: document.type = Node.RootNode;
2778:
2779: while (true) {
2780: node = lexer.getToken(Lexer.IgnoreWhitespace);
2781: if (node == null)
2782: break;
2783:
2784: /* deal with comments etc. */
2785: if (Node.insertMisc(document, node))
2786: continue;
2787:
2788: if (node.type == Node.DocTypeTag) {
2789: if (doctype == null) {
2790: Node.insertNodeAtEnd(document, node);
2791: doctype = node;
2792: } else
2793: Report.warning(lexer, document, node,
2794: Report.DISCARDING_UNEXPECTED);
2795: continue;
2796: }
2797:
2798: if (node.type == Node.EndTag) {
2799: Report.warning(lexer, document, node,
2800: Report.DISCARDING_UNEXPECTED); //TODO?
2801: continue;
2802: }
2803:
2804: if (node.type != Node.StartTag || node.tag != tt.tagHtml) {
2805: lexer.ungetToken();
2806: html = lexer.inferredTag("html");
2807: } else
2808: html = node;
2809:
2810: Node.insertNodeAtEnd(document, html);
2811: getParseHTML().parse(lexer, html, (short) 0); // TODO?
2812: break;
2813: }
2814:
2815: return document;
2816: }
2817:
2818: /**
2819: * Indicates whether or not whitespace should be preserved for this element.
2820: * If an <code>xml:space</code> attribute is found, then if the attribute value is
2821: * <code>preserve</code>, returns <code>true</code>. For any other value, returns
2822: * <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
2823: * found, then the following element names result in a return value of <code>true:
2824: * pre, script, style,</code> and <code>xsl:text</code>. Finally, if a
2825: * <code>TagTable</code> was passed in and the element appears as the "pre" element
2826: * in the <code>TagTable</code>, then <code>true</code> will be returned.
2827: * Otherwise, <code>false</code> is returned.
2828: * @param element The <code>Node</code> to test to see if whitespace should be
2829: * preserved.
2830: * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code>
2831: * function. This may be <code>null</code>, in which case this test
2832: * is bypassed.
2833: * @return <code>true</code> or <code>false</code>, as explained above.
2834: */
2835:
2836: public static boolean XMLPreserveWhiteSpace(Node element,
2837: TagTable tt) {
2838: AttVal attribute;
2839:
2840: /* search attributes for xml:space */
2841: for (attribute = element.attributes; attribute != null; attribute = attribute.next) {
2842: if (attribute.attribute.equals("xml:space")) {
2843: if (attribute.value.equals("preserve"))
2844: return true;
2845:
2846: return false;
2847: }
2848: }
2849:
2850: /* kludge for html docs without explicit xml:space attribute */
2851: if (Lexer.wstrcasecmp(element.element, "pre") == 0
2852: || Lexer.wstrcasecmp(element.element, "script") == 0
2853: || Lexer.wstrcasecmp(element.element, "style") == 0)
2854: return true;
2855:
2856: if ((tt != null) && (tt.findParser(element) == getParsePre()))
2857: return true;
2858:
2859: /* kludge for XSL docs */
2860: if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0)
2861: return true;
2862:
2863: return false;
2864: }
2865:
2866: /*
2867: XML documents
2868: */
2869: public static void parseXMLElement(Lexer lexer, Node element,
2870: short mode) {
2871: Node node;
2872:
2873: /* Jeff Young's kludge for XSL docs */
2874:
2875: if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0)
2876: return;
2877:
2878: /* if node is pre or has xml:space="preserve" then do so */
2879:
2880: if (XMLPreserveWhiteSpace(element, lexer.configuration.tt))
2881: mode = Lexer.Preformatted;
2882:
2883: while (true) {
2884: node = lexer.getToken(mode);
2885: if (node == null)
2886: break;
2887: if (node.type == Node.EndTag
2888: && node.element.equals(element.element)) {
2889: element.closed = true;
2890: break;
2891: }
2892:
2893: /* discard unexpected end tags */
2894: if (node.type == Node.EndTag) {
2895: Report.error(lexer, element, node,
2896: Report.UNEXPECTED_ENDTAG);
2897: continue;
2898: }
2899:
2900: /* parse content on seeing start tag */
2901: if (node.type == Node.StartTag)
2902: parseXMLElement(lexer, node, mode);
2903:
2904: Node.insertNodeAtEnd(element, node);
2905: }
2906:
2907: /*
2908: if first child is text then trim initial space and
2909: delete text node if it is empty.
2910: */
2911:
2912: node = element.content;
2913:
2914: if (node != null && node.type == Node.TextNode
2915: && mode != Lexer.Preformatted) {
2916: if (node.textarray[node.start] == (byte) ' ') {
2917: node.start++;
2918:
2919: if (node.start >= node.end)
2920: Node.discardElement(node);
2921: }
2922: }
2923:
2924: /*
2925: if last child is text then trim final space and
2926: delete the text node if it is empty
2927: */
2928:
2929: node = element.last;
2930:
2931: if (node != null && node.type == Node.TextNode
2932: && mode != Lexer.Preformatted) {
2933: if (node.textarray[node.end - 1] == (byte) ' ') {
2934: node.end--;
2935:
2936: if (node.start >= node.end)
2937: Node.discardElement(node);
2938: }
2939: }
2940: }
2941:
2942: public static Node parseXMLDocument(Lexer lexer) {
2943: Node node, document, doctype;
2944:
2945: document = lexer.newNode();
2946: document.type = Node.RootNode;
2947: doctype = null;
2948: lexer.configuration.XmlTags = true;
2949:
2950: while (true) {
2951: node = lexer.getToken(Lexer.IgnoreWhitespace);
2952: if (node == null)
2953: break;
2954: /* discard unexpected end tags */
2955: if (node.type == Node.EndTag) {
2956: Report.warning(lexer, null, node,
2957: Report.UNEXPECTED_ENDTAG);
2958: continue;
2959: }
2960:
2961: /* deal with comments etc. */
2962: if (Node.insertMisc(document, node))
2963: continue;
2964:
2965: if (node.type == Node.DocTypeTag) {
2966: if (doctype == null) {
2967: Node.insertNodeAtEnd(document, node);
2968: doctype = node;
2969: } else
2970: Report.warning(lexer, document, node,
2971: Report.DISCARDING_UNEXPECTED); // TODO
2972: continue;
2973: }
2974:
2975: /* if start tag then parse element's content */
2976: if (node.type == Node.StartTag) {
2977: Node.insertNodeAtEnd(document, node);
2978: parseXMLElement(lexer, node, Lexer.IgnoreWhitespace);
2979: }
2980:
2981: }
2982:
2983: if (false) { //#if 0
2984: /* discard the document type */
2985: node = document.findDocType();
2986:
2987: if (node != null)
2988: Node.discardElement(node);
2989: } // #endif
2990:
2991: if (doctype != null && !lexer.checkDocTypeKeyWords(doctype))
2992: Report.warning(lexer, doctype, null,
2993: Report.DTYPE_NOT_UPPER_CASE);
2994:
2995: /* ensure presence of initial <?XML version="1.0"?> */
2996: if (lexer.configuration.XmlPi)
2997: lexer.fixXMLPI(document);
2998:
2999: return document;
3000: }
3001:
3002: public static boolean isJavaScript(Node node) {
3003: boolean result = false;
3004: AttVal attr;
3005:
3006: if (node.attributes == null)
3007: return true;
3008:
3009: for (attr = node.attributes; attr != null; attr = attr.next) {
3010: if ((Lexer.wstrcasecmp(attr.attribute, "language") == 0 || Lexer
3011: .wstrcasecmp(attr.attribute, "type") == 0)
3012: && Lexer.wsubstr(attr.value, "javascript"))
3013: result = true;
3014: }
3015:
3016: return result;
3017: }
3018:
3019: }
|