0001: /*
0002: * @(#)ParserImpl.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: package org.w3c.tidy;
0007:
0008: /**
0009: *
0010: * HTML Parser implementation
0011: *
0012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013: * See Tidy.java for the copyright notice.
0014: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015: * HTML Tidy Release 4 Aug 2000</a>
0016: *
0017: * @author Dave Raggett <dsr@w3.org>
0018: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019: * @version 1.0, 1999/05/22
0020: * @version 1.0.1, 1999/05/29
0021: * @version 1.1, 1999/06/18 Java Bean
0022: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024: * @version 1.4, 1999/09/04 DOM support
0025: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032: */
0033:
0034: public class ParserImpl {
0035:
0036: //private static int SeenBodyEndTag; /* AQ: moved into lexer structure */
0037:
0038: private static void parseTag(Lexer lexer, Node node, short mode) {
0039: // Local fix by GLP 2000-12-21. Need to reset insertspace if this
0040: // is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
0041: // Remove this code once the fix is made in Tidy.
0042:
0043: /****** (Original code follows)
0044: if ((node.tag.model & Dict.CM_EMPTY) != 0)
0045: {
0046: lexer.waswhite = false;
0047: return;
0048: }
0049: else if (!((node.tag.model & Dict.CM_INLINE) != 0))
0050: lexer.insertspace = false;
0051: *******/
0052:
0053: if (!((node.tag.model & Dict.CM_INLINE) != 0))
0054: lexer.insertspace = false;
0055:
0056: if ((node.tag.model & Dict.CM_EMPTY) != 0) {
0057: lexer.waswhite = false;
0058: return;
0059: }
0060:
0061: if (node.tag.parser == null || node.type == Node.StartEndTag)
0062: return;
0063:
0064: node.tag.parser.parse(lexer, node, mode);
0065: }
0066:
0067: private static void moveToHead(Lexer lexer, Node element, Node node) {
0068: Node head;
0069: TagTable tt = lexer.configuration.tt;
0070:
0071: if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
0072: Report.warning(lexer, element, node,
0073: Report.TAG_NOT_ALLOWED_IN);
0074:
0075: while (element.tag != tt.tagHtml)
0076: element = element.parent;
0077:
0078: for (head = element.content; head != null; head = head.next) {
0079: if (head.tag == tt.tagHead) {
0080: Node.insertNodeAtEnd(head, node);
0081: break;
0082: }
0083: }
0084:
0085: if (node.tag.parser != null)
0086: parseTag(lexer, node, Lexer.IgnoreWhitespace);
0087: } else {
0088: Report.warning(lexer, element, node,
0089: Report.DISCARDING_UNEXPECTED);
0090: }
0091: }
0092:
0093: public static class ParseHTML implements Parser {
0094:
0095: public void parse(Lexer lexer, Node html, short mode) {
0096: Node node, head;
0097: Node frameset = null;
0098: Node noframes = null;
0099:
0100: lexer.configuration.XmlTags = false;
0101: lexer.seenBodyEndTag = 0;
0102: TagTable tt = lexer.configuration.tt;
0103:
0104: for (;;) {
0105: node = lexer.getToken(Lexer.IgnoreWhitespace);
0106:
0107: if (node == null) {
0108: node = lexer.inferredTag("head");
0109: break;
0110: }
0111:
0112: if (node.tag == tt.tagHead)
0113: break;
0114:
0115: if (node.tag == html.tag && node.type == Node.EndTag) {
0116: Report.warning(lexer, html, node,
0117: Report.DISCARDING_UNEXPECTED);
0118: continue;
0119: }
0120:
0121: /* deal with comments etc. */
0122: if (Node.insertMisc(html, node))
0123: continue;
0124:
0125: lexer.ungetToken();
0126: node = lexer.inferredTag("head");
0127: break;
0128: }
0129:
0130: head = node;
0131: Node.insertNodeAtEnd(html, head);
0132: getParseHead().parse(lexer, head, mode);
0133:
0134: for (;;) {
0135: node = lexer.getToken(Lexer.IgnoreWhitespace);
0136:
0137: if (node == null) {
0138: if (frameset == null) /* create an empty body */
0139: node = lexer.inferredTag("body");
0140:
0141: return;
0142: }
0143:
0144: /* robustly handle html tags */
0145: if (node.tag == html.tag) {
0146: if (node.type != Node.StartTag && frameset == null)
0147: Report.warning(lexer, html, node,
0148: Report.DISCARDING_UNEXPECTED);
0149:
0150: continue;
0151: }
0152:
0153: /* deal with comments etc. */
0154: if (Node.insertMisc(html, node))
0155: continue;
0156:
0157: /* if frameset document coerce <body> to <noframes> */
0158: if (node.tag == tt.tagBody) {
0159: if (node.type != Node.StartTag) {
0160: Report.warning(lexer, html, node,
0161: Report.DISCARDING_UNEXPECTED);
0162: continue;
0163: }
0164:
0165: if (frameset != null) {
0166: lexer.ungetToken();
0167:
0168: if (noframes == null) {
0169: noframes = lexer.inferredTag("noframes");
0170: Node.insertNodeAtEnd(frameset, noframes);
0171: Report.warning(lexer, html, noframes,
0172: Report.INSERTING_TAG);
0173: }
0174:
0175: parseTag(lexer, noframes, mode);
0176: continue;
0177: }
0178:
0179: break; /* to parse body */
0180: }
0181:
0182: /* flag an error if we see more than one frameset */
0183: if (node.tag == tt.tagFrameset) {
0184: if (node.type != Node.StartTag) {
0185: Report.warning(lexer, html, node,
0186: Report.DISCARDING_UNEXPECTED);
0187: continue;
0188: }
0189:
0190: if (frameset != null)
0191: Report.error(lexer, html, node,
0192: Report.DUPLICATE_FRAMESET);
0193: else
0194: frameset = node;
0195:
0196: Node.insertNodeAtEnd(html, node);
0197: parseTag(lexer, node, mode);
0198:
0199: /*
0200: see if it includes a noframes element so
0201: that we can merge subsequent noframes elements
0202: */
0203:
0204: for (node = frameset.content; node != null; node = node.next) {
0205: if (node.tag == tt.tagNoframes)
0206: noframes = node;
0207: }
0208: continue;
0209: }
0210:
0211: /* if not a frameset document coerce <noframes> to <body> */
0212: if (node.tag == tt.tagNoframes) {
0213: if (node.type != Node.StartTag) {
0214: Report.warning(lexer, html, node,
0215: Report.DISCARDING_UNEXPECTED);
0216: continue;
0217: }
0218:
0219: if (frameset == null) {
0220: Report.warning(lexer, html, node,
0221: Report.DISCARDING_UNEXPECTED);
0222: node = lexer.inferredTag("body");
0223: break;
0224: }
0225:
0226: if (noframes == null) {
0227: noframes = node;
0228: Node.insertNodeAtEnd(frameset, noframes);
0229: }
0230:
0231: parseTag(lexer, noframes, mode);
0232: continue;
0233: }
0234:
0235: if (node.type == Node.StartTag
0236: || node.type == Node.StartEndTag) {
0237: if (node.tag != null
0238: && (node.tag.model & Dict.CM_HEAD) != 0) {
0239: moveToHead(lexer, html, node);
0240: continue;
0241: }
0242: }
0243:
0244: lexer.ungetToken();
0245:
0246: /* insert other content into noframes element */
0247:
0248: if (frameset != null) {
0249: if (noframes == null) {
0250: noframes = lexer.inferredTag("noframes");
0251: Node.insertNodeAtEnd(frameset, noframes);
0252: } else
0253: Report.warning(lexer, html, node,
0254: Report.NOFRAMES_CONTENT);
0255:
0256: parseTag(lexer, noframes, mode);
0257: continue;
0258: }
0259:
0260: node = lexer.inferredTag("body");
0261: break;
0262: }
0263:
0264: /* node must be body */
0265:
0266: Node.insertNodeAtEnd(html, node);
0267: parseTag(lexer, node, mode);
0268: }
0269:
0270: };
0271:
0272: public static class ParseHead implements Parser {
0273:
0274: public void parse(Lexer lexer, Node head, short mode) {
0275: Node node;
0276: int HasTitle = 0;
0277: int HasBase = 0;
0278: TagTable tt = lexer.configuration.tt;
0279:
0280: while (true) {
0281: node = lexer.getToken(Lexer.IgnoreWhitespace);
0282: if (node == null)
0283: break;
0284: if (node.tag == head.tag && node.type == Node.EndTag) {
0285: head.closed = true;
0286: break;
0287: }
0288:
0289: if (node.type == Node.TextNode) {
0290: lexer.ungetToken();
0291: break;
0292: }
0293:
0294: /* deal with comments etc. */
0295: if (Node.insertMisc(head, node))
0296: continue;
0297:
0298: if (node.type == Node.DocTypeTag) {
0299: Node.insertDocType(lexer, head, node);
0300: continue;
0301: }
0302:
0303: /* discard unknown tags */
0304: if (node.tag == null) {
0305: Report.warning(lexer, head, node,
0306: Report.DISCARDING_UNEXPECTED);
0307: continue;
0308: }
0309:
0310: if (!((node.tag.model & Dict.CM_HEAD) != 0)) {
0311: lexer.ungetToken();
0312: break;
0313: }
0314:
0315: if (node.type == Node.StartTag
0316: || node.type == Node.StartEndTag) {
0317: if (node.tag == tt.tagTitle) {
0318: ++HasTitle;
0319:
0320: if (HasTitle > 1)
0321: Report.warning(lexer, head, node,
0322: Report.TOO_MANY_ELEMENTS);
0323: } else if (node.tag == tt.tagBase) {
0324: ++HasBase;
0325:
0326: if (HasBase > 1)
0327: Report.warning(lexer, head, node,
0328: Report.TOO_MANY_ELEMENTS);
0329: } else if (node.tag == tt.tagNoscript)
0330: Report.warning(lexer, head, node,
0331: Report.TAG_NOT_ALLOWED_IN);
0332:
0333: Node.insertNodeAtEnd(head, node);
0334: parseTag(lexer, node, Lexer.IgnoreWhitespace);
0335: continue;
0336: }
0337:
0338: /* discard unexpected text nodes and end tags */
0339: Report.warning(lexer, head, node,
0340: Report.DISCARDING_UNEXPECTED);
0341: }
0342:
0343: if (HasTitle == 0) {
0344: Report.warning(lexer, head, null,
0345: Report.MISSING_TITLE_ELEMENT);
0346: Node.insertNodeAtEnd(head, lexer.inferredTag("title"));
0347: }
0348: }
0349:
0350: };
0351:
0352: public static class ParseTitle implements Parser {
0353:
0354: public void parse(Lexer lexer, Node title, short mode) {
0355: Node node;
0356:
0357: while (true) {
0358: node = lexer.getToken(Lexer.MixedContent);
0359: if (node == null)
0360: break;
0361: if (node.tag == title.tag && node.type == Node.EndTag) {
0362: title.closed = true;
0363: Node.trimSpaces(lexer, title);
0364: return;
0365: }
0366:
0367: if (node.type == Node.TextNode) {
0368: /* only called for 1st child */
0369: if (title.content == null)
0370: Node.trimInitialSpace(lexer, title, node);
0371:
0372: if (node.start >= node.end) {
0373: continue;
0374: }
0375:
0376: Node.insertNodeAtEnd(title, node);
0377: continue;
0378: }
0379:
0380: /* deal with comments etc. */
0381: if (Node.insertMisc(title, node))
0382: continue;
0383:
0384: /* discard unknown tags */
0385: if (node.tag == null) {
0386: Report.warning(lexer, title, node,
0387: Report.DISCARDING_UNEXPECTED);
0388: continue;
0389: }
0390:
0391: /* pushback unexpected tokens */
0392: Report.warning(lexer, title, node,
0393: Report.MISSING_ENDTAG_BEFORE);
0394: lexer.ungetToken();
0395: Node.trimSpaces(lexer, title);
0396: return;
0397: }
0398:
0399: Report.warning(lexer, title, node,
0400: Report.MISSING_ENDTAG_FOR);
0401: }
0402:
0403: };
0404:
0405: public static class ParseScript implements Parser {
0406:
0407: public void parse(Lexer lexer, Node script, short mode) {
0408: /*
0409: This isn't quite right for CDATA content as it recognises
0410: tags within the content and parses them accordingly.
0411: This will unfortunately screw up scripts which include
0412: < + letter, < + !, < + ? or < + / + letter
0413: */
0414:
0415: Node node;
0416:
0417: node = lexer.getCDATA(script);
0418:
0419: if (node != null)
0420: Node.insertNodeAtEnd(script, node);
0421: }
0422:
0423: };
0424:
0425: public static class ParseBody implements Parser {
0426:
0427: public void parse(Lexer lexer, Node body, short mode) {
0428: Node node;
0429: boolean checkstack, iswhitenode;
0430:
0431: mode = Lexer.IgnoreWhitespace;
0432: checkstack = true;
0433: TagTable tt = lexer.configuration.tt;
0434:
0435: while (true) {
0436: node = lexer.getToken(mode);
0437: if (node == null)
0438: break;
0439: if (node.tag == body.tag && node.type == Node.EndTag) {
0440: body.closed = true;
0441: Node.trimSpaces(lexer, body);
0442: lexer.seenBodyEndTag = 1;
0443: mode = Lexer.IgnoreWhitespace;
0444:
0445: if (body.parent.tag == tt.tagNoframes)
0446: break;
0447:
0448: continue;
0449: }
0450:
0451: if (node.tag == tt.tagNoframes) {
0452: if (node.type == Node.StartTag) {
0453: Node.insertNodeAtEnd(body, node);
0454: getParseBlock().parse(lexer, node, mode);
0455: continue;
0456: }
0457:
0458: if (node.type == Node.EndTag
0459: && body.parent.tag == tt.tagNoframes) {
0460: Node.trimSpaces(lexer, body);
0461: lexer.ungetToken();
0462: break;
0463: }
0464: }
0465:
0466: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)
0467: && body.parent.tag == tt.tagNoframes) {
0468: Node.trimSpaces(lexer, body);
0469: lexer.ungetToken();
0470: break;
0471: }
0472:
0473: if (node.tag == tt.tagHtml) {
0474: if (node.type == Node.StartTag
0475: || node.type == Node.StartEndTag)
0476: Report.warning(lexer, body, node,
0477: Report.DISCARDING_UNEXPECTED);
0478:
0479: continue;
0480: }
0481:
0482: iswhitenode = false;
0483:
0484: if (node.type == Node.TextNode
0485: && node.end <= node.start + 1
0486: && node.textarray[node.start] == (byte) ' ')
0487: iswhitenode = true;
0488:
0489: /* deal with comments etc. */
0490: if (Node.insertMisc(body, node))
0491: continue;
0492:
0493: if (lexer.seenBodyEndTag == 1 && !iswhitenode) {
0494: ++lexer.seenBodyEndTag;
0495: Report.warning(lexer, body, node,
0496: Report.CONTENT_AFTER_BODY);
0497: }
0498:
0499: /* mixed content model permits text */
0500: if (node.type == Node.TextNode) {
0501: if (iswhitenode && mode == Lexer.IgnoreWhitespace) {
0502: continue;
0503: }
0504:
0505: if (lexer.configuration.EncloseBodyText
0506: && !iswhitenode) {
0507: Node para;
0508:
0509: lexer.ungetToken();
0510: para = lexer.inferredTag("p");
0511: Node.insertNodeAtEnd(body, para);
0512: parseTag(lexer, para, mode);
0513: mode = Lexer.MixedContent;
0514: continue;
0515: } else
0516: /* strict doesn't allow text here */
0517: lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
0518:
0519: if (checkstack) {
0520: checkstack = false;
0521:
0522: if (lexer.inlineDup(node) > 0)
0523: continue;
0524: }
0525:
0526: Node.insertNodeAtEnd(body, node);
0527: mode = Lexer.MixedContent;
0528: continue;
0529: }
0530:
0531: if (node.type == Node.DocTypeTag) {
0532: Node.insertDocType(lexer, body, node);
0533: continue;
0534: }
0535: /* discard unknown and PARAM tags */
0536: if (node.tag == null || node.tag == tt.tagParam) {
0537: Report.warning(lexer, body, node,
0538: Report.DISCARDING_UNEXPECTED);
0539: continue;
0540: }
0541:
0542: /*
0543: Netscape allows LI and DD directly in BODY
0544: We infer UL or DL respectively and use this
0545: boolean to exclude block-level elements so as
0546: to match Netscape's observed behaviour.
0547: */
0548: lexer.excludeBlocks = false;
0549:
0550: if (!((node.tag.model & Dict.CM_BLOCK) != 0)
0551: && !((node.tag.model & Dict.CM_INLINE) != 0)) {
0552: /* avoid this error message being issued twice */
0553: if (!((node.tag.model & Dict.CM_HEAD) != 0))
0554: Report.warning(lexer, body, node,
0555: Report.TAG_NOT_ALLOWED_IN);
0556:
0557: if ((node.tag.model & Dict.CM_HTML) != 0) {
0558: /* copy body attributes if current body was inferred */
0559: if (node.tag == tt.tagBody && body.implicit
0560: && body.attributes == null) {
0561: body.attributes = node.attributes;
0562: node.attributes = null;
0563: }
0564:
0565: continue;
0566: }
0567:
0568: if ((node.tag.model & Dict.CM_HEAD) != 0) {
0569: moveToHead(lexer, body, node);
0570: continue;
0571: }
0572:
0573: if ((node.tag.model & Dict.CM_LIST) != 0) {
0574: lexer.ungetToken();
0575: node = lexer.inferredTag("ul");
0576: Node.addClass(node, "noindent");
0577: lexer.excludeBlocks = true;
0578: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
0579: lexer.ungetToken();
0580: node = lexer.inferredTag("dl");
0581: lexer.excludeBlocks = true;
0582: } else if ((node.tag.model & (Dict.CM_TABLE
0583: | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) {
0584: lexer.ungetToken();
0585: node = lexer.inferredTag("table");
0586: lexer.excludeBlocks = true;
0587: } else {
0588: /* AQ: The following line is from the official C
0589: version of tidy. It doesn't make sense to me
0590: because the '!' operator has higher precedence
0591: than the '&' operator. It seems to me that the
0592: expression always evaluates to 0.
0593:
0594: if (!node->tag->model & (CM_ROW | CM_FIELD))
0595:
0596: AQ: 13Jan2000 fixed in C tidy
0597: */
0598: if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) {
0599: lexer.ungetToken();
0600: return;
0601: }
0602:
0603: /* ignore </td> </th> <option> etc. */
0604: continue;
0605: }
0606: }
0607:
0608: if (node.type == Node.EndTag) {
0609: if (node.tag == tt.tagBr)
0610: node.type = Node.StartTag;
0611: else if (node.tag == tt.tagP) {
0612: Node.coerceNode(lexer, node, tt.tagBr);
0613: Node.insertNodeAtEnd(body, node);
0614: node = lexer.inferredTag("br");
0615: } else if ((node.tag.model & Dict.CM_INLINE) != 0)
0616: lexer.popInline(node);
0617: }
0618:
0619: if (node.type == Node.StartTag
0620: || node.type == Node.StartEndTag) {
0621: if (((node.tag.model & Dict.CM_INLINE) != 0)
0622: && !((node.tag.model & Dict.CM_MIXED) != 0)) {
0623: /* HTML4 strict doesn't allow inline content here */
0624: /* but HTML2 does allow img elements as children of body */
0625: if (node.tag == tt.tagImg)
0626: lexer.versions &= ~Dict.VERS_HTML40_STRICT;
0627: else
0628: lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
0629:
0630: if (checkstack && !node.implicit) {
0631: checkstack = false;
0632:
0633: if (lexer.inlineDup(node) > 0)
0634: continue;
0635: }
0636:
0637: mode = Lexer.MixedContent;
0638: } else {
0639: checkstack = true;
0640: mode = Lexer.IgnoreWhitespace;
0641: }
0642:
0643: if (node.implicit)
0644: Report.warning(lexer, body, node,
0645: Report.INSERTING_TAG);
0646:
0647: Node.insertNodeAtEnd(body, node);
0648: parseTag(lexer, node, mode);
0649: continue;
0650: }
0651:
0652: /* discard unexpected tags */
0653: Report.warning(lexer, body, node,
0654: Report.DISCARDING_UNEXPECTED);
0655: }
0656: }
0657:
0658: };
0659:
0660: public static class ParseFrameSet implements Parser {
0661:
0662: public void parse(Lexer lexer, Node frameset, short mode) {
0663: Node node;
0664: TagTable tt = lexer.configuration.tt;
0665:
0666: lexer.badAccess |= Report.USING_FRAMES;
0667:
0668: while (true) {
0669: node = lexer.getToken(Lexer.IgnoreWhitespace);
0670: if (node == null)
0671: break;
0672: if (node.tag == frameset.tag
0673: && node.type == Node.EndTag) {
0674: frameset.closed = true;
0675: Node.trimSpaces(lexer, frameset);
0676: return;
0677: }
0678:
0679: /* deal with comments etc. */
0680: if (Node.insertMisc(frameset, node))
0681: continue;
0682:
0683: if (node.tag == null) {
0684: Report.warning(lexer, frameset, node,
0685: Report.DISCARDING_UNEXPECTED);
0686: continue;
0687: }
0688:
0689: if (node.type == Node.StartTag
0690: || node.type == Node.StartEndTag) {
0691: if (node.tag != null
0692: && (node.tag.model & Dict.CM_HEAD) != 0) {
0693: moveToHead(lexer, frameset, node);
0694: continue;
0695: }
0696: }
0697:
0698: if (node.tag == tt.tagBody) {
0699: lexer.ungetToken();
0700: node = lexer.inferredTag("noframes");
0701: Report.warning(lexer, frameset, node,
0702: Report.INSERTING_TAG);
0703: }
0704:
0705: if (node.type == Node.StartTag
0706: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0707: Node.insertNodeAtEnd(frameset, node);
0708: lexer.excludeBlocks = false;
0709: parseTag(lexer, node, Lexer.MixedContent);
0710: continue;
0711: } else if (node.type == Node.StartEndTag
0712: && (node.tag.model & Dict.CM_FRAMES) != 0) {
0713: Node.insertNodeAtEnd(frameset, node);
0714: continue;
0715: }
0716:
0717: /* discard unexpected tags */
0718: Report.warning(lexer, frameset, node,
0719: Report.DISCARDING_UNEXPECTED);
0720: }
0721:
0722: Report.warning(lexer, frameset, node,
0723: Report.MISSING_ENDTAG_FOR);
0724: }
0725:
0726: };
0727:
0728: public static class ParseInline implements Parser {
0729:
0730: public void parse(Lexer lexer, Node element, short mode) {
0731: Node node, parent;
0732: TagTable tt = lexer.configuration.tt;
0733:
0734: if ((element.tag.model & Dict.CM_EMPTY) != 0)
0735: return;
0736:
0737: if (element.tag == tt.tagA) {
0738: if (element.attributes == null) {
0739: Report.warning(lexer, element.parent, element,
0740: Report.DISCARDING_UNEXPECTED);
0741: Node.discardElement(element);
0742: return;
0743: }
0744: }
0745:
0746: /*
0747: ParseInline is used for some block level elements like H1 to H6
0748: For such elements we need to insert inline emphasis tags currently
0749: on the inline stack. For Inline elements, we normally push them
0750: onto the inline stack provided they aren't implicit or OBJECT/APPLET.
0751: This test is carried out in PushInline and PopInline, see istack.c
0752: We don't push A or SPAN to replicate current browser behavior
0753: */
0754: if (((element.tag.model & Dict.CM_BLOCK) != 0)
0755: || (element.tag == tt.tagDt))
0756: lexer.inlineDup(null);
0757: else if ((element.tag.model & Dict.CM_INLINE) != 0
0758: && element.tag != tt.tagA
0759: && element.tag != tt.tagSpan)
0760: lexer.pushInline(element);
0761:
0762: if (element.tag == tt.tagNobr)
0763: lexer.badLayout |= Report.USING_NOBR;
0764: else if (element.tag == tt.tagFont)
0765: lexer.badLayout |= Report.USING_FONT;
0766:
0767: /* Inline elements may or may not be within a preformatted element */
0768: if (mode != Lexer.Preformatted)
0769: mode = Lexer.MixedContent;
0770:
0771: while (true) {
0772: node = lexer.getToken(mode);
0773: if (node == null)
0774: break;
0775: /* end tag for current element */
0776: if (node.tag == element.tag && node.type == Node.EndTag) {
0777: if ((element.tag.model & Dict.CM_INLINE) != 0
0778: && element.tag != tt.tagA)
0779: lexer.popInline(node);
0780:
0781: if (!((mode & Lexer.Preformatted) != 0))
0782: Node.trimSpaces(lexer, element);
0783: /*
0784: if a font element wraps an anchor and nothing else
0785: then move the font element inside the anchor since
0786: otherwise it won't alter the anchor text color
0787: */
0788: if (element.tag == tt.tagFont
0789: && element.content != null
0790: && element.content == element.last) {
0791: Node child = element.content;
0792:
0793: if (child.tag == tt.tagA) {
0794: child.parent = element.parent;
0795: child.next = element.next;
0796: child.prev = element.prev;
0797:
0798: if (child.prev != null)
0799: child.prev.next = child;
0800: else
0801: child.parent.content = child;
0802:
0803: if (child.next != null)
0804: child.next.prev = child;
0805: else
0806: child.parent.last = child;
0807:
0808: element.next = null;
0809: element.prev = null;
0810: element.parent = child;
0811: element.content = child.content;
0812: element.last = child.last;
0813: child.content = element;
0814: child.last = element;
0815: for (child = element.content; child != null; child = child.next)
0816: child.parent = element;
0817: }
0818: }
0819: element.closed = true;
0820: Node.trimSpaces(lexer, element);
0821: Node.trimEmptyElement(lexer, element);
0822: return;
0823: }
0824:
0825: /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
0826: /* otherwise emphasis nesting is probably unintentional */
0827: /* big and small have cumulative effect to leave them alone */
0828: if (node.type == Node.StartTag
0829: && node.tag == element.tag
0830: && lexer.isPushed(node) && !node.implicit
0831: && !element.implicit && node.tag != null
0832: && ((node.tag.model & Dict.CM_INLINE) != 0)
0833: && node.tag != tt.tagA
0834: && node.tag != tt.tagFont
0835: && node.tag != tt.tagBig
0836: && node.tag != tt.tagSmall) {
0837: if (element.content != null
0838: && node.attributes == null) {
0839: Report.warning(lexer, element, node,
0840: Report.COERCE_TO_ENDTAG);
0841: node.type = Node.EndTag;
0842: lexer.ungetToken();
0843: continue;
0844: }
0845:
0846: Report.warning(lexer, element, node,
0847: Report.NESTED_EMPHASIS);
0848: }
0849:
0850: if (node.type == Node.TextNode) {
0851: /* only called for 1st child */
0852: if (element.content == null
0853: && !((mode & Lexer.Preformatted) != 0))
0854: Node.trimSpaces(lexer, element);
0855:
0856: if (node.start >= node.end) {
0857: continue;
0858: }
0859:
0860: Node.insertNodeAtEnd(element, node);
0861: continue;
0862: }
0863:
0864: /* mixed content model so allow text */
0865: if (Node.insertMisc(element, node))
0866: continue;
0867:
0868: /* deal with HTML tags */
0869: if (node.tag == tt.tagHtml) {
0870: if (node.type == Node.StartTag
0871: || node.type == Node.StartEndTag) {
0872: Report.warning(lexer, element, node,
0873: Report.DISCARDING_UNEXPECTED);
0874: continue;
0875: }
0876:
0877: /* otherwise infer end of inline element */
0878: lexer.ungetToken();
0879: if (!((mode & Lexer.Preformatted) != 0))
0880: Node.trimSpaces(lexer, element);
0881: Node.trimEmptyElement(lexer, element);
0882: return;
0883: }
0884:
0885: /* within <dt> or <pre> map <p> to <br> */
0886: if (node.tag == tt.tagP
0887: && node.type == Node.StartTag
0888: && ((mode & Lexer.Preformatted) != 0
0889: || element.tag == tt.tagDt || element
0890: .isDescendantOf(tt.tagDt))) {
0891: node.tag = tt.tagBr;
0892: node.element = "br";
0893: Node.trimSpaces(lexer, element);
0894: Node.insertNodeAtEnd(element, node);
0895: continue;
0896: }
0897:
0898: /* ignore unknown and PARAM tags */
0899: if (node.tag == null || node.tag == tt.tagParam) {
0900: Report.warning(lexer, element, node,
0901: Report.DISCARDING_UNEXPECTED);
0902: continue;
0903: }
0904:
0905: if (node.tag == tt.tagBr && node.type == Node.EndTag)
0906: node.type = Node.StartTag;
0907:
0908: if (node.type == Node.EndTag) {
0909: /* coerce </br> to <br> */
0910: if (node.tag == tt.tagBr)
0911: node.type = Node.StartTag;
0912: else if (node.tag == tt.tagP) {
0913: /* coerce unmatched </p> to <br><br> */
0914: if (!element.isDescendantOf(tt.tagP)) {
0915: Node.coerceNode(lexer, node, tt.tagBr);
0916: Node.trimSpaces(lexer, element);
0917: Node.insertNodeAtEnd(element, node);
0918: node = lexer.inferredTag("br");
0919: continue;
0920: }
0921: } else if ((node.tag.model & Dict.CM_INLINE) != 0
0922: && node.tag != tt.tagA
0923: && !((node.tag.model & Dict.CM_OBJECT) != 0)
0924: && (element.tag.model & Dict.CM_INLINE) != 0) {
0925: /* allow any inline end tag to end current element */
0926: lexer.popInline(element);
0927:
0928: if (element.tag != tt.tagA) {
0929: if (node.tag == tt.tagA
0930: && node.tag != element.tag) {
0931: Report.warning(lexer, element, node,
0932: Report.MISSING_ENDTAG_BEFORE);
0933: lexer.ungetToken();
0934: } else {
0935: Report.warning(lexer, element, node,
0936: Report.NON_MATCHING_ENDTAG);
0937: }
0938:
0939: if (!((mode & Lexer.Preformatted) != 0))
0940: Node.trimSpaces(lexer, element);
0941: Node.trimEmptyElement(lexer, element);
0942: return;
0943: }
0944:
0945: /* if parent is <a> then discard unexpected inline end tag */
0946: Report.warning(lexer, element, node,
0947: Report.DISCARDING_UNEXPECTED);
0948: continue;
0949: } /* special case </tr> etc. for stuff moved in front of table */
0950: else if (lexer.exiled && node.tag.model != 0
0951: && (node.tag.model & Dict.CM_TABLE) != 0) {
0952: lexer.ungetToken();
0953: Node.trimSpaces(lexer, element);
0954: Node.trimEmptyElement(lexer, element);
0955: return;
0956: }
0957: }
0958:
0959: /* allow any header tag to end current header */
0960: if ((node.tag.model & Dict.CM_HEADING) != 0
0961: && (element.tag.model & Dict.CM_HEADING) != 0) {
0962: if (node.tag == element.tag) {
0963: Report.warning(lexer, element, node,
0964: Report.NON_MATCHING_ENDTAG);
0965: } else {
0966: Report.warning(lexer, element, node,
0967: Report.MISSING_ENDTAG_BEFORE);
0968: lexer.ungetToken();
0969: }
0970: if (!((mode & Lexer.Preformatted) != 0))
0971: Node.trimSpaces(lexer, element);
0972: Node.trimEmptyElement(lexer, element);
0973: return;
0974: }
0975:
0976: /*
0977: an <A> tag to ends any open <A> element
0978: but <A href=...> is mapped to </A><A href=...>
0979: */
0980: if (node.tag == tt.tagA && !node.implicit
0981: && lexer.isPushed(node)) {
0982: /* coerce <a> to </a> unless it has some attributes */
0983: if (node.attributes == null) {
0984: node.type = Node.EndTag;
0985: Report.warning(lexer, element, node,
0986: Report.COERCE_TO_ENDTAG);
0987: lexer.popInline(node);
0988: lexer.ungetToken();
0989: continue;
0990: }
0991:
0992: lexer.ungetToken();
0993: Report.warning(lexer, element, node,
0994: Report.MISSING_ENDTAG_BEFORE);
0995: lexer.popInline(element);
0996: if (!((mode & Lexer.Preformatted) != 0))
0997: Node.trimSpaces(lexer, element);
0998: Node.trimEmptyElement(lexer, element);
0999: return;
1000: }
1001:
1002: if ((element.tag.model & Dict.CM_HEADING) != 0) {
1003: if (node.tag == tt.tagCenter
1004: || node.tag == tt.tagDiv) {
1005: if (node.type != Node.StartTag
1006: && node.type != Node.StartEndTag) {
1007: Report.warning(lexer, element, node,
1008: Report.DISCARDING_UNEXPECTED);
1009: continue;
1010: }
1011:
1012: Report.warning(lexer, element, node,
1013: Report.TAG_NOT_ALLOWED_IN);
1014:
1015: /* insert center as parent if heading is empty */
1016: if (element.content == null) {
1017: Node.insertNodeAsParent(element, node);
1018: continue;
1019: }
1020:
1021: /* split heading and make center parent of 2nd part */
1022: Node.insertNodeAfterElement(element, node);
1023:
1024: if (!((mode & Lexer.Preformatted) != 0))
1025: Node.trimSpaces(lexer, element);
1026:
1027: element = lexer.cloneNode(element);
1028: element.start = lexer.lexsize;
1029: element.end = lexer.lexsize;
1030: Node.insertNodeAtEnd(node, element);
1031: continue;
1032: }
1033:
1034: if (node.tag == tt.tagHr) {
1035: if (node.type != Node.StartTag
1036: && node.type != Node.StartEndTag) {
1037: Report.warning(lexer, element, node,
1038: Report.DISCARDING_UNEXPECTED);
1039: continue;
1040: }
1041:
1042: Report.warning(lexer, element, node,
1043: Report.TAG_NOT_ALLOWED_IN);
1044:
1045: /* insert hr before heading if heading is empty */
1046: if (element.content == null) {
1047: Node.insertNodeBeforeElement(element, node);
1048: continue;
1049: }
1050:
1051: /* split heading and insert hr before 2nd part */
1052: Node.insertNodeAfterElement(element, node);
1053:
1054: if (!((mode & Lexer.Preformatted) != 0))
1055: Node.trimSpaces(lexer, element);
1056:
1057: element = lexer.cloneNode(element);
1058: element.start = lexer.lexsize;
1059: element.end = lexer.lexsize;
1060: Node.insertNodeAfterElement(node, element);
1061: continue;
1062: }
1063: }
1064:
1065: if (element.tag == tt.tagDt) {
1066: if (node.tag == tt.tagHr) {
1067: Node dd;
1068:
1069: if (node.type != Node.StartTag
1070: && node.type != Node.StartEndTag) {
1071: Report.warning(lexer, element, node,
1072: Report.DISCARDING_UNEXPECTED);
1073: continue;
1074: }
1075:
1076: Report.warning(lexer, element, node,
1077: Report.TAG_NOT_ALLOWED_IN);
1078: dd = lexer.inferredTag("dd");
1079:
1080: /* insert hr within dd before dt if dt is empty */
1081: if (element.content == null) {
1082: Node.insertNodeBeforeElement(element, dd);
1083: Node.insertNodeAtEnd(dd, node);
1084: continue;
1085: }
1086:
1087: /* split dt and insert hr within dd before 2nd part */
1088: Node.insertNodeAfterElement(element, dd);
1089: Node.insertNodeAtEnd(dd, node);
1090:
1091: if (!((mode & Lexer.Preformatted) != 0))
1092: Node.trimSpaces(lexer, element);
1093:
1094: element = lexer.cloneNode(element);
1095: element.start = lexer.lexsize;
1096: element.end = lexer.lexsize;
1097: Node.insertNodeAfterElement(dd, element);
1098: continue;
1099: }
1100: }
1101:
1102: /*
1103: if this is the end tag for an ancestor element
1104: then infer end tag for this element
1105: */
1106: if (node.type == Node.EndTag) {
1107: for (parent = element.parent; parent != null; parent = parent.parent) {
1108: if (node.tag == parent.tag) {
1109: if (!((element.tag.model & Dict.CM_OPT) != 0)
1110: && !element.implicit)
1111: Report.warning(lexer, element, node,
1112: Report.MISSING_ENDTAG_BEFORE);
1113:
1114: if (element.tag == tt.tagA)
1115: lexer.popInline(element);
1116:
1117: lexer.ungetToken();
1118:
1119: if (!((mode & Lexer.Preformatted) != 0))
1120: Node.trimSpaces(lexer, element);
1121:
1122: Node.trimEmptyElement(lexer, element);
1123: return;
1124: }
1125: }
1126: }
1127:
1128: /* block level tags end this element */
1129: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1130: if (node.type != Node.StartTag) {
1131: Report.warning(lexer, element, node,
1132: Report.DISCARDING_UNEXPECTED);
1133: continue;
1134: }
1135:
1136: if (!((element.tag.model & Dict.CM_OPT) != 0))
1137: Report.warning(lexer, element, node,
1138: Report.MISSING_ENDTAG_BEFORE);
1139:
1140: if ((node.tag.model & Dict.CM_HEAD) != 0
1141: && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
1142: moveToHead(lexer, element, node);
1143: continue;
1144: }
1145:
1146: /*
1147: prevent anchors from propagating into block tags
1148: except for headings h1 to h6
1149: */
1150: if (element.tag == tt.tagA) {
1151: if (node.tag != null
1152: && !((node.tag.model & Dict.CM_HEADING) != 0))
1153: lexer.popInline(element);
1154: else if (!(element.content != null)) {
1155: Node.discardElement(element);
1156: lexer.ungetToken();
1157: return;
1158: }
1159: }
1160:
1161: lexer.ungetToken();
1162:
1163: if (!((mode & Lexer.Preformatted) != 0))
1164: Node.trimSpaces(lexer, element);
1165:
1166: Node.trimEmptyElement(lexer, element);
1167: return;
1168: }
1169:
1170: /* parse inline element */
1171: if (node.type == Node.StartTag
1172: || node.type == Node.StartEndTag) {
1173: if (node.implicit)
1174: Report.warning(lexer, element, node,
1175: Report.INSERTING_TAG);
1176:
1177: /* trim white space before <br> */
1178: if (node.tag == tt.tagBr)
1179: Node.trimSpaces(lexer, element);
1180:
1181: Node.insertNodeAtEnd(element, node);
1182: parseTag(lexer, node, mode);
1183: continue;
1184: }
1185:
1186: /* discard unexpected tags */
1187: Report.warning(lexer, element, node,
1188: Report.DISCARDING_UNEXPECTED);
1189: }
1190:
1191: if (!((element.tag.model & Dict.CM_OPT) != 0))
1192: Report.warning(lexer, element, node,
1193: Report.MISSING_ENDTAG_FOR);
1194:
1195: Node.trimEmptyElement(lexer, element);
1196: }
1197: };
1198:
1199: public static class ParseList implements Parser {
1200:
1201: public void parse(Lexer lexer, Node list, short mode) {
1202: Node node;
1203: Node parent;
1204: TagTable tt = lexer.configuration.tt;
1205:
1206: if ((list.tag.model & Dict.CM_EMPTY) != 0)
1207: return;
1208:
1209: lexer.insert = -1; /* defer implicit inline start tags */
1210:
1211: while (true) {
1212: node = lexer.getToken(Lexer.IgnoreWhitespace);
1213: if (node == null)
1214: break;
1215:
1216: if (node.tag == list.tag && node.type == Node.EndTag) {
1217: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1218: Node.coerceNode(lexer, list, tt.tagUl);
1219:
1220: list.closed = true;
1221: Node.trimEmptyElement(lexer, list);
1222: return;
1223: }
1224:
1225: /* deal with comments etc. */
1226: if (Node.insertMisc(list, node))
1227: continue;
1228:
1229: if (node.type != Node.TextNode && node.tag == null) {
1230: Report.warning(lexer, list, node,
1231: Report.DISCARDING_UNEXPECTED);
1232: continue;
1233: }
1234:
1235: /*
1236: if this is the end tag for an ancestor element
1237: then infer end tag for this element
1238: */
1239: if (node.type == Node.EndTag) {
1240: if (node.tag == tt.tagForm) {
1241: lexer.badForm = 1;
1242: Report.warning(lexer, list, node,
1243: Report.DISCARDING_UNEXPECTED);
1244: continue;
1245: }
1246:
1247: if (node.tag != null
1248: && (node.tag.model & Dict.CM_INLINE) != 0) {
1249: Report.warning(lexer, list, node,
1250: Report.DISCARDING_UNEXPECTED);
1251: lexer.popInline(node);
1252: continue;
1253: }
1254:
1255: for (parent = list.parent; parent != null; parent = parent.parent) {
1256: if (node.tag == parent.tag) {
1257: Report.warning(lexer, list, node,
1258: Report.MISSING_ENDTAG_BEFORE);
1259: lexer.ungetToken();
1260:
1261: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1262: Node.coerceNode(lexer, list, tt.tagUl);
1263:
1264: Node.trimEmptyElement(lexer, list);
1265: return;
1266: }
1267: }
1268:
1269: Report.warning(lexer, list, node,
1270: Report.DISCARDING_UNEXPECTED);
1271: continue;
1272: }
1273:
1274: if (node.tag != tt.tagLi) {
1275: lexer.ungetToken();
1276:
1277: if (node.tag != null
1278: && (node.tag.model & Dict.CM_BLOCK) != 0
1279: && lexer.excludeBlocks) {
1280: Report.warning(lexer, list, node,
1281: Report.MISSING_ENDTAG_BEFORE);
1282: Node.trimEmptyElement(lexer, list);
1283: return;
1284: }
1285:
1286: node = lexer.inferredTag("li");
1287: node.addAttribute("style", "list-style: none");
1288: Report.warning(lexer, list, node,
1289: Report.MISSING_STARTTAG);
1290: }
1291:
1292: /* node should be <LI> */
1293: Node.insertNodeAtEnd(list, node);
1294: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1295: }
1296:
1297: if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1298: Node.coerceNode(lexer, list, tt.tagUl);
1299:
1300: Report
1301: .warning(lexer, list, node,
1302: Report.MISSING_ENDTAG_FOR);
1303: Node.trimEmptyElement(lexer, list);
1304: }
1305:
1306: };
1307:
1308: public static class ParseDefList implements Parser {
1309:
1310: public void parse(Lexer lexer, Node list, short mode) {
1311: Node node, parent;
1312: TagTable tt = lexer.configuration.tt;
1313:
1314: if ((list.tag.model & Dict.CM_EMPTY) != 0)
1315: return;
1316:
1317: lexer.insert = -1; /* defer implicit inline start tags */
1318:
1319: while (true) {
1320: node = lexer.getToken(Lexer.IgnoreWhitespace);
1321: if (node == null)
1322: break;
1323: if (node.tag == list.tag && node.type == Node.EndTag) {
1324: list.closed = true;
1325: Node.trimEmptyElement(lexer, list);
1326: return;
1327: }
1328:
1329: /* deal with comments etc. */
1330: if (Node.insertMisc(list, node))
1331: continue;
1332:
1333: if (node.type == Node.TextNode) {
1334: lexer.ungetToken();
1335: node = lexer.inferredTag("dt");
1336: Report.warning(lexer, list, node,
1337: Report.MISSING_STARTTAG);
1338: }
1339:
1340: if (node.tag == null) {
1341: Report.warning(lexer, list, node,
1342: Report.DISCARDING_UNEXPECTED);
1343: continue;
1344: }
1345:
1346: /*
1347: if this is the end tag for an ancestor element
1348: then infer end tag for this element
1349: */
1350: if (node.type == Node.EndTag) {
1351: if (node.tag == tt.tagForm) {
1352: lexer.badForm = 1;
1353: Report.warning(lexer, list, node,
1354: Report.DISCARDING_UNEXPECTED);
1355: continue;
1356: }
1357:
1358: for (parent = list.parent; parent != null; parent = parent.parent) {
1359: if (node.tag == parent.tag) {
1360: Report.warning(lexer, list, node,
1361: Report.MISSING_ENDTAG_BEFORE);
1362:
1363: lexer.ungetToken();
1364: Node.trimEmptyElement(lexer, list);
1365: return;
1366: }
1367: }
1368: }
1369:
1370: /* center in a dt or a dl breaks the dl list in two */
1371: if (node.tag == tt.tagCenter) {
1372: if (list.content != null)
1373: Node.insertNodeAfterElement(list, node);
1374: else /* trim empty dl list */
1375: {
1376: Node.insertNodeBeforeElement(list, node);
1377: Node.discardElement(list);
1378: }
1379:
1380: /* and parse contents of center */
1381: parseTag(lexer, node, mode);
1382:
1383: /* now create a new dl element */
1384: list = lexer.inferredTag("dl");
1385: Node.insertNodeAfterElement(node, list);
1386: continue;
1387: }
1388:
1389: if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) {
1390: lexer.ungetToken();
1391:
1392: if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
1393: Report.warning(lexer, list, node,
1394: Report.TAG_NOT_ALLOWED_IN);
1395: Node.trimEmptyElement(lexer, list);
1396: return;
1397: }
1398:
1399: /* if DD appeared directly in BODY then exclude blocks */
1400: if (!((node.tag.model & Dict.CM_INLINE) != 0)
1401: && lexer.excludeBlocks) {
1402: Node.trimEmptyElement(lexer, list);
1403: return;
1404: }
1405:
1406: node = lexer.inferredTag("dd");
1407: Report.warning(lexer, list, node,
1408: Report.MISSING_STARTTAG);
1409: }
1410:
1411: if (node.type == Node.EndTag) {
1412: Report.warning(lexer, list, node,
1413: Report.DISCARDING_UNEXPECTED);
1414: continue;
1415: }
1416:
1417: /* node should be <DT> or <DD>*/
1418: Node.insertNodeAtEnd(list, node);
1419: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1420: }
1421:
1422: Report
1423: .warning(lexer, list, node,
1424: Report.MISSING_ENDTAG_FOR);
1425: Node.trimEmptyElement(lexer, list);
1426: }
1427:
1428: };
1429:
1430: public static class ParsePre implements Parser {
1431:
1432: public void parse(Lexer lexer, Node pre, short mode) {
1433: Node node, parent;
1434: TagTable tt = lexer.configuration.tt;
1435:
1436: if ((pre.tag.model & Dict.CM_EMPTY) != 0)
1437: return;
1438:
1439: if ((pre.tag.model & Dict.CM_OBSOLETE) != 0)
1440: Node.coerceNode(lexer, pre, tt.tagPre);
1441:
1442: lexer.inlineDup(null); /* tell lexer to insert inlines if needed */
1443:
1444: while (true) {
1445: node = lexer.getToken(Lexer.Preformatted);
1446: if (node == null)
1447: break;
1448: if (node.tag == pre.tag && node.type == Node.EndTag) {
1449: Node.trimSpaces(lexer, pre);
1450: pre.closed = true;
1451: Node.trimEmptyElement(lexer, pre);
1452: return;
1453: }
1454:
1455: if (node.tag == tt.tagHtml) {
1456: if (node.type == Node.StartTag
1457: || node.type == Node.StartEndTag)
1458: Report.warning(lexer, pre, node,
1459: Report.DISCARDING_UNEXPECTED);
1460:
1461: continue;
1462: }
1463:
1464: if (node.type == Node.TextNode) {
1465: /* if first check for inital newline */
1466: if (pre.content == null) {
1467: if (node.textarray[node.start] == (byte) '\n')
1468: ++node.start;
1469:
1470: if (node.start >= node.end) {
1471: continue;
1472: }
1473: }
1474:
1475: Node.insertNodeAtEnd(pre, node);
1476: continue;
1477: }
1478:
1479: /* deal with comments etc. */
1480: if (Node.insertMisc(pre, node))
1481: continue;
1482:
1483: /* discard unknown and PARAM tags */
1484: if (node.tag == null || node.tag == tt.tagParam) {
1485: Report.warning(lexer, pre, node,
1486: Report.DISCARDING_UNEXPECTED);
1487: continue;
1488: }
1489:
1490: if (node.tag == tt.tagP) {
1491: if (node.type == Node.StartTag) {
1492: Report.warning(lexer, pre, node,
1493: Report.USING_BR_INPLACE_OF);
1494:
1495: /* trim white space before <p> in <pre>*/
1496: Node.trimSpaces(lexer, pre);
1497:
1498: /* coerce both <p> and </p> to <br> */
1499: Node.coerceNode(lexer, node, tt.tagBr);
1500: Node.insertNodeAtEnd(pre, node);
1501: } else {
1502: Report.warning(lexer, pre, node,
1503: Report.DISCARDING_UNEXPECTED);
1504: }
1505: continue;
1506: }
1507:
1508: if ((node.tag.model & Dict.CM_HEAD) != 0
1509: && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
1510: moveToHead(lexer, pre, node);
1511: continue;
1512: }
1513:
1514: /*
1515: if this is the end tag for an ancestor element
1516: then infer end tag for this element
1517: */
1518: if (node.type == Node.EndTag) {
1519: if (node.tag == tt.tagForm) {
1520: lexer.badForm = 1;
1521: Report.warning(lexer, pre, node,
1522: Report.DISCARDING_UNEXPECTED);
1523: continue;
1524: }
1525:
1526: for (parent = pre.parent; parent != null; parent = parent.parent) {
1527: if (node.tag == parent.tag) {
1528: Report.warning(lexer, pre, node,
1529: Report.MISSING_ENDTAG_BEFORE);
1530:
1531: lexer.ungetToken();
1532: Node.trimSpaces(lexer, pre);
1533: Node.trimEmptyElement(lexer, pre);
1534: return;
1535: }
1536: }
1537: }
1538:
1539: /* what about head content, HEAD, BODY tags etc? */
1540: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1541: if (node.type != Node.StartTag) {
1542: Report.warning(lexer, pre, node,
1543: Report.DISCARDING_UNEXPECTED);
1544: continue;
1545: }
1546:
1547: Report.warning(lexer, pre, node,
1548: Report.MISSING_ENDTAG_BEFORE);
1549: lexer.excludeBlocks = true;
1550:
1551: /* check if we need to infer a container */
1552: if ((node.tag.model & Dict.CM_LIST) != 0) {
1553: lexer.ungetToken();
1554: node = lexer.inferredTag("ul");
1555: Node.addClass(node, "noindent");
1556: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1557: lexer.ungetToken();
1558: node = lexer.inferredTag("dl");
1559: } else if ((node.tag.model & Dict.CM_TABLE) != 0) {
1560: lexer.ungetToken();
1561: node = lexer.inferredTag("table");
1562: }
1563:
1564: Node.insertNodeAfterElement(pre, node);
1565: pre = lexer.inferredTag("pre");
1566: Node.insertNodeAfterElement(node, pre);
1567: parseTag(lexer, node, Lexer.IgnoreWhitespace);
1568: lexer.excludeBlocks = false;
1569: continue;
1570: }
1571: /*
1572: if (!((node.tag.model & Dict.CM_INLINE) != 0))
1573: {
1574: Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE);
1575: lexer.ungetToken();
1576: return;
1577: }
1578: */
1579: if (node.type == Node.StartTag
1580: || node.type == Node.StartEndTag) {
1581: /* trim white space before <br> */
1582: if (node.tag == tt.tagBr)
1583: Node.trimSpaces(lexer, pre);
1584:
1585: Node.insertNodeAtEnd(pre, node);
1586: parseTag(lexer, node, Lexer.Preformatted);
1587: continue;
1588: }
1589:
1590: /* discard unexpected tags */
1591: Report.warning(lexer, pre, node,
1592: Report.DISCARDING_UNEXPECTED);
1593: }
1594:
1595: Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
1596: Node.trimEmptyElement(lexer, pre);
1597: }
1598:
1599: };
1600:
1601: public static class ParseBlock implements Parser {
1602:
1603: public void parse(Lexer lexer, Node element, short mode)
1604: /*
1605: element is node created by the lexer
1606: upon seeing the start tag, or by the
1607: parser when the start tag is inferred
1608: */
1609: {
1610: Node node, parent;
1611: boolean checkstack;
1612: int istackbase = 0;
1613: TagTable tt = lexer.configuration.tt;
1614:
1615: checkstack = true;
1616:
1617: if ((element.tag.model & Dict.CM_EMPTY) != 0)
1618: return;
1619:
1620: if (element.tag == tt.tagForm
1621: && element.isDescendantOf(tt.tagForm))
1622: Report.warning(lexer, element, null,
1623: Report.ILLEGAL_NESTING);
1624:
1625: /*
1626: InlineDup() asks the lexer to insert inline emphasis tags
1627: currently pushed on the istack, but take care to avoid
1628: propagating inline emphasis inside OBJECT or APPLET.
1629: For these elements a fresh inline stack context is created
1630: and disposed of upon reaching the end of the element.
1631: They thus behave like table cells in this respect.
1632: */
1633: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1634: istackbase = lexer.istackbase;
1635: lexer.istackbase = lexer.istack.size();
1636: }
1637:
1638: if (!((element.tag.model & Dict.CM_MIXED) != 0))
1639: lexer.inlineDup(null);
1640:
1641: mode = Lexer.IgnoreWhitespace;
1642:
1643: while (true) {
1644: node = lexer.getToken(mode /*Lexer.MixedContent*/);
1645: if (node == null)
1646: break;
1647: /* end tag for this element */
1648: if (node.type == Node.EndTag
1649: && node.tag != null
1650: && (node.tag == element.tag || element.was == node.tag)) {
1651:
1652: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1653: /* pop inline stack */
1654: while (lexer.istack.size() > lexer.istackbase)
1655: lexer.popInline(null);
1656: lexer.istackbase = istackbase;
1657: }
1658:
1659: element.closed = true;
1660: Node.trimSpaces(lexer, element);
1661: Node.trimEmptyElement(lexer, element);
1662: return;
1663: }
1664: // BEGIN RAVE MODIFICATIONS
1665: // if (node.tag == tt.tagHtml ||
1666: // node.tag == tt.tagHead ||
1667: // node.tag == tt.tagBody)
1668: if (lexer.configuration.inputJspMode
1669: && node.tag == tt.tagHtml) {
1670: Node.insertNodeAtEnd(element, node);
1671: getParseHTML().parse(lexer, node, (short) 0); // TODO?
1672: continue;
1673: } else if (!lexer.configuration.inputJspMode
1674: && (node.tag == tt.tagHtml
1675: || node.tag == tt.tagHead || node.tag == tt.tagBody))
1676: // END RAVE MODIFICATIONS
1677: {
1678: if (node.type == Node.StartTag
1679: || node.type == Node.StartEndTag)
1680: Report.warning(lexer, element, node,
1681: Report.DISCARDING_UNEXPECTED);
1682:
1683: continue;
1684: }
1685:
1686: if (node.type == Node.EndTag) {
1687: if (node.tag == null) {
1688: Report.warning(lexer, element, node,
1689: Report.DISCARDING_UNEXPECTED);
1690:
1691: continue;
1692: } else if (node.tag == tt.tagBr)
1693: node.type = Node.StartTag;
1694: else if (node.tag == tt.tagP) {
1695: Node.coerceNode(lexer, node, tt.tagBr);
1696: Node.insertNodeAtEnd(element, node);
1697: node = lexer.inferredTag("br");
1698: } else {
1699: /*
1700: if this is the end tag for an ancestor element
1701: then infer end tag for this element
1702: */
1703: for (parent = element.parent; parent != null; parent = parent.parent) {
1704: if (node.tag == parent.tag) {
1705: if (!((element.tag.model & Dict.CM_OPT) != 0))
1706: Report
1707: .warning(
1708: lexer,
1709: element,
1710: node,
1711: Report.MISSING_ENDTAG_BEFORE);
1712:
1713: lexer.ungetToken();
1714:
1715: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1716: /* pop inline stack */
1717: while (lexer.istack.size() > lexer.istackbase)
1718: lexer.popInline(null);
1719: lexer.istackbase = istackbase;
1720: }
1721:
1722: Node.trimSpaces(lexer, element);
1723: Node.trimEmptyElement(lexer, element);
1724: return;
1725: }
1726: }
1727: /* special case </tr> etc. for stuff moved in front of table */
1728: if (lexer.exiled
1729: && node.tag.model != 0
1730: && (node.tag.model & Dict.CM_TABLE) != 0) {
1731: lexer.ungetToken();
1732: Node.trimSpaces(lexer, element);
1733: Node.trimEmptyElement(lexer, element);
1734: return;
1735: }
1736: }
1737: }
1738:
1739: /* mixed content model permits text */
1740: if (node.type == Node.TextNode) {
1741: boolean iswhitenode = false;
1742:
1743: if (node.type == Node.TextNode
1744: && node.end <= node.start + 1
1745: && lexer.lexbuf[node.start] == (byte) ' ')
1746: iswhitenode = true;
1747:
1748: if (lexer.configuration.EncloseBlockText
1749: && !iswhitenode) {
1750: lexer.ungetToken();
1751: node = lexer.inferredTag("p");
1752: Node.insertNodeAtEnd(element, node);
1753: parseTag(lexer, node, Lexer.MixedContent);
1754: continue;
1755: }
1756:
1757: if (checkstack) {
1758: checkstack = false;
1759:
1760: if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
1761: if (lexer.inlineDup(node) > 0)
1762: continue;
1763: }
1764: }
1765:
1766: Node.insertNodeAtEnd(element, node);
1767: mode = Lexer.MixedContent;
1768: /*
1769: HTML4 strict doesn't allow mixed content for
1770: elements with %block; as their content model
1771: */
1772: lexer.versions &= ~Dict.VERS_HTML40_STRICT;
1773: continue;
1774: }
1775:
1776: if (Node.insertMisc(element, node))
1777: continue;
1778:
1779: /* allow PARAM elements? */
1780: if (node.tag == tt.tagParam) {
1781: if (((element.tag.model & Dict.CM_PARAM) != 0)
1782: && (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
1783: Node.insertNodeAtEnd(element, node);
1784: continue;
1785: }
1786:
1787: /* otherwise discard it */
1788: Report.warning(lexer, element, node,
1789: Report.DISCARDING_UNEXPECTED);
1790: continue;
1791: }
1792:
1793: /* allow AREA elements? */
1794: if (node.tag == tt.tagArea) {
1795: if ((element.tag == tt.tagMap)
1796: && (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
1797: Node.insertNodeAtEnd(element, node);
1798: continue;
1799: }
1800:
1801: /* otherwise discard it */
1802: Report.warning(lexer, element, node,
1803: Report.DISCARDING_UNEXPECTED);
1804: continue;
1805: }
1806:
1807: /* ignore unknown start/end tags */
1808: if (node.tag == null) {
1809: Report.warning(lexer, element, node,
1810: Report.DISCARDING_UNEXPECTED);
1811: continue;
1812: }
1813:
1814: /*
1815: Allow Dict.CM_INLINE elements here.
1816:
1817: Allow Dict.CM_BLOCK elements here unless
1818: lexer.excludeBlocks is yes.
1819:
1820: LI and DD are special cased.
1821:
1822: Otherwise infer end tag for this element.
1823: */
1824:
1825: if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
1826: if (node.type != Node.StartTag
1827: && node.type != Node.StartEndTag) {
1828: Report.warning(lexer, element, node,
1829: Report.DISCARDING_UNEXPECTED);
1830: continue;
1831: }
1832:
1833: if (element.tag == tt.tagTd
1834: || element.tag == tt.tagTh) {
1835: /* if parent is a table cell, avoid inferring the end of the cell */
1836:
1837: if ((node.tag.model & Dict.CM_HEAD) != 0) {
1838: moveToHead(lexer, element, node);
1839: continue;
1840: }
1841:
1842: if ((node.tag.model & Dict.CM_LIST) != 0) {
1843: lexer.ungetToken();
1844: node = lexer.inferredTag("ul");
1845: Node.addClass(node, "noindent");
1846: lexer.excludeBlocks = true;
1847: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1848: lexer.ungetToken();
1849: node = lexer.inferredTag("dl");
1850: lexer.excludeBlocks = true;
1851: }
1852:
1853: /* infer end of current table cell */
1854: if (!((node.tag.model & Dict.CM_BLOCK) != 0)) {
1855: lexer.ungetToken();
1856: Node.trimSpaces(lexer, element);
1857: Node.trimEmptyElement(lexer, element);
1858: return;
1859: }
1860: } else if ((node.tag.model & Dict.CM_BLOCK) != 0) {
1861: if (lexer.excludeBlocks) {
1862: if (!((element.tag.model & Dict.CM_OPT) != 0))
1863: Report.warning(lexer, element, node,
1864: Report.MISSING_ENDTAG_BEFORE);
1865:
1866: lexer.ungetToken();
1867:
1868: if ((element.tag.model & Dict.CM_OBJECT) != 0)
1869: lexer.istackbase = istackbase;
1870:
1871: Node.trimSpaces(lexer, element);
1872: Node.trimEmptyElement(lexer, element);
1873: return;
1874: }
1875: } else /* things like list items */
1876: {
1877: if (!((element.tag.model & Dict.CM_OPT) != 0)
1878: && !element.implicit)
1879: Report.warning(lexer, element, node,
1880: Report.MISSING_ENDTAG_BEFORE);
1881:
1882: if ((node.tag.model & Dict.CM_HEAD) != 0) {
1883: moveToHead(lexer, element, node);
1884: continue;
1885: }
1886:
1887: lexer.ungetToken();
1888:
1889: if ((node.tag.model & Dict.CM_LIST) != 0) {
1890: if (element.parent != null
1891: && element.parent.tag != null
1892: && element.parent.tag.parser == getParseList()) {
1893: Node.trimSpaces(lexer, element);
1894: Node.trimEmptyElement(lexer, element);
1895: return;
1896: }
1897:
1898: node = lexer.inferredTag("ul");
1899: Node.addClass(node, "noindent");
1900: } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
1901: if (element.parent.tag == tt.tagDl) {
1902: Node.trimSpaces(lexer, element);
1903: Node.trimEmptyElement(lexer, element);
1904: return;
1905: }
1906:
1907: node = lexer.inferredTag("dl");
1908: } else if ((node.tag.model & Dict.CM_TABLE) != 0
1909: || (node.tag.model & Dict.CM_ROW) != 0) {
1910: node = lexer.inferredTag("table");
1911: } else if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1912: /* pop inline stack */
1913: while (lexer.istack.size() > lexer.istackbase)
1914: lexer.popInline(null);
1915: lexer.istackbase = istackbase;
1916: Node.trimSpaces(lexer, element);
1917: Node.trimEmptyElement(lexer, element);
1918: return;
1919:
1920: } else {
1921: Node.trimSpaces(lexer, element);
1922: Node.trimEmptyElement(lexer, element);
1923: return;
1924: }
1925: }
1926: }
1927:
1928: /* parse known element */
1929: if (node.type == Node.StartTag
1930: || node.type == Node.StartEndTag) {
1931: if ((node.tag.model & Dict.CM_INLINE) != 0) {
1932: if (checkstack && !node.implicit) {
1933: checkstack = false;
1934:
1935: if (lexer.inlineDup(node) > 0)
1936: continue;
1937: }
1938:
1939: mode = Lexer.MixedContent;
1940: } else {
1941: checkstack = true;
1942: mode = Lexer.IgnoreWhitespace;
1943: }
1944:
1945: /* trim white space before <br> */
1946: if (node.tag == tt.tagBr)
1947: Node.trimSpaces(lexer, element);
1948:
1949: Node.insertNodeAtEnd(element, node);
1950:
1951: if (node.implicit)
1952: Report.warning(lexer, element, node,
1953: Report.INSERTING_TAG);
1954:
1955: parseTag(lexer, node, Lexer.IgnoreWhitespace /*Lexer.MixedContent*/);
1956: continue;
1957: }
1958:
1959: /* discard unexpected tags */
1960: if (node.type == Node.EndTag)
1961: lexer.popInline(node); /* if inline end tag */
1962:
1963: Report.warning(lexer, element, node,
1964: Report.DISCARDING_UNEXPECTED);
1965: }
1966:
1967: if (!((element.tag.model & Dict.CM_OPT) != 0))
1968: Report.warning(lexer, element, node,
1969: Report.MISSING_ENDTAG_FOR);
1970:
1971: if ((element.tag.model & Dict.CM_OBJECT) != 0) {
1972: /* pop inline stack */
1973: while (lexer.istack.size() > lexer.istackbase)
1974: lexer.popInline(null);
1975: lexer.istackbase = istackbase;
1976: }
1977:
1978: Node.trimSpaces(lexer, element);
1979: Node.trimEmptyElement(lexer, element);
1980: }
1981:
1982: };
1983:
1984: public static class ParseTableTag implements Parser {
1985:
1986: public void parse(Lexer lexer, Node table, short mode) {
1987: Node node, parent;
1988: int istackbase;
1989: TagTable tt = lexer.configuration.tt;
1990:
1991: lexer.deferDup();
1992: istackbase = lexer.istackbase;
1993: lexer.istackbase = lexer.istack.size();
1994:
1995: while (true) {
1996: node = lexer.getToken(Lexer.IgnoreWhitespace);
1997: if (node == null)
1998: break;
1999: if (node.tag == table.tag && node.type == Node.EndTag) {
2000: lexer.istackbase = istackbase;
2001: table.closed = true;
2002: Node.trimEmptyElement(lexer, table);
2003: return;
2004: }
2005:
2006: /* deal with comments etc. */
2007: if (Node.insertMisc(table, node))
2008: continue;
2009:
2010: /* discard unknown tags */
2011: if (node.tag == null && node.type != Node.TextNode) {
2012: Report.warning(lexer, table, node,
2013: Report.DISCARDING_UNEXPECTED);
2014: continue;
2015: }
2016:
2017: /* if TD or TH or text or inline or block then infer <TR> */
2018:
2019: if (node.type != Node.EndTag) {
2020: if (node.tag == tt.tagTd || node.tag == tt.tagTh
2021: || node.tag == tt.tagTable) {
2022: lexer.ungetToken();
2023: node = lexer.inferredTag("tr");
2024: Report.warning(lexer, table, node,
2025: Report.MISSING_STARTTAG);
2026: } else if (node.type == Node.TextNode
2027: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2028: Node.insertNodeBeforeElement(table, node);
2029: Report.warning(lexer, table, node,
2030: Report.TAG_NOT_ALLOWED_IN);
2031: lexer.exiled = true;
2032:
2033: /* AQ: TODO
2034: Line 2040 of parser.c (13 Jan 2000) reads as follows:
2035: if (!node->type == TextNode)
2036: This will always evaluate to false.
2037: This has been reported to Dave Raggett <dsr@w3.org>
2038: */
2039: //Should be?: if (!(node.type == Node.TextNode))
2040: if (false)
2041: parseTag(lexer, node,
2042: Lexer.IgnoreWhitespace);
2043:
2044: lexer.exiled = false;
2045: continue;
2046: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2047: moveToHead(lexer, table, node);
2048: continue;
2049: }
2050: }
2051:
2052: /*
2053: if this is the end tag for an ancestor element
2054: then infer end tag for this element
2055: */
2056: if (node.type == Node.EndTag) {
2057: if (node.tag == tt.tagForm) {
2058: lexer.badForm = 1;
2059: Report.warning(lexer, table, node,
2060: Report.DISCARDING_UNEXPECTED);
2061: continue;
2062: }
2063:
2064: if (node.tag != null
2065: && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0) {
2066: Report.warning(lexer, table, node,
2067: Report.DISCARDING_UNEXPECTED);
2068: continue;
2069: }
2070:
2071: for (parent = table.parent; parent != null; parent = parent.parent) {
2072: if (node.tag == parent.tag) {
2073: Report.warning(lexer, table, node,
2074: Report.MISSING_ENDTAG_BEFORE);
2075: lexer.ungetToken();
2076: lexer.istackbase = istackbase;
2077: Node.trimEmptyElement(lexer, table);
2078: return;
2079: }
2080: }
2081: }
2082:
2083: if (!((node.tag.model & Dict.CM_TABLE) != 0)) {
2084: lexer.ungetToken();
2085: Report.warning(lexer, table, node,
2086: Report.TAG_NOT_ALLOWED_IN);
2087: lexer.istackbase = istackbase;
2088: Node.trimEmptyElement(lexer, table);
2089: return;
2090: }
2091:
2092: if (node.type == Node.StartTag
2093: || node.type == Node.StartEndTag) {
2094: Node.insertNodeAtEnd(table, node);
2095: ;
2096: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2097: continue;
2098: }
2099:
2100: /* discard unexpected text nodes and end tags */
2101: Report.warning(lexer, table, node,
2102: Report.DISCARDING_UNEXPECTED);
2103: }
2104:
2105: Report.warning(lexer, table, node,
2106: Report.MISSING_ENDTAG_FOR);
2107: Node.trimEmptyElement(lexer, table);
2108: lexer.istackbase = istackbase;
2109: }
2110:
2111: };
2112:
2113: public static class ParseColGroup implements Parser {
2114:
2115: public void parse(Lexer lexer, Node colgroup, short mode) {
2116: Node node, parent;
2117: TagTable tt = lexer.configuration.tt;
2118:
2119: if ((colgroup.tag.model & Dict.CM_EMPTY) != 0)
2120: return;
2121:
2122: while (true) {
2123: node = lexer.getToken(Lexer.IgnoreWhitespace);
2124: if (node == null)
2125: break;
2126: if (node.tag == colgroup.tag
2127: && node.type == Node.EndTag) {
2128: colgroup.closed = true;
2129: return;
2130: }
2131:
2132: /*
2133: if this is the end tag for an ancestor element
2134: then infer end tag for this element
2135: */
2136: if (node.type == Node.EndTag) {
2137: if (node.tag == tt.tagForm) {
2138: lexer.badForm = 1;
2139: Report.warning(lexer, colgroup, node,
2140: Report.DISCARDING_UNEXPECTED);
2141: continue;
2142: }
2143:
2144: for (parent = colgroup.parent; parent != null; parent = parent.parent) {
2145:
2146: if (node.tag == parent.tag) {
2147: lexer.ungetToken();
2148: return;
2149: }
2150: }
2151: }
2152:
2153: if (node.type == Node.TextNode) {
2154: lexer.ungetToken();
2155: return;
2156: }
2157:
2158: /* deal with comments etc. */
2159: if (Node.insertMisc(colgroup, node))
2160: continue;
2161:
2162: /* discard unknown tags */
2163: if (node.tag == null) {
2164: Report.warning(lexer, colgroup, node,
2165: Report.DISCARDING_UNEXPECTED);
2166: continue;
2167: }
2168:
2169: if (node.tag != tt.tagCol) {
2170: lexer.ungetToken();
2171: return;
2172: }
2173:
2174: if (node.type == Node.EndTag) {
2175: Report.warning(lexer, colgroup, node,
2176: Report.DISCARDING_UNEXPECTED);
2177: continue;
2178: }
2179:
2180: /* node should be <COL> */
2181: Node.insertNodeAtEnd(colgroup, node);
2182: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2183: }
2184: }
2185:
2186: };
2187:
2188: public static class ParseRowGroup implements Parser {
2189:
2190: public void parse(Lexer lexer, Node rowgroup, short mode) {
2191: Node node, parent;
2192: TagTable tt = lexer.configuration.tt;
2193:
2194: if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0)
2195: return;
2196:
2197: while (true) {
2198: node = lexer.getToken(Lexer.IgnoreWhitespace);
2199: if (node == null)
2200: break;
2201: if (node.tag == rowgroup.tag) {
2202: if (node.type == Node.EndTag) {
2203: rowgroup.closed = true;
2204: Node.trimEmptyElement(lexer, rowgroup);
2205: return;
2206: }
2207:
2208: lexer.ungetToken();
2209: return;
2210: }
2211:
2212: /* if </table> infer end tag */
2213: if (node.tag == tt.tagTable && node.type == Node.EndTag) {
2214: lexer.ungetToken();
2215: Node.trimEmptyElement(lexer, rowgroup);
2216: return;
2217: }
2218:
2219: /* deal with comments etc. */
2220: if (Node.insertMisc(rowgroup, node))
2221: continue;
2222:
2223: /* discard unknown tags */
2224: if (node.tag == null && node.type != Node.TextNode) {
2225: Report.warning(lexer, rowgroup, node,
2226: Report.DISCARDING_UNEXPECTED);
2227: continue;
2228: }
2229:
2230: /*
2231: if TD or TH then infer <TR>
2232: if text or inline or block move before table
2233: if head content move to head
2234: */
2235:
2236: if (node.type != Node.EndTag) {
2237: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2238: lexer.ungetToken();
2239: node = lexer.inferredTag("tr");
2240: Report.warning(lexer, rowgroup, node,
2241: Report.MISSING_STARTTAG);
2242: } else if (node.type == Node.TextNode
2243: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2244: Node.moveBeforeTable(rowgroup, node, tt);
2245: Report.warning(lexer, rowgroup, node,
2246: Report.TAG_NOT_ALLOWED_IN);
2247: lexer.exiled = true;
2248:
2249: if (node.type != Node.TextNode)
2250: parseTag(lexer, node,
2251: Lexer.IgnoreWhitespace);
2252:
2253: lexer.exiled = false;
2254: continue;
2255: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2256: Report.warning(lexer, rowgroup, node,
2257: Report.TAG_NOT_ALLOWED_IN);
2258: moveToHead(lexer, rowgroup, node);
2259: continue;
2260: }
2261: }
2262:
2263: /*
2264: if this is the end tag for ancestor element
2265: then infer end tag for this element
2266: */
2267: if (node.type == Node.EndTag) {
2268: if (node.tag == tt.tagForm) {
2269: lexer.badForm = 1;
2270: Report.warning(lexer, rowgroup, node,
2271: Report.DISCARDING_UNEXPECTED);
2272: continue;
2273: }
2274:
2275: if (node.tag == tt.tagTr || node.tag == tt.tagTd
2276: || node.tag == tt.tagTh) {
2277: Report.warning(lexer, rowgroup, node,
2278: Report.DISCARDING_UNEXPECTED);
2279: continue;
2280: }
2281:
2282: for (parent = rowgroup.parent; parent != null; parent = parent.parent) {
2283: if (node.tag == parent.tag) {
2284: lexer.ungetToken();
2285: Node.trimEmptyElement(lexer, rowgroup);
2286: return;
2287: }
2288: }
2289: }
2290:
2291: /*
2292: if THEAD, TFOOT or TBODY then implied end tag
2293:
2294: */
2295: if ((node.tag.model & Dict.CM_ROWGRP) != 0) {
2296: if (node.type != Node.EndTag)
2297: lexer.ungetToken();
2298:
2299: Node.trimEmptyElement(lexer, rowgroup);
2300: return;
2301: }
2302:
2303: if (node.type == Node.EndTag) {
2304: Report.warning(lexer, rowgroup, node,
2305: Report.DISCARDING_UNEXPECTED);
2306: continue;
2307: }
2308:
2309: if (!(node.tag == tt.tagTr)) {
2310: node = lexer.inferredTag("tr");
2311: Report.warning(lexer, rowgroup, node,
2312: Report.MISSING_STARTTAG);
2313: lexer.ungetToken();
2314: }
2315:
2316: /* node should be <TR> */
2317: Node.insertNodeAtEnd(rowgroup, node);
2318: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2319: }
2320:
2321: Node.trimEmptyElement(lexer, rowgroup);
2322: }
2323:
2324: };
2325:
2326: public static class ParseRow implements Parser {
2327:
2328: public void parse(Lexer lexer, Node row, short mode) {
2329: Node node, parent;
2330: boolean exclude_state;
2331: TagTable tt = lexer.configuration.tt;
2332:
2333: if ((row.tag.model & Dict.CM_EMPTY) != 0)
2334: return;
2335:
2336: while (true) {
2337: node = lexer.getToken(Lexer.IgnoreWhitespace);
2338: if (node == null)
2339: break;
2340: if (node.tag == row.tag) {
2341: if (node.type == Node.EndTag) {
2342: row.closed = true;
2343: Node.fixEmptyRow(lexer, row);
2344: return;
2345: }
2346:
2347: lexer.ungetToken();
2348: Node.fixEmptyRow(lexer, row);
2349: return;
2350: }
2351:
2352: /*
2353: if this is the end tag for an ancestor element
2354: then infer end tag for this element
2355: */
2356: if (node.type == Node.EndTag) {
2357: if (node.tag == tt.tagForm) {
2358: lexer.badForm = 1;
2359: Report.warning(lexer, row, node,
2360: Report.DISCARDING_UNEXPECTED);
2361: continue;
2362: }
2363:
2364: if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
2365: Report.warning(lexer, row, node,
2366: Report.DISCARDING_UNEXPECTED);
2367: continue;
2368: }
2369:
2370: for (parent = row.parent; parent != null; parent = parent.parent) {
2371: if (node.tag == parent.tag) {
2372: lexer.ungetToken();
2373: Node.trimEmptyElement(lexer, row);
2374: return;
2375: }
2376: }
2377: }
2378:
2379: /* deal with comments etc. */
2380: if (Node.insertMisc(row, node))
2381: continue;
2382:
2383: /* discard unknown tags */
2384: if (node.tag == null && node.type != Node.TextNode) {
2385: Report.warning(lexer, row, node,
2386: Report.DISCARDING_UNEXPECTED);
2387: continue;
2388: }
2389:
2390: /* discard unexpected <table> element */
2391: if (node.tag == tt.tagTable) {
2392: Report.warning(lexer, row, node,
2393: Report.DISCARDING_UNEXPECTED);
2394: continue;
2395: }
2396:
2397: /* THEAD, TFOOT or TBODY */
2398: if (node.tag != null
2399: && (node.tag.model & Dict.CM_ROWGRP) != 0) {
2400: lexer.ungetToken();
2401: Node.trimEmptyElement(lexer, row);
2402: return;
2403: }
2404:
2405: if (node.type == Node.EndTag) {
2406: Report.warning(lexer, row, node,
2407: Report.DISCARDING_UNEXPECTED);
2408: continue;
2409: }
2410:
2411: /*
2412: if text or inline or block move before table
2413: if head content move to head
2414: */
2415:
2416: if (node.type != Node.EndTag) {
2417: if (node.tag == tt.tagForm) {
2418: lexer.ungetToken();
2419: node = lexer.inferredTag("td");
2420: Report.warning(lexer, row, node,
2421: Report.MISSING_STARTTAG);
2422: } else if (node.type == Node.TextNode
2423: || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
2424: Node.moveBeforeTable(row, node, tt);
2425: Report.warning(lexer, row, node,
2426: Report.TAG_NOT_ALLOWED_IN);
2427: lexer.exiled = true;
2428:
2429: if (node.type != Node.TextNode)
2430: parseTag(lexer, node,
2431: Lexer.IgnoreWhitespace);
2432:
2433: lexer.exiled = false;
2434: continue;
2435: } else if ((node.tag.model & Dict.CM_HEAD) != 0) {
2436: Report.warning(lexer, row, node,
2437: Report.TAG_NOT_ALLOWED_IN);
2438: moveToHead(lexer, row, node);
2439: continue;
2440: }
2441: }
2442:
2443: if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) {
2444: Report.warning(lexer, row, node,
2445: Report.TAG_NOT_ALLOWED_IN);
2446: continue;
2447: }
2448:
2449: /* node should be <TD> or <TH> */
2450: Node.insertNodeAtEnd(row, node);
2451: exclude_state = lexer.excludeBlocks;
2452: lexer.excludeBlocks = false;
2453: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2454: lexer.excludeBlocks = exclude_state;
2455:
2456: /* pop inline stack */
2457:
2458: while (lexer.istack.size() > lexer.istackbase)
2459: lexer.popInline(null);
2460: }
2461:
2462: Node.trimEmptyElement(lexer, row);
2463: }
2464:
2465: };
2466:
2467: public static class ParseNoFrames implements Parser {
2468:
2469: public void parse(Lexer lexer, Node noframes, short mode) {
2470: Node node;
2471: boolean checkstack;
2472: TagTable tt = lexer.configuration.tt;
2473:
2474: lexer.badAccess |= Report.USING_NOFRAMES;
2475: mode = Lexer.IgnoreWhitespace;
2476: checkstack = true;
2477:
2478: while (true) {
2479: node = lexer.getToken(mode);
2480: if (node == null)
2481: break;
2482: if (node.tag == noframes.tag
2483: && node.type == Node.EndTag) {
2484: noframes.closed = true;
2485: Node.trimSpaces(lexer, noframes);
2486: return;
2487: }
2488:
2489: if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)) {
2490: Report.warning(lexer, noframes, node,
2491: Report.MISSING_ENDTAG_BEFORE);
2492: Node.trimSpaces(lexer, noframes);
2493: lexer.ungetToken();
2494: return;
2495: }
2496:
2497: if (node.tag == tt.tagHtml) {
2498: if (node.type == Node.StartTag
2499: || node.type == Node.StartEndTag)
2500: Report.warning(lexer, noframes, node,
2501: Report.DISCARDING_UNEXPECTED);
2502:
2503: continue;
2504: }
2505:
2506: /* deal with comments etc. */
2507: if (Node.insertMisc(noframes, node))
2508: continue;
2509:
2510: if (node.tag == tt.tagBody
2511: && node.type == Node.StartTag) {
2512: Node.insertNodeAtEnd(noframes, node);
2513: parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/);
2514: continue;
2515: }
2516:
2517: /* implicit body element inferred */
2518: if (node.type == Node.TextNode || node.tag != null) {
2519: lexer.ungetToken();
2520: node = lexer.inferredTag("body");
2521: if (lexer.configuration.XmlOut)
2522: Report.warning(lexer, noframes, node,
2523: Report.INSERTING_TAG);
2524: Node.insertNodeAtEnd(noframes, node);
2525: parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/);
2526: continue;
2527: }
2528: /* discard unexpected end tags */
2529: Report.warning(lexer, noframes, node,
2530: Report.DISCARDING_UNEXPECTED);
2531: }
2532:
2533: Report.warning(lexer, noframes, node,
2534: Report.MISSING_ENDTAG_FOR);
2535: }
2536:
2537: };
2538:
2539: public static class ParseSelect implements Parser {
2540:
2541: public void parse(Lexer lexer, Node field, short mode) {
2542: Node node;
2543: TagTable tt = lexer.configuration.tt;
2544:
2545: lexer.insert = -1; /* defer implicit inline start tags */
2546:
2547: while (true) {
2548: node = lexer.getToken(Lexer.IgnoreWhitespace);
2549: if (node == null)
2550: break;
2551: if (node.tag == field.tag && node.type == Node.EndTag) {
2552: field.closed = true;
2553: Node.trimSpaces(lexer, field);
2554: return;
2555: }
2556:
2557: /* deal with comments etc. */
2558: if (Node.insertMisc(field, node))
2559: continue;
2560:
2561: if (node.type == Node.StartTag
2562: && (node.tag == tt.tagOption
2563: || node.tag == tt.tagOptgroup || node.tag == tt.tagScript)) {
2564: Node.insertNodeAtEnd(field, node);
2565: parseTag(lexer, node, Lexer.IgnoreWhitespace);
2566: continue;
2567: }
2568:
2569: /* discard unexpected tags */
2570: Report.warning(lexer, field, node,
2571: Report.DISCARDING_UNEXPECTED);
2572: }
2573:
2574: Report.warning(lexer, field, node,
2575: Report.MISSING_ENDTAG_FOR);
2576: }
2577:
2578: };
2579:
2580: public static class ParseText implements Parser {
2581:
2582: public void parse(Lexer lexer, Node field, short mode) {
2583: Node node;
2584: TagTable tt = lexer.configuration.tt;
2585:
2586: lexer.insert = -1; /* defer implicit inline start tags */
2587:
2588: if (field.tag == tt.tagTextarea)
2589: mode = Lexer.Preformatted;
2590:
2591: while (true) {
2592: node = lexer.getToken(mode);
2593: if (node == null)
2594: break;
2595: if (node.tag == field.tag && node.type == Node.EndTag) {
2596: field.closed = true;
2597: Node.trimSpaces(lexer, field);
2598: return;
2599: }
2600:
2601: /* deal with comments etc. */
2602: if (Node.insertMisc(field, node))
2603: continue;
2604:
2605: if (node.type == Node.TextNode) {
2606: /* only called for 1st child */
2607: if (field.content == null
2608: && !((mode & Lexer.Preformatted) != 0))
2609: Node.trimSpaces(lexer, field);
2610:
2611: if (node.start >= node.end) {
2612: continue;
2613: }
2614:
2615: Node.insertNodeAtEnd(field, node);
2616: continue;
2617: }
2618:
2619: if (node.tag == tt.tagFont) {
2620: Report.warning(lexer, field, node,
2621: Report.DISCARDING_UNEXPECTED);
2622: continue;
2623: }
2624:
2625: /* terminate element on other tags */
2626: if (!((field.tag.model & Dict.CM_OPT) != 0))
2627: Report.warning(lexer, field, node,
2628: Report.MISSING_ENDTAG_BEFORE);
2629:
2630: lexer.ungetToken();
2631: Node.trimSpaces(lexer, field);
2632: return;
2633: }
2634:
2635: if (!((field.tag.model & Dict.CM_OPT) != 0))
2636: Report.warning(lexer, field, node,
2637: Report.MISSING_ENDTAG_FOR);
2638: }
2639:
2640: };
2641:
2642: public static class ParseOptGroup implements Parser {
2643:
2644: public void parse(Lexer lexer, Node field, short mode) {
2645: Node node;
2646: TagTable tt = lexer.configuration.tt;
2647:
2648: lexer.insert = -1; /* defer implicit inline start tags */
2649:
2650: while (true) {
2651: node = lexer.getToken(Lexer.IgnoreWhitespace);
2652: if (node == null)
2653: break;
2654: if (node.tag == field.tag && node.type == Node.EndTag) {
2655: field.closed = true;
2656: Node.trimSpaces(lexer, field);
2657: return;
2658: }
2659:
2660: /* deal with comments etc. */
2661: if (Node.insertMisc(field, node))
2662: continue;
2663:
2664: if (node.type == Node.StartTag
2665: && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) {
2666: if (node.tag == tt.tagOptgroup)
2667: Report.warning(lexer, field, node,
2668: Report.CANT_BE_NESTED);
2669:
2670: Node.insertNodeAtEnd(field, node);
2671: parseTag(lexer, node, Lexer.MixedContent);
2672: continue;
2673: }
2674:
2675: /* discard unexpected tags */
2676: Report.warning(lexer, field, node,
2677: Report.DISCARDING_UNEXPECTED);
2678: }
2679: }
2680:
2681: };
2682:
2683: public static Parser getParseHTML() {
2684: return _parseHTML;
2685: }
2686:
2687: public static Parser getParseHead() {
2688: return _parseHead;
2689: }
2690:
2691: public static Parser getParseTitle() {
2692: return _parseTitle;
2693: }
2694:
2695: public static Parser getParseScript() {
2696: return _parseScript;
2697: }
2698:
2699: public static Parser getParseBody() {
2700: return _parseBody;
2701: }
2702:
2703: public static Parser getParseFrameSet() {
2704: return _parseFrameSet;
2705: }
2706:
2707: public static Parser getParseInline() {
2708: return _parseInline;
2709: }
2710:
2711: public static Parser getParseList() {
2712: return _parseList;
2713: }
2714:
2715: public static Parser getParseDefList() {
2716: return _parseDefList;
2717: }
2718:
2719: public static Parser getParsePre() {
2720: return _parsePre;
2721: }
2722:
2723: public static Parser getParseBlock() {
2724: return _parseBlock;
2725: }
2726:
2727: public static Parser getParseTableTag() {
2728: return _parseTableTag;
2729: }
2730:
2731: public static Parser getParseColGroup() {
2732: return _parseColGroup;
2733: }
2734:
2735: public static Parser getParseRowGroup() {
2736: return _parseRowGroup;
2737: }
2738:
2739: public static Parser getParseRow() {
2740: return _parseRow;
2741: }
2742:
2743: public static Parser getParseNoFrames() {
2744: return _parseNoFrames;
2745: }
2746:
2747: public static Parser getParseSelect() {
2748: return _parseSelect;
2749: }
2750:
2751: public static Parser getParseText() {
2752: return _parseText;
2753: }
2754:
2755: public static Parser getParseOptGroup() {
2756: return _parseOptGroup;
2757: }
2758:
2759: private static Parser _parseHTML = new ParseHTML();
2760: private static Parser _parseHead = new ParseHead();
2761: private static Parser _parseTitle = new ParseTitle();
2762: private static Parser _parseScript = new ParseScript();
2763: private static Parser _parseBody = new ParseBody();
2764: private static Parser _parseFrameSet = new ParseFrameSet();
2765: private static Parser _parseInline = new ParseInline();
2766: private static Parser _parseList = new ParseList();
2767: private static Parser _parseDefList = new ParseDefList();
2768: private static Parser _parsePre = new ParsePre();
2769: private static Parser _parseBlock = new ParseBlock();
2770: private static Parser _parseTableTag = new ParseTableTag();
2771: private static Parser _parseColGroup = new ParseColGroup();
2772: private static Parser _parseRowGroup = new ParseRowGroup();
2773: private static Parser _parseRow = new ParseRow();
2774: private static Parser _parseNoFrames = new ParseNoFrames();
2775: private static Parser _parseSelect = new ParseSelect();
2776: private static Parser _parseText = new ParseText();
2777: private static Parser _parseOptGroup = new ParseOptGroup();
2778:
2779: /*
2780: HTML is the top level element
2781: */
2782: public static Node parseDocument(Lexer lexer) {
2783: Node node, document, html;
2784: Node doctype = null;
2785: TagTable tt = lexer.configuration.tt;
2786:
2787: document = lexer.newNode();
2788: document.type = Node.RootNode;
2789:
2790: while (true) {
2791: node = lexer.getToken(Lexer.IgnoreWhitespace);
2792: if (node == null)
2793: break;
2794:
2795: /* deal with comments etc. */
2796: if (Node.insertMisc(document, node))
2797: continue;
2798:
2799: if (node.type == Node.DocTypeTag) {
2800: if (doctype == null) {
2801: Node.insertNodeAtEnd(document, node);
2802: doctype = node;
2803: } else
2804: Report.warning(lexer, document, node,
2805: Report.DISCARDING_UNEXPECTED);
2806: continue;
2807: }
2808:
2809: if (node.type == Node.EndTag) {
2810: Report.warning(lexer, document, node,
2811: Report.DISCARDING_UNEXPECTED); //TODO?
2812: continue;
2813: }
2814:
2815: // BEGIN RAVE MODIFICATIONS
2816: if (node.tag == tt.tagJspRoot) {
2817: lexer.configuration.inputJspMode = true;
2818: Node root = node;
2819: Node.insertNodeAtEnd(document, root);
2820: getParseBlock().parse(lexer, root, (short) 0); // TODO?
2821: break;
2822: } else
2823: // END RAVE MODIFICATIONS
2824:
2825: if (node.type != Node.StartTag || node.tag != tt.tagHtml) {
2826: lexer.ungetToken();
2827: html = lexer.inferredTag("html");
2828: } else
2829: html = node;
2830:
2831: Node.insertNodeAtEnd(document, html);
2832: getParseHTML().parse(lexer, html, (short) 0); // TODO?
2833: break;
2834: }
2835:
2836: return document;
2837: }
2838:
2839: /**
2840: * Indicates whether or not whitespace should be preserved for this element.
2841: * If an <code>xml:space</code> attribute is found, then if the attribute value is
2842: * <code>preserve</code>, returns <code>true</code>. For any other value, returns
2843: * <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
2844: * found, then the following element names result in a return value of <code>true:
2845: * pre, script, style,</code> and <code>xsl:text</code>. Finally, if a
2846: * <code>TagTable</code> was passed in and the element appears as the "pre" element
2847: * in the <code>TagTable</code>, then <code>true</code> will be returned.
2848: * Otherwise, <code>false</code> is returned.
2849: * @param element The <code>Node</code> to test to see if whitespace should be
2850: * preserved.
2851: * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code>
2852: * function. This may be <code>null</code>, in which case this test
2853: * is bypassed.
2854: * @return <code>true</code> or <code>false</code>, as explained above.
2855: */
2856:
2857: public static boolean XMLPreserveWhiteSpace(Node element,
2858: TagTable tt) {
2859: AttVal attribute;
2860:
2861: /* search attributes for xml:space */
2862: for (attribute = element.attributes; attribute != null; attribute = attribute.next) {
2863: if (attribute.attribute.equals("xml:space")) {
2864: if (attribute.value.equals("preserve"))
2865: return true;
2866:
2867: return false;
2868: }
2869: }
2870:
2871: /* kludge for html docs without explicit xml:space attribute */
2872: if (Lexer.wstrcasecmp(element.element, "pre") == 0
2873: || Lexer.wstrcasecmp(element.element, "script") == 0
2874: || Lexer.wstrcasecmp(element.element, "style") == 0)
2875: return true;
2876:
2877: if ((tt != null) && (tt.findParser(element) == getParsePre()))
2878: return true;
2879:
2880: /* kludge for XSL docs */
2881: if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0)
2882: return true;
2883:
2884: return false;
2885: }
2886:
2887: /*
2888: XML documents
2889: */
2890: public static void parseXMLElement(Lexer lexer, Node element,
2891: short mode) {
2892: Node node;
2893:
2894: /* Jeff Young's kludge for XSL docs */
2895:
2896: if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0)
2897: return;
2898:
2899: /* if node is pre or has xml:space="preserve" then do so */
2900:
2901: if (XMLPreserveWhiteSpace(element, lexer.configuration.tt))
2902: mode = Lexer.Preformatted;
2903:
2904: while (true) {
2905: node = lexer.getToken(mode);
2906: if (node == null)
2907: break;
2908: if (node.type == Node.EndTag
2909: && node.element.equals(element.element)) {
2910: element.closed = true;
2911: break;
2912: }
2913:
2914: /* discard unexpected end tags */
2915: if (node.type == Node.EndTag) {
2916: Report.error(lexer, element, node,
2917: Report.UNEXPECTED_ENDTAG);
2918: continue;
2919: }
2920:
2921: /* parse content on seeing start tag */
2922: if (node.type == Node.StartTag)
2923: parseXMLElement(lexer, node, mode);
2924:
2925: Node.insertNodeAtEnd(element, node);
2926: }
2927:
2928: /*
2929: if first child is text then trim initial space and
2930: delete text node if it is empty.
2931: */
2932:
2933: node = element.content;
2934:
2935: if (node != null && node.type == Node.TextNode
2936: && mode != Lexer.Preformatted) {
2937: if (node.textarray[node.start] == (byte) ' ') {
2938: node.start++;
2939:
2940: if (node.start >= node.end)
2941: Node.discardElement(node);
2942: }
2943: }
2944:
2945: /*
2946: if last child is text then trim final space and
2947: delete the text node if it is empty
2948: */
2949:
2950: node = element.last;
2951:
2952: if (node != null && node.type == Node.TextNode
2953: && mode != Lexer.Preformatted) {
2954: if (node.textarray[node.end - 1] == (byte) ' ') {
2955: node.end--;
2956:
2957: if (node.start >= node.end)
2958: Node.discardElement(node);
2959: }
2960: }
2961: }
2962:
2963: public static Node parseXMLDocument(Lexer lexer) {
2964: Node node, document, doctype;
2965:
2966: document = lexer.newNode();
2967: document.type = Node.RootNode;
2968: doctype = null;
2969: lexer.configuration.XmlTags = true;
2970:
2971: while (true) {
2972: node = lexer.getToken(Lexer.IgnoreWhitespace);
2973: if (node == null)
2974: break;
2975: /* discard unexpected end tags */
2976: if (node.type == Node.EndTag) {
2977: Report.warning(lexer, null, node,
2978: Report.UNEXPECTED_ENDTAG);
2979: continue;
2980: }
2981:
2982: /* deal with comments etc. */
2983: if (Node.insertMisc(document, node))
2984: continue;
2985:
2986: if (node.type == Node.DocTypeTag) {
2987: if (doctype == null) {
2988: Node.insertNodeAtEnd(document, node);
2989: doctype = node;
2990: } else
2991: Report.warning(lexer, document, node,
2992: Report.DISCARDING_UNEXPECTED); // TODO
2993: continue;
2994: }
2995:
2996: /* if start tag then parse element's content */
2997: if (node.type == Node.StartTag) {
2998: Node.insertNodeAtEnd(document, node);
2999: parseXMLElement(lexer, node, Lexer.IgnoreWhitespace);
3000: }
3001:
3002: }
3003:
3004: if (false) { //#if 0
3005: /* discard the document type */
3006: node = document.findDocType();
3007:
3008: if (node != null)
3009: Node.discardElement(node);
3010: } // #endif
3011:
3012: if (doctype != null && !lexer.checkDocTypeKeyWords(doctype))
3013: Report.warning(lexer, doctype, null,
3014: Report.DTYPE_NOT_UPPER_CASE);
3015:
3016: /* ensure presence of initial <?XML version="1.0"?> */
3017: if (lexer.configuration.XmlPi)
3018: lexer.fixXMLPI(document);
3019:
3020: return document;
3021: }
3022:
3023: public static boolean isJavaScript(Node node) {
3024: boolean result = false;
3025: AttVal attr;
3026:
3027: if (node.attributes == null)
3028: return true;
3029:
3030: for (attr = node.attributes; attr != null; attr = attr.next) {
3031: if ((Lexer.wstrcasecmp(attr.attribute, "language") == 0 || Lexer
3032: .wstrcasecmp(attr.attribute, "type") == 0)
3033: && Lexer.wsubstr(attr.value, "javascript"))
3034: result = true;
3035: }
3036:
3037: return result;
3038: }
3039:
3040: }
|