0001: /*
0002: * Java HTML Tidy - JTidy
0003: * HTML parser and pretty printer
0004: *
0005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0006: * Institute of Technology, Institut National de Recherche en
0007: * Informatique et en Automatique, Keio University). All Rights
0008: * Reserved.
0009: *
0010: * Contributing Author(s):
0011: *
0012: * Dave Raggett <dsr@w3.org>
0013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0014: * Gary L Peskin <garyp@firstech.com> (Java development)
0015: * Sami Lempinen <sami@lempinen.net> (release management)
0016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
0017: *
0018: * The contributing author(s) would like to thank all those who
0019: * helped with testing, bug fixes, and patience. This wouldn't
0020: * have been possible without all of you.
0021: *
0022: * COPYRIGHT NOTICE:
0023: *
0024: * This software and documentation is provided "as is," and
0025: * the copyright holders and contributing author(s) make no
0026: * representations or warranties, express or implied, including
0027: * but not limited to, warranties of merchantability or fitness
0028: * for any particular purpose or that the use of the software or
0029: * documentation will not infringe any third party patents,
0030: * copyrights, trademarks or other rights.
0031: *
0032: * The copyright holders and contributing author(s) will not be
0033: * liable for any direct, indirect, special or consequential damages
0034: * arising out of any use of the software or documentation, even if
0035: * advised of the possibility of such damage.
0036: *
0037: * Permission is hereby granted to use, copy, modify, and distribute
0038: * this source code, or portions hereof, documentation and executables,
0039: * for any purpose, without fee, subject to the following restrictions:
0040: *
0041: * 1. The origin of this source code must not be misrepresented.
0042: * 2. Altered versions must be plainly marked as such and must
0043: * not be misrepresented as being the original source.
0044: * 3. This Copyright notice may not be removed or altered from any
0045: * source or altered source distribution.
0046: *
0047: * The copyright holders and contributing author(s) specifically
0048: * permit, without fee, and encourage the use of this source code
0049: * as a component for supporting the Hypertext Markup Language in
0050: * commercial products. If you use this source code in a product,
0051: * acknowledgment is not required but would be appreciated.
0052: *
0053: */
0054: package org.w3c.tidy;
0055:
0056: import java.io.PrintWriter;
0057: import java.util.List;
0058: import java.util.Stack;
0059: import java.util.Vector;
0060:
0061: /**
0062: * Lexer for html parser.
0063: * <p>
0064: * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
0065: * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
0066: * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
0067: * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
0068: * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
0069: * Not yet done: - Doctype subset and marked sections
0070: * </p>
0071: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
0072: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
0073: * @author Fabrizio Giustina
0074: * @version $Revision: 1.93 $ ($Author: fgiust $)
0075: */
0076: public class Lexer {
0077:
0078: /**
0079: * state: ignore whitespace.
0080: */
0081: public static final short IGNORE_WHITESPACE = 0;
0082:
0083: /**
0084: * state: mixed content.
0085: */
0086: public static final short MIXED_CONTENT = 1;
0087:
0088: /**
0089: * state: preformatted.
0090: */
0091: public static final short PREFORMATTED = 2;
0092:
0093: /**
0094: * state: ignore markup.
0095: */
0096: public static final short IGNORE_MARKUP = 3;
0097:
0098: /**
0099: * URI for XHTML 1.0 transitional DTD.
0100: */
0101: private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
0102:
0103: /**
0104: * URI for XHTML 1.0 strict DTD.
0105: */
0106: private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
0107:
0108: /**
0109: * URI for XHTML 1.0 frameset DTD.
0110: */
0111: private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
0112:
0113: /**
0114: * URI for XHTML 1.1.
0115: */
0116: private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
0117:
0118: /**
0119: * URI for XHTML Basic 1.0.
0120: */
0121: // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
0122: /**
0123: * xhtml namespace.
0124: */
0125: private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
0126:
0127: /**
0128: * lists all the known versions.
0129: */
0130: private static final Lexer.W3CVersionInfo[] W3CVERSION = {
0131: new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict",
0132: VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
0133: new W3CVersionInfo("HTML 4.01 Transitional",
0134: "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0135: Dict.VERS_HTML40_LOOSE),
0136: new W3CVersionInfo("HTML 4.01 Frameset",
0137: "XHTML 1.0 Frameset", VOYAGER_FRAMESET,
0138: Dict.VERS_FRAMESET),
0139: new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict",
0140: VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
0141: new W3CVersionInfo("HTML 4.0 Transitional",
0142: "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0143: Dict.VERS_HTML40_LOOSE),
0144: new W3CVersionInfo("HTML 4.0 Frameset",
0145: "XHTML 1.0 Frameset", VOYAGER_FRAMESET,
0146: Dict.VERS_FRAMESET),
0147: new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional",
0148: VOYAGER_LOOSE, Dict.VERS_HTML32),
0149: new W3CVersionInfo("HTML 3.2 Final",
0150: "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0151: Dict.VERS_HTML32),
0152: new W3CVersionInfo("HTML 3.2 Draft",
0153: "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0154: Dict.VERS_HTML32),
0155: new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict",
0156: VOYAGER_STRICT, Dict.VERS_HTML20),
0157: new W3CVersionInfo("HTML 4.01", "XHTML 1.1",
0158: VOYAGER_STRICT, Dict.VERS_XHTML11) };
0159:
0160: /**
0161: * getToken state: content.
0162: */
0163: private static final short LEX_CONTENT = 0;
0164:
0165: /**
0166: * getToken state: gt.
0167: */
0168: private static final short LEX_GT = 1;
0169:
0170: /**
0171: * getToken state: endtag.
0172: */
0173: private static final short LEX_ENDTAG = 2;
0174:
0175: /**
0176: * getToken state: start tag.
0177: */
0178: private static final short LEX_STARTTAG = 3;
0179:
0180: /**
0181: * getToken state: comment.
0182: */
0183: private static final short LEX_COMMENT = 4;
0184:
0185: /**
0186: * getToken state: doctype.
0187: */
0188: private static final short LEX_DOCTYPE = 5;
0189:
0190: /**
0191: * getToken state: procinstr.
0192: */
0193: private static final short LEX_PROCINSTR = 6;
0194:
0195: /**
0196: * getToken state: cdata.
0197: */
0198: private static final short LEX_CDATA = 8;
0199:
0200: /**
0201: * getToken state: section.
0202: */
0203: private static final short LEX_SECTION = 9;
0204:
0205: /**
0206: * getToken state: asp.
0207: */
0208: private static final short LEX_ASP = 10;
0209:
0210: /**
0211: * getToken state: jste.
0212: */
0213: private static final short LEX_JSTE = 11;
0214:
0215: /**
0216: * getToken state: php.
0217: */
0218: private static final short LEX_PHP = 12;
0219:
0220: /**
0221: * getToken state: xml declaration.
0222: */
0223: private static final short LEX_XMLDECL = 13;
0224:
0225: /**
0226: * file stream.
0227: */
0228: protected StreamIn in;
0229:
0230: /**
0231: * error output stream.
0232: */
0233: protected PrintWriter errout;
0234:
0235: /**
0236: * for accessibility errors.
0237: */
0238: protected short badAccess;
0239:
0240: /**
0241: * for bad style errors.
0242: */
0243: protected short badLayout;
0244:
0245: /**
0246: * for bad char encodings.
0247: */
0248: protected short badChars;
0249:
0250: /**
0251: * for mismatched/mispositioned form tags.
0252: */
0253: protected short badForm;
0254:
0255: /**
0256: * count of warnings in this document.
0257: */
0258: protected short warnings;
0259:
0260: /**
0261: * count of errors.
0262: */
0263: protected short errors;
0264:
0265: /**
0266: * lines seen.
0267: */
0268: protected int lines;
0269:
0270: /**
0271: * at start of current token.
0272: */
0273: protected int columns;
0274:
0275: /**
0276: * used to collapse contiguous white space.
0277: */
0278: protected boolean waswhite;
0279:
0280: /**
0281: * true after token has been pushed back.
0282: */
0283: protected boolean pushed;
0284:
0285: /**
0286: * when space is moved after end tag.
0287: */
0288: protected boolean insertspace;
0289:
0290: /**
0291: * Netscape compatibility.
0292: */
0293: protected boolean excludeBlocks;
0294:
0295: /**
0296: * true if moved out of table.
0297: */
0298: protected boolean exiled;
0299:
0300: /**
0301: * true if xmlns attribute on html element.
0302: */
0303: protected boolean isvoyager;
0304:
0305: /**
0306: * bit vector of HTML versions.
0307: */
0308: protected short versions;
0309:
0310: /**
0311: * version as given by doctype (if any).
0312: */
0313: protected int doctype;
0314:
0315: /**
0316: * set if html or PUBLIC is missing.
0317: */
0318: protected boolean badDoctype;
0319:
0320: /**
0321: * start of current node.
0322: */
0323: protected int txtstart;
0324:
0325: /**
0326: * end of current node.
0327: */
0328: protected int txtend;
0329:
0330: /**
0331: * state of lexer's finite state machine.
0332: */
0333: protected short state;
0334:
0335: /**
0336: * current node.
0337: */
0338: protected Node token;
0339:
0340: /**
0341: * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
0342: * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
0343: */
0344: protected byte[] lexbuf;
0345:
0346: /**
0347: * allocated.
0348: */
0349: protected int lexlength;
0350:
0351: /**
0352: * used.
0353: */
0354: protected int lexsize;
0355:
0356: /**
0357: * Inline stack for compatibility with Mosaic. For deferring text node.
0358: */
0359: protected Node inode;
0360:
0361: /**
0362: * for inferring inline tags.
0363: */
0364: protected int insert;
0365:
0366: /**
0367: * stack.
0368: */
0369: protected Stack istack;
0370:
0371: /**
0372: * start of frame.
0373: */
0374: protected int istackbase;
0375:
0376: /**
0377: * used for cleaning up presentation markup.
0378: */
0379: protected Style styles;
0380:
0381: /**
0382: * configuration.
0383: */
0384: protected Configuration configuration;
0385:
0386: /**
0387: * already seen end body tag?
0388: */
0389: protected boolean seenEndBody;
0390:
0391: /**
0392: * already seen end html tag?
0393: */
0394: protected boolean seenEndHtml;
0395:
0396: /**
0397: * report.
0398: */
0399: protected Report report;
0400:
0401: /**
0402: * Root node is saved here.
0403: */
0404: protected Node root;
0405:
0406: /**
0407: * node list.
0408: */
0409: private List nodeList;
0410:
0411: /**
0412: * Instantiates a new Lexer.
0413: * @param in StreamIn
0414: * @param configuration configuation instance
0415: * @param report report instance, for reporting errors
0416: */
0417: public Lexer(StreamIn in, Configuration configuration, Report report) {
0418: this .report = report;
0419: this .in = in;
0420: this .lines = 1;
0421: this .columns = 1;
0422: this .state = LEX_CONTENT;
0423: this .versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
0424: this .doctype = Dict.VERS_UNKNOWN;
0425: this .insert = -1;
0426: this .istack = new Stack();
0427: this .configuration = configuration;
0428: this .nodeList = new Vector();
0429: }
0430:
0431: /**
0432: * Creates a new node and add it to nodelist.
0433: * @return Node
0434: */
0435: public Node newNode() {
0436: Node node = new Node();
0437: this .nodeList.add(node);
0438: return node;
0439: }
0440:
0441: /**
0442: * Creates a new node and add it to nodelist.
0443: * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0444: * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0445: * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0446: * @param textarray array of bytes contained in the Node
0447: * @param start start position
0448: * @param end end position
0449: * @return Node
0450: */
0451: public Node newNode(short type, byte[] textarray, int start, int end) {
0452: Node node = new Node(type, textarray, start, end);
0453: this .nodeList.add(node);
0454: return node;
0455: }
0456:
0457: /**
0458: * Creates a new node and add it to nodelist.
0459: * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0460: * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0461: * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0462: * @param textarray array of bytes contained in the Node
0463: * @param start start position
0464: * @param end end position
0465: * @param element tag name
0466: * @return Node
0467: */
0468: public Node newNode(short type, byte[] textarray, int start,
0469: int end, String element) {
0470: Node node = new Node(type, textarray, start, end, element,
0471: this .configuration.tt);
0472: this .nodeList.add(node);
0473: return node;
0474: }
0475:
0476: /**
0477: * Clones a node and add it to node list.
0478: * @param node Node
0479: * @return cloned Node
0480: */
0481: public Node cloneNode(Node node) {
0482: Node cnode = (Node) node.clone();
0483: this .nodeList.add(cnode);
0484: for (AttVal att = cnode.attributes; att != null; att = att.next) {
0485: if (att.asp != null) {
0486: this .nodeList.add(att.asp);
0487: }
0488: if (att.php != null) {
0489: this .nodeList.add(att.php);
0490: }
0491: }
0492: return cnode;
0493: }
0494:
0495: /**
0496: * Clones an attribute value and add eventual asp or php node to node list.
0497: * @param attrs original AttVal
0498: * @return cloned AttVal
0499: */
0500: public AttVal cloneAttributes(AttVal attrs) {
0501: AttVal cattrs = (AttVal) attrs.clone();
0502: for (AttVal att = cattrs; att != null; att = att.next) {
0503: if (att.asp != null) {
0504: this .nodeList.add(att.asp);
0505: }
0506: if (att.php != null) {
0507: this .nodeList.add(att.php);
0508: }
0509: }
0510: return cattrs;
0511: }
0512:
0513: /**
0514: * Update <code>oldtextarray</code> in the current nodes.
0515: * @param oldtextarray previous text array
0516: * @param newtextarray new text array
0517: */
0518: protected void updateNodeTextArrays(byte[] oldtextarray,
0519: byte[] newtextarray) {
0520: Node node;
0521: for (int i = 0; i < this .nodeList.size(); i++) {
0522: node = (Node) (this .nodeList.get(i));
0523: if (node.textarray == oldtextarray) {
0524: node.textarray = newtextarray;
0525: }
0526: }
0527: }
0528:
0529: /**
0530: * Adds a new line node. Used for creating preformatted text from Word2000.
0531: * @return new line node
0532: */
0533: public Node newLineNode() {
0534: Node node = newNode();
0535:
0536: node.textarray = this .lexbuf;
0537: node.start = this .lexsize;
0538: addCharToLexer('\n');
0539: node.end = this .lexsize;
0540: return node;
0541: }
0542:
0543: /**
0544: * Has end of input stream been reached?
0545: * @return <code>true</code> if end of input stream been reached
0546: */
0547: public boolean endOfInput() {
0548: return this .in.isEndOfStream();
0549: }
0550:
0551: /**
0552: * Adds a byte to lexer buffer.
0553: * @param c byte to add
0554: */
0555: public void addByte(int c) {
0556: if (this .lexsize + 1 >= this .lexlength) {
0557: while (this .lexsize + 1 >= this .lexlength) {
0558: if (this .lexlength == 0) {
0559: this .lexlength = 8192;
0560: } else {
0561: this .lexlength = this .lexlength * 2;
0562: }
0563: }
0564:
0565: byte[] temp = this .lexbuf;
0566: this .lexbuf = new byte[this .lexlength];
0567: if (temp != null) {
0568: System.arraycopy(temp, 0, this .lexbuf, 0, temp.length);
0569: updateNodeTextArrays(temp, this .lexbuf);
0570: }
0571: }
0572:
0573: this .lexbuf[this .lexsize++] = (byte) c;
0574: this .lexbuf[this .lexsize] = (byte) '\0'; // debug
0575: }
0576:
0577: /**
0578: * Substitute the last char in buffer.
0579: * @param c new char
0580: */
0581: public void changeChar(byte c) {
0582: if (this .lexsize > 0) {
0583: this .lexbuf[this .lexsize - 1] = c;
0584: }
0585: }
0586:
0587: /**
0588: * Store char c as UTF-8 encoded byte stream.
0589: * @param c char to store
0590: */
0591: public void addCharToLexer(int c) {
0592: // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
0593: // Fix by Pablo Mayrgundter 17-08-2004
0594:
0595: if ((this .configuration.xmlOut || this .configuration.xHTML) // only for xml output
0596: && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
0597: || c == 0x9 || c == 0xA || c == 0xD // Then white-space.
0598: || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
0599: || (c >= 0x10000 && c <= 0x10FFFF))) {
0600: return;
0601: }
0602:
0603: int i = 0;
0604: int[] count = new int[] { 0 };
0605: byte[] buf = new byte[10]; // unsigned char
0606:
0607: boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null,
0608: count);
0609: if (err) {
0610: // replacement char 0xFFFD encoded as UTF-8
0611: buf[0] = (byte) 0xEF;
0612: buf[1] = (byte) 0xBF;
0613: buf[2] = (byte) 0xBD;
0614: count[0] = 3;
0615: }
0616:
0617: for (i = 0; i < count[0]; i++) {
0618: addByte(buf[i]); // uint
0619: }
0620:
0621: }
0622:
0623: /**
0624: * Adds a string to lexer buffer.
0625: * @param str String to add
0626: */
0627: public void addStringToLexer(String str) {
0628: for (int i = 0; i < str.length(); i++) {
0629: addCharToLexer(str.charAt(i));
0630: }
0631: }
0632:
0633: /**
0634: * Parse an html entity.
0635: * @param mode mode
0636: */
0637: public void parseEntity(short mode) {
0638: // No longer attempts to insert missing ';' for unknown
0639: // entities unless one was present already, since this
0640: // gives unexpected results.
0641: //
0642: // For example: <a href="something.htm?foo&bar&fred">
0643: // was tidied to: <a href="something.htm?foo&bar;&fred;">
0644: // rather than: <a href="something.htm?foo&bar&fred">
0645: //
0646: // My thanks for Maurice Buxton for spotting this.
0647: //
0648: // Also Randy Waki pointed out the following case for the
0649: // 04 Aug 00 version (bug #433012):
0650: //
0651: // For example: <a href="something.htm?id=1&lang=en">
0652: // was tidied to: <a href="something.htm?id=1⟨=en">
0653: // rather than: <a href="something.htm?id=1&lang=en">
0654: //
0655: // where "lang" is a known entity (#9001), but browsers would
0656: // misinterpret "⟨" because it had a value > 256.
0657: //
0658: // So the case of an apparently known entity with a value > 256 and
0659: // missing a semicolon is handled specially.
0660: //
0661: // "ParseEntity" is also a bit of a misnomer - it handles entities and
0662: // numeric character references. Invalid NCR's are now reported.
0663:
0664: int start;
0665: boolean first = true;
0666: boolean semicolon = false;
0667: int c, ch, startcol;
0668: String str;
0669:
0670: start = this .lexsize - 1; // to start at "&"
0671: startcol = this .in.getCurcol() - 1;
0672:
0673: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
0674: if (c == ';') {
0675: semicolon = true;
0676: break;
0677: }
0678:
0679: if (first && c == '#') {
0680: // #431953 - start RJ
0681: if (!this .configuration.ncr
0682: || this .configuration.getInCharEncoding() == Configuration.BIG5
0683: || this .configuration.getInCharEncoding() == Configuration.SHIFTJIS) {
0684: this .in.ungetChar(c);
0685: return;
0686: }
0687: // #431953 - end RJ
0688:
0689: addCharToLexer(c);
0690: first = false;
0691: continue;
0692: }
0693:
0694: first = false;
0695:
0696: if (TidyUtils.isNamechar((char) c)) {
0697: addCharToLexer(c);
0698: continue;
0699: }
0700:
0701: // otherwise put it back
0702: this .in.ungetChar(c);
0703: break;
0704: }
0705:
0706: str = TidyUtils.getString(this .lexbuf, start, this .lexsize
0707: - start);
0708:
0709: if ("&apos".equals(str) && !configuration.xmlOut
0710: && !this .isvoyager && !configuration.xHTML) {
0711: report.entityError(this , Report.APOS_UNDEFINED, str, 39);
0712: }
0713:
0714: ch = EntityTable.getDefaultEntityTable().entityCode(str);
0715:
0716: // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
0717: // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
0718: // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
0719: // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
0720: // || (ch >= 0xE000 && ch <= 0xFFFD)))
0721: // {
0722: // this.lexsize = start;
0723: // return;
0724: // }
0725:
0726: // deal with unrecognized or invalid entities
0727: // #433012 - fix by Randy Waki 17 Feb 01
0728: // report invalid NCR's - Terry Teague 01 Sep 01
0729: if (ch <= 0 || (ch >= 256 && c != ';')) {
0730: // set error position just before offending character
0731: this .lines = this .in.getCurline();
0732: this .columns = startcol;
0733:
0734: if (this .lexsize > start + 1) {
0735: if (ch >= 128 && ch <= 159) {
0736: // invalid numeric character reference
0737: int c1 = 0;
0738:
0739: if (configuration.replacementCharEncoding == Configuration.WIN1252) {
0740: c1 = EncodingUtils.decodeWin1252(ch);
0741: } else if (configuration.replacementCharEncoding == Configuration.MACROMAN) {
0742: c1 = EncodingUtils.decodeMacRoman(ch);
0743: }
0744:
0745: // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
0746:
0747: int replaceMode = c1 != 0 ? Report.REPLACED_CHAR
0748: : Report.DISCARDED_CHAR;
0749:
0750: if (c != ';') /* issue warning if not terminated by ';' */
0751: {
0752: report.entityError(this ,
0753: Report.MISSING_SEMICOLON_NCR, str, c);
0754: }
0755:
0756: report.encodingError(this ,
0757: (short) (Report.INVALID_NCR | replaceMode),
0758: ch);
0759:
0760: if (c1 != 0) {
0761: // make the replacement
0762: this .lexsize = start;
0763: addCharToLexer(c1);
0764: semicolon = false;
0765: } else {
0766: /* discard */
0767: this .lexsize = start;
0768: semicolon = false;
0769: }
0770:
0771: } else {
0772: report.entityError(this , Report.UNKNOWN_ENTITY,
0773: str, ch);
0774: }
0775:
0776: if (semicolon) {
0777: addCharToLexer(';');
0778: }
0779: } else {
0780: // naked &
0781: report.entityError(this , Report.UNESCAPED_AMPERSAND,
0782: str, ch);
0783: }
0784: } else {
0785: // issue warning if not terminated by ';'
0786: if (c != ';') {
0787: // set error position just before offending character
0788: this .lines = this .in.getCurline();
0789: this .columns = startcol;
0790: report.entityError(this , Report.MISSING_SEMICOLON, str,
0791: c);
0792: }
0793:
0794: this .lexsize = start;
0795:
0796: if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED)) {
0797: ch = ' ';
0798: }
0799:
0800: addCharToLexer(ch);
0801:
0802: if (ch == '&' && !this .configuration.quoteAmpersand) {
0803: addCharToLexer('a');
0804: addCharToLexer('m');
0805: addCharToLexer('p');
0806: addCharToLexer(';');
0807: }
0808: }
0809: }
0810:
0811: /**
0812: * Parses a tag name.
0813: * @return first char after the tag name
0814: */
0815: public char parseTagName() {
0816: int c;
0817:
0818: // fold case of first char in buffer
0819: c = this .lexbuf[this .txtstart];
0820:
0821: if (!this .configuration.xmlTags && TidyUtils.isUpper((char) c)) {
0822: c = TidyUtils.toLower((char) c);
0823: this .lexbuf[this .txtstart] = (byte) c;
0824: }
0825:
0826: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
0827: if (!TidyUtils.isNamechar((char) c)) {
0828: break;
0829: }
0830:
0831: // fold case of subsequent chars
0832: if (!this .configuration.xmlTags
0833: && TidyUtils.isUpper((char) c)) {
0834: c = TidyUtils.toLower((char) c);
0835: }
0836:
0837: addCharToLexer(c);
0838: }
0839:
0840: this .txtend = this .lexsize;
0841: return (char) c;
0842: }
0843:
0844: /**
0845: * calls addCharToLexer for any char in the string.
0846: * @param str input String
0847: */
0848: public void addStringLiteral(String str) {
0849: int len = str.length();
0850: for (int i = 0; i < len; i++) {
0851: addCharToLexer(str.charAt(i));
0852: }
0853: }
0854:
0855: /**
0856: * calls addCharToLexer for any char in the string till len is reached.
0857: * @param str input String
0858: * @param len length of the substring to be added
0859: */
0860: void addStringLiteralLen(String str, int len) {
0861: int strlen = str.length();
0862: if (strlen < len) {
0863: len = strlen;
0864: }
0865: for (int i = 0; i < len; i++) {
0866: addCharToLexer(str.charAt(i));
0867: }
0868: }
0869:
0870: /**
0871: * Choose what version to use for new doctype.
0872: * @return html version constant
0873: */
0874: public short htmlVersion() {
0875: if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20)) {
0876: return Dict.VERS_HTML20;
0877: }
0878:
0879: if (!(this .configuration.xmlOut | this .configuration.xmlTags | this .isvoyager)
0880: && TidyUtils.toBoolean(versions & Dict.VERS_HTML32)) {
0881: return Dict.VERS_HTML32;
0882: }
0883: if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11)) {
0884: return Dict.VERS_XHTML11;
0885: }
0886: if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT)) {
0887: return Dict.VERS_HTML40_STRICT;
0888: }
0889:
0890: if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE)) {
0891: return Dict.VERS_HTML40_LOOSE;
0892: }
0893:
0894: if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET)) {
0895: return Dict.VERS_FRAMESET;
0896: }
0897:
0898: return Dict.VERS_UNKNOWN;
0899: }
0900:
0901: /**
0902: * Choose what version to use for new doctype.
0903: * @return html version name
0904: */
0905: public String htmlVersionName() {
0906: short guessed;
0907: int j;
0908:
0909: guessed = apparentVersion();
0910:
0911: for (j = 0; j < W3CVERSION.length; ++j) {
0912: if (guessed == W3CVERSION[j].code) {
0913: if (this .isvoyager) {
0914: return W3CVERSION[j].voyagerName;
0915: }
0916:
0917: return W3CVERSION[j].name;
0918: }
0919: }
0920:
0921: return null;
0922: }
0923:
0924: /**
0925: * Add meta element for Tidy. If the meta tag is already present, update release date.
0926: * @param root root node
0927: * @return <code>true</code> if the tag has been added
0928: */
0929: public boolean addGenerator(Node root) {
0930: AttVal attval;
0931: Node node;
0932: Node head = root.findHEAD(this .configuration.tt);
0933:
0934: if (head != null) {
0935: String meta = "HTML Tidy for Java (vers. "
0936: + Report.RELEASE_DATE_STRING + "), see www.w3.org";
0937:
0938: for (node = head.content; node != null; node = node.next) {
0939: if (node.tag == this .configuration.tt.tagMeta) {
0940: attval = node.getAttrByName("name");
0941:
0942: if (attval != null
0943: && attval.value != null
0944: && "generator"
0945: .equalsIgnoreCase(attval.value)) {
0946: attval = node.getAttrByName("content");
0947:
0948: if (attval != null
0949: && attval.value != null
0950: && attval.value.length() >= 9
0951: && "HTML Tidy"
0952: .equalsIgnoreCase(attval.value
0953: .substring(0, 9))) {
0954: attval.value = meta;
0955: return false;
0956: }
0957: }
0958: }
0959: }
0960:
0961: node = this .inferredTag("meta");
0962: node.addAttribute("content", meta);
0963: node.addAttribute("name", "generator");
0964: head.insertNodeAtStart(node);
0965: return true;
0966: }
0967:
0968: return false;
0969: }
0970:
0971: /**
0972: * Check system keywords (keywords should be uppercase).
0973: * @param doctype doctype node
0974: * @return true if doctype keywords are all uppercase
0975: */
0976: public boolean checkDocTypeKeyWords(Node doctype) {
0977: int len = doctype.end - doctype.start;
0978: String s = TidyUtils.getString(this .lexbuf, doctype.start, len);
0979:
0980: return !(TidyUtils.findBadSubString("SYSTEM", s, len)
0981: || TidyUtils.findBadSubString("PUBLIC", s, len)
0982: || TidyUtils.findBadSubString("//DTD", s, len)
0983: || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils
0984: .findBadSubString("//EN", s, len));
0985: }
0986:
0987: /**
0988: * Examine DOCTYPE to identify version.
0989: * @param doctype doctype node
0990: * @return version code
0991: */
0992: public short findGivenVersion(Node doctype) {
0993: String p, s;
0994: int i, j;
0995: int len;
0996: String str1;
0997: String str2;
0998:
0999: // if root tag for doctype isn't html give up now
1000: str1 = TidyUtils.getString(this .lexbuf, doctype.start, 5);
1001: if (!"html ".equalsIgnoreCase(str1)) {
1002: return 0;
1003: }
1004:
1005: if (!checkDocTypeKeyWords(doctype)) {
1006: report.warning(this , doctype, null,
1007: Report.DTYPE_NOT_UPPER_CASE);
1008: }
1009:
1010: // give up if all we are given is the system id for the doctype
1011: str1 = TidyUtils.getString(this .lexbuf, doctype.start + 5, 7);
1012: if ("SYSTEM ".equalsIgnoreCase(str1)) {
1013: // but at least ensure the case is correct
1014: if (!str1.substring(0, 6).equals("SYSTEM")) {
1015: System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0,
1016: this .lexbuf, doctype.start + 5, 6);
1017: }
1018: return 0; // unrecognized
1019: }
1020:
1021: if ("PUBLIC ".equalsIgnoreCase(str1)) {
1022: if (!str1.substring(0, 6).equals("PUBLIC")) {
1023: System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0,
1024: this .lexbuf, doctype.start + 5, 6);
1025: }
1026: } else {
1027: this .badDoctype = true;
1028: }
1029:
1030: for (i = doctype.start; i < doctype.end; ++i) {
1031: if (this .lexbuf[i] == (byte) '"') {
1032: str1 = TidyUtils.getString(this .lexbuf, i + 1, 12);
1033: str2 = TidyUtils.getString(this .lexbuf, i + 1, 13);
1034: if (str1.equals("-//W3C//DTD ")) {
1035: // compute length of identifier e.g. "HTML 4.0 Transitional"
1036: for (j = i + 13; j < doctype.end
1037: && this .lexbuf[j] != (byte) '/'; ++j) {
1038: //
1039: }
1040: len = j - i - 13;
1041: p = TidyUtils.getString(this .lexbuf, i + 13, len);
1042:
1043: for (j = 1; j < W3CVERSION.length; ++j) {
1044: s = W3CVERSION[j].name;
1045: if (len == s.length() && s.equals(p)) {
1046: return W3CVERSION[j].code;
1047: }
1048: }
1049:
1050: // else unrecognized version
1051: } else if (str2.equals("-//IETF//DTD ")) {
1052: // compute length of identifier e.g. "HTML 2.0"
1053: for (j = i + 14; j < doctype.end
1054: && this .lexbuf[j] != (byte) '/'; ++j) {
1055: //
1056: }
1057: len = j - i - 14;
1058:
1059: p = TidyUtils.getString(this .lexbuf, i + 14, len);
1060: s = W3CVERSION[0].name;
1061: if (len == s.length() && s.equals(p)) {
1062: return W3CVERSION[0].code;
1063: }
1064:
1065: // else unrecognized version
1066: }
1067: break;
1068: }
1069: }
1070:
1071: return 0;
1072: }
1073:
1074: /**
1075: * Fix xhtml namespace.
1076: * @param root root Node
1077: * @param profile current profile
1078: */
1079: public void fixHTMLNameSpace(Node root, String profile) {
1080: Node node;
1081: AttVal attr;
1082:
1083: node = root.content;
1084: while (node != null
1085: && node.tag != this .configuration.tt.tagHtml) {
1086: node = node.next;
1087: }
1088:
1089: if (node != null) {
1090:
1091: for (attr = node.attributes; attr != null; attr = attr.next) {
1092: if (attr.attribute.equals("xmlns")) {
1093: break;
1094: }
1095:
1096: }
1097:
1098: if (attr != null) {
1099: if (!attr.value.equals(profile)) {
1100: report.warning(this , node, null,
1101: Report.INCONSISTENT_NAMESPACE);
1102: attr.value = profile;
1103: }
1104: } else {
1105: attr = new AttVal(node.attributes, null, '"', "xmlns",
1106: profile);
1107: attr.dict = AttributeTable.getDefaultAttributeTable()
1108: .findAttribute(attr);
1109: node.attributes = attr;
1110: }
1111: }
1112: }
1113:
1114: /**
1115: * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
1116: * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1117: * @param root root node
1118: * @return new doctype node
1119: */
1120: Node newXhtmlDocTypeNode(Node root) {
1121: Node html = root.findHTML(this .configuration.tt);
1122: if (html == null) {
1123: return null;
1124: }
1125:
1126: Node newdoctype = newNode();
1127: newdoctype.setType(Node.DOCTYPE_TAG);
1128: newdoctype.next = html;
1129: newdoctype.parent = root;
1130: newdoctype.prev = null;
1131:
1132: if (html == root.content) {
1133: // No <?xml ... ?> declaration.
1134: root.content.prev = newdoctype;
1135: root.content = newdoctype;
1136: newdoctype.prev = null;
1137: } else {
1138: // we have an <?xml ... ?> declaration.
1139: newdoctype.prev = html.prev;
1140: newdoctype.prev.next = newdoctype;
1141: }
1142: html.prev = newdoctype;
1143: return newdoctype;
1144: }
1145:
1146: /**
1147: * Adds a new xhtml doctype to the document.
1148: * @param root root node
1149: * @return <code>true</code> if a doctype has been added
1150: */
1151: public boolean setXHTMLDocType(Node root) {
1152: String fpi = " ";
1153: String sysid = "";
1154: String namespace = XHTML_NAMESPACE;
1155: String dtdsub = null;
1156: Node doctype;
1157: int dtdlen = 0;
1158:
1159: doctype = root.findDocType();
1160:
1161: fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
1162:
1163: if (this .configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
1164: if (doctype != null) {
1165: Node.discardElement(doctype);
1166: }
1167: return true;
1168: }
1169:
1170: if (this .configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
1171: // see what flavor of XHTML this document matches
1172: if (TidyUtils.toBoolean(this .versions
1173: & Dict.VERS_HTML40_STRICT)) {
1174: // use XHTML strict
1175: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1176: sysid = VOYAGER_STRICT;
1177: } else if (TidyUtils.toBoolean(this .versions
1178: & Dict.VERS_FRAMESET)) {
1179: // use XHTML frames
1180: fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1181: sysid = VOYAGER_FRAMESET;
1182: } else if (TidyUtils.toBoolean(this .versions
1183: & Dict.VERS_LOOSE)) {
1184: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1185: sysid = VOYAGER_LOOSE;
1186: } else if (TidyUtils.toBoolean(this .versions
1187: & Dict.VERS_XHTML11)) {
1188: // use XHTML 1.1
1189: fpi = "-//W3C//DTD XHTML 1.1//EN";
1190: sysid = VOYAGER_11;
1191: } else {
1192: // proprietary
1193: fpi = null;
1194: sysid = "";
1195: if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
1196: {
1197: Node.discardElement(doctype);
1198: }
1199: }
1200: } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
1201: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1202: sysid = VOYAGER_STRICT;
1203: } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
1204: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1205: sysid = VOYAGER_LOOSE;
1206: }
1207:
1208: if (this .configuration.docTypeMode == Configuration.DOCTYPE_USER
1209: && this .configuration.docTypeStr != null) {
1210: fpi = this .configuration.docTypeStr;
1211: sysid = "";
1212: }
1213:
1214: if (fpi == null) {
1215: return false;
1216: }
1217:
1218: if (doctype != null) {
1219: // Look for internal DTD subset
1220: if (configuration.xHTML || configuration.xmlOut) {
1221:
1222: int len = doctype.end - doctype.start + 1;
1223: String start = TidyUtils.getString(this .lexbuf,
1224: doctype.start, len);
1225:
1226: int dtdbeg = start.indexOf('[');
1227: if (dtdbeg >= 0) {
1228: int dtdend = start.substring(dtdbeg).indexOf(']');
1229: if (dtdend >= 0) {
1230: dtdlen = dtdend + 1;
1231: dtdsub = start.substring(dtdbeg);
1232: }
1233: }
1234: }
1235: } else {
1236: if ((doctype = newXhtmlDocTypeNode(root)) == null) {
1237: return false;
1238: }
1239: }
1240:
1241: this .txtstart = this .lexsize;
1242: this .txtend = this .lexsize;
1243:
1244: // add public identifier
1245: addStringLiteral("html PUBLIC ");
1246:
1247: // check if the fpi is quoted or not
1248: if (fpi.charAt(0) == '"') {
1249: addStringLiteral(fpi);
1250: } else {
1251: addStringLiteral("\"");
1252: addStringLiteral(fpi);
1253: addStringLiteral("\"");
1254: }
1255:
1256: if (this .configuration.wraplen != 0
1257: && sysid.length() + 6 >= this .configuration.wraplen) {
1258: addStringLiteral("\n\"");
1259: } else {
1260: // FG: don't wrap
1261: addStringLiteral(" \"");
1262: }
1263:
1264: // add system identifier
1265: addStringLiteral(sysid);
1266: addStringLiteral("\"");
1267:
1268: if (dtdlen > 0 && dtdsub != null) {
1269: addCharToLexer(' ');
1270: addStringLiteralLen(dtdsub, dtdlen);
1271: }
1272:
1273: this .txtend = this .lexsize;
1274:
1275: int length = this .txtend - this .txtstart;
1276: doctype.textarray = new byte[length];
1277:
1278: System.arraycopy(this .lexbuf, this .txtstart, doctype.textarray,
1279: 0, length);
1280: doctype.start = 0;
1281: doctype.end = length;
1282:
1283: return false;
1284: }
1285:
1286: /**
1287: * Return the html version used in document.
1288: * @return version code
1289: */
1290: public short apparentVersion() {
1291: switch (this .doctype) {
1292: case Dict.VERS_UNKNOWN:
1293: return htmlVersion();
1294:
1295: case Dict.VERS_HTML20:
1296: if (TidyUtils.toBoolean(this .versions & Dict.VERS_HTML20)) {
1297: return Dict.VERS_HTML20;
1298: }
1299:
1300: break;
1301:
1302: case Dict.VERS_HTML32:
1303: if (TidyUtils.toBoolean(this .versions & Dict.VERS_HTML32)) {
1304: return Dict.VERS_HTML32;
1305: }
1306:
1307: break; // to replace old version by new
1308:
1309: case Dict.VERS_HTML40_STRICT:
1310: if (TidyUtils.toBoolean(this .versions
1311: & Dict.VERS_HTML40_STRICT)) {
1312: return Dict.VERS_HTML40_STRICT;
1313: }
1314:
1315: break;
1316:
1317: case Dict.VERS_HTML40_LOOSE:
1318: if (TidyUtils.toBoolean(this .versions
1319: & Dict.VERS_HTML40_LOOSE)) {
1320: return Dict.VERS_HTML40_LOOSE;
1321: }
1322:
1323: break; // to replace old version by new
1324:
1325: case Dict.VERS_FRAMESET:
1326: if (TidyUtils.toBoolean(this .versions & Dict.VERS_FRAMESET)) {
1327: return Dict.VERS_FRAMESET;
1328: }
1329:
1330: break;
1331:
1332: case Dict.VERS_XHTML11:
1333: if (TidyUtils.toBoolean(this .versions & Dict.VERS_XHTML11)) {
1334: return Dict.VERS_XHTML11;
1335: }
1336:
1337: break;
1338: default:
1339: // should never reach here
1340: break;
1341: }
1342:
1343: // kludge to avoid error appearing at end of file
1344: // it would be better to note the actual position
1345: // when first encountering the doctype declaration
1346:
1347: this .lines = 1;
1348: this .columns = 1;
1349:
1350: report.warning(this , null, null, Report.INCONSISTENT_VERSION);
1351: return this .htmlVersion();
1352: }
1353:
1354: /**
1355: * Fixup doctype if missing.
1356: * @param root root node
1357: * @return <code>false</code> if current version has not been identified
1358: */
1359: public boolean fixDocType(Node root) {
1360: Node doctype;
1361: int guessed = Dict.VERS_HTML40_STRICT, i;
1362:
1363: if (this .badDoctype) {
1364: report.warning(this , null, null, Report.MALFORMED_DOCTYPE);
1365: }
1366:
1367: doctype = root.findDocType();
1368:
1369: if (this .configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
1370: if (doctype != null) {
1371: Node.discardElement(doctype);
1372: }
1373: return true;
1374: }
1375:
1376: if (this .configuration.xmlOut) {
1377: return true;
1378: }
1379:
1380: if (this .configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
1381: Node.discardElement(doctype);
1382: doctype = null;
1383: guessed = Dict.VERS_HTML40_STRICT;
1384: } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
1385: Node.discardElement(doctype);
1386: doctype = null;
1387: guessed = Dict.VERS_HTML40_LOOSE;
1388: } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
1389: if (doctype != null) {
1390: if (this .doctype == Dict.VERS_UNKNOWN) {
1391: return false;
1392: }
1393:
1394: switch (this .doctype) {
1395: case Dict.VERS_UNKNOWN:
1396: return false;
1397:
1398: case Dict.VERS_HTML20:
1399: if (TidyUtils.toBoolean(this .versions
1400: & Dict.VERS_HTML20)) {
1401: return true;
1402: }
1403:
1404: break; // to replace old version by new
1405:
1406: case Dict.VERS_HTML32:
1407: if (TidyUtils.toBoolean(this .versions
1408: & Dict.VERS_HTML32)) {
1409: return true;
1410: }
1411:
1412: break; // to replace old version by new
1413:
1414: case Dict.VERS_HTML40_STRICT:
1415: if (TidyUtils.toBoolean(this .versions
1416: & Dict.VERS_HTML40_STRICT)) {
1417: return true;
1418: }
1419:
1420: break; // to replace old version by new
1421:
1422: case Dict.VERS_HTML40_LOOSE:
1423: if (TidyUtils.toBoolean(this .versions
1424: & Dict.VERS_HTML40_LOOSE)) {
1425: return true;
1426: }
1427:
1428: break; // to replace old version by new
1429:
1430: case Dict.VERS_FRAMESET:
1431: if (TidyUtils.toBoolean(this .versions
1432: & Dict.VERS_FRAMESET)) {
1433: return true;
1434: }
1435:
1436: break; // to replace old version by new
1437:
1438: case Dict.VERS_XHTML11:
1439: if (TidyUtils.toBoolean(this .versions
1440: & Dict.VERS_XHTML11)) {
1441: return true;
1442: }
1443:
1444: break; // to replace old version by new
1445: default:
1446: // should never reach here
1447: break;
1448: }
1449:
1450: // INCONSISTENT_VERSION warning is now issued by ApparentVersion()
1451: }
1452:
1453: // choose new doctype
1454: guessed = htmlVersion();
1455: }
1456:
1457: if (guessed == Dict.VERS_UNKNOWN) {
1458: return false;
1459: }
1460:
1461: // for XML use the Voyager system identifier
1462: if (this .configuration.xmlOut || this .configuration.xmlTags
1463: || this .isvoyager) {
1464: if (doctype != null) {
1465: Node.discardElement(doctype);
1466: }
1467:
1468: fixHTMLNameSpace(root, XHTML_NAMESPACE);
1469:
1470: // Namespace is the same for all XHTML variants
1471: // Also, don't return yet. Still need to add DOCTYPE declaration.
1472: //
1473: // for (i = 0; i < W3CVersion.length; ++i)
1474: // {
1475: // if (guessed == W3CVersion[i].code)
1476: // {
1477: // fixHTMLNameSpace(root, W3CVersion[i].profile);
1478: // break;
1479: // }
1480: // }
1481: // return true;
1482: }
1483:
1484: if (doctype == null) {
1485: if ((doctype = newXhtmlDocTypeNode(root)) == null) {
1486: return false;
1487: }
1488: }
1489:
1490: this .txtstart = this .lexsize;
1491: this .txtend = this .lexsize;
1492:
1493: // use the appropriate public identifier
1494: addStringLiteral("html PUBLIC ");
1495:
1496: if (this .configuration.docTypeMode == Configuration.DOCTYPE_USER
1497: && this .configuration.docTypeStr != null
1498: && this .configuration.docTypeStr.length() > 0) {
1499: // check if the fpi is quoted or not
1500: if (this .configuration.docTypeStr.charAt(0) == '"') {
1501: addStringLiteral(this .configuration.docTypeStr);
1502: } else {
1503: addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1504: addStringLiteral(this .configuration.docTypeStr);
1505: addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1506: }
1507: } else if (guessed == Dict.VERS_HTML20) {
1508: addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1509: } else {
1510: addStringLiteral("\"-//W3C//DTD ");
1511:
1512: for (i = 0; i < W3CVERSION.length; ++i) {
1513: if (guessed == W3CVERSION[i].code) {
1514: addStringLiteral(W3CVERSION[i].name);
1515: break;
1516: }
1517: }
1518:
1519: addStringLiteral("//EN\"");
1520: }
1521:
1522: this .txtend = this .lexsize;
1523:
1524: int length = this .txtend - this .txtstart;
1525: doctype.textarray = new byte[length];
1526:
1527: System.arraycopy(this .lexbuf, this .txtstart, doctype.textarray,
1528: 0, length);
1529: doctype.start = 0;
1530: doctype.end = length;
1531:
1532: return true;
1533: }
1534:
1535: /**
1536: * Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using
1537: * ASCII or UTF-8 output.
1538: * @param root root node
1539: * @return always true
1540: */
1541: public boolean fixXmlDecl(Node root) {
1542: Node xml;
1543: AttVal version;
1544: AttVal encoding;
1545:
1546: if (root.content != null && root.content.type == Node.XML_DECL) {
1547: xml = root.content;
1548: } else {
1549: xml = newNode(Node.XML_DECL, this .lexbuf, 0, 0);
1550: xml.next = root.content;
1551:
1552: if (root.content != null) {
1553: root.content.prev = xml;
1554: xml.next = root.content;
1555: }
1556:
1557: root.content = xml;
1558: }
1559:
1560: version = xml.getAttrByName("version");
1561: encoding = xml.getAttrByName("encoding");
1562:
1563: // We need to insert a check if declared encoding and output encoding mismatch
1564: // and fix the Xml declaration accordingly!!!
1565: if (encoding == null
1566: && this .configuration.getOutCharEncoding() != Configuration.UTF8) {
1567: if (this .configuration.getOutCharEncoding() == Configuration.LATIN1) {
1568: xml.addAttribute("encoding", "iso-8859-1");
1569: }
1570: if (this .configuration.getOutCharEncoding() == Configuration.ISO2022) {
1571: xml.addAttribute("encoding", "iso-2022");
1572: }
1573: }
1574:
1575: if (version == null) {
1576: xml.addAttribute("version", "1.0");
1577: }
1578:
1579: return true;
1580: }
1581:
1582: /**
1583: * Generates and inserts a new node.
1584: * @param name tag name
1585: * @return generated node
1586: */
1587: public Node inferredTag(String name) {
1588: Node node;
1589:
1590: node = newNode(Node.START_TAG, this .lexbuf, this .txtstart,
1591: this .txtend, name);
1592: node.implicit = true;
1593: return node;
1594: }
1595:
1596: /**
1597: * Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
1598: * foo.
1599: * @param container container node
1600: * @return cdata node
1601: */
1602: public Node getCDATA(Node container) {
1603: int c, lastc, start, len, i;
1604: int qt = 0;
1605: int esc = 0;
1606: String str;
1607: boolean endtag = false;
1608: boolean begtag = false;
1609:
1610: if (container.isJavaScript()) {
1611: esc = '\\';
1612: }
1613:
1614: this .lines = this .in.getCurline();
1615: this .columns = this .in.getCurcol();
1616: this .waswhite = false;
1617: this .txtstart = this .lexsize;
1618: this .txtend = this .lexsize;
1619:
1620: lastc = '\0';
1621: start = -1;
1622:
1623: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
1624: // treat \r\n as \n and \r as \n
1625: if (qt > 0) {
1626: // #598860 script parsing fails with quote chars
1627: // A quoted string is ended by the quotation character, or end of line
1628: if ((c == '\r' || c == '\n' || c == qt)
1629: && (!TidyUtils.toBoolean(esc) || lastc != esc)) {
1630: qt = 0;
1631: } else if (c == '/' && lastc == '<') {
1632: start = this .lexsize + 1; // to first letter
1633: }
1634:
1635: else if (c == '>' && start >= 0) {
1636: len = this .lexsize - start;
1637:
1638: this .lines = this .in.getCurline();
1639: this .columns = this .in.getCurcol() - 3;
1640:
1641: report.warning(this , null, null,
1642: Report.BAD_CDATA_CONTENT);
1643:
1644: // if javascript insert backslash before /
1645: if (TidyUtils.toBoolean(esc)) {
1646: for (i = this .lexsize; i > start - 1; --i) {
1647: this .lexbuf[i] = this .lexbuf[i - 1];
1648: }
1649:
1650: this .lexbuf[start - 1] = (byte) esc;
1651: this .lexsize++;
1652: }
1653:
1654: start = -1;
1655: }
1656: } else if (TidyUtils.isQuote(c)
1657: && (!TidyUtils.toBoolean(esc) || lastc != esc)) {
1658: qt = c;
1659: } else if (c == '<') {
1660: start = this .lexsize + 1; // to first letter
1661: endtag = false;
1662: begtag = true;
1663: } else if (c == '!' && lastc == '<') // Cancel start tag
1664: {
1665: start = -1;
1666: endtag = false;
1667: begtag = false;
1668: } else if (c == '/' && lastc == '<') {
1669: start = this .lexsize + 1; // to first letter
1670: endtag = true;
1671: begtag = false;
1672: } else if (c == '>' && start >= 0) // End of begin or end tag
1673: {
1674: int decr = 2;
1675:
1676: if (endtag
1677: && ((len = this .lexsize - start) == container.element
1678: .length())) {
1679:
1680: str = TidyUtils.getString(this .lexbuf, start, len);
1681: if (container.element.equalsIgnoreCase(str)) {
1682: this .txtend = start - decr;
1683: this .lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01
1684: break;
1685: }
1686: }
1687:
1688: // Unquoted markup will end SCRIPT or STYLE elements
1689:
1690: this .lines = this .in.getCurline();
1691: this .columns = this .in.getCurcol() - 3;
1692:
1693: report.warning(this , null, null,
1694: Report.BAD_CDATA_CONTENT);
1695: if (begtag) {
1696: decr = 1;
1697: }
1698: this .txtend = start - decr;
1699: this .lexsize = start - decr;
1700: break;
1701: }
1702: // #427844 - fix by Markus Hoenicka 21 Oct 00
1703: else if (c == '\r') {
1704: if (begtag || endtag) {
1705: continue; // discard whitespace in endtag
1706: }
1707:
1708: c = this .in.readChar();
1709:
1710: if (c != '\n') {
1711: this .in.ungetChar(c);
1712: }
1713:
1714: c = '\n';
1715:
1716: } else if ((c == '\n' || c == '\t' || c == ' ')
1717: && (begtag || endtag)) {
1718: continue; // discard whitespace in endtag
1719: }
1720:
1721: addCharToLexer(c);
1722: this .txtend = this .lexsize;
1723: lastc = c;
1724: }
1725:
1726: if (c == StreamIn.END_OF_STREAM) {
1727: report.warning(this , container, null,
1728: Report.MISSING_ENDTAG_FOR);
1729: }
1730:
1731: if (this .txtend > this .txtstart) {
1732: this .token = newNode(Node.TEXT_NODE, this .lexbuf,
1733: this .txtstart, this .txtend);
1734: return this .token;
1735: }
1736:
1737: return null;
1738: }
1739:
1740: /**
1741: *
1742: *
1743: */
1744: public void ungetToken() {
1745: this .pushed = true;
1746: }
1747:
1748: /**
1749: * Gets a token.
1750: * @param mode one of the following:
1751: * <ul>
1752: * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1753: * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1754: * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1755: * </ul>
1756: * @return next Node
1757: */
1758: public Node getToken(short mode) {
1759: int c = 0;
1760: int badcomment = 0;
1761: // pass by reference
1762: boolean[] isempty = new boolean[1];
1763: boolean inDTDSubset = false;
1764: AttVal attributes = null;
1765:
1766: if (this .pushed) {
1767: // duplicate inlines in preference to pushed text nodes when appropriate
1768: if (this .token.type != Node.TEXT_NODE
1769: || (this .insert == -1 && this .inode == null)) {
1770: this .pushed = false;
1771: return this .token;
1772: }
1773: }
1774:
1775: // at start of block elements, unclosed inline
1776: if (this .insert != -1 || this .inode != null) {
1777: return insertedToken();
1778: }
1779:
1780: this .lines = this .in.getCurline();
1781: this .columns = this .in.getCurcol();
1782: this .waswhite = false;
1783:
1784: this .txtstart = this .lexsize;
1785: this .txtend = this .lexsize;
1786:
1787: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
1788: // FG fix for [427846] different from tidy
1789: // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1790: if (this .insertspace && mode != IGNORE_WHITESPACE) {
1791: addCharToLexer(' ');
1792: }
1793: if (this .insertspace
1794: && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) {
1795: this .waswhite = true;
1796: this .insertspace = false;
1797: }
1798:
1799: // treat \r\n as \n and \r as \n
1800: if (c == '\r') {
1801: c = this .in.readChar();
1802:
1803: if (c != '\n') {
1804: this .in.ungetChar(c);
1805: }
1806:
1807: c = '\n';
1808: }
1809:
1810: addCharToLexer(c);
1811:
1812: switch (this .state) {
1813: case LEX_CONTENT:
1814: // element content
1815:
1816: // Discard white space if appropriate.
1817: // Its cheaper to do this here rather than in parser methods for elements that
1818: // don't have mixed content.
1819: if (TidyUtils.isWhite((char) c)
1820: && (mode == IGNORE_WHITESPACE)
1821: && this .lexsize == this .txtstart + 1) {
1822: --this .lexsize;
1823: this .waswhite = false;
1824: this .lines = this .in.getCurline();
1825: this .columns = this .in.getCurcol();
1826: continue;
1827: }
1828:
1829: if (c == '<') {
1830: this .state = LEX_GT;
1831: continue;
1832: }
1833:
1834: if (TidyUtils.isWhite((char) c)) {
1835: // was previous char white?
1836: if (this .waswhite) {
1837: if (mode != PREFORMATTED
1838: && mode != IGNORE_MARKUP) {
1839: --this .lexsize;
1840: this .lines = this .in.getCurline();
1841: this .columns = this .in.getCurcol();
1842: }
1843: } else {
1844: // prev char wasn't white
1845: this .waswhite = true;
1846:
1847: if (mode != PREFORMATTED
1848: && mode != IGNORE_MARKUP && c != ' ') {
1849: changeChar((byte) ' ');
1850: }
1851: }
1852:
1853: continue;
1854: } else if (c == '&' && mode != IGNORE_MARKUP) {
1855: parseEntity(mode);
1856: }
1857:
1858: // this is needed to avoid trimming trailing whitespace
1859: if (mode == IGNORE_WHITESPACE) {
1860: mode = MIXED_CONTENT;
1861: }
1862:
1863: this .waswhite = false;
1864: continue;
1865:
1866: case LEX_GT:
1867: // <
1868:
1869: // check for endtag
1870: if (c == '/') {
1871: c = this .in.readChar();
1872: if (c == StreamIn.END_OF_STREAM) {
1873: this .in.ungetChar(c);
1874: continue;
1875: }
1876:
1877: addCharToLexer(c);
1878:
1879: if (TidyUtils.isLetter((char) c)) {
1880: this .lexsize -= 3;
1881: this .txtend = this .lexsize;
1882: this .in.ungetChar(c);
1883: this .state = LEX_ENDTAG;
1884: this .lexbuf[this .lexsize] = (byte) '\0'; // debug
1885:
1886: // changed from
1887: // this.in.curcol -= 2;
1888: this .columns -= 2;
1889:
1890: // if some text before the </ return it now
1891: if (this .txtend > this .txtstart) {
1892: // trim space char before end tag
1893: if (mode == IGNORE_WHITESPACE
1894: && this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1895: this .lexsize -= 1;
1896: this .txtend = this .lexsize;
1897: }
1898:
1899: this .token = newNode(Node.TEXT_NODE,
1900: this .lexbuf, this .txtstart,
1901: this .txtend);
1902: return this .token;
1903: }
1904:
1905: continue; // no text so keep going
1906: }
1907:
1908: // otherwise treat as CDATA
1909: this .waswhite = false;
1910: this .state = LEX_CONTENT;
1911: continue;
1912: }
1913:
1914: if (mode == IGNORE_MARKUP) {
1915: // otherwise treat as CDATA
1916: this .waswhite = false;
1917: this .state = LEX_CONTENT;
1918: continue;
1919: }
1920:
1921: // look out for comments, doctype or marked sections this isn't quite right, but its getting there
1922: if (c == '!') {
1923: c = this .in.readChar();
1924:
1925: if (c == '-') {
1926: c = this .in.readChar();
1927:
1928: if (c == '-') {
1929: this .state = LEX_COMMENT; // comment
1930: this .lexsize -= 2;
1931: this .txtend = this .lexsize;
1932:
1933: // if some text before < return it now
1934: if (this .txtend > this .txtstart) {
1935: this .token = newNode(Node.TEXT_NODE,
1936: this .lexbuf, this .txtstart,
1937: this .txtend);
1938: return this .token;
1939: }
1940:
1941: this .txtstart = this .lexsize;
1942: continue;
1943: }
1944:
1945: report.warning(this , null, null,
1946: Report.MALFORMED_COMMENT);
1947: } else if (c == 'd' || c == 'D') {
1948: this .state = LEX_DOCTYPE; // doctype
1949: this .lexsize -= 2;
1950: this .txtend = this .lexsize;
1951: mode = IGNORE_WHITESPACE;
1952:
1953: // skip until white space or '>'
1954:
1955: for (;;) {
1956: c = this .in.readChar();
1957:
1958: if (c == StreamIn.END_OF_STREAM || c == '>') {
1959: this .in.ungetChar(c);
1960: break;
1961: }
1962:
1963: if (!TidyUtils.isWhite((char) c)) {
1964: continue;
1965: }
1966:
1967: // and skip to end of whitespace
1968:
1969: for (;;) {
1970: c = this .in.readChar();
1971:
1972: if (c == StreamIn.END_OF_STREAM
1973: || c == '>') {
1974: this .in.ungetChar(c);
1975: break;
1976: }
1977:
1978: if (TidyUtils.isWhite((char) c)) {
1979: continue;
1980: }
1981:
1982: this .in.ungetChar(c);
1983: break;
1984: }
1985:
1986: break;
1987: }
1988:
1989: // if some text before < return it now
1990: if (this .txtend > this .txtstart) {
1991: this .token = newNode(Node.TEXT_NODE,
1992: this .lexbuf, this .txtstart,
1993: this .txtend);
1994: return this .token;
1995: }
1996:
1997: this .txtstart = this .lexsize;
1998: continue;
1999: } else if (c == '[') {
2000: // Word 2000 embeds <![if ...]> ... <![endif]> sequences
2001: this .lexsize -= 2;
2002: this .state = LEX_SECTION;
2003: this .txtend = this .lexsize;
2004:
2005: // if some text before < return it now
2006: if (this .txtend > this .txtstart) {
2007: this .token = newNode(Node.TEXT_NODE,
2008: this .lexbuf, this .txtstart,
2009: this .txtend);
2010: return this .token;
2011: }
2012:
2013: this .txtstart = this .lexsize;
2014: continue;
2015: }
2016:
2017: // otherwise swallow chars up to and including next '>'
2018: while (true) {
2019: c = this .in.readChar();
2020: if (c == '>') {
2021: break;
2022: }
2023: if (c == -1) {
2024: this .in.ungetChar(c);
2025: break;
2026: }
2027: }
2028:
2029: this .lexsize -= 2;
2030: this .lexbuf[this .lexsize] = (byte) '\0';
2031: this .state = LEX_CONTENT;
2032: continue;
2033: }
2034:
2035: // processing instructions
2036:
2037: if (c == '?') {
2038: this .lexsize -= 2;
2039: this .state = LEX_PROCINSTR;
2040: this .txtend = this .lexsize;
2041:
2042: // if some text before < return it now
2043: if (this .txtend > this .txtstart) {
2044: this .token = newNode(Node.TEXT_NODE,
2045: this .lexbuf, this .txtstart, this .txtend);
2046: return this .token;
2047: }
2048:
2049: this .txtstart = this .lexsize;
2050: continue;
2051: }
2052:
2053: // Microsoft ASP's e.g. <% ... server-code ... %>
2054: if (c == '%') {
2055: this .lexsize -= 2;
2056: this .state = LEX_ASP;
2057: this .txtend = this .lexsize;
2058:
2059: // if some text before < return it now
2060: if (this .txtend > this .txtstart) {
2061: this .token = newNode(Node.TEXT_NODE,
2062: this .lexbuf, this .txtstart, this .txtend);
2063: return this .token;
2064: }
2065:
2066: this .txtstart = this .lexsize;
2067: continue;
2068: }
2069:
2070: // Netscapes JSTE e.g. <# ... server-code ... #>
2071: if (c == '#') {
2072: this .lexsize -= 2;
2073: this .state = LEX_JSTE;
2074: this .txtend = this .lexsize;
2075:
2076: // if some text before < return it now
2077: if (this .txtend > this .txtstart) {
2078: this .token = newNode(Node.TEXT_NODE,
2079: this .lexbuf, this .txtstart, this .txtend);
2080: return this .token;
2081: }
2082:
2083: this .txtstart = this .lexsize;
2084: continue;
2085: }
2086:
2087: // check for start tag
2088: if (TidyUtils.isLetter((char) c)) {
2089: this .in.ungetChar(c); // push back letter
2090: this .lexsize -= 2; // discard " <" + letter
2091: this .txtend = this .lexsize;
2092: this .state = LEX_STARTTAG; // ready to read tag name
2093:
2094: // if some text before < return it now
2095: if (this .txtend > this .txtstart) {
2096: this .token = newNode(Node.TEXT_NODE,
2097: this .lexbuf, this .txtstart, this .txtend);
2098: return this .token;
2099: }
2100:
2101: continue; // no text so keep going
2102: }
2103:
2104: // otherwise treat as CDATA
2105: this .state = LEX_CONTENT;
2106: this .waswhite = false;
2107: continue;
2108:
2109: case LEX_ENDTAG:
2110: // </letter
2111: this .txtstart = this .lexsize - 1;
2112:
2113: // changed from
2114: // this.in.curcol -= 2;
2115: this .columns -= 2;
2116:
2117: c = parseTagName();
2118: this .token = newNode(
2119: Node.END_TAG, // create endtag token
2120: this .lexbuf, this .txtstart, this .txtend,
2121: TidyUtils.getString(this .lexbuf, this .txtstart,
2122: this .txtend - this .txtstart));
2123: this .lexsize = this .txtstart;
2124: this .txtend = this .txtstart;
2125:
2126: // skip to '>'
2127: while (c != '>') {
2128: c = this .in.readChar();
2129:
2130: if (c == StreamIn.END_OF_STREAM) {
2131: break;
2132: }
2133: }
2134:
2135: if (c == StreamIn.END_OF_STREAM) {
2136: this .in.ungetChar(c);
2137: continue;
2138: }
2139:
2140: this .state = LEX_CONTENT;
2141: this .waswhite = false;
2142: return this .token; // the endtag token
2143:
2144: case LEX_STARTTAG:
2145: // first letter of tagname
2146: this .txtstart = this .lexsize - 1; // set txtstart to first letter
2147: c = parseTagName();
2148: isempty[0] = false;
2149: attributes = null;
2150: this .token = newNode((isempty[0] ? Node.START_END_TAG
2151: : Node.START_TAG), this .lexbuf, this .txtstart,
2152: this .txtend, TidyUtils.getString(this .lexbuf,
2153: this .txtstart, this .txtend
2154: - this .txtstart));
2155:
2156: // parse attributes, consuming closing ">"
2157: if (c != '>') {
2158: if (c == '/') {
2159: this .in.ungetChar(c);
2160: }
2161:
2162: attributes = parseAttrs(isempty);
2163: }
2164:
2165: if (isempty[0]) {
2166: this .token.type = Node.START_END_TAG;
2167: }
2168:
2169: this .token.attributes = attributes;
2170: this .lexsize = this .txtstart;
2171: this .txtend = this .txtstart;
2172:
2173: // swallow newline following start tag
2174: // special check needed for CRLF sequence
2175: // this doesn't apply to empty elements
2176: // nor to preformatted content that needs escaping
2177:
2178: if (
2179:
2180: (mode != PREFORMATTED || preContent(this .token))
2181: && (this .token.expectsContent() || this .token.tag == this .configuration.tt.tagBr)) {
2182:
2183: c = this .in.readChar();
2184:
2185: if (c == '\r') {
2186: c = this .in.readChar();
2187:
2188: if (c != '\n') {
2189: this .in.ungetChar(c);
2190: }
2191: } else if (c != '\n' && c != '\f') {
2192: this .in.ungetChar(c);
2193: }
2194:
2195: this .waswhite = true; // to swallow leading whitespace
2196: } else {
2197: this .waswhite = false;
2198: }
2199:
2200: this .state = LEX_CONTENT;
2201:
2202: if (this .token.tag == null) {
2203: report.error(this , null, this .token,
2204: Report.UNKNOWN_ELEMENT);
2205: } else if (!this .configuration.xmlTags) {
2206: constrainVersion(this .token.tag.versions);
2207:
2208: if (TidyUtils.toBoolean(this .token.tag.versions
2209: & Dict.VERS_PROPRIETARY)) {
2210: // #427810 - fix by Gary Deschaines 24 May 00
2211: if (this .configuration.makeClean
2212: && (this .token.tag != this .configuration.tt.tagNobr && //
2213: this .token.tag != this .configuration.tt.tagWbr)) {
2214: report.warning(this , null, this .token,
2215: Report.PROPRIETARY_ELEMENT);
2216: }
2217: // #427810 - fix by Terry Teague 2 Jul 01
2218: else if (!this .configuration.makeClean) {
2219: report.warning(this , null, this .token,
2220: Report.PROPRIETARY_ELEMENT);
2221: }
2222: }
2223:
2224: if (this .token.tag.getChkattrs() != null) {
2225: this .token.tag.getChkattrs().check(this ,
2226: this .token);
2227: } else {
2228: this .token.checkAttributes(this );
2229: }
2230:
2231: // should this be called before attribute checks?
2232: this .token.repairDuplicateAttributes(this );
2233:
2234: }
2235:
2236: return this .token; // return start tag
2237:
2238: case LEX_COMMENT:
2239: // seen <!-- so look for -->
2240:
2241: if (c != '-') {
2242: continue;
2243: }
2244:
2245: c = this .in.readChar();
2246: addCharToLexer(c);
2247:
2248: if (c != '-') {
2249: continue;
2250: }
2251:
2252: end_comment: while (true) {
2253: c = this .in.readChar();
2254:
2255: if (c == '>') {
2256: if (badcomment != 0) {
2257: report.warning(this , null, null,
2258: Report.MALFORMED_COMMENT);
2259: }
2260:
2261: this .txtend = this .lexsize - 2; // AQ 8Jul2000
2262: this .lexbuf[this .lexsize] = (byte) '\0';
2263: this .state = LEX_CONTENT;
2264: this .waswhite = false;
2265: this .token = newNode(Node.COMMENT_TAG,
2266: this .lexbuf, this .txtstart, this .txtend);
2267:
2268: // now look for a line break
2269:
2270: c = this .in.readChar();
2271:
2272: if (c == '\r') {
2273: c = this .in.readChar();
2274:
2275: if (c != '\n') {
2276: this .token.linebreak = true;
2277: }
2278: }
2279:
2280: if (c == '\n') {
2281: this .token.linebreak = true;
2282: } else {
2283: this .in.ungetChar(c);
2284: }
2285:
2286: return this .token;
2287: }
2288:
2289: // note position of first such error in the comment
2290: if (badcomment == 0) {
2291: this .lines = this .in.getCurline();
2292: this .columns = this .in.getCurcol() - 3;
2293: }
2294:
2295: badcomment++;
2296: if (this .configuration.fixComments) {
2297: this .lexbuf[this .lexsize - 2] = (byte) '=';
2298: }
2299:
2300: addCharToLexer(c);
2301:
2302: // if '-' then look for '>' to end the comment
2303: if (c != '-') {
2304: break end_comment;
2305: }
2306:
2307: }
2308: // otherwise continue to look for -->
2309: this .lexbuf[this .lexsize - 2] = (byte) '=';
2310: continue;
2311:
2312: case LEX_DOCTYPE:
2313: // seen <!d so look for '> ' munging whitespace
2314:
2315: if (TidyUtils.isWhite((char) c)) {
2316: if (this .waswhite) {
2317: this .lexsize -= 1;
2318: }
2319:
2320: this .waswhite = true;
2321: } else {
2322: this .waswhite = false;
2323: }
2324:
2325: if (inDTDSubset) {
2326: if (c == ']') {
2327: inDTDSubset = false;
2328: }
2329: } else if (c == '[') {
2330: inDTDSubset = true;
2331: }
2332: if (inDTDSubset || c != '>') {
2333: continue;
2334: }
2335:
2336: this .lexsize -= 1;
2337: this .txtend = this .lexsize;
2338: this .lexbuf[this .lexsize] = (byte) '\0';
2339: this .state = LEX_CONTENT;
2340: this .waswhite = false;
2341: this .token = newNode(Node.DOCTYPE_TAG, this .lexbuf,
2342: this .txtstart, this .txtend);
2343: // make a note of the version named by the doctype
2344: this .doctype = findGivenVersion(this .token);
2345: return this .token;
2346:
2347: case LEX_PROCINSTR:
2348: // seen <? so look for '> '
2349: // check for PHP preprocessor instructions <?php ... ?>
2350:
2351: if (this .lexsize - this .txtstart == 3) {
2352: if ((TidyUtils.getString(this .lexbuf,
2353: this .txtstart, 3)).equals("php")) {
2354: this .state = LEX_PHP;
2355: continue;
2356: }
2357: }
2358:
2359: if (this .lexsize - this .txtstart == 4) {
2360: if ((TidyUtils.getString(this .lexbuf,
2361: this .txtstart, 3)).equals("xml")
2362: && TidyUtils
2363: .isWhite((char) this .lexbuf[this .txtstart + 3])) {
2364: this .state = LEX_XMLDECL;
2365: attributes = null;
2366: continue;
2367: }
2368: }
2369:
2370: if (this .configuration.xmlPIs) // insist on ?> as terminator
2371: {
2372: if (c != '?') {
2373: continue;
2374: }
2375:
2376: // now look for '>'
2377: c = this .in.readChar();
2378:
2379: if (c == StreamIn.END_OF_STREAM) {
2380: report.warning(this , null, null,
2381: Report.UNEXPECTED_END_OF_FILE);
2382: this .in.ungetChar(c);
2383: continue;
2384: }
2385:
2386: addCharToLexer(c);
2387: }
2388:
2389: if (c != '>') {
2390: continue;
2391: }
2392:
2393: this .lexsize -= 1;
2394: this .txtend = this .lexsize;
2395: this .lexbuf[this .lexsize] = (byte) '\0';
2396: this .state = LEX_CONTENT;
2397: this .waswhite = false;
2398: this .token = newNode(Node.PROC_INS_TAG, this .lexbuf,
2399: this .txtstart, this .txtend);
2400: return this .token;
2401:
2402: case LEX_ASP:
2403: // seen <% so look for "%> "
2404: if (c != '%') {
2405: continue;
2406: }
2407:
2408: // now look for '>'
2409: c = this .in.readChar();
2410:
2411: if (c != '>') {
2412: this .in.ungetChar(c);
2413: continue;
2414: }
2415:
2416: this .lexsize -= 1;
2417: this .txtend = this .lexsize;
2418: this .lexbuf[this .lexsize] = (byte) '\0';
2419: this .state = LEX_CONTENT;
2420: this .waswhite = false;
2421: this .token = newNode(Node.ASP_TAG, this .lexbuf,
2422: this .txtstart, this .txtend);
2423: return this .token;
2424:
2425: case LEX_JSTE:
2426: // seen <# so look for "#> "
2427: if (c != '#') {
2428: continue;
2429: }
2430:
2431: // now look for '>'
2432: c = this .in.readChar();
2433:
2434: if (c != '>') {
2435: this .in.ungetChar(c);
2436: continue;
2437: }
2438:
2439: this .lexsize -= 1;
2440: this .txtend = this .lexsize;
2441: this .lexbuf[this .lexsize] = (byte) '\0';
2442: this .state = LEX_CONTENT;
2443: this .waswhite = false;
2444: this .token = newNode(Node.JSTE_TAG, this .lexbuf,
2445: this .txtstart, this .txtend);
2446: return this .token;
2447:
2448: case LEX_PHP:
2449: // seen " <?php" so look for "?> "
2450: if (c != '?') {
2451: continue;
2452: }
2453:
2454: // now look for '>'
2455: c = this .in.readChar();
2456:
2457: if (c != '>') {
2458: this .in.ungetChar(c);
2459: continue;
2460: }
2461:
2462: this .lexsize -= 1;
2463: this .txtend = this .lexsize;
2464: this .lexbuf[this .lexsize] = (byte) '\0';
2465: this .state = LEX_CONTENT;
2466: this .waswhite = false;
2467: this .token = newNode(Node.PHP_TAG, this .lexbuf,
2468: this .txtstart, this .txtend);
2469: return this .token;
2470:
2471: case LEX_XMLDECL: // seen "<?xml" so look for "?>"
2472:
2473: if (TidyUtils.isWhite((char) c) && c != '?') {
2474: continue;
2475: }
2476:
2477: // get pseudo-attribute
2478: if (c != '?') {
2479: String name;
2480: Node[] asp = new Node[1];
2481: Node[] php = new Node[1];
2482: AttVal av = new AttVal();
2483: int[] pdelim = new int[1];
2484: isempty[0] = false;
2485:
2486: this .in.ungetChar(c);
2487:
2488: name = this .parseAttribute(isempty, asp, php);
2489: av.attribute = name;
2490:
2491: av.value = this .parseValue(name, true, isempty,
2492: pdelim);
2493: av.delim = pdelim[0];
2494: av.next = attributes;
2495:
2496: attributes = av;
2497: // continue;
2498: }
2499:
2500: // now look for '>'
2501: c = this .in.readChar();
2502:
2503: if (c != '>') {
2504: this .in.ungetChar(c);
2505: continue;
2506: }
2507: this .lexsize -= 1;
2508: this .txtend = this .txtstart;
2509: this .lexbuf[this .txtend] = '\0';
2510: this .state = LEX_CONTENT;
2511: this .waswhite = false;
2512: this .token = newNode(Node.XML_DECL, this .lexbuf,
2513: this .txtstart, this .txtend);
2514: this .token.attributes = attributes;
2515: return this .token;
2516:
2517: case LEX_SECTION:
2518: // seen " <![" so look for "]> "
2519: if (c == '[') {
2520: if (this .lexsize == (this .txtstart + 6)
2521: && (TidyUtils.getString(this .lexbuf,
2522: this .txtstart, 6)).equals("CDATA[")) {
2523: this .state = LEX_CDATA;
2524: this .lexsize -= 6;
2525: continue;
2526: }
2527: }
2528:
2529: if (c != ']') {
2530: continue;
2531: }
2532:
2533: // now look for '>'
2534: c = this .in.readChar();
2535:
2536: if (c != '>') {
2537: this .in.ungetChar(c);
2538: continue;
2539: }
2540:
2541: this .lexsize -= 1;
2542: this .txtend = this .lexsize;
2543: this .lexbuf[this .lexsize] = (byte) '\0';
2544: this .state = LEX_CONTENT;
2545: this .waswhite = false;
2546: this .token = newNode(Node.SECTION_TAG, this .lexbuf,
2547: this .txtstart, this .txtend);
2548: return this .token;
2549:
2550: case LEX_CDATA:
2551: // seen " <![CDATA[" so look for "]]> "
2552: if (c != ']') {
2553: continue;
2554: }
2555:
2556: // now look for ']'
2557: c = this .in.readChar();
2558:
2559: if (c != ']') {
2560: this .in.ungetChar(c);
2561: continue;
2562: }
2563:
2564: // now look for '>'
2565: c = this .in.readChar();
2566:
2567: if (c != '>') {
2568: this .in.ungetChar(c);
2569: continue;
2570: }
2571:
2572: this .lexsize -= 1;
2573: this .txtend = this .lexsize;
2574: this .lexbuf[this .lexsize] = (byte) '\0';
2575: this .state = LEX_CONTENT;
2576: this .waswhite = false;
2577: this .token = newNode(Node.CDATA_TAG, this .lexbuf,
2578: this .txtstart, this .txtend);
2579: return this .token;
2580:
2581: default:
2582: // should never reach here
2583: break;
2584: }
2585: }
2586:
2587: if (this .state == LEX_CONTENT) // text string
2588: {
2589: this .txtend = this .lexsize;
2590:
2591: if (this .txtend > this .txtstart) {
2592: this .in.ungetChar(c);
2593:
2594: if (this .lexbuf[this .lexsize - 1] == (byte) ' ') {
2595: this .lexsize -= 1;
2596: this .txtend = this .lexsize;
2597: }
2598:
2599: this .token = newNode(Node.TEXT_NODE, this .lexbuf,
2600: this .txtstart, this .txtend);
2601: return this .token;
2602: }
2603: } else if (this .state == LEX_COMMENT) // comment
2604: {
2605: if (c == StreamIn.END_OF_STREAM) {
2606: report.warning(this , null, null,
2607: Report.MALFORMED_COMMENT);
2608: }
2609:
2610: this .txtend = this .lexsize;
2611: this .lexbuf[this .lexsize] = (byte) '\0';
2612: this .state = LEX_CONTENT;
2613: this .waswhite = false;
2614: this .token = newNode(Node.COMMENT_TAG, this .lexbuf,
2615: this .txtstart, this .txtend);
2616: return this .token;
2617: }
2618:
2619: return null;
2620: }
2621:
2622: /**
2623: * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2624: * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2625: * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2626: * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2627: * masked from Tidy by the quotemarks.
2628: * @return parsed Node
2629: */
2630: public Node parseAsp() {
2631: int c;
2632: Node asp = null;
2633:
2634: this .txtstart = this .lexsize;
2635:
2636: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
2637:
2638: addCharToLexer(c);
2639:
2640: if (c != '%') {
2641: continue;
2642: }
2643:
2644: if ((c = this .in.readChar()) == StreamIn.END_OF_STREAM) {
2645: break;
2646: }
2647: addCharToLexer(c);
2648:
2649: if (c == '>') {
2650: break;
2651: }
2652: }
2653:
2654: this .lexsize -= 2;
2655: this .txtend = this .lexsize;
2656:
2657: if (this .txtend > this .txtstart) {
2658: asp = newNode(Node.ASP_TAG, this .lexbuf, this .txtstart,
2659: this .txtend);
2660: }
2661:
2662: this .txtstart = this .txtend;
2663: return asp;
2664: }
2665:
2666: /**
2667: * PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>.
2668: * @return parsed Node
2669: */
2670: public Node parsePhp() {
2671: int c;
2672: Node php = null;
2673:
2674: this .txtstart = this .lexsize;
2675:
2676: while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
2677: addCharToLexer(c);
2678:
2679: if (c != '?') {
2680: continue;
2681: }
2682:
2683: if ((c = this .in.readChar()) == StreamIn.END_OF_STREAM) {
2684: break;
2685: }
2686: addCharToLexer(c);
2687:
2688: if (c == '>') {
2689: break;
2690: }
2691: }
2692:
2693: this .lexsize -= 2;
2694: this .txtend = this .lexsize;
2695:
2696: if (this .txtend > this .txtstart) {
2697: php = newNode(Node.PHP_TAG, this .lexbuf, this .txtstart,
2698: this .txtend);
2699: }
2700:
2701: this .txtstart = this .txtend;
2702: return php;
2703: }
2704:
2705: /**
2706: * consumes the '>' terminating start tags.
2707: * @param isempty flag is passed as array so it can be modified
2708: * @param asp asp Node, passed as array so it can be modified
2709: * @param php php Node, passed as array so it can be modified
2710: * @return parsed attribute
2711: */
2712: public String parseAttribute(boolean[] isempty, Node[] asp,
2713: Node[] php) {
2714: int start = 0;
2715: String attr;
2716: int c = 0;
2717: int lastc = 0;
2718:
2719: asp[0] = null; // clear asp pointer
2720: php[0] = null; // clear php pointer
2721: // skip white space before the attribute
2722:
2723: for (;;) {
2724: c = this .in.readChar();
2725:
2726: if (c == '/') {
2727: c = this .in.readChar();
2728:
2729: if (c == '>') {
2730: isempty[0] = true;
2731: return null;
2732: }
2733:
2734: this .in.ungetChar(c);
2735: c = '/';
2736: break;
2737: }
2738:
2739: if (c == '>') {
2740: return null;
2741: }
2742:
2743: if (c == '<') {
2744: c = this .in.readChar();
2745:
2746: if (c == '%') {
2747: asp[0] = parseAsp();
2748: return null;
2749: } else if (c == '?') {
2750: php[0] = parsePhp();
2751: return null;
2752: }
2753:
2754: this .in.ungetChar(c);
2755: if (this .state != LEX_XMLDECL) // FG fix for 532535
2756: {
2757: this .in.ungetChar('<'); // fix for 433360
2758: }
2759: report.attrError(this , this .token, null,
2760: Report.UNEXPECTED_GT);
2761: return null;
2762: }
2763:
2764: if (c == '=') {
2765: report.attrError(this , this .token, null,
2766: Report.UNEXPECTED_EQUALSIGN);
2767: continue;
2768: }
2769:
2770: if (c == '"' || c == '\'') {
2771: report.attrError(this , this .token, null,
2772: Report.UNEXPECTED_QUOTEMARK);
2773: continue;
2774: }
2775:
2776: if (c == StreamIn.END_OF_STREAM) {
2777: report.attrError(this , this .token, null,
2778: Report.UNEXPECTED_END_OF_FILE);
2779: this .in.ungetChar(c);
2780: return null;
2781: }
2782:
2783: if (!TidyUtils.isWhite((char) c)) {
2784: break;
2785: }
2786: }
2787:
2788: start = this .lexsize;
2789: lastc = c;
2790:
2791: for (;;) {
2792: // but push back '=' for parseValue()
2793: if (c == '=' || c == '>') {
2794: this .in.ungetChar(c);
2795: break;
2796: }
2797:
2798: if (c == '<' || c == StreamIn.END_OF_STREAM) {
2799: this .in.ungetChar(c);
2800: break;
2801: }
2802: if (lastc == '-' && (c == '"' || c == '\'')) {
2803: this .lexsize--;
2804: this .in.ungetChar(c);
2805: break;
2806: }
2807: if (TidyUtils.isWhite((char) c)) {
2808: break;
2809: }
2810:
2811: // what should be done about non-namechar characters?
2812: // currently these are incorporated into the attr name
2813:
2814: if (!this .configuration.xmlTags
2815: && TidyUtils.isUpper((char) c)) {
2816: c = TidyUtils.toLower((char) c);
2817: }
2818:
2819: // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
2820: addCharToLexer(c);
2821:
2822: lastc = c;
2823: c = this .in.readChar();
2824: }
2825:
2826: // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
2827: int len = this .lexsize - start;
2828: attr = (len > 0 ? TidyUtils.getString(this .lexbuf, start, len)
2829: : null);
2830: this .lexsize = start;
2831:
2832: return attr;
2833: }
2834:
2835: /**
2836: * Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
2837: * routine recognizes ' and " quoted strings.
2838: * @return delimiter
2839: */
2840: public int parseServerInstruction() {
2841: int c, delim = '"';
2842: boolean isrule = false;
2843:
2844: c = this .in.readChar();
2845: addCharToLexer(c);
2846:
2847: // check for ASP, PHP or Tango
2848: if (c == '%' || c == '?' || c == '@') {
2849: isrule = true;
2850: }
2851:
2852: for (;;) {
2853: c = this .in.readChar();
2854:
2855: if (c == StreamIn.END_OF_STREAM) {
2856: break;
2857: }
2858:
2859: if (c == '>') {
2860: if (isrule) {
2861: addCharToLexer(c);
2862: } else {
2863: this .in.ungetChar(c);
2864: }
2865:
2866: break;
2867: }
2868:
2869: // if not recognized as ASP, PHP or Tango
2870: // then also finish value on whitespace
2871: if (!isrule) {
2872: if (TidyUtils.isWhite((char) c)) {
2873: break;
2874: }
2875: }
2876:
2877: addCharToLexer(c);
2878:
2879: if (c == '"') {
2880: do {
2881: c = this .in.readChar();
2882:
2883: if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
2884: {
2885: report.attrError(this , this .token, null,
2886: Report.UNEXPECTED_END_OF_FILE);
2887: this .in.ungetChar(c);
2888: return 0;
2889: }
2890: if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
2891: {
2892: this .in.ungetChar(c);
2893: report.attrError(this , this .token, null,
2894: Report.UNEXPECTED_GT);
2895: return 0;
2896: }
2897:
2898: addCharToLexer(c);
2899: } while (c != '"');
2900: delim = '\'';
2901: continue;
2902: }
2903:
2904: if (c == '\'') {
2905: do {
2906: c = this .in.readChar();
2907:
2908: if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
2909: {
2910: report.attrError(this , this .token, null,
2911: Report.UNEXPECTED_END_OF_FILE);
2912: this .in.ungetChar(c);
2913: return 0;
2914: }
2915: if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
2916: {
2917: this .in.ungetChar(c);
2918: report.attrError(this , this .token, null,
2919: Report.UNEXPECTED_GT);
2920: return 0;
2921: }
2922:
2923: addCharToLexer(c);
2924: } while (c != '\'');
2925: }
2926: }
2927:
2928: return delim;
2929: }
2930:
2931: /**
2932: * Parse an attribute value.
2933: * @param name attribute name
2934: * @param foldCase fold case?
2935: * @param isempty is attribute empty? Passed as an array reference to allow modification
2936: * @param pdelim delimiter, passed as an array reference to allow modification
2937: * @return parsed value
2938: */
2939: public String parseValue(String name, boolean foldCase,
2940: boolean[] isempty, int[] pdelim) {
2941: // values start with "=" or " = " etc.
2942: // doesn't consume the ">" at end of start tag
2943:
2944: int len = 0;
2945: int start;
2946: boolean seenGt = false;
2947: boolean munge = true;
2948: int c = 0;
2949: int lastc, delim, quotewarning;
2950: String value;
2951:
2952: delim = 0;
2953: pdelim[0] = '"';
2954:
2955: // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
2956: // significant and must be preserved
2957:
2958: if (this .configuration.literalAttribs) {
2959: munge = false;
2960: }
2961:
2962: // skip white space before the '='
2963: while (true) {
2964: c = this .in.readChar();
2965:
2966: if (c == StreamIn.END_OF_STREAM) {
2967: this .in.ungetChar(c);
2968: break;
2969: }
2970:
2971: if (!TidyUtils.isWhite((char) c)) {
2972: break;
2973: }
2974: }
2975:
2976: // c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
2977:
2978: if (c != '=' && c != '"' && c != '\'') {
2979: this .in.ungetChar(c);
2980: return null;
2981: }
2982:
2983: // skip white space after '='
2984:
2985: while (true) {
2986: c = this .in.readChar();
2987:
2988: if (c == StreamIn.END_OF_STREAM) {
2989: this .in.ungetChar(c);
2990: break;
2991: }
2992:
2993: if (!TidyUtils.isWhite((char) c)) {
2994: break;
2995: }
2996: }
2997:
2998: // check for quote marks
2999:
3000: if (c == '"' || c == '\'') {
3001: delim = c;
3002: } else if (c == '<') {
3003: start = this .lexsize;
3004: addCharToLexer(c);
3005: pdelim[0] = parseServerInstruction();
3006: len = this .lexsize - start;
3007: this .lexsize = start;
3008: return (len > 0 ? TidyUtils.getString(this .lexbuf, start,
3009: len) : null);
3010: } else {
3011: this .in.ungetChar(c);
3012: }
3013:
3014: // and read the value string check for quote mark if needed
3015:
3016: quotewarning = 0;
3017: start = this .lexsize;
3018: c = '\0';
3019:
3020: while (true) {
3021: lastc = c; // track last character
3022: c = this .in.readChar();
3023:
3024: if (c == StreamIn.END_OF_STREAM) {
3025: report.attrError(this , this .token, null,
3026: Report.UNEXPECTED_END_OF_FILE);
3027: this .in.ungetChar(c);
3028: break;
3029: }
3030:
3031: if (delim == (char) 0) {
3032: if (c == '>') {
3033: this .in.ungetChar(c);
3034: break;
3035: }
3036:
3037: if (c == '"' || c == '\'') {
3038: report.attrError(this , this .token, null,
3039: Report.UNEXPECTED_QUOTEMARK);
3040: break;
3041: }
3042:
3043: if (c == '<') {
3044: this .in.ungetChar(c); // fix for 433360
3045: c = '>';
3046: this .in.ungetChar(c);
3047: report.attrError(this , this .token, null,
3048: Report.UNEXPECTED_GT);
3049: break;
3050: }
3051:
3052: // For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
3053: // care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
3054: // <a> tag to <a href="http://www.acme.com"/>
3055:
3056: if (c == '/') {
3057: // peek ahead in case of />
3058: c = this .in.readChar();
3059:
3060: if (c == '>'
3061: && !AttributeTable
3062: .getDefaultAttributeTable().isUrl(
3063: name)) {
3064: isempty[0] = true;
3065: this .in.ungetChar(c);
3066: break;
3067: }
3068:
3069: // unget peeked char
3070: this .in.ungetChar(c);
3071: c = '/';
3072: }
3073: } else {
3074: // delim is '\'' or '"'
3075: if (c == delim) {
3076: break;
3077: }
3078:
3079: // treat CRLF, CR and LF as single line break
3080:
3081: if (c == '\r') {
3082: c = this .in.readChar();
3083: if (c != '\n') {
3084: this .in.ungetChar(c);
3085: }
3086:
3087: c = '\n';
3088: }
3089:
3090: if (c == '\n' || c == '<' || c == '>') {
3091: ++quotewarning;
3092: }
3093:
3094: if (c == '>') {
3095: seenGt = true;
3096: }
3097: }
3098:
3099: if (c == '&') {
3100: // no entities in ID attributes
3101: if ("id".equalsIgnoreCase(name)) {
3102: report.attrError(this , null, null,
3103: Report.ENTITY_IN_ID);
3104: continue;
3105: }
3106:
3107: addCharToLexer(c);
3108: parseEntity((short) 0);
3109: continue;
3110:
3111: }
3112:
3113: // kludge for JavaScript attribute values with line continuations in string literals
3114:
3115: if (c == '\\') {
3116: c = this .in.readChar();
3117:
3118: if (c != '\n') {
3119: this .in.ungetChar(c);
3120: c = '\\';
3121: }
3122: }
3123:
3124: if (TidyUtils.isWhite((char) c)) {
3125: if (delim == (char) 0) {
3126: break;
3127: }
3128:
3129: if (munge) {
3130: // discard line breaks in quoted URLs
3131: // #438650 - fix by Randy Waki
3132: if (c == '\n'
3133: && AttributeTable
3134: .getDefaultAttributeTable().isUrl(
3135: name)) {
3136: // warn that we discard this newline
3137: report.attrError(this , this .token, null,
3138: Report.NEWLINE_IN_URI);
3139: continue;
3140: }
3141:
3142: c = ' ';
3143:
3144: if (lastc == ' ') {
3145: continue;
3146: }
3147: }
3148: } else if (foldCase && TidyUtils.isUpper((char) c)) {
3149: c = TidyUtils.toLower((char) c);
3150: }
3151:
3152: addCharToLexer(c);
3153: }
3154:
3155: if (quotewarning > 10 && seenGt && munge) {
3156: // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
3157: // characters. an exception is made for Javascript attributes and the javascript URL scheme which may
3158: // legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
3159:
3160: if (!AttributeTable.getDefaultAttributeTable().isScript(
3161: name)
3162: && !(AttributeTable.getDefaultAttributeTable()
3163: .isUrl(name) && "javascript:"
3164: .equals(TidyUtils.getString(this .lexbuf,
3165: start, 11)))
3166: && !"<xml ".equals(TidyUtils.getString(this .lexbuf,
3167: start, 5))) // #500236 - fix by Klaus Johannes Rusch
3168: // 06 Jan 02
3169: {
3170: report.error(this , null, null,
3171: Report.SUSPECTED_MISSING_QUOTE);
3172: }
3173: }
3174:
3175: len = this .lexsize - start;
3176: this .lexsize = start;
3177:
3178: if (len > 0 || delim != 0) {
3179: // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
3180: // --literal-attributes is set to yes
3181: // #994841 - Whitespace is removed from value attributes
3182:
3183: if (munge
3184: && !TidyUtils.isInValuesIgnoreCase(new String[] {
3185: "alt", "title", "value", "prompt" }, name)) {
3186: while (TidyUtils.isWhite((char) this .lexbuf[start + len
3187: - 1])) {
3188: --len;
3189: }
3190:
3191: while (TidyUtils.isWhite((char) this .lexbuf[start])
3192: && start < len) {
3193: ++start;
3194: --len;
3195: }
3196: }
3197:
3198: value = TidyUtils.getString(this .lexbuf, start, len);
3199: } else {
3200: value = null;
3201: }
3202:
3203: // note delimiter if given
3204: if (delim != 0) {
3205: pdelim[0] = delim;
3206: } else {
3207: pdelim[0] = '"';
3208: }
3209:
3210: return value;
3211: }
3212:
3213: /**
3214: * Check if attr is a valid name.
3215: * @param attr String to check, must be non-null
3216: * @return <code>true</code> if attr is a valid name.
3217: */
3218: public static boolean isValidAttrName(String attr) {
3219: char c;
3220: int i;
3221:
3222: // first character should be a letter
3223: c = attr.charAt(0);
3224:
3225: if (!TidyUtils.isLetter(c)) {
3226: return false;
3227: }
3228:
3229: // remaining characters should be namechars
3230: for (i = 1; i < attr.length(); i++) {
3231: c = attr.charAt(i);
3232:
3233: if (TidyUtils.isNamechar(c)) {
3234: continue;
3235: }
3236:
3237: return false;
3238: }
3239:
3240: return true;
3241: }
3242:
3243: /**
3244: * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3245: * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3246: * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3247: * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3248: * meaning, by putting a backslash in front.
3249: * @param buf css selector name
3250: * @return <code>true</code> if the given string is a valid css1 selector name
3251: */
3252: public static boolean isCSS1Selector(String buf) {
3253: if (buf == null) {
3254: return false;
3255: }
3256:
3257: // #508936 - CSS class naming for -clean option
3258: boolean valid = true;
3259: int esclen = 0;
3260: char c;
3261: int pos;
3262:
3263: for (pos = 0; valid && pos < buf.length(); ++pos) {
3264: c = buf.charAt(pos);
3265: if (c == '\\') {
3266: esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
3267: } else if (Character.isDigit(c)) {
3268: // Digit not 1st, unless escaped (Max length "\112F")
3269: if (esclen > 0) {
3270: valid = (++esclen < 6);
3271: }
3272: if (valid) {
3273: valid = (pos > 0 || esclen > 0);
3274: }
3275: } else {
3276: valid = (esclen > 0 // Escaped? Anything goes.
3277: || (pos > 0 && c == '-') // Dash cannot be 1st char
3278: || Character.isLetter(c) // a-z, A-Z anywhere
3279: || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
3280: esclen = 0;
3281: }
3282: }
3283: return valid;
3284: }
3285:
3286: /**
3287: * Parse tag attributes.
3288: * @param isempty is tag empty?
3289: * @return parsed attribute/value list
3290: */
3291: public AttVal parseAttrs(boolean[] isempty) {
3292: AttVal av, list;
3293: String attribute, value;
3294: int[] delim = new int[1];
3295: Node[] asp = new Node[1];
3296: Node[] php = new Node[1];
3297:
3298: list = null;
3299:
3300: while (!endOfInput()) {
3301: attribute = parseAttribute(isempty, asp, php);
3302:
3303: if (attribute == null) {
3304: // check if attributes are created by ASP markup
3305: if (asp[0] != null) {
3306: av = new AttVal(list, null, asp[0], null, '\0',
3307: null, null);
3308: list = av;
3309: continue;
3310: }
3311:
3312: // check if attributes are created by PHP markup
3313: if (php[0] != null) {
3314: av = new AttVal(list, null, null, php[0], '\0',
3315: null, null);
3316: list = av;
3317: continue;
3318: }
3319:
3320: break;
3321: }
3322:
3323: value = parseValue(attribute, false, isempty, delim);
3324:
3325: if (attribute != null && isValidAttrName(attribute)) {
3326: av = new AttVal(list, null, null, null, delim[0],
3327: attribute, value);
3328: av.dict = AttributeTable.getDefaultAttributeTable()
3329: .findAttribute(av);
3330: list = av;
3331: } else {
3332: av = new AttVal(null, null, null, null, 0, attribute,
3333: value);
3334:
3335: // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
3336: if (value != null) {
3337: report.attrError(this , this .token, av,
3338: Report.BAD_ATTRIBUTE_VALUE);
3339: } else if (TidyUtils.lastChar(attribute) == '"') {
3340: report.attrError(this , this .token, av,
3341: Report.MISSING_QUOTEMARK);
3342: } else {
3343: report.attrError(this , this .token, av,
3344: Report.UNKNOWN_ATTRIBUTE);
3345: }
3346: }
3347: }
3348:
3349: return list;
3350: }
3351:
3352: /**
3353: * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3354: * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3355: * <code><p><em> text <p><em> more text</code> Shouldn't be mapped to
3356: * <code><p><em> text </em></p><p><em><em> more text </em></em></code>
3357: * @param node Node to be pushed
3358: */
3359: public void pushInline(Node node) {
3360: IStack is;
3361:
3362: if (node.implicit) {
3363: return;
3364: }
3365:
3366: if (node.tag == null) {
3367: return;
3368: }
3369:
3370: if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) {
3371: return;
3372: }
3373:
3374: if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) {
3375: return;
3376: }
3377:
3378: if (node.tag != this .configuration.tt.tagFont && isPushed(node)) {
3379: return;
3380: }
3381:
3382: // make sure there is enough space for the stack
3383: is = new IStack();
3384: is.tag = node.tag;
3385: is.element = node.element;
3386: if (node.attributes != null) {
3387: is.attributes = cloneAttributes(node.attributes);
3388: }
3389: this .istack.push(is);
3390: }
3391:
3392: /**
3393: * Pop a copy of an inline node from the stack.
3394: * @param node Node to be popped
3395: */
3396: public void popInline(Node node) {
3397: IStack is;
3398:
3399: if (node != null) {
3400:
3401: if (node.tag == null) {
3402: return;
3403: }
3404:
3405: if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) {
3406: return;
3407: }
3408:
3409: if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) {
3410: return;
3411: }
3412:
3413: // if node is </a> then pop until we find an <a>
3414: if (node.tag == this .configuration.tt.tagA) {
3415:
3416: while (this .istack.size() > 0) {
3417: is = (IStack) this .istack.pop();
3418: if (is.tag == this .configuration.tt.tagA) {
3419: break;
3420: }
3421: }
3422:
3423: if (this .insert >= this .istack.size()) {
3424: this .insert = -1;
3425: }
3426: return;
3427: }
3428: }
3429:
3430: if (this .istack.size() > 0) {
3431: is = (IStack) this .istack.pop();
3432: if (this .insert >= this .istack.size()) {
3433: this .insert = -1;
3434: }
3435: }
3436: }
3437:
3438: /**
3439: * Is the node in the stack?
3440: * @param node Node
3441: * @return <code>true</code> is the node is found in the stack
3442: */
3443: public boolean isPushed(Node node) {
3444: int i;
3445: IStack is;
3446:
3447: for (i = this .istack.size() - 1; i >= 0; --i) {
3448: is = (IStack) this .istack.elementAt(i);
3449: if (is.tag == node.tag) {
3450: return true;
3451: }
3452: }
3453:
3454: return false;
3455: }
3456:
3457: /**
3458: * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3459: * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3460: * will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as
3461: * equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer
3462: * into a mode where it gets tokens from the inline stack rather than from the input stream.
3463: * @param node original node
3464: * @return stack size
3465: */
3466: public int inlineDup(Node node) {
3467: int n;
3468:
3469: n = this .istack.size() - this .istackbase;
3470: if (n > 0) {
3471: this .insert = this .istackbase;
3472: this .inode = node;
3473: }
3474:
3475: return n;
3476: }
3477:
3478: /**
3479: * @return
3480: */
3481: public Node insertedToken() {
3482: Node node;
3483: IStack is;
3484: int n;
3485:
3486: // this will only be null if inode != null
3487: if (this .insert == -1) {
3488: node = this .inode;
3489: this .inode = null;
3490: return node;
3491: }
3492:
3493: // is this is the "latest" node then update the position, otherwise use current values
3494: if (this .inode == null) {
3495: this .lines = this .in.getCurline();
3496: this .columns = this .in.getCurcol();
3497: }
3498:
3499: node = newNode(Node.START_TAG, this .lexbuf, this .txtstart,
3500: this .txtend);
3501:
3502: // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
3503: node.implicit = true;
3504: is = (IStack) this .istack.elementAt(this .insert);
3505: node.element = is.element;
3506: node.tag = is.tag;
3507: if (is.attributes != null) {
3508: node.attributes = cloneAttributes(is.attributes);
3509: }
3510:
3511: // advance lexer to next item on the stack
3512: n = this .insert;
3513:
3514: // and recover state if we have reached the end
3515: if (++n < this .istack.size()) {
3516: this .insert = n;
3517: } else {
3518: this .insert = -1;
3519: }
3520:
3521: return node;
3522: }
3523:
3524: /**
3525: * Can the given element be removed?
3526: * @param element node
3527: * @return <code>true</code> if he element can be removed
3528: */
3529: public boolean canPrune(Node element) {
3530: if (element.type == Node.TEXT_NODE) {
3531: return true;
3532: }
3533:
3534: if (element.content != null) {
3535: return false;
3536: }
3537:
3538: if (element.tag == this .configuration.tt.tagA
3539: && element.attributes != null) {
3540: return false;
3541: }
3542:
3543: if (element.tag == this .configuration.tt.tagP
3544: && !this .configuration.dropEmptyParas) {
3545: return false;
3546: }
3547:
3548: if (element.tag == null) {
3549: return false;
3550: }
3551:
3552: if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW)) {
3553: return false;
3554: }
3555:
3556: if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) {
3557: return false;
3558: }
3559:
3560: if (element.tag == this .configuration.tt.tagApplet) {
3561: return false;
3562: }
3563:
3564: if (element.tag == this .configuration.tt.tagObject) {
3565: return false;
3566: }
3567:
3568: if (element.tag == this .configuration.tt.tagScript
3569: && element.getAttrByName("src") != null) {
3570: return false;
3571: }
3572:
3573: // #540555 Empty title tag is trimmed
3574: if (element.tag == this .configuration.tt.tagTitle) {
3575: return false;
3576: }
3577:
3578: // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
3579: if (element.tag == this .configuration.tt.tagIframe) {
3580: return false;
3581: }
3582:
3583: if (element.getAttrByName("id") != null
3584: || element.getAttrByName("name") != null) {
3585: return false;
3586: }
3587:
3588: return true;
3589: }
3590:
3591: /**
3592: * duplicate name attribute as an id and check if id and name match.
3593: * @param node Node to check for name/it attributes
3594: */
3595: public void fixId(Node node) {
3596: AttVal name = node.getAttrByName("name");
3597: AttVal id = node.getAttrByName("id");
3598:
3599: if (name != null) {
3600: if (id != null) {
3601: if (id.value != null && !id.value.equals(name.value)) {
3602: report.attrError(this , node, name,
3603: Report.ID_NAME_MISMATCH);
3604: }
3605: } else if (this .configuration.xmlOut) {
3606: node.addAttribute("id", name.value);
3607: }
3608: }
3609: }
3610:
3611: /**
3612: * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3613: */
3614: public void deferDup() {
3615: this .insert = -1;
3616: this .inode = null;
3617: }
3618:
3619: /**
3620: * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3621: * HTML this is handled here rather than in the tag/attr dicts.
3622: * @param vers html version code
3623: */
3624: void constrainVersion(int vers) {
3625: this .versions &= (vers | Dict.VERS_PROPRIETARY);
3626: }
3627:
3628: /**
3629: * Is content acceptable for pre elements?
3630: * @param node content
3631: * @return <code>true</code> if node is acceptable in pre elements
3632: */
3633: protected boolean preContent(Node node) {
3634: // p is coerced to br's
3635: if (node.tag == this .configuration.tt.tagP) {
3636: return true;
3637: }
3638:
3639: if (node.tag == null
3640: || node.tag == this .configuration.tt.tagP
3641: || !TidyUtils.toBoolean(node.tag.model
3642: & (Dict.CM_INLINE | Dict.CM_NEW))) {
3643: return false;
3644: }
3645: return true;
3646: }
3647:
3648: /**
3649: * document type.
3650: */
3651: private static class W3CVersionInfo {
3652:
3653: /**
3654: * name.
3655: */
3656: String name;
3657:
3658: /**
3659: * voyager name.
3660: */
3661: String voyagerName;
3662:
3663: /**
3664: * profile.
3665: */
3666: String profile;
3667:
3668: /**
3669: * code.
3670: */
3671: short code;
3672:
3673: /**
3674: * Instantiates a new W3CVersionInfo.
3675: * @param name version name
3676: * @param voyagerName voyager (xhtml) name
3677: * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
3678: * @param code unique code for this version info
3679: */
3680: public W3CVersionInfo(String name, String voyagerName,
3681: String profile, short code) {
3682: this.name = name;
3683: this.voyagerName = voyagerName;
3684: this.profile = profile;
3685: this.code = code;
3686: }
3687: }
3688:
3689: }
|