0001: /*
0002: * @(#)Lexer.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: package org.w3c.tidy;
0007:
0008: /**
0009: *
0010: * Lexer for html parser
0011: *
0012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013: * See Tidy.java for the copyright notice.
0014: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015: * HTML Tidy Release 4 Aug 2000</a>
0016: *
0017: * @author Dave Raggett <dsr@w3.org>
0018: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019: * @version 1.0, 1999/05/22
0020: * @version 1.0.1, 1999/05/29
0021: * @version 1.1, 1999/06/18 Java Bean
0022: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024: * @version 1.4, 1999/09/04 DOM support
0025: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032: */
0033:
0034: /*
0035: Given a file stream fp it returns a sequence of tokens.
0036:
0037: GetToken(fp) gets the next token
0038: UngetToken(fp) provides one level undo
0039:
0040: The tags include an attribute list:
0041:
0042: - linked list of attribute/value nodes
0043: - each node has 2 null-terminated strings.
0044: - entities are replaced in attribute values
0045:
0046: white space is compacted if not in preformatted mode
0047: If not in preformatted mode then leading white space
0048: is discarded and subsequent white space sequences
0049: compacted to single space chars.
0050:
0051: If XmlTags is no then Tag names are folded to upper
0052: case and attribute names to lower case.
0053:
0054: Not yet done:
0055: - Doctype subset and marked sections
0056: */
0057:
0058: import java.io.PrintWriter;
0059: import java.util.Stack;
0060: import java.util.Vector;
0061:
0062: public class Lexer {
0063:
0064: public StreamIn in; /* file stream */
0065: public PrintWriter errout; /* error output stream */
0066: public short badAccess; /* for accessibility errors */
0067: public short badLayout; /* for bad style errors */
0068: public short badChars; /* for bad char encodings */
0069: public short badForm; /* for mismatched/mispositioned form tags */
0070: public short warnings; /* count of warnings in this document */
0071: public short errors; /* count of errors */
0072: public int lines; /* lines seen */
0073: public int columns; /* at start of current token */
0074: public boolean waswhite; /* used to collapse contiguous white space */
0075: public boolean pushed; /* true after token has been pushed back */
0076: public boolean insertspace; /* when space is moved after end tag */
0077: public boolean excludeBlocks; /* Netscape compatibility */
0078: public boolean exiled; /* true if moved out of table */
0079: public boolean isvoyager; /* true if xmlns attribute on html element */
0080: public short versions; /* bit vector of HTML versions */
0081: public int doctype; /* version as given by doctype (if any) */
0082: public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
0083: public int txtstart; /* start of current node */
0084: public int txtend; /* end of current node */
0085: public short state; /* state of lexer's finite state machine */
0086: public Node token;
0087:
0088: /*
0089: lexer character buffer
0090:
0091: parse tree nodes span onto this buffer
0092: which contains the concatenated text
0093: contents of all of the elements.
0094:
0095: lexsize must be reset for each file.
0096: */
0097: public byte[] lexbuf; /* byte buffer of UTF-8 chars */
0098: public int lexlength; /* allocated */
0099: public int lexsize; /* used */
0100:
0101: /* Inline stack for compatibility with Mosaic */
0102: public Node inode; /* for deferring text node */
0103: public int insert; /* for inferring inline tags */
0104: public Stack istack;
0105: public int istackbase; /* start of frame */
0106:
0107: public Style styles; /* used for cleaning up presentation markup */
0108:
0109: public Configuration configuration;
0110: protected int seenBodyEndTag; /* used by parser */
0111: private Vector nodeList;
0112:
0113: public Lexer(StreamIn in, Configuration configuration) {
0114: this .in = in;
0115: this .lines = 1;
0116: this .columns = 1;
0117: this .state = LEX_CONTENT;
0118: this .badAccess = 0;
0119: this .badLayout = 0;
0120: this .badChars = 0;
0121: this .badForm = 0;
0122: this .warnings = 0;
0123: this .errors = 0;
0124: this .waswhite = false;
0125: this .pushed = false;
0126: this .insertspace = false;
0127: this .exiled = false;
0128: this .isvoyager = false;
0129: this .versions = Dict.VERS_EVERYTHING;
0130: this .doctype = Dict.VERS_UNKNOWN;
0131: this .badDoctype = false;
0132: this .txtstart = 0;
0133: this .txtend = 0;
0134: this .token = null;
0135: this .lexbuf = null;
0136: this .lexlength = 0;
0137: this .lexsize = 0;
0138: this .inode = null;
0139: this .insert = -1;
0140: this .istack = new Stack();
0141: this .istackbase = 0;
0142: this .styles = null;
0143: this .configuration = configuration;
0144: this .seenBodyEndTag = 0;
0145: this .nodeList = new Vector();
0146: }
0147:
0148: public Node newNode() {
0149: Node node = new Node();
0150: nodeList.addElement(node);
0151: return node;
0152: }
0153:
0154: public Node newNode(short type, byte[] textarray, int start, int end) {
0155: Node node = new Node(type, textarray, start, end);
0156: nodeList.addElement(node);
0157: return node;
0158: }
0159:
0160: public Node newNode(short type, byte[] textarray, int start,
0161: int end, String element) {
0162: Node node = new Node(type, textarray, start, end, element,
0163: configuration.tt);
0164: nodeList.addElement(node);
0165: return node;
0166: }
0167:
0168: public Node cloneNode(Node node) {
0169: Node cnode = (Node) node.clone();
0170: nodeList.addElement(cnode);
0171: for (AttVal att = cnode.attributes; att != null; att = att.next) {
0172: if (att.asp != null)
0173: nodeList.addElement(att.asp);
0174: if (att.php != null)
0175: nodeList.addElement(att.php);
0176: }
0177: return cnode;
0178: }
0179:
0180: public AttVal cloneAttributes(AttVal attrs) {
0181: AttVal cattrs = (AttVal) attrs.clone();
0182: for (AttVal att = cattrs; att != null; att = att.next) {
0183: if (att.asp != null)
0184: nodeList.addElement(att.asp);
0185: if (att.php != null)
0186: nodeList.addElement(att.php);
0187: }
0188: return cattrs;
0189: }
0190:
0191: protected void updateNodeTextArrays(byte[] oldtextarray,
0192: byte[] newtextarray) {
0193: Node node;
0194: for (int i = 0; i < nodeList.size(); i++) {
0195: node = (Node) (nodeList.elementAt(i));
0196: if (node.textarray == oldtextarray)
0197: node.textarray = newtextarray;
0198: }
0199: }
0200:
0201: /* used for creating preformatted text from Word2000 */
0202: public Node newLineNode() {
0203: Node node = newNode();
0204:
0205: node.textarray = this .lexbuf;
0206: node.start = this .lexsize;
0207: addCharToLexer((int) '\n');
0208: node.end = this .lexsize;
0209: return node;
0210: }
0211:
0212: // Should always be able convert to/from UTF-8, so encoding exceptions are
0213: // converted to an Error to avoid adding throws declarations in
0214: // lots of methods.
0215:
0216: public static byte[] getBytes(String str) {
0217: try {
0218: return str.getBytes("UTF8");
0219: } catch (java.io.UnsupportedEncodingException e) {
0220: throw new Error("string to UTF-8 conversion failed: "
0221: + e.getMessage());
0222: }
0223: }
0224:
0225: public static String getString(byte[] bytes, int offset, int length) {
0226: try {
0227: return new String(bytes, offset, length, "UTF8");
0228: } catch (java.io.UnsupportedEncodingException e) {
0229: throw new Error("UTF-8 to string conversion failed: "
0230: + e.getMessage());
0231: }
0232: }
0233:
0234: public boolean endOfInput() {
0235: return this .in.isEndOfStream();
0236: }
0237:
0238: public void addByte(int c) {
0239: if (this .lexsize + 1 >= this .lexlength) {
0240: while (this .lexsize + 1 >= this .lexlength) {
0241: if (this .lexlength == 0)
0242: this .lexlength = 8192;
0243: else
0244: this .lexlength = this .lexlength * 2;
0245: }
0246:
0247: byte[] temp = this .lexbuf;
0248: this .lexbuf = new byte[this .lexlength];
0249: if (temp != null) {
0250: System.arraycopy(temp, 0, this .lexbuf, 0, temp.length);
0251: updateNodeTextArrays(temp, this .lexbuf);
0252: }
0253: }
0254:
0255: this .lexbuf[this .lexsize++] = (byte) c;
0256: this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
0257: }
0258:
0259: public void changeChar(byte c) {
0260: if (this .lexsize > 0) {
0261: this .lexbuf[this .lexsize - 1] = c;
0262: }
0263: }
0264:
0265: /* store char c as UTF-8 encoded byte stream */
0266: public void addCharToLexer(int c) {
0267: if (c < 128)
0268: addByte(c);
0269: else if (c <= 0x7FF) {
0270: addByte(0xC0 | (c >> 6));
0271: addByte(0x80 | (c & 0x3F));
0272: } else if (c <= 0xFFFF) {
0273: addByte(0xE0 | (c >> 12));
0274: addByte(0x80 | ((c >> 6) & 0x3F));
0275: addByte(0x80 | (c & 0x3F));
0276: } else if (c <= 0x1FFFFF) {
0277: addByte(0xF0 | (c >> 18));
0278: addByte(0x80 | ((c >> 12) & 0x3F));
0279: addByte(0x80 | ((c >> 6) & 0x3F));
0280: addByte(0x80 | (c & 0x3F));
0281: } else {
0282: addByte(0xF8 | (c >> 24));
0283: addByte(0x80 | ((c >> 18) & 0x3F));
0284: addByte(0x80 | ((c >> 12) & 0x3F));
0285: addByte(0x80 | ((c >> 6) & 0x3F));
0286: addByte(0x80 | (c & 0x3F));
0287: }
0288: }
0289:
0290: public void addStringToLexer(String str) {
0291: for (int i = 0; i < str.length(); i++) {
0292: addCharToLexer((int) str.charAt(i));
0293: }
0294: }
0295:
0296: /*
0297: No longer attempts to insert missing ';' for unknown
0298: enitities unless one was present already, since this
0299: gives unexpected results.
0300:
0301: For example: <a href="something.htm?foo&bar&fred">
0302: was tidied to: <a href="something.htm?foo&bar;&fred;">
0303: rather than: <a href="something.htm?foo&bar&fred">
0304:
0305: My thanks for Maurice Buxton for spotting this.
0306: */
0307: public void parseEntity(short mode) {
0308: short map;
0309: int start;
0310: boolean first = true;
0311: boolean semicolon = false;
0312: boolean numeric = false;
0313: int c, ch, startcol;
0314: String str;
0315:
0316: start = this .lexsize - 1; /* to start at "&" */
0317: startcol = this .in.curcol - 1;
0318:
0319: while (true) {
0320: c = this .in.readChar();
0321: if (c == StreamIn.EndOfStream)
0322: break;
0323: if (c == ';') {
0324: semicolon = true;
0325: break;
0326: }
0327:
0328: if (first && c == '#') {
0329: addCharToLexer(c);
0330: first = false;
0331: numeric = true;
0332: continue;
0333: }
0334:
0335: first = false;
0336: map = MAP((char) c);
0337:
0338: /* AQ: Added flag for numeric entities so that numeric entities
0339: with missing semi-colons are recognized.
0340: Eg. "rep..." is recognized as "rep"
0341: */
0342: if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) {
0343: addCharToLexer(c);
0344: continue;
0345: }
0346: if (!numeric && ((map & NAMECHAR) != 0)) {
0347: addCharToLexer(c);
0348: continue;
0349: }
0350:
0351: /* otherwise put it back */
0352:
0353: this .in.ungetChar(c);
0354: break;
0355: }
0356:
0357: str = getString(this .lexbuf, start, this .lexsize - start);
0358: ch = EntityTable.getDefaultEntityTable().entityCode(str);
0359:
0360: /* deal with unrecognized entities */
0361: if (ch <= 0) {
0362: /* set error position just before offending chararcter */
0363: this .lines = this .in.curline;
0364: this .columns = startcol;
0365:
0366: if (this .lexsize > start + 1) {
0367: Report
0368: .entityError(this , Report.UNKNOWN_ENTITY, str,
0369: ch);
0370:
0371: if (semicolon)
0372: addCharToLexer(';');
0373: } else /* naked & */
0374: {
0375: Report.entityError(this , Report.UNESCAPED_AMPERSAND,
0376: str, ch);
0377: }
0378: } else {
0379: if (c != ';') /* issue warning if not terminated by ';' */
0380: {
0381: /* set error position just before offending chararcter */
0382: this .lines = this .in.curline;
0383: this .columns = startcol;
0384: Report.entityError(this , Report.MISSING_SEMICOLON, str,
0385: c);
0386: }
0387:
0388: this .lexsize = start;
0389:
0390: if (ch == 160 && (mode & Preformatted) != 0)
0391: ch = ' ';
0392:
0393: addCharToLexer(ch);
0394:
0395: if (ch == '&' && !this .configuration.QuoteAmpersand) {
0396: addCharToLexer('a');
0397: addCharToLexer('m');
0398: addCharToLexer('p');
0399: addCharToLexer(';');
0400: }
0401: }
0402: }
0403:
0404: public char parseTagName() {
0405: short map;
0406: int c;
0407:
0408: /* fold case of first char in buffer */
0409:
0410: c = this .lexbuf[this .txtstart];
0411: map = MAP((char) c);
0412:
0413: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0) {
0414: c += (int) ((int) 'a' - (int) 'A');
0415: this .lexbuf[this .txtstart] = (byte) c;
0416: }
0417:
0418: while (true) {
0419: c = this .in.readChar();
0420: if (c == StreamIn.EndOfStream)
0421: break;
0422: map = MAP((char) c);
0423:
0424: if ((map & NAMECHAR) == 0)
0425: break;
0426:
0427: /* fold case of subsequent chars */
0428:
0429: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
0430: c += (int) ((int) 'a' - (int) 'A');
0431:
0432: addCharToLexer(c);
0433: }
0434:
0435: this .txtend = this .lexsize;
0436: return (char) c;
0437: }
0438:
0439: public void addStringLiteral(String str) {
0440: for (int i = 0; i < str.length(); i++) {
0441: addCharToLexer((int) str.charAt(i));
0442: }
0443: }
0444:
0445: /* choose what version to use for new doctype */
0446: public short HTMLVersion() {
0447: short versions;
0448:
0449: versions = this .versions;
0450:
0451: if ((versions & Dict.VERS_HTML20) != 0)
0452: return Dict.VERS_HTML20;
0453:
0454: if ((versions & Dict.VERS_HTML32) != 0)
0455: return Dict.VERS_HTML32;
0456:
0457: if ((versions & Dict.VERS_HTML40_STRICT) != 0)
0458: return Dict.VERS_HTML40_STRICT;
0459:
0460: if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
0461: return Dict.VERS_HTML40_LOOSE;
0462:
0463: if ((versions & Dict.VERS_FRAMES) != 0)
0464: return Dict.VERS_FRAMES;
0465:
0466: return Dict.VERS_UNKNOWN;
0467: }
0468:
0469: public String HTMLVersionName() {
0470: short guessed;
0471: int j;
0472:
0473: guessed = apparentVersion();
0474:
0475: for (j = 0; j < W3CVersion.length; ++j) {
0476: if (guessed == W3CVersion[j].code) {
0477: if (this .isvoyager)
0478: return W3CVersion[j].voyagerName;
0479:
0480: return W3CVersion[j].name;
0481: }
0482: }
0483:
0484: return null;
0485: }
0486:
0487: /* add meta element for Tidy */
0488: public boolean addGenerator(Node root) {
0489: AttVal attval;
0490: Node node;
0491: Node head = root.findHEAD(configuration.tt);
0492:
0493: if (head != null) {
0494: for (node = head.content; node != null; node = node.next) {
0495: if (node.tag == configuration.tt.tagMeta) {
0496: attval = node.getAttrByName("name");
0497:
0498: if (attval != null
0499: && attval.value != null
0500: && Lexer.wstrcasecmp(attval.value,
0501: "generator") == 0) {
0502: attval = node.getAttrByName("content");
0503:
0504: if (attval != null
0505: && attval.value != null
0506: && attval.value.length() >= 9
0507: && Lexer.wstrcasecmp(attval.value
0508: .substring(0, 9), "HTML Tidy") == 0) {
0509: return false;
0510: }
0511: }
0512: }
0513: }
0514:
0515: node = this .inferredTag("meta");
0516: node.addAttribute("content", "HTML Tidy, see www.w3.org");
0517: node.addAttribute("name", "generator");
0518: Node.insertNodeAtStart(head, node);
0519: return true;
0520: }
0521:
0522: return false;
0523: }
0524:
0525: /* return true if substring s is in p and isn't all in upper case */
0526: /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
0527: /* len is how many chars to check in p */
0528: private static boolean findBadSubString(String s, String p, int len) {
0529: int n = s.length();
0530: int i = 0;
0531: String ps;
0532:
0533: while (n < len) {
0534: ps = p.substring(i, i + n);
0535: if (wstrcasecmp(s, ps) == 0)
0536: return (!ps.equals(s.substring(0, n)));
0537:
0538: ++i;
0539: --len;
0540: }
0541:
0542: return false;
0543: }
0544:
0545: public boolean checkDocTypeKeyWords(Node doctype) {
0546: int len = doctype.end - doctype.start;
0547: String s = getString(this .lexbuf, doctype.start, len);
0548:
0549: return !(findBadSubString("SYSTEM", s, len)
0550: || findBadSubString("PUBLIC", s, len)
0551: || findBadSubString("//DTD", s, len)
0552: || findBadSubString("//W3C", s, len) || findBadSubString(
0553: "//EN", s, len));
0554: }
0555:
0556: /* examine <!DOCTYPE> to identify version */
0557: public short findGivenVersion(Node doctype) {
0558: String p, s;
0559: int i, j;
0560: int len;
0561: String str1;
0562: String str2;
0563:
0564: /* if root tag for doctype isn't html give up now */
0565: str1 = getString(this .lexbuf, doctype.start, 5);
0566: if (wstrcasecmp(str1, "html ") != 0)
0567: return 0;
0568:
0569: if (!checkDocTypeKeyWords(doctype))
0570: Report.warning(this , doctype, null,
0571: Report.DTYPE_NOT_UPPER_CASE);
0572:
0573: /* give up if all we are given is the system id for the doctype */
0574: str1 = getString(this .lexbuf, doctype.start + 5, 7);
0575: if (wstrcasecmp(str1, "SYSTEM ") == 0) {
0576: /* but at least ensure the case is correct */
0577: if (!str1.substring(0, 6).equals("SYSTEM"))
0578: System.arraycopy(getBytes("SYSTEM"), 0, this .lexbuf,
0579: doctype.start + 5, 6);
0580: return 0; /* unrecognized */
0581: }
0582:
0583: if (wstrcasecmp(str1, "PUBLIC ") == 0) {
0584: if (!str1.substring(0, 6).equals("PUBLIC"))
0585: System.arraycopy(getBytes("PUBLIC "), 0, this .lexbuf,
0586: doctype.start + 5, 6);
0587: } else
0588: this .badDoctype = true;
0589:
0590: for (i = doctype.start; i < doctype.end; ++i) {
0591: if (this .lexbuf[i] == (byte) '"') {
0592: str1 = getString(this .lexbuf, i + 1, 12);
0593: str2 = getString(this .lexbuf, i + 1, 13);
0594: if (str1.equals("-//W3C//DTD ")) {
0595: /* compute length of identifier e.g. "HTML 4.0 Transitional" */
0596: for (j = i + 13; j < doctype.end
0597: && this .lexbuf[j] != (byte) '/'; ++j)
0598: ;
0599: len = j - i - 13;
0600: p = getString(this .lexbuf, i + 13, len);
0601:
0602: for (j = 1; j < W3CVersion.length; ++j) {
0603: s = W3CVersion[j].name;
0604: if (len == s.length() && s.equals(p))
0605: return W3CVersion[j].code;
0606: }
0607:
0608: /* else unrecognized version */
0609: } else if (str2.equals("-//IETF//DTD ")) {
0610: /* compute length of identifier e.g. "HTML 2.0" */
0611: for (j = i + 14; j < doctype.end
0612: && this .lexbuf[j] != (byte) '/'; ++j)
0613: ;
0614: len = j - i - 14;
0615:
0616: p = getString(this .lexbuf, i + 14, len);
0617: s = W3CVersion[0].name;
0618: if (len == s.length() && s.equals(p))
0619: return W3CVersion[0].code;
0620:
0621: /* else unrecognized version */
0622: }
0623: break;
0624: }
0625: }
0626:
0627: return 0;
0628: }
0629:
0630: public void fixHTMLNameSpace(Node root, String profile) {
0631: Node node;
0632: AttVal prev, attr;
0633:
0634: for (node = root.content; node != null
0635: && node.tag != configuration.tt.tagHtml; node = node.next)
0636: ;
0637:
0638: if (node != null) {
0639: prev = null;
0640:
0641: for (attr = node.attributes; attr != null; attr = attr.next) {
0642: if (attr.attribute.equals("xmlns"))
0643: break;
0644:
0645: prev = attr;
0646: }
0647:
0648: if (attr != null) {
0649: if (!attr.value.equals(profile)) {
0650: Report.warning(this , node, null,
0651: Report.INCONSISTENT_NAMESPACE);
0652: attr.value = profile;
0653: }
0654: } else {
0655: attr = new AttVal(node.attributes, null, (int) '"',
0656: "xmlns", profile);
0657: attr.dict = AttributeTable.getDefaultAttributeTable()
0658: .findAttribute(attr);
0659: node.attributes = attr;
0660: }
0661: }
0662: }
0663:
0664: public boolean setXHTMLDocType(Node root) {
0665: String fpi = " ";
0666: String sysid = "";
0667: String namespace = XHTML_NAMESPACE;
0668: Node doctype;
0669:
0670: doctype = root.findDocType();
0671:
0672: if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
0673: if (doctype != null)
0674: Node.discardElement(doctype);
0675: return true;
0676: }
0677:
0678: if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0679: /* see what flavor of XHTML this document matches */
0680: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
0681: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0682: sysid = voyager_strict;
0683: } else if ((this .versions & Dict.VERS_LOOSE) != 0) {
0684: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0685: sysid = voyager_loose;
0686: } else if ((this .versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
0687: fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
0688: sysid = voyager_frameset;
0689: } else /* lets assume XHTML transitional */
0690: {
0691: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0692: sysid = voyager_loose;
0693: }
0694: } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0695: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0696: sysid = voyager_strict;
0697: } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0698: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0699: sysid = voyager_loose;
0700: }
0701:
0702: fixHTMLNameSpace(root, namespace);
0703:
0704: if (doctype == null) {
0705: doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0706: doctype.next = root.content;
0707: doctype.parent = root;
0708: doctype.prev = null;
0709: root.content = doctype;
0710: }
0711:
0712: if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0713: && configuration.docTypeStr != null) {
0714: fpi = configuration.docTypeStr;
0715: sysid = "";
0716: }
0717:
0718: this .txtstart = this .lexsize;
0719: this .txtend = this .lexsize;
0720:
0721: /* add public identifier */
0722: addStringLiteral("html PUBLIC ");
0723:
0724: /* check if the fpi is quoted or not */
0725: if (fpi.charAt(0) == '"')
0726: addStringLiteral(fpi);
0727: else {
0728: addStringLiteral("\"");
0729: addStringLiteral(fpi);
0730: addStringLiteral("\"");
0731: }
0732:
0733: if (sysid.length() + 6 >= this .configuration.wraplen)
0734: addStringLiteral("\n\"");
0735: else
0736: addStringLiteral("\n \"");
0737:
0738: /* add system identifier */
0739: addStringLiteral(sysid);
0740: addStringLiteral("\"");
0741:
0742: this .txtend = this .lexsize;
0743:
0744: doctype.start = this .txtstart;
0745: doctype.end = this .txtend;
0746:
0747: return false;
0748: }
0749:
0750: public short apparentVersion() {
0751: switch (this .doctype) {
0752: case Dict.VERS_UNKNOWN:
0753: return HTMLVersion();
0754:
0755: case Dict.VERS_HTML20:
0756: if ((this .versions & Dict.VERS_HTML20) != 0)
0757: return Dict.VERS_HTML20;
0758:
0759: break;
0760:
0761: case Dict.VERS_HTML32:
0762: if ((this .versions & Dict.VERS_HTML32) != 0)
0763: return Dict.VERS_HTML32;
0764:
0765: break; /* to replace old version by new */
0766:
0767: case Dict.VERS_HTML40_STRICT:
0768: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0769: return Dict.VERS_HTML40_STRICT;
0770:
0771: break;
0772:
0773: case Dict.VERS_HTML40_LOOSE:
0774: if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0775: return Dict.VERS_HTML40_LOOSE;
0776:
0777: break; /* to replace old version by new */
0778:
0779: case Dict.VERS_FRAMES:
0780: if ((this .versions & Dict.VERS_FRAMES) != 0)
0781: return Dict.VERS_FRAMES;
0782:
0783: break;
0784: }
0785:
0786: Report.warning(this , null, null, Report.INCONSISTENT_VERSION);
0787: return this .HTMLVersion();
0788: }
0789:
0790: /* fixup doctype if missing */
0791: public boolean fixDocType(Node root) {
0792: Node doctype;
0793: int guessed = Dict.VERS_HTML40_STRICT, i;
0794:
0795: if (this .badDoctype)
0796: Report.warning(this , null, null, Report.MALFORMED_DOCTYPE);
0797:
0798: if (configuration.XmlOut)
0799: return true;
0800:
0801: doctype = root.findDocType();
0802:
0803: if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
0804: if (doctype != null)
0805: Node.discardElement(doctype);
0806: return true;
0807: }
0808:
0809: if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0810: Node.discardElement(doctype);
0811: doctype = null;
0812: guessed = Dict.VERS_HTML40_STRICT;
0813: } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0814: Node.discardElement(doctype);
0815: doctype = null;
0816: guessed = Dict.VERS_HTML40_LOOSE;
0817: } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0818: if (doctype != null) {
0819: if (this .doctype == Dict.VERS_UNKNOWN)
0820: return false;
0821:
0822: switch (this .doctype) {
0823: case Dict.VERS_UNKNOWN:
0824: return false;
0825:
0826: case Dict.VERS_HTML20:
0827: if ((this .versions & Dict.VERS_HTML20) != 0)
0828: return true;
0829:
0830: break; /* to replace old version by new */
0831:
0832: case Dict.VERS_HTML32:
0833: if ((this .versions & Dict.VERS_HTML32) != 0)
0834: return true;
0835:
0836: break; /* to replace old version by new */
0837:
0838: case Dict.VERS_HTML40_STRICT:
0839: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0840: return true;
0841:
0842: break; /* to replace old version by new */
0843:
0844: case Dict.VERS_HTML40_LOOSE:
0845: if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0846: return true;
0847:
0848: break; /* to replace old version by new */
0849:
0850: case Dict.VERS_FRAMES:
0851: if ((this .versions & Dict.VERS_FRAMES) != 0)
0852: return true;
0853:
0854: break; /* to replace old version by new */
0855: }
0856:
0857: /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
0858: }
0859:
0860: /* choose new doctype */
0861: guessed = HTMLVersion();
0862: }
0863:
0864: if (guessed == Dict.VERS_UNKNOWN)
0865: return false;
0866:
0867: /* for XML use the Voyager system identifier */
0868: if (this .configuration.XmlOut || this .configuration.XmlTags
0869: || this .isvoyager) {
0870: if (doctype != null)
0871: Node.discardElement(doctype);
0872:
0873: for (i = 0; i < W3CVersion.length; ++i) {
0874: if (guessed == W3CVersion[i].code) {
0875: fixHTMLNameSpace(root, W3CVersion[i].profile);
0876: break;
0877: }
0878: }
0879:
0880: return true;
0881: }
0882:
0883: if (doctype == null) {
0884: doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0885: doctype.next = root.content;
0886: doctype.parent = root;
0887: doctype.prev = null;
0888: root.content = doctype;
0889: }
0890:
0891: this .txtstart = this .lexsize;
0892: this .txtend = this .lexsize;
0893:
0894: /* use the appropriate public identifier */
0895: addStringLiteral("html PUBLIC ");
0896:
0897: if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0898: && configuration.docTypeStr != null)
0899: addStringLiteral(configuration.docTypeStr);
0900: else if (guessed == Dict.VERS_HTML20)
0901: addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
0902: else {
0903: addStringLiteral("\"-//W3C//DTD ");
0904:
0905: for (i = 0; i < W3CVersion.length; ++i) {
0906: if (guessed == W3CVersion[i].code) {
0907: addStringLiteral(W3CVersion[i].name);
0908: break;
0909: }
0910: }
0911:
0912: addStringLiteral("//EN\"");
0913: }
0914:
0915: this .txtend = this .lexsize;
0916:
0917: doctype.start = this .txtstart;
0918: doctype.end = this .txtend;
0919:
0920: return true;
0921: }
0922:
0923: /* ensure XML document starts with <?XML version="1.0"?> */
0924: public boolean fixXMLPI(Node root) {
0925: Node xml;
0926: int s;
0927:
0928: if (root.content != null
0929: && root.content.type == Node.ProcInsTag) {
0930: s = root.content.start;
0931:
0932: if (this .lexbuf[s] == (byte) 'x'
0933: && this .lexbuf[s + 1] == (byte) 'm'
0934: && this .lexbuf[s + 2] == (byte) 'l')
0935: return true;
0936: }
0937:
0938: xml = newNode(Node.ProcInsTag, this .lexbuf, 0, 0);
0939: xml.next = root.content;
0940:
0941: if (root.content != null) {
0942: root.content.prev = xml;
0943: xml.next = root.content;
0944: }
0945:
0946: root.content = xml;
0947:
0948: this .txtstart = this .lexsize;
0949: this .txtend = this .lexsize;
0950: addStringLiteral("xml version=\"1.0\"");
0951: if (this .configuration.CharEncoding == Configuration.LATIN1)
0952: addStringLiteral(" encoding=\"ISO-8859-1\"");
0953: this .txtend = this .lexsize;
0954:
0955: xml.start = this .txtstart;
0956: xml.end = this .txtend;
0957: return false;
0958: }
0959:
0960: public Node inferredTag(String name) {
0961: Node node;
0962:
0963: node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
0964: this .txtend, name);
0965: node.implicit = true;
0966: return node;
0967: }
0968:
0969: public static boolean expectsContent(Node node) {
0970: if (node.type != Node.StartTag)
0971: return false;
0972:
0973: /* unknown element? */
0974: if (node.tag == null)
0975: return true;
0976:
0977: if ((node.tag.model & Dict.CM_EMPTY) != 0)
0978: return false;
0979:
0980: return true;
0981: }
0982:
0983: /*
0984: create a text node for the contents of
0985: a CDATA element like style or script
0986: which ends with </foo> for some foo.
0987: */
0988: public Node getCDATA(Node container) {
0989: int c, lastc, start, len, i;
0990: String str;
0991: boolean endtag = false;
0992:
0993: this .lines = this .in.curline;
0994: this .columns = this .in.curcol;
0995: this .waswhite = false;
0996: this .txtstart = this .lexsize;
0997: this .txtend = this .lexsize;
0998:
0999: lastc = (int) '\0';
1000: start = -1;
1001:
1002: while (true) {
1003: c = this .in.readChar();
1004: if (c == StreamIn.EndOfStream)
1005: break;
1006: /* treat \r\n as \n and \r as \n */
1007:
1008: if (c == (int) '/' && lastc == (int) '<') {
1009: if (endtag) {
1010: this .lines = this .in.curline;
1011: this .columns = this .in.curcol - 3;
1012:
1013: Report.warning(this , null, null,
1014: Report.BAD_CDATA_CONTENT);
1015: }
1016:
1017: start = this .lexsize + 1; /* to first letter */
1018: endtag = true;
1019: } else if (c == (int) '>' && start >= 0) {
1020: len = this .lexsize - start;
1021: if (len == container.element.length()) {
1022: str = getString(this .lexbuf, start, len);
1023: if (Lexer.wstrcasecmp(str, container.element) == 0) {
1024: this .txtend = start - 2;
1025: break;
1026: }
1027: }
1028:
1029: this .lines = this .in.curline;
1030: this .columns = this .in.curcol - 3;
1031:
1032: Report.warning(this , null, null,
1033: Report.BAD_CDATA_CONTENT);
1034:
1035: /* if javascript insert backslash before / */
1036:
1037: if (ParserImpl.isJavaScript(container)) {
1038: for (i = this .lexsize; i > start - 1; --i)
1039: this .lexbuf[i] = this .lexbuf[i - 1];
1040:
1041: this .lexbuf[start - 1] = (byte) '\\';
1042: this .lexsize++;
1043: }
1044:
1045: start = -1;
1046: } else if (c == (int) '\r') {
1047: c = this .in.readChar();
1048:
1049: if (c != (int) '\n')
1050: this .in.ungetChar(c);
1051:
1052: c = (int) '\n';
1053: }
1054:
1055: addCharToLexer((int) c);
1056: this .txtend = this .lexsize;
1057: lastc = c;
1058: }
1059:
1060: if (c == StreamIn.EndOfStream)
1061: Report.warning(this , container, null,
1062: Report.MISSING_ENDTAG_FOR);
1063:
1064: if (this .txtend > this .txtstart) {
1065: this .token = newNode(Node.TextNode, this .lexbuf,
1066: this .txtstart, this .txtend);
1067: return this .token;
1068: }
1069:
1070: return null;
1071: }
1072:
1073: public void ungetToken() {
1074: this .pushed = true;
1075: }
1076:
1077: public static final short IgnoreWhitespace = 0;
1078: public static final short MixedContent = 1;
1079: public static final short Preformatted = 2;
1080: public static final short IgnoreMarkup = 3;
1081:
1082: /*
1083: modes for GetToken()
1084:
1085: MixedContent -- for elements which don't accept PCDATA
1086: Preformatted -- white space preserved as is
1087: IgnoreMarkup -- for CDATA elements such as script, style
1088: */
1089:
1090: public Node getToken(short mode) {
1091: short map;
1092: int c = 0;
1093: int lastc;
1094: int badcomment = 0;
1095: MutableBoolean isempty = new MutableBoolean();
1096: AttVal attributes;
1097:
1098: if (this .pushed) {
1099: /* duplicate inlines in preference to pushed text nodes when appropriate */
1100: if (this .token.type != Node.TextNode
1101: || (this .insert == -1 && this .inode == null)) {
1102: this .pushed = false;
1103: return this .token;
1104: }
1105: }
1106:
1107: /* at start of block elements, unclosed inline
1108: elements are inserted into the token stream */
1109:
1110: if (this .insert != -1 || this .inode != null)
1111: return insertedToken();
1112:
1113: this .lines = this .in.curline;
1114: this .columns = this .in.curcol;
1115: this .waswhite = false;
1116:
1117: this .txtstart = this .lexsize;
1118: this .txtend = this .lexsize;
1119:
1120: while (true) {
1121: c = this .in.readChar();
1122: if (c == StreamIn.EndOfStream)
1123: break;
1124: if (this .insertspace && mode != IgnoreWhitespace) {
1125: addCharToLexer(' ');
1126: this .waswhite = true;
1127: this .insertspace = false;
1128: }
1129:
1130: /* treat \r\n as \n and \r as \n */
1131:
1132: if (c == '\r') {
1133: c = this .in.readChar();
1134:
1135: if (c != '\n')
1136: this .in.ungetChar(c);
1137:
1138: c = '\n';
1139: }
1140:
1141: addCharToLexer(c);
1142:
1143: switch (this .state) {
1144: case LEX_CONTENT: /* element content */
1145: map = MAP((char) c);
1146:
1147: /*
1148: Discard white space if appropriate. Its cheaper
1149: to do this here rather than in parser methods
1150: for elements that don't have mixed content.
1151: */
1152: if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1153: && this .lexsize == this .txtstart + 1) {
1154: --this .lexsize;
1155: this .waswhite = false;
1156: this .lines = this .in.curline;
1157: this .columns = this .in.curcol;
1158: continue;
1159: }
1160:
1161: if (c == '<') {
1162: this .state = LEX_GT;
1163: continue;
1164: }
1165:
1166: if ((map & WHITE) != 0) {
1167: /* was previous char white? */
1168: if (this .waswhite) {
1169: if (mode != Preformatted
1170: && mode != IgnoreMarkup) {
1171: --this .lexsize;
1172: this .lines = this .in.curline;
1173: this .columns = this .in.curcol;
1174: }
1175: } else /* prev char wasn't white */
1176: {
1177: this .waswhite = true;
1178: lastc = c;
1179:
1180: if (mode != Preformatted
1181: && mode != IgnoreMarkup && c != ' ')
1182: changeChar((byte) ' ');
1183: }
1184:
1185: continue;
1186: } else if (c == '&' && mode != IgnoreMarkup)
1187: parseEntity(mode);
1188:
1189: /* this is needed to avoid trimming trailing whitespace */
1190: if (mode == IgnoreWhitespace)
1191: mode = MixedContent;
1192:
1193: this .waswhite = false;
1194: continue;
1195:
1196: case LEX_GT: /* < */
1197:
1198: /* check for endtag */
1199: if (c == '/') {
1200: c = this .in.readChar();
1201: if (c == StreamIn.EndOfStream) {
1202: this .in.ungetChar(c);
1203: continue;
1204: }
1205:
1206: addCharToLexer(c);
1207: map = MAP((char) c);
1208:
1209: if ((map & LETTER) != 0) {
1210: this .lexsize -= 3;
1211: this .txtend = this .lexsize;
1212: this .in.ungetChar(c);
1213: this .state = LEX_ENDTAG;
1214: this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
1215: this .in.curcol -= 2;
1216:
1217: /* if some text before the </ return it now */
1218: if (this .txtend > this .txtstart) {
1219: /* trim space char before end tag */
1220: if (mode == IgnoreWhitespace
1221: && this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1222: this .lexsize -= 1;
1223: this .txtend = this .lexsize;
1224: }
1225:
1226: this .token = newNode(Node.TextNode,
1227: this .lexbuf, this .txtstart,
1228: this .txtend);
1229: return this .token;
1230: }
1231:
1232: continue; /* no text so keep going */
1233: }
1234:
1235: /* otherwise treat as CDATA */
1236: this .waswhite = false;
1237: this .state = LEX_CONTENT;
1238: continue;
1239: }
1240:
1241: if (mode == IgnoreMarkup) {
1242: /* otherwise treat as CDATA */
1243: this .waswhite = false;
1244: this .state = LEX_CONTENT;
1245: continue;
1246: }
1247:
1248: /*
1249: look out for comments, doctype or marked sections
1250: this isn't quite right, but its getting there ...
1251: */
1252: if (c == '!') {
1253: c = this .in.readChar();
1254:
1255: if (c == '-') {
1256: c = this .in.readChar();
1257:
1258: if (c == '-') {
1259: this .state = LEX_COMMENT; /* comment */
1260: this .lexsize -= 2;
1261: this .txtend = this .lexsize;
1262:
1263: /* if some text before < return it now */
1264: if (this .txtend > this .txtstart) {
1265: this .token = newNode(Node.TextNode,
1266: this .lexbuf, this .txtstart,
1267: this .txtend);
1268: return this .token;
1269: }
1270:
1271: this .txtstart = this .lexsize;
1272: continue;
1273: }
1274:
1275: Report.warning(this , null, null,
1276: Report.MALFORMED_COMMENT);
1277: } else if (c == 'd' || c == 'D') {
1278: this .state = LEX_DOCTYPE; /* doctype */
1279: this .lexsize -= 2;
1280: this .txtend = this .lexsize;
1281: mode = IgnoreWhitespace;
1282:
1283: /* skip until white space or '>' */
1284:
1285: for (;;) {
1286: c = this .in.readChar();
1287:
1288: if (c == StreamIn.EndOfStream || c == '>') {
1289: this .in.ungetChar(c);
1290: break;
1291: }
1292:
1293: map = MAP((char) c);
1294:
1295: if ((map & WHITE) == 0)
1296: continue;
1297:
1298: /* and skip to end of whitespace */
1299:
1300: for (;;) {
1301: c = this .in.readChar();
1302:
1303: if (c == StreamIn.EndOfStream
1304: || c == '>') {
1305: this .in.ungetChar(c);
1306: break;
1307: }
1308:
1309: map = MAP((char) c);
1310:
1311: if ((map & WHITE) != 0)
1312: continue;
1313:
1314: this .in.ungetChar(c);
1315: break;
1316: }
1317:
1318: break;
1319: }
1320:
1321: /* if some text before < return it now */
1322: if (this .txtend > this .txtstart) {
1323: this .token = newNode(Node.TextNode,
1324: this .lexbuf, this .txtstart,
1325: this .txtend);
1326: return this .token;
1327: }
1328:
1329: this .txtstart = this .lexsize;
1330: continue;
1331: } else if (c == '[') {
1332: /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1333: this .lexsize -= 2;
1334: this .state = LEX_SECTION;
1335: this .txtend = this .lexsize;
1336:
1337: /* if some text before < return it now */
1338: if (this .txtend > this .txtstart) {
1339: this .token = newNode(Node.TextNode,
1340: this .lexbuf, this .txtstart,
1341: this .txtend);
1342: return this .token;
1343: }
1344:
1345: this .txtstart = this .lexsize;
1346: continue;
1347: }
1348:
1349: /* otherwise swallow chars up to and including next '>' */
1350: while (true) {
1351: c = this .in.readChar();
1352: if (c == '>')
1353: break;
1354: if (c == -1) {
1355: this .in.ungetChar(c);
1356: break;
1357: }
1358: }
1359:
1360: this .lexsize -= 2;
1361: this .lexbuf[this .lexsize] = (byte) '\0';
1362: this .state = LEX_CONTENT;
1363: continue;
1364: }
1365:
1366: /*
1367: processing instructions
1368: */
1369:
1370: if (c == '?') {
1371: this .lexsize -= 2;
1372: this .state = LEX_PROCINSTR;
1373: this .txtend = this .lexsize;
1374:
1375: /* if some text before < return it now */
1376: if (this .txtend > this .txtstart) {
1377: this .token = newNode(Node.TextNode,
1378: this .lexbuf, this .txtstart, this .txtend);
1379: return this .token;
1380: }
1381:
1382: this .txtstart = this .lexsize;
1383: continue;
1384: }
1385:
1386: /* Microsoft ASP's e.g. <% ... server-code ... %> */
1387: if (c == '%') {
1388: this .lexsize -= 2;
1389: this .state = LEX_ASP;
1390: this .txtend = this .lexsize;
1391:
1392: /* if some text before < return it now */
1393: if (this .txtend > this .txtstart) {
1394: this .token = newNode(Node.TextNode,
1395: this .lexbuf, this .txtstart, this .txtend);
1396: return this .token;
1397: }
1398:
1399: this .txtstart = this .lexsize;
1400: continue;
1401: }
1402:
1403: /* Netscapes JSTE e.g. <# ... server-code ... #> */
1404: if (c == '#') {
1405: this .lexsize -= 2;
1406: this .state = LEX_JSTE;
1407: this .txtend = this .lexsize;
1408:
1409: /* if some text before < return it now */
1410: if (this .txtend > this .txtstart) {
1411: this .token = newNode(Node.TextNode,
1412: this .lexbuf, this .txtstart, this .txtend);
1413: return this .token;
1414: }
1415:
1416: this .txtstart = this .lexsize;
1417: continue;
1418: }
1419:
1420: map = MAP((char) c);
1421:
1422: /* check for start tag */
1423: if ((map & LETTER) != 0) {
1424: this .in.ungetChar(c); /* push back letter */
1425: this .lexsize -= 2; /* discard "<" + letter */
1426: this .txtend = this .lexsize;
1427: this .state = LEX_STARTTAG; /* ready to read tag name */
1428:
1429: /* if some text before < return it now */
1430: if (this .txtend > this .txtstart) {
1431: this .token = newNode(Node.TextNode,
1432: this .lexbuf, this .txtstart, this .txtend);
1433: return this .token;
1434: }
1435:
1436: continue; /* no text so keep going */
1437: }
1438:
1439: /* otherwise treat as CDATA */
1440: this .state = LEX_CONTENT;
1441: this .waswhite = false;
1442: continue;
1443:
1444: case LEX_ENDTAG: /* </letter */
1445: this .txtstart = this .lexsize - 1;
1446: this .in.curcol += 2;
1447: c = parseTagName();
1448: this .token = newNode(Node.EndTag, /* create endtag token */
1449: this .lexbuf, this .txtstart, this .txtend, getString(
1450: this .lexbuf, this .txtstart, this .txtend
1451: - this .txtstart));
1452: this .lexsize = this .txtstart;
1453: this .txtend = this .txtstart;
1454:
1455: /* skip to '>' */
1456: while (c != '>') {
1457: c = this .in.readChar();
1458:
1459: if (c == StreamIn.EndOfStream)
1460: break;
1461: }
1462:
1463: if (c == StreamIn.EndOfStream) {
1464: this .in.ungetChar(c);
1465: continue;
1466: }
1467:
1468: this .state = LEX_CONTENT;
1469: this .waswhite = false;
1470: return this .token; /* the endtag token */
1471:
1472: case LEX_STARTTAG: /* first letter of tagname */
1473: this .txtstart = this .lexsize - 1; /* set txtstart to first letter */
1474: c = parseTagName();
1475: isempty.value = false;
1476: attributes = null;
1477: this .token = newNode((isempty.value ? Node.StartEndTag
1478: : Node.StartTag), this .lexbuf, this .txtstart,
1479: this .txtend, getString(this .lexbuf,
1480: this .txtstart, this .txtend
1481: - this .txtstart));
1482:
1483: /* parse attributes, consuming closing ">" */
1484: if (c != '>') {
1485: if (c == '/')
1486: this .in.ungetChar(c);
1487:
1488: attributes = parseAttrs(isempty);
1489: }
1490:
1491: if (isempty.value)
1492: this .token.type = Node.StartEndTag;
1493:
1494: this .token.attributes = attributes;
1495: this .lexsize = this .txtstart;
1496: this .txtend = this .txtstart;
1497:
1498: /* swallow newline following start tag */
1499: /* special check needed for CRLF sequence */
1500: /* this doesn't apply to empty elements */
1501:
1502: if (expectsContent(this .token)
1503: || this .token.tag == configuration.tt.tagBr) {
1504:
1505: c = this .in.readChar();
1506:
1507: if (c == '\r') {
1508: c = this .in.readChar();
1509:
1510: if (c != '\n')
1511: this .in.ungetChar(c);
1512: } else if (c != '\n' && c != '\f')
1513: this .in.ungetChar(c);
1514:
1515: this .waswhite = true; /* to swallow leading whitespace */
1516: } else
1517: this .waswhite = false;
1518:
1519: this .state = LEX_CONTENT;
1520:
1521: if (this .token.tag == null)
1522: Report.error(this , null, this .token,
1523: Report.UNKNOWN_ELEMENT);
1524: else if (!this .configuration.XmlTags) {
1525: this .versions &= this .token.tag.versions;
1526:
1527: if ((this .token.tag.versions & Dict.VERS_PROPRIETARY) != 0) {
1528: if (!this .configuration.MakeClean
1529: && (this .token.tag == configuration.tt.tagNobr || this .token.tag == configuration.tt.tagWbr))
1530: Report.warning(this , null, this .token,
1531: Report.PROPRIETARY_ELEMENT);
1532: }
1533:
1534: if (this .token.tag.chkattrs != null) {
1535: this .token.checkUniqueAttributes(this );
1536: this .token.tag.chkattrs.check(this , this .token);
1537: } else
1538: this .token.checkAttributes(this );
1539: }
1540:
1541: return this .token; /* return start tag */
1542:
1543: case LEX_COMMENT: /* seen <!-- so look for --> */
1544:
1545: if (c != '-')
1546: continue;
1547:
1548: c = this .in.readChar();
1549: addCharToLexer(c);
1550:
1551: if (c != '-')
1552: continue;
1553:
1554: end_comment: while (true) {
1555: c = this .in.readChar();
1556:
1557: if (c == '>') {
1558: if (badcomment != 0)
1559: Report.warning(this , null, null,
1560: Report.MALFORMED_COMMENT);
1561:
1562: this .txtend = this .lexsize - 2; // AQ 8Jul2000
1563: this .lexbuf[this .lexsize] = (byte) '\0';
1564: this .state = LEX_CONTENT;
1565: this .waswhite = false;
1566: this .token = newNode(Node.CommentTag,
1567: this .lexbuf, this .txtstart, this .txtend);
1568:
1569: /* now look for a line break */
1570:
1571: c = this .in.readChar();
1572:
1573: if (c == '\r') {
1574: c = this .in.readChar();
1575:
1576: if (c != '\n')
1577: this .token.linebreak = true;
1578: }
1579:
1580: if (c == '\n')
1581: this .token.linebreak = true;
1582: else
1583: this .in.ungetChar(c);
1584:
1585: return this .token;
1586: }
1587:
1588: /* note position of first such error in the comment */
1589: if (badcomment == 0) {
1590: this .lines = this .in.curline;
1591: this .columns = this .in.curcol - 3;
1592: }
1593:
1594: badcomment++;
1595: if (this .configuration.FixComments)
1596: this .lexbuf[this .lexsize - 2] = (byte) '=';
1597:
1598: addCharToLexer(c);
1599:
1600: /* if '-' then look for '>' to end the comment */
1601: if (c != '-')
1602: break end_comment;
1603:
1604: }
1605: /* otherwise continue to look for --> */
1606: this .lexbuf[this .lexsize - 2] = (byte) '=';
1607: continue;
1608:
1609: case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
1610: map = MAP((char) c);
1611:
1612: if ((map & WHITE) != 0) {
1613: if (this .waswhite)
1614: this .lexsize -= 1;
1615:
1616: this .waswhite = true;
1617: } else
1618: this .waswhite = false;
1619:
1620: if (c != '>')
1621: continue;
1622:
1623: this .lexsize -= 1;
1624: this .txtend = this .lexsize;
1625: this .lexbuf[this .lexsize] = (byte) '\0';
1626: this .state = LEX_CONTENT;
1627: this .waswhite = false;
1628: this .token = newNode(Node.DocTypeTag, this .lexbuf,
1629: this .txtstart, this .txtend);
1630: /* make a note of the version named by the doctype */
1631: this .doctype = findGivenVersion(this .token);
1632: return this .token;
1633:
1634: case LEX_PROCINSTR: /* seen <? so look for '>' */
1635: /* check for PHP preprocessor instructions <?php ... ?> */
1636:
1637: if (this .lexsize - this .txtstart == 3) {
1638: if ((getString(this .lexbuf, this .txtstart, 3))
1639: .equals("php")) {
1640: this .state = LEX_PHP;
1641: continue;
1642: }
1643: }
1644:
1645: if (this .configuration.XmlPIs) /* insist on ?> as terminator */
1646: {
1647: if (c != '?')
1648: continue;
1649:
1650: /* now look for '>' */
1651: c = this .in.readChar();
1652:
1653: if (c == StreamIn.EndOfStream) {
1654: Report.warning(this , null, null,
1655: Report.UNEXPECTED_END_OF_FILE);
1656: this .in.ungetChar(c);
1657: continue;
1658: }
1659:
1660: addCharToLexer(c);
1661: }
1662:
1663: if (c != '>')
1664: continue;
1665:
1666: this .lexsize -= 1;
1667: this .txtend = this .lexsize;
1668: this .lexbuf[this .lexsize] = (byte) '\0';
1669: this .state = LEX_CONTENT;
1670: this .waswhite = false;
1671: this .token = newNode(Node.ProcInsTag, this .lexbuf,
1672: this .txtstart, this .txtend);
1673: return this .token;
1674:
1675: case LEX_ASP: /* seen <% so look for "%>" */
1676: if (c != '%')
1677: continue;
1678:
1679: /* now look for '>' */
1680: c = this .in.readChar();
1681:
1682: if (c != '>') {
1683: this .in.ungetChar(c);
1684: continue;
1685: }
1686:
1687: this .lexsize -= 1;
1688: this .txtend = this .lexsize;
1689: this .lexbuf[this .lexsize] = (byte) '\0';
1690: this .state = LEX_CONTENT;
1691: this .waswhite = false;
1692: this .token = newNode(Node.AspTag, this .lexbuf,
1693: this .txtstart, this .txtend);
1694: return this .token;
1695:
1696: case LEX_JSTE: /* seen <# so look for "#>" */
1697: if (c != '#')
1698: continue;
1699:
1700: /* now look for '>' */
1701: c = this .in.readChar();
1702:
1703: if (c != '>') {
1704: this .in.ungetChar(c);
1705: continue;
1706: }
1707:
1708: this .lexsize -= 1;
1709: this .txtend = this .lexsize;
1710: this .lexbuf[this .lexsize] = (byte) '\0';
1711: this .state = LEX_CONTENT;
1712: this .waswhite = false;
1713: this .token = newNode(Node.JsteTag, this .lexbuf,
1714: this .txtstart, this .txtend);
1715: return this .token;
1716:
1717: case LEX_PHP: /* seen "<?php" so look for "?>" */
1718: if (c != '?')
1719: continue;
1720:
1721: /* now look for '>' */
1722: c = this .in.readChar();
1723:
1724: if (c != '>') {
1725: this .in.ungetChar(c);
1726: continue;
1727: }
1728:
1729: this .lexsize -= 1;
1730: this .txtend = this .lexsize;
1731: this .lexbuf[this .lexsize] = (byte) '\0';
1732: this .state = LEX_CONTENT;
1733: this .waswhite = false;
1734: this .token = newNode(Node.PhpTag, this .lexbuf,
1735: this .txtstart, this .txtend);
1736: return this .token;
1737:
1738: case LEX_SECTION: /* seen "<![" so look for "]>" */
1739: if (c == '[') {
1740: if (this .lexsize == (this .txtstart + 6)
1741: && (getString(this .lexbuf, this .txtstart, 6))
1742: .equals("CDATA[")) {
1743: this .state = LEX_CDATA;
1744: this .lexsize -= 6;
1745: continue;
1746: }
1747: }
1748:
1749: if (c != ']')
1750: continue;
1751:
1752: /* now look for '>' */
1753: c = this .in.readChar();
1754:
1755: if (c != '>') {
1756: this .in.ungetChar(c);
1757: continue;
1758: }
1759:
1760: this .lexsize -= 1;
1761: this .txtend = this .lexsize;
1762: this .lexbuf[this .lexsize] = (byte) '\0';
1763: this .state = LEX_CONTENT;
1764: this .waswhite = false;
1765: this .token = newNode(Node.SectionTag, this .lexbuf,
1766: this .txtstart, this .txtend);
1767: return this .token;
1768:
1769: case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1770: if (c != ']')
1771: continue;
1772:
1773: /* now look for ']' */
1774: c = this .in.readChar();
1775:
1776: if (c != ']') {
1777: this .in.ungetChar(c);
1778: continue;
1779: }
1780:
1781: /* now look for '>' */
1782: c = this .in.readChar();
1783:
1784: if (c != '>') {
1785: this .in.ungetChar(c);
1786: continue;
1787: }
1788:
1789: this .lexsize -= 1;
1790: this .txtend = this .lexsize;
1791: this .lexbuf[this .lexsize] = (byte) '\0';
1792: this .state = LEX_CONTENT;
1793: this .waswhite = false;
1794: this .token = newNode(Node.CDATATag, this .lexbuf,
1795: this .txtstart, this .txtend);
1796: return this .token;
1797: }
1798: }
1799:
1800: if (this .state == LEX_CONTENT) /* text string */
1801: {
1802: this .txtend = this .lexsize;
1803:
1804: if (this .txtend > this .txtstart) {
1805: this .in.ungetChar(c);
1806:
1807: if (this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1808: this .lexsize -= 1;
1809: this .txtend = this .lexsize;
1810: }
1811:
1812: this .token = newNode(Node.TextNode, this .lexbuf,
1813: this .txtstart, this .txtend);
1814: return this .token;
1815: }
1816: } else if (this .state == LEX_COMMENT) /* comment */
1817: {
1818: if (c == StreamIn.EndOfStream)
1819: Report.warning(this , null, null,
1820: Report.MALFORMED_COMMENT);
1821:
1822: this .txtend = this .lexsize;
1823: this .lexbuf[this .lexsize] = (byte) '\0';
1824: this .state = LEX_CONTENT;
1825: this .waswhite = false;
1826: this .token = newNode(Node.CommentTag, this .lexbuf,
1827: this .txtstart, this .txtend);
1828: return this .token;
1829: }
1830:
1831: return null;
1832: }
1833:
1834: /*
1835: parser for ASP within start tags
1836:
1837: Some people use ASP for to customize attributes
1838: Tidy isn't really well suited to dealing with ASP
1839: This is a workaround for attributes, but won't
1840: deal with the case where the ASP is used to tailor
1841: the attribute value. Here is an example of a work
1842: around for using ASP in attribute values:
1843:
1844: href="<%=rsSchool.Fields("ID").Value%>"
1845:
1846: where the ASP that generates the attribute value
1847: is masked from Tidy by the quotemarks.
1848:
1849: */
1850:
1851: public Node parseAsp() {
1852: int c;
1853: Node asp = null;
1854:
1855: this .txtstart = this .lexsize;
1856:
1857: for (;;) {
1858: c = this .in.readChar();
1859: addCharToLexer(c);
1860:
1861: if (c != '%')
1862: continue;
1863:
1864: c = this .in.readChar();
1865: addCharToLexer(c);
1866:
1867: if (c == '>')
1868: break;
1869: }
1870:
1871: this .lexsize -= 2;
1872: this .txtend = this .lexsize;
1873:
1874: if (this .txtend > this .txtstart)
1875: asp = newNode(Node.AspTag, this .lexbuf, this .txtstart,
1876: this .txtend);
1877:
1878: this .txtstart = this .txtend;
1879: return asp;
1880: }
1881:
1882: /*
1883: PHP is like ASP but is based upon XML
1884: processing instructions, e.g. <?php ... ?>
1885: */
1886: public Node parsePhp() {
1887: int c;
1888: Node php = null;
1889:
1890: this .txtstart = this .lexsize;
1891:
1892: for (;;) {
1893: c = this .in.readChar();
1894: addCharToLexer(c);
1895:
1896: if (c != '?')
1897: continue;
1898:
1899: c = this .in.readChar();
1900: addCharToLexer(c);
1901:
1902: if (c == '>')
1903: break;
1904: }
1905:
1906: this .lexsize -= 2;
1907: this .txtend = this .lexsize;
1908:
1909: if (this .txtend > this .txtstart)
1910: php = newNode(Node.PhpTag, this .lexbuf, this .txtstart,
1911: this .txtend);
1912:
1913: this .txtstart = this .txtend;
1914: return php;
1915: }
1916:
1917: /* consumes the '>' terminating start tags */
1918: public String parseAttribute(MutableBoolean isempty,
1919: MutableObject asp, MutableObject php) {
1920: int start = 0;
1921: // int len = 0; Removed by BUGFIX for 126265
1922: short map;
1923: String attr;
1924: int c = 0;
1925:
1926: asp.setObject(null); /* clear asp pointer */
1927: php.setObject(null); /* clear php pointer */
1928: /* skip white space before the attribute */
1929:
1930: for (;;) {
1931: c = this .in.readChar();
1932:
1933: if (c == '/') {
1934: c = this .in.readChar();
1935:
1936: if (c == '>') {
1937: isempty.value = true;
1938: return null;
1939: }
1940:
1941: this .in.ungetChar(c);
1942: c = '/';
1943: break;
1944: }
1945:
1946: if (c == '>')
1947: return null;
1948:
1949: if (c == '<') {
1950: c = this .in.readChar();
1951:
1952: if (c == '%') {
1953: asp.setObject(parseAsp());
1954: return null;
1955: } else if (c == '?') {
1956: php.setObject(parsePhp());
1957: return null;
1958: }
1959:
1960: this .in.ungetChar(c);
1961: Report.attrError(this , this .token, null,
1962: Report.UNEXPECTED_GT);
1963: return null;
1964: }
1965:
1966: if (c == '"' || c == '\'') {
1967: Report.attrError(this , this .token, null,
1968: Report.UNEXPECTED_QUOTEMARK);
1969: continue;
1970: }
1971:
1972: if (c == StreamIn.EndOfStream) {
1973: Report.attrError(this , this .token, null,
1974: Report.UNEXPECTED_END_OF_FILE);
1975: this .in.ungetChar(c);
1976: return null;
1977: }
1978:
1979: map = MAP((char) c);
1980:
1981: if ((map & WHITE) == 0)
1982: break;
1983: }
1984:
1985: start = this .lexsize;
1986:
1987: for (;;) {
1988: /* but push back '=' for parseValue() */
1989: if (c == '=' || c == '>') {
1990: this .in.ungetChar(c);
1991: break;
1992: }
1993:
1994: if (c == '<' || c == StreamIn.EndOfStream) {
1995: this .in.ungetChar(c);
1996: break;
1997: }
1998:
1999: map = MAP((char) c);
2000:
2001: if ((map & WHITE) != 0)
2002: break;
2003:
2004: /* what should be done about non-namechar characters? */
2005: /* currently these are incorporated into the attr name */
2006:
2007: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
2008: c += (int) ('a' - 'A');
2009:
2010: // ++len; Removed by BUGFIX for 126265
2011: addCharToLexer(c);
2012:
2013: c = this .in.readChar();
2014: }
2015:
2016: // Following line added by GLP to fix BUG 126265. This is a temporary comment
2017: // and should be removed when Tidy is fixed.
2018: int len = this .lexsize - start;
2019: attr = (len > 0 ? getString(this .lexbuf, start, len) : null);
2020: this .lexsize = start;
2021:
2022: return attr;
2023: }
2024:
2025: /*
2026: invoked when < is seen in place of attribute value
2027: but terminates on whitespace if not ASP, PHP or Tango
2028: this routine recognizes ' and " quoted strings
2029: */
2030: public int parseServerInstruction() {
2031: int c, map, delim = '"';
2032: boolean isrule = false;
2033:
2034: c = this .in.readChar();
2035: addCharToLexer(c);
2036:
2037: /* check for ASP, PHP or Tango */
2038: if (c == '%' || c == '?' || c == '@')
2039: isrule = true;
2040:
2041: for (;;) {
2042: c = this .in.readChar();
2043:
2044: if (c == StreamIn.EndOfStream)
2045: break;
2046:
2047: if (c == '>') {
2048: if (isrule)
2049: addCharToLexer(c);
2050: else
2051: this .in.ungetChar(c);
2052:
2053: break;
2054: }
2055:
2056: /* if not recognized as ASP, PHP or Tango */
2057: /* then also finish value on whitespace */
2058: if (!isrule) {
2059: map = MAP((char) c);
2060:
2061: if ((map & WHITE) != 0)
2062: break;
2063: }
2064:
2065: addCharToLexer(c);
2066:
2067: if (c == '"') {
2068: do {
2069: c = this .in.readChar();
2070: addCharToLexer(c);
2071: } while (c != '"');
2072: delim = '\'';
2073: continue;
2074: }
2075:
2076: if (c == '\'') {
2077: do {
2078: c = this .in.readChar();
2079: addCharToLexer(c);
2080: } while (c != '\'');
2081: }
2082: }
2083:
2084: return delim;
2085: }
2086:
2087: /* values start with "=" or " = " etc. */
2088: /* doesn't consume the ">" at end of start tag */
2089:
2090: public String parseValue(String name, boolean foldCase,
2091: MutableBoolean isempty, MutableInteger pdelim) {
2092: int len = 0;
2093: int start;
2094: short map;
2095: boolean seen_gt = false;
2096: boolean munge = true;
2097: int c = 0;
2098: int lastc, delim, quotewarning;
2099: String value;
2100:
2101: delim = 0;
2102: pdelim.value = (int) '"';
2103:
2104: /*
2105: Henry Zrepa reports that some folk are using the
2106: embed element with script attributes where newlines
2107: are significant and must be preserved
2108: */
2109: if (configuration.LiteralAttribs)
2110: munge = false;
2111:
2112: /* skip white space before the '=' */
2113:
2114: for (;;) {
2115: c = this .in.readChar();
2116:
2117: if (c == StreamIn.EndOfStream) {
2118: this .in.ungetChar(c);
2119: break;
2120: }
2121:
2122: map = MAP((char) c);
2123:
2124: if ((map & WHITE) == 0)
2125: break;
2126: }
2127:
2128: /*
2129: c should be '=' if there is a value
2130: other legal possibilities are white
2131: space, '/' and '>'
2132: */
2133:
2134: if (c != '=') {
2135: this .in.ungetChar(c);
2136: return null;
2137: }
2138:
2139: /* skip white space after '=' */
2140:
2141: for (;;) {
2142: c = this .in.readChar();
2143:
2144: if (c == StreamIn.EndOfStream) {
2145: this .in.ungetChar(c);
2146: break;
2147: }
2148:
2149: map = MAP((char) c);
2150:
2151: if ((map & WHITE) == 0)
2152: break;
2153: }
2154:
2155: /* check for quote marks */
2156:
2157: if (c == '"' || c == '\'')
2158: delim = c;
2159: else if (c == '<') {
2160: start = this .lexsize;
2161: addCharToLexer(c);
2162: pdelim.value = parseServerInstruction();
2163: len = this .lexsize - start;
2164: this .lexsize = start;
2165: return (len > 0 ? getString(this .lexbuf, start, len) : null);
2166: } else
2167: this .in.ungetChar(c);
2168:
2169: /*
2170: and read the value string
2171: check for quote mark if needed
2172: */
2173:
2174: quotewarning = 0;
2175: start = this .lexsize;
2176: c = '\0';
2177:
2178: for (;;) {
2179: lastc = c; /* track last character */
2180: c = this .in.readChar();
2181:
2182: if (c == StreamIn.EndOfStream) {
2183: Report.attrError(this , this .token, null,
2184: Report.UNEXPECTED_END_OF_FILE);
2185: this .in.ungetChar(c);
2186: break;
2187: }
2188:
2189: if (delim == (char) 0) {
2190: if (c == '>') {
2191: this .in.ungetChar(c);
2192: break;
2193: }
2194:
2195: if (c == '"' || c == '\'') {
2196: Report.attrError(this , this .token, null,
2197: Report.UNEXPECTED_QUOTEMARK);
2198: break;
2199: }
2200:
2201: if (c == '<') {
2202: /* this.in.ungetChar(c); */
2203: Report.attrError(this , this .token, null,
2204: Report.UNEXPECTED_GT);
2205: /* break; */
2206: }
2207:
2208: /*
2209: For cases like <br clear=all/> need to avoid treating /> as
2210: part of the attribute value, however care is needed to avoid
2211: so treating <a href=http://www.acme.com/> in this way, which
2212: would map the <a> tag to <a href="http://www.acme.com"/>
2213: */
2214: if (c == '/') {
2215: /* peek ahead in case of /> */
2216: c = this .in.readChar();
2217:
2218: if (c == '>'
2219: && !AttributeTable
2220: .getDefaultAttributeTable().isUrl(
2221: name)) {
2222: isempty.value = true;
2223: this .in.ungetChar(c);
2224: break;
2225: }
2226:
2227: /* unget peeked char */
2228: this .in.ungetChar(c);
2229: c = '/';
2230: }
2231: } else /* delim is '\'' or '"' */
2232: {
2233: if (c == delim)
2234: break;
2235:
2236: /* treat CRLF, CR and LF as single line break */
2237:
2238: if (c == '\r') {
2239: c = this .in.readChar();
2240: if (c != '\n')
2241: this .in.ungetChar(c);
2242:
2243: c = '\n';
2244: }
2245:
2246: if (c == '\n' || c == '<' || c == '>')
2247: ++quotewarning;
2248:
2249: if (c == '>')
2250: seen_gt = true;
2251: }
2252:
2253: if (c == '&') {
2254: addCharToLexer(c);
2255: parseEntity((short) 0);
2256: continue;
2257: }
2258:
2259: /*
2260: kludge for JavaScript attribute values
2261: with line continuations in string literals
2262: */
2263: if (c == '\\') {
2264: c = this .in.readChar();
2265:
2266: if (c != '\n') {
2267: this .in.ungetChar(c);
2268: c = '\\';
2269: }
2270: }
2271:
2272: map = MAP((char) c);
2273:
2274: if ((map & WHITE) != 0) {
2275: if (delim == (char) 0)
2276: break;
2277:
2278: if (munge) {
2279: c = ' ';
2280:
2281: if (lastc == ' ')
2282: continue;
2283: }
2284: } else if (foldCase && (map & UPPERCASE) != 0)
2285: c += (int) ('a' - 'A');
2286:
2287: addCharToLexer(c);
2288: }
2289:
2290: if (quotewarning > 10 && seen_gt && munge) {
2291: /*
2292: there is almost certainly a missing trailling quote mark
2293: as we have see too many newlines, < or > characters.
2294:
2295: an exception is made for Javascript attributes and the
2296: javascript URL scheme which may legitimately include < and >
2297: */
2298: if (!AttributeTable.getDefaultAttributeTable().isScript(
2299: name)
2300: && !(AttributeTable.getDefaultAttributeTable()
2301: .isUrl(name) && (getString(this .lexbuf,
2302: start, 11)).equals("javascript:")))
2303: Report.error(this , null, null,
2304: Report.SUSPECTED_MISSING_QUOTE);
2305: }
2306:
2307: len = this .lexsize - start;
2308: this .lexsize = start;
2309:
2310: if (len > 0 || delim != 0)
2311: value = getString(this .lexbuf, start, len);
2312: else
2313: value = null;
2314:
2315: /* note delimiter if given */
2316: if (delim != 0)
2317: pdelim.value = delim;
2318: else
2319: pdelim.value = (int) '"';
2320:
2321: return value;
2322: }
2323:
2324: /* attr must be non-null */
2325: public static boolean isValidAttrName(String attr) {
2326: short map;
2327: char c;
2328: int i;
2329:
2330: /* first character should be a letter */
2331: c = attr.charAt(0);
2332: map = MAP(c);
2333:
2334: if (!((map & LETTER) != 0))
2335: return false;
2336:
2337: /* remaining characters should be namechars */
2338: for (i = 1; i < attr.length(); i++) {
2339: c = attr.charAt(i);
2340: map = MAP(c);
2341:
2342: if ((map & NAMECHAR) != 0)
2343: continue;
2344:
2345: return false;
2346: }
2347:
2348: return true;
2349: }
2350:
2351: /* swallows closing '>' */
2352:
2353: public AttVal parseAttrs(MutableBoolean isempty) {
2354: AttVal av, list;
2355: String attribute, value;
2356: MutableInteger delim = new MutableInteger();
2357: MutableObject asp = new MutableObject();
2358: MutableObject php = new MutableObject();
2359:
2360: list = null;
2361:
2362: for (; !endOfInput();) {
2363: attribute = parseAttribute(isempty, asp, php);
2364:
2365: if (attribute == null) {
2366: /* check if attributes are created by ASP markup */
2367: if (asp.getObject() != null) {
2368: av = new AttVal(list, null, (Node) asp.getObject(),
2369: null, '\0', null, null);
2370: list = av;
2371: continue;
2372: }
2373:
2374: /* check if attributes are created by PHP markup */
2375: if (php.getObject() != null) {
2376: av = new AttVal(list, null, null, (Node) php
2377: .getObject(), '\0', null, null);
2378: list = av;
2379: continue;
2380: }
2381:
2382: break;
2383: }
2384:
2385: value = parseValue(attribute, false, isempty, delim);
2386:
2387: if (attribute != null && isValidAttrName(attribute)) {
2388: av = new AttVal(list, null, null, null, delim.value,
2389: attribute, value);
2390: av.dict = AttributeTable.getDefaultAttributeTable()
2391: .findAttribute(av);
2392: list = av;
2393: } else {
2394: av = new AttVal(null, null, null, null, 0, attribute,
2395: value);
2396: Report.attrError(this , this .token, value,
2397: Report.BAD_ATTRIBUTE_VALUE);
2398: }
2399: }
2400:
2401: return list;
2402: }
2403:
2404: /*
2405: push a copy of an inline node onto stack
2406: but don't push if implicit or OBJECT or APPLET
2407: (implicit tags are ones generated from the istack)
2408:
2409: One issue arises with pushing inlines when
2410: the tag is already pushed. For instance:
2411:
2412: <p><em>text
2413: <p><em>more text
2414:
2415: Shouldn't be mapped to
2416:
2417: <p><em>text</em></p>
2418: <p><em><em>more text</em></em>
2419: */
2420: public void pushInline(Node node) {
2421: IStack is;
2422:
2423: if (node.implicit)
2424: return;
2425:
2426: if (node.tag == null)
2427: return;
2428:
2429: if ((node.tag.model & Dict.CM_INLINE) == 0)
2430: return;
2431:
2432: if ((node.tag.model & Dict.CM_OBJECT) != 0)
2433: return;
2434:
2435: if (node.tag != configuration.tt.tagFont && isPushed(node))
2436: return;
2437:
2438: // make sure there is enough space for the stack
2439: is = new IStack();
2440: is.tag = node.tag;
2441: is.element = node.element;
2442: if (node.attributes != null)
2443: is.attributes = cloneAttributes(node.attributes);
2444: this .istack.push(is);
2445: }
2446:
2447: /* pop inline stack */
2448: public void popInline(Node node) {
2449: AttVal av;
2450: IStack is;
2451:
2452: if (node != null) {
2453:
2454: if (node.tag == null)
2455: return;
2456:
2457: if ((node.tag.model & Dict.CM_INLINE) == 0)
2458: return;
2459:
2460: if ((node.tag.model & Dict.CM_OBJECT) != 0)
2461: return;
2462:
2463: // if node is </a> then pop until we find an <a>
2464: if (node.tag == configuration.tt.tagA) {
2465:
2466: while (this .istack.size() > 0) {
2467: is = (IStack) this .istack.pop();
2468: if (is.tag == configuration.tt.tagA) {
2469: break;
2470: }
2471: }
2472:
2473: if (this .insert >= this .istack.size())
2474: this .insert = -1;
2475: return;
2476: }
2477: }
2478:
2479: if (this .istack.size() > 0) {
2480: is = (IStack) this .istack.pop();
2481: if (this .insert >= this .istack.size())
2482: this .insert = -1;
2483: }
2484: }
2485:
2486: public boolean isPushed(Node node) {
2487: int i;
2488: IStack is;
2489:
2490: for (i = this .istack.size() - 1; i >= 0; --i) {
2491: is = (IStack) this .istack.elementAt(i);
2492: if (is.tag == node.tag)
2493: return true;
2494: }
2495:
2496: return false;
2497: }
2498:
2499: /*
2500: This has the effect of inserting "missing" inline
2501: elements around the contents of blocklevel elements
2502: such as P, TD, TH, DIV, PRE etc. This procedure is
2503: called at the start of ParseBlock. when the inline
2504: stack is not empty, as will be the case in:
2505:
2506: <i><h1>italic heading</h1></i>
2507:
2508: which is then treated as equivalent to
2509:
2510: <h1><i>italic heading</i></h1>
2511:
2512: This is implemented by setting the lexer into a mode
2513: where it gets tokens from the inline stack rather than
2514: from the input stream.
2515: */
2516: public int inlineDup(Node node) {
2517: int n;
2518:
2519: n = this .istack.size() - this .istackbase;
2520: if (n > 0) {
2521: this .insert = this .istackbase;
2522: this .inode = node;
2523: }
2524:
2525: return n;
2526: }
2527:
2528: public Node insertedToken() {
2529: Node node;
2530: IStack is;
2531: int n;
2532:
2533: // this will only be null if inode != null
2534: if (this .insert == -1) {
2535: node = this .inode;
2536: this .inode = null;
2537: return node;
2538: }
2539:
2540: // is this is the "latest" node then update
2541: // the position, otherwise use current values
2542:
2543: if (this .inode == null) {
2544: this .lines = this .in.curline;
2545: this .columns = this .in.curcol;
2546: }
2547:
2548: node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
2549: this .txtend); // GLP: Bugfix 126261. Remove when this change
2550: // is fixed in istack.c in the original Tidy
2551: node.implicit = true;
2552: is = (IStack) this .istack.elementAt(this .insert);
2553: node.element = is.element;
2554: node.tag = is.tag;
2555: if (is.attributes != null)
2556: node.attributes = cloneAttributes(is.attributes);
2557:
2558: // advance lexer to next item on the stack
2559: n = this .insert;
2560:
2561: // and recover state if we have reached the end
2562: if (++n < this .istack.size()) {
2563: this .insert = n;
2564: } else {
2565: this .insert = -1;
2566: }
2567:
2568: return node;
2569: }
2570:
2571: /* AQ: Try this for speed optimization */
2572: public static int wstrcasecmp(String s1, String s2) {
2573: return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2574: }
2575:
2576: public static int wstrcaselexcmp(String s1, String s2) {
2577: char c;
2578: int i = 0;
2579:
2580: while (i < s1.length() && i < s2.length()) {
2581: c = s1.charAt(i);
2582: if (toLower(c) != toLower(s2.charAt(i))) {
2583: break;
2584: }
2585: i += 1;
2586: }
2587: if (i == s1.length() && i == s2.length()) {
2588: return 0;
2589: } else if (i == s1.length()) {
2590: return -1;
2591: } else if (i == s2.length()) {
2592: return 1;
2593: } else {
2594: return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
2595: }
2596: }
2597:
2598: public static boolean wsubstr(String s1, String s2) {
2599: int i;
2600: int len1 = s1.length();
2601: int len2 = s2.length();
2602:
2603: for (i = 0; i <= len1 - len2; ++i) {
2604: if (s2.equalsIgnoreCase(s1.substring(i)))
2605: return true;
2606: }
2607:
2608: return false;
2609: }
2610:
2611: public boolean canPrune(Node element) {
2612: if (element.type == Node.TextNode)
2613: return true;
2614:
2615: if (element.content != null)
2616: return false;
2617:
2618: if (element.tag == configuration.tt.tagA
2619: && element.attributes != null)
2620: return false;
2621:
2622: if (element.tag == configuration.tt.tagP
2623: && !this .configuration.DropEmptyParas)
2624: return false;
2625:
2626: if (element.tag == null)
2627: return false;
2628:
2629: if ((element.tag.model & Dict.CM_ROW) != 0)
2630: return false;
2631:
2632: if (element.tag == configuration.tt.tagApplet)
2633: return false;
2634:
2635: if (element.tag == configuration.tt.tagObject)
2636: return false;
2637:
2638: if (element.attributes != null
2639: && (element.getAttrByName("id") != null || element
2640: .getAttrByName("name") != null))
2641: return false;
2642:
2643: return true;
2644: }
2645:
2646: /* duplicate name attribute as an id */
2647: public void fixId(Node node) {
2648: AttVal name = node.getAttrByName("name");
2649: AttVal id = node.getAttrByName("id");
2650:
2651: if (name != null) {
2652: if (id != null) {
2653: if (!id.value.equals(name.value))
2654: Report.attrError(this , node, "name",
2655: Report.ID_NAME_MISMATCH);
2656: } else if (this .configuration.XmlOut)
2657: node.addAttribute("id", name.value);
2658: }
2659: }
2660:
2661: /*
2662: defer duplicates when entering a table or other
2663: element where the inlines shouldn't be duplicated
2664: */
2665: public void deferDup() {
2666: this .insert = -1;
2667: this .inode = null;
2668: }
2669:
2670: /* Private methods and fields */
2671:
2672: /* lexer char types */
2673: private static final short DIGIT = 1;
2674: private static final short LETTER = 2;
2675: private static final short NAMECHAR = 4;
2676: private static final short WHITE = 8;
2677: private static final short NEWLINE = 16;
2678: private static final short LOWERCASE = 32;
2679: private static final short UPPERCASE = 64;
2680:
2681: /* lexer GetToken states */
2682:
2683: private static final short LEX_CONTENT = 0;
2684: private static final short LEX_GT = 1;
2685: private static final short LEX_ENDTAG = 2;
2686: private static final short LEX_STARTTAG = 3;
2687: private static final short LEX_COMMENT = 4;
2688: private static final short LEX_DOCTYPE = 5;
2689: private static final short LEX_PROCINSTR = 6;
2690: private static final short LEX_ENDCOMMENT = 7;
2691: private static final short LEX_CDATA = 8;
2692: private static final short LEX_SECTION = 9;
2693: private static final short LEX_ASP = 10;
2694: private static final short LEX_JSTE = 11;
2695: private static final short LEX_PHP = 12;
2696:
2697: /* used to classify chars for lexical purposes */
2698: private static short[] lexmap = new short[128];
2699:
2700: private static void mapStr(String str, short code) {
2701: int j;
2702:
2703: for (int i = 0; i < str.length(); i++) {
2704: j = (int) str.charAt(i);
2705: lexmap[j] |= code;
2706: }
2707: }
2708:
2709: static {
2710: mapStr("\r\n\f", (short) (NEWLINE | WHITE));
2711: mapStr(" \t", WHITE);
2712: mapStr("-.:_", NAMECHAR);
2713: mapStr("0123456789", (short) (DIGIT | NAMECHAR));
2714: mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE
2715: | LETTER | NAMECHAR));
2716: mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE
2717: | LETTER | NAMECHAR));
2718: }
2719:
2720: private static short MAP(char c) {
2721: return ((int) c < 128 ? lexmap[(int) c] : 0);
2722: }
2723:
2724: private static boolean isWhite(char c) {
2725: short m = MAP(c);
2726:
2727: return (m & WHITE) != 0;
2728: }
2729:
2730: private static boolean isDigit(char c) {
2731: short m;
2732:
2733: m = MAP(c);
2734:
2735: return (m & DIGIT) != 0;
2736: }
2737:
2738: private static boolean isLetter(char c) {
2739: short m;
2740:
2741: m = MAP(c);
2742:
2743: return (m & LETTER) != 0;
2744: }
2745:
2746: private static char toLower(char c) {
2747: short m = MAP(c);
2748:
2749: if ((m & UPPERCASE) != 0)
2750: c = (char) ((int) c + (int) 'a' - (int) 'A');
2751:
2752: return c;
2753: }
2754:
2755: private static char toUpper(char c) {
2756: short m = MAP(c);
2757:
2758: if ((m & LOWERCASE) != 0)
2759: c = (char) ((int) c + (int) 'A' - (int) 'a');
2760:
2761: return c;
2762: }
2763:
2764: public static char foldCase(char c, boolean tocaps, boolean xmlTags) {
2765: short m;
2766:
2767: if (!xmlTags) {
2768: m = MAP(c);
2769:
2770: if (tocaps) {
2771: if ((m & LOWERCASE) != 0)
2772: c = (char) ((int) c + (int) 'A' - (int) 'a');
2773: } else /* force to lower case */
2774: {
2775: if ((m & UPPERCASE) != 0)
2776: c = (char) ((int) c + (int) 'a' - (int) 'A');
2777: }
2778: }
2779:
2780: return c;
2781: }
2782:
2783: private static class W3CVersionInfo {
2784: String name;
2785: String voyagerName;
2786: String profile;
2787: short code;
2788:
2789: public W3CVersionInfo(String name, String voyagerName,
2790: String profile, short code) {
2791: this .name = name;
2792: this .voyagerName = voyagerName;
2793: this .profile = profile;
2794: this .code = code;
2795: }
2796: }
2797:
2798: /* the 3 URIs for the XHTML 1.0 DTDs */
2799: private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
2800: private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
2801: private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
2802:
2803: private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
2804:
2805: private static Lexer.W3CVersionInfo[] W3CVersion = {
2806: new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict",
2807: voyager_strict, Dict.VERS_HTML40_STRICT),
2808: new W3CVersionInfo("HTML 4.01 Transitional",
2809: "XHTML 1.0 Transitional", voyager_loose,
2810: Dict.VERS_HTML40_LOOSE),
2811: new W3CVersionInfo("HTML 4.01 Frameset",
2812: "XHTML 1.0 Frameset", voyager_frameset,
2813: Dict.VERS_FRAMES),
2814: new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict",
2815: voyager_strict, Dict.VERS_HTML40_STRICT),
2816: new W3CVersionInfo("HTML 4.0 Transitional",
2817: "XHTML 1.0 Transitional", voyager_loose,
2818: Dict.VERS_HTML40_LOOSE),
2819: new W3CVersionInfo("HTML 4.0 Frameset",
2820: "XHTML 1.0 Frameset", voyager_frameset,
2821: Dict.VERS_FRAMES),
2822: new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional",
2823: voyager_loose, Dict.VERS_HTML32),
2824: new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict",
2825: voyager_strict, Dict.VERS_HTML20) };
2826:
2827: }
|