0001: /*
0002: * @(#)Lexer.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: package org.w3c.tidy;
0007:
0008: /**
0009: *
0010: * Lexer for html parser
0011: *
0012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013: * See Tidy.java for the copyright notice.
0014: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015: * HTML Tidy Release 4 Aug 2000</a>
0016: *
0017: * @author Dave Raggett <dsr@w3.org>
0018: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019: * @version 1.0, 1999/05/22
0020: * @version 1.0.1, 1999/05/29
0021: * @version 1.1, 1999/06/18 Java Bean
0022: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024: * @version 1.4, 1999/09/04 DOM support
0025: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032: */
0033:
0034: /*
0035: Given a file stream fp it returns a sequence of tokens.
0036:
0037: GetToken(fp) gets the next token
0038: UngetToken(fp) provides one level undo
0039:
0040: The tags include an attribute list:
0041:
0042: - linked list of attribute/value nodes
0043: - each node has 2 null-terminated strings.
0044: - entities are replaced in attribute values
0045:
0046: white space is compacted if not in preformatted mode
0047: If not in preformatted mode then leading white space
0048: is discarded and subsequent white space sequences
0049: compacted to single space chars.
0050:
0051: If XmlTags is no then Tag names are folded to upper
0052: case and attribute names to lower case.
0053:
0054: Not yet done:
0055: - Doctype subset and marked sections
0056: */
0057:
0058: import java.io.PrintWriter;
0059: import java.util.Stack;
0060: import java.util.Vector;
0061:
0062: public class Lexer {
0063:
0064: public StreamIn in; /* file stream */
0065: public PrintWriter errout; /* error output stream */
0066: public short badAccess; /* for accessibility errors */
0067: public short badLayout; /* for bad style errors */
0068: public short badChars; /* for bad char encodings */
0069: public short badForm; /* for mismatched/mispositioned form tags */
0070: public short warnings; /* count of warnings in this document */
0071: public short errors; /* count of errors */
0072: public int lines; /* lines seen */
0073: public int columns; /* at start of current token */
0074: public boolean waswhite; /* used to collapse contiguous white space */
0075: public boolean pushed; /* true after token has been pushed back */
0076: public boolean insertspace; /* when space is moved after end tag */
0077: public boolean excludeBlocks; /* Netscape compatibility */
0078: public boolean exiled; /* true if moved out of table */
0079: public boolean isvoyager; /* true if xmlns attribute on html element */
0080: public short versions; /* bit vector of HTML versions */
0081: public int doctype; /* version as given by doctype (if any) */
0082: public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
0083: public int txtstart; /* start of current node */
0084: public int txtend; /* end of current node */
0085: public short state; /* state of lexer's finite state machine */
0086: public Node token;
0087:
0088: /*
0089: lexer character buffer
0090:
0091: parse tree nodes span onto this buffer
0092: which contains the concatenated text
0093: contents of all of the elements.
0094:
0095: lexsize must be reset for each file.
0096: */
0097: public byte[] lexbuf; /* byte buffer of UTF-8 chars */
0098: public int lexlength; /* allocated */
0099: public int lexsize; /* used */
0100:
0101: /* Inline stack for compatibility with Mosaic */
0102: public Node inode; /* for deferring text node */
0103: public int insert; /* for inferring inline tags */
0104: public Stack istack;
0105: public int istackbase; /* start of frame */
0106:
0107: public Style styles; /* used for cleaning up presentation markup */
0108:
0109: public Configuration configuration;
0110: protected int seenBodyEndTag; /* used by parser */
0111: private Vector nodeList;
0112:
0113: public Lexer(StreamIn in, Configuration configuration) {
0114: this .in = in;
0115: this .lines = 1;
0116: this .columns = 1;
0117: this .state = LEX_CONTENT;
0118: this .badAccess = 0;
0119: this .badLayout = 0;
0120: this .badChars = 0;
0121: this .badForm = 0;
0122: this .warnings = 0;
0123: this .errors = 0;
0124: this .waswhite = false;
0125: this .pushed = false;
0126: this .insertspace = false;
0127: this .exiled = false;
0128: this .isvoyager = false;
0129: this .versions = Dict.VERS_EVERYTHING;
0130: this .doctype = Dict.VERS_UNKNOWN;
0131: this .badDoctype = false;
0132: this .txtstart = 0;
0133: this .txtend = 0;
0134: this .token = null;
0135: this .lexbuf = null;
0136: this .lexlength = 0;
0137: this .lexsize = 0;
0138: this .inode = null;
0139: this .insert = -1;
0140: this .istack = new Stack();
0141: this .istackbase = 0;
0142: this .styles = null;
0143: this .configuration = configuration;
0144: this .seenBodyEndTag = 0;
0145: this .nodeList = new Vector();
0146: }
0147:
0148: public Node newNode() {
0149: Node node = new Node();
0150: nodeList.addElement(node);
0151: return node;
0152: }
0153:
0154: public Node newNode(short type, byte[] textarray, int start, int end) {
0155: Node node = new Node(type, textarray, start, end);
0156: nodeList.addElement(node);
0157: return node;
0158: }
0159:
0160: public Node newNode(short type, byte[] textarray, int start,
0161: int end, String element) {
0162: Node node = new Node(type, textarray, start, end, element,
0163: configuration.tt);
0164: nodeList.addElement(node);
0165: return node;
0166: }
0167:
0168: public Node cloneNode(Node node) {
0169: Node cnode = (Node) node.clone();
0170: nodeList.addElement(cnode);
0171: for (AttVal att = cnode.attributes; att != null; att = att.next) {
0172: if (att.asp != null)
0173: nodeList.addElement(att.asp);
0174: if (att.php != null)
0175: nodeList.addElement(att.php);
0176: }
0177: return cnode;
0178: }
0179:
0180: public AttVal cloneAttributes(AttVal attrs) {
0181: AttVal cattrs = (AttVal) attrs.clone();
0182: for (AttVal att = cattrs; att != null; att = att.next) {
0183: if (att.asp != null)
0184: nodeList.addElement(att.asp);
0185: if (att.php != null)
0186: nodeList.addElement(att.php);
0187: }
0188: return cattrs;
0189: }
0190:
0191: protected void updateNodeTextArrays(byte[] oldtextarray,
0192: byte[] newtextarray) {
0193: Node node;
0194: for (int i = 0; i < nodeList.size(); i++) {
0195: node = (Node) (nodeList.elementAt(i));
0196: if (node.textarray == oldtextarray)
0197: node.textarray = newtextarray;
0198: }
0199: }
0200:
0201: /* used for creating preformatted text from Word2000 */
0202: public Node newLineNode() {
0203: Node node = newNode();
0204:
0205: node.textarray = this .lexbuf;
0206: node.start = this .lexsize;
0207: addCharToLexer((int) '\n');
0208: node.end = this .lexsize;
0209: return node;
0210: }
0211:
0212: // Should always be able convert to/from UTF-8, so encoding exceptions are
0213: // converted to an Error to avoid adding throws declarations in
0214: // lots of methods.
0215:
0216: public static byte[] getBytes(String str) {
0217: try {
0218: return str.getBytes("UTF8");
0219: } catch (java.io.UnsupportedEncodingException e) {
0220: throw new Error("string to UTF-8 conversion failed: "
0221: + e.getMessage());
0222: }
0223: }
0224:
0225: public static String getString(byte[] bytes, int offset, int length) {
0226: try {
0227: return new String(bytes, offset, length, "UTF8");
0228: } catch (java.io.UnsupportedEncodingException e) {
0229: throw new Error("UTF-8 to string conversion failed: "
0230: + e.getMessage());
0231: }
0232: }
0233:
0234: public boolean endOfInput() {
0235: return this .in.isEndOfStream();
0236: }
0237:
0238: public void addByte(int c) {
0239: if (this .lexsize + 1 >= this .lexlength) {
0240: while (this .lexsize + 1 >= this .lexlength) {
0241: if (this .lexlength == 0)
0242: this .lexlength = 8192;
0243: else
0244: this .lexlength = this .lexlength * 2;
0245: }
0246:
0247: byte[] temp = this .lexbuf;
0248: this .lexbuf = new byte[this .lexlength];
0249: if (temp != null) {
0250: System.arraycopy(temp, 0, this .lexbuf, 0, temp.length);
0251: updateNodeTextArrays(temp, this .lexbuf);
0252: }
0253: }
0254:
0255: this .lexbuf[this .lexsize++] = (byte) c;
0256: this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
0257: }
0258:
0259: public void changeChar(byte c) {
0260: if (this .lexsize > 0) {
0261: this .lexbuf[this .lexsize - 1] = c;
0262: }
0263: }
0264:
0265: /* store char c as UTF-8 encoded byte stream */
0266: public void addCharToLexer(int c) {
0267: if (c < 128)
0268: addByte(c);
0269: else if (c <= 0x7FF) {
0270: addByte(0xC0 | (c >> 6));
0271: addByte(0x80 | (c & 0x3F));
0272: } else if (c <= 0xFFFF) {
0273: addByte(0xE0 | (c >> 12));
0274: addByte(0x80 | ((c >> 6) & 0x3F));
0275: addByte(0x80 | (c & 0x3F));
0276: } else if (c <= 0x1FFFFF) {
0277: addByte(0xF0 | (c >> 18));
0278: addByte(0x80 | ((c >> 12) & 0x3F));
0279: addByte(0x80 | ((c >> 6) & 0x3F));
0280: addByte(0x80 | (c & 0x3F));
0281: } else {
0282: addByte(0xF8 | (c >> 24));
0283: addByte(0x80 | ((c >> 18) & 0x3F));
0284: addByte(0x80 | ((c >> 12) & 0x3F));
0285: addByte(0x80 | ((c >> 6) & 0x3F));
0286: addByte(0x80 | (c & 0x3F));
0287: }
0288: }
0289:
0290: public void addStringToLexer(String str) {
0291: for (int i = 0; i < str.length(); i++) {
0292: addCharToLexer((int) str.charAt(i));
0293: }
0294: }
0295:
0296: /*
0297: No longer attempts to insert missing ';' for unknown
0298: enitities unless one was present already, since this
0299: gives unexpected results.
0300:
0301: For example: <a href="something.htm?foo&bar&fred">
0302: was tidied to: <a href="something.htm?foo&bar;&fred;">
0303: rather than: <a href="something.htm?foo&bar&fred">
0304:
0305: My thanks for Maurice Buxton for spotting this.
0306: */
0307: public void parseEntity(short mode) {
0308: short map;
0309: int start;
0310: boolean first = true;
0311: boolean semicolon = false;
0312: boolean numeric = false;
0313: int c, ch, startcol;
0314: String str;
0315:
0316: start = this .lexsize - 1; /* to start at "&" */
0317: startcol = this .in.curcol - 1;
0318:
0319: while (true) {
0320: c = this .in.readChar();
0321: if (c == StreamIn.EndOfStream)
0322: break;
0323: if (c == ';') {
0324: semicolon = true;
0325: break;
0326: }
0327:
0328: if (first && c == '#') {
0329: addCharToLexer(c);
0330: first = false;
0331: numeric = true;
0332: continue;
0333: }
0334:
0335: first = false;
0336: map = MAP((char) c);
0337:
0338: /* AQ: Added flag for numeric entities so that numeric entities
0339: with missing semi-colons are recognized.
0340: Eg. "rep..." is recognized as "rep"
0341: */
0342: if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) {
0343: addCharToLexer(c);
0344: continue;
0345: }
0346: if (!numeric && ((map & NAMECHAR) != 0)) {
0347: addCharToLexer(c);
0348: continue;
0349: }
0350:
0351: /* otherwise put it back */
0352:
0353: this .in.ungetChar(c);
0354: break;
0355: }
0356:
0357: str = getString(this .lexbuf, start, this .lexsize - start);
0358: ch = EntityTable.getDefaultEntityTable().entityCode(str);
0359:
0360: /* deal with unrecognized entities */
0361: if (ch <= 0) {
0362: /* set error position just before offending chararcter */
0363: this .lines = this .in.curline;
0364: this .columns = startcol;
0365:
0366: if (this .lexsize > start + 1) {
0367: Report
0368: .entityError(this , Report.UNKNOWN_ENTITY, str,
0369: ch);
0370:
0371: if (semicolon)
0372: addCharToLexer(';');
0373: } else /* naked & */
0374: {
0375: Report.entityError(this , Report.UNESCAPED_AMPERSAND,
0376: str, ch);
0377: }
0378: } else {
0379: if (c != ';') /* issue warning if not terminated by ';' */
0380: {
0381: /* set error position just before offending chararcter */
0382: this .lines = this .in.curline;
0383: this .columns = startcol;
0384: Report.entityError(this , Report.MISSING_SEMICOLON, str,
0385: c);
0386: }
0387:
0388: this .lexsize = start;
0389:
0390: if (ch == 160 && (mode & Preformatted) != 0)
0391: ch = ' ';
0392:
0393: addCharToLexer(ch);
0394:
0395: if (ch == '&' && !this .configuration.QuoteAmpersand) {
0396: addCharToLexer('a');
0397: addCharToLexer('m');
0398: addCharToLexer('p');
0399: addCharToLexer(';');
0400: }
0401: }
0402: }
0403:
0404: public char parseTagName() {
0405: short map;
0406: int c;
0407:
0408: /* fold case of first char in buffer */
0409:
0410: c = this .lexbuf[this .txtstart];
0411: map = MAP((char) c);
0412:
0413: // BEGIN RAVE MODIFICATIONS
0414: boolean wasColon = c == ':';
0415: if (this .configuration.inputJspMode) { // don't change case of attributes
0416: ;
0417: } else
0418: // END RAVE MODIFICATIONS
0419: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0) {
0420: c += (int) ((int) 'a' - (int) 'A');
0421: this .lexbuf[this .txtstart] = (byte) c;
0422: }
0423:
0424: while (true) {
0425: c = this .in.readChar();
0426: if (c == StreamIn.EndOfStream)
0427: break;
0428: map = MAP((char) c);
0429:
0430: if ((map & NAMECHAR) == 0)
0431: break;
0432:
0433: /* fold case of subsequent chars */
0434:
0435: // BEGIN RAVE MODIFICATIONS
0436: if (c == ':') {
0437: wasColon = true;
0438: }
0439: if (this .configuration.inputJspMode) { // don't change case of attributes
0440: ;
0441: } else
0442: // END RAVE MODIFICATIONS
0443: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
0444: c += (int) ((int) 'a' - (int) 'A');
0445:
0446: addCharToLexer(c);
0447: }
0448:
0449: this .txtend = this .lexsize;
0450:
0451: // BEGIN RAVE MODIFICATIONS
0452: if (!this .configuration.XmlTags && !wasColon) {
0453: lowercaseBuf();
0454: }
0455: // END RAVE MODIFICATIONS
0456:
0457: return (char) c;
0458: }
0459:
0460: // BEGIN RAVE MODIFICATIONS
0461: /** Force the byte sequence in the buffer to lowercase if applicable.
0462: * This assumes we're only dealing with ascii chars and as soon as
0463: * it sees something other than that in the lex buffer it gives up
0464: * converting the case (since the lex buffer is an UTF-8 encoded
0465: * byte array, not unicode characters.) */
0466: private void lowercaseBuf() {
0467: for (int i = this .txtstart; i < this .txtend; i++) {
0468: byte c = this .lexbuf[i];
0469: if (c >= 128)
0470: break; // aaaaaaaaaaaaaw freak-out!
0471: short map = MAP((char) c);
0472: if ((map & UPPERCASE) != 0) {
0473: c += (int) ((int) 'a' - (int) 'A');
0474: this .lexbuf[i] = (byte) c;
0475: }
0476: }
0477: }
0478:
0479: // END RAVE MODIFICATIONS
0480:
0481: public void addStringLiteral(String str) {
0482: for (int i = 0; i < str.length(); i++) {
0483: addCharToLexer((int) str.charAt(i));
0484: }
0485: }
0486:
0487: /* choose what version to use for new doctype */
0488: public short HTMLVersion() {
0489: short versions;
0490:
0491: versions = this .versions;
0492:
0493: if ((versions & Dict.VERS_HTML20) != 0)
0494: return Dict.VERS_HTML20;
0495:
0496: if ((versions & Dict.VERS_HTML32) != 0)
0497: return Dict.VERS_HTML32;
0498:
0499: if ((versions & Dict.VERS_HTML40_STRICT) != 0)
0500: return Dict.VERS_HTML40_STRICT;
0501:
0502: if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
0503: return Dict.VERS_HTML40_LOOSE;
0504:
0505: if ((versions & Dict.VERS_FRAMES) != 0)
0506: return Dict.VERS_FRAMES;
0507:
0508: return Dict.VERS_UNKNOWN;
0509: }
0510:
0511: public String HTMLVersionName() {
0512: short guessed;
0513: int j;
0514:
0515: guessed = apparentVersion();
0516:
0517: for (j = 0; j < W3CVersion.length; ++j) {
0518: if (guessed == W3CVersion[j].code) {
0519: if (this .isvoyager)
0520: return W3CVersion[j].voyagerName;
0521:
0522: return W3CVersion[j].name;
0523: }
0524: }
0525:
0526: return null;
0527: }
0528:
0529: /* add meta element for Tidy */
0530: public boolean addGenerator(Node root) {
0531:
0532: // BEGIN RAVE MODIFICATIONS
0533: if (configuration.outputJspMode) {
0534: return false;
0535: }
0536: // END RAVE MODIFICATIONS
0537:
0538: AttVal attval;
0539: Node node;
0540: Node head = root.findHEAD(configuration.tt);
0541:
0542: if (head != null) {
0543: for (node = head.content; node != null; node = node.next) {
0544: if (node.tag == configuration.tt.tagMeta) {
0545: attval = node.getAttrByName("name");
0546:
0547: if (attval != null
0548: && attval.value != null
0549: && Lexer.wstrcasecmp(attval.value,
0550: "generator") == 0) {
0551: attval = node.getAttrByName("content");
0552:
0553: if (attval != null
0554: && attval.value != null
0555: && attval.value.length() >= 9
0556: && Lexer.wstrcasecmp(attval.value
0557: .substring(0, 9), "HTML Tidy") == 0) {
0558: return false;
0559: }
0560: }
0561: }
0562: }
0563:
0564: node = this .inferredTag("meta");
0565: node.addAttribute("content", "HTML Tidy, see www.w3.org");
0566: node.addAttribute("name", "generator");
0567: Node.insertNodeAtStart(head, node);
0568: return true;
0569: }
0570:
0571: return false;
0572: }
0573:
0574: /* return true if substring s is in p and isn't all in upper case */
0575: /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
0576: /* len is how many chars to check in p */
0577: private static boolean findBadSubString(String s, String p, int len) {
0578: int n = s.length();
0579: int i = 0;
0580: String ps;
0581:
0582: while (n < len) {
0583: ps = p.substring(i, i + n);
0584: if (wstrcasecmp(s, ps) == 0)
0585: return (!ps.equals(s.substring(0, n)));
0586:
0587: ++i;
0588: --len;
0589: }
0590:
0591: return false;
0592: }
0593:
0594: public boolean checkDocTypeKeyWords(Node doctype) {
0595: int len = doctype.end - doctype.start;
0596: String s = getString(this .lexbuf, doctype.start, len);
0597:
0598: return !(findBadSubString("SYSTEM", s, len)
0599: || findBadSubString("PUBLIC", s, len)
0600: || findBadSubString("//DTD", s, len)
0601: || findBadSubString("//W3C", s, len) || findBadSubString(
0602: "//EN", s, len));
0603: }
0604:
0605: /* examine <!DOCTYPE> to identify version */
0606: public short findGivenVersion(Node doctype) {
0607: String p, s;
0608: int i, j;
0609: int len;
0610: String str1;
0611: String str2;
0612:
0613: /* if root tag for doctype isn't html give up now */
0614: str1 = getString(this .lexbuf, doctype.start, 5);
0615: if (wstrcasecmp(str1, "html ") != 0)
0616: return 0;
0617:
0618: if (!checkDocTypeKeyWords(doctype))
0619: Report.warning(this , doctype, null,
0620: Report.DTYPE_NOT_UPPER_CASE);
0621:
0622: /* give up if all we are given is the system id for the doctype */
0623: str1 = getString(this .lexbuf, doctype.start + 5, 7);
0624: if (wstrcasecmp(str1, "SYSTEM ") == 0) {
0625: /* but at least ensure the case is correct */
0626: if (!str1.substring(0, 6).equals("SYSTEM"))
0627: System.arraycopy(getBytes("SYSTEM"), 0, this .lexbuf,
0628: doctype.start + 5, 6);
0629: return 0; /* unrecognized */
0630: }
0631:
0632: if (wstrcasecmp(str1, "PUBLIC ") == 0) {
0633: if (!str1.substring(0, 6).equals("PUBLIC"))
0634: System.arraycopy(getBytes("PUBLIC "), 0, this .lexbuf,
0635: doctype.start + 5, 6);
0636: } else
0637: this .badDoctype = true;
0638:
0639: for (i = doctype.start; i < doctype.end; ++i) {
0640: if (this .lexbuf[i] == (byte) '"') {
0641: str1 = getString(this .lexbuf, i + 1, 12);
0642: str2 = getString(this .lexbuf, i + 1, 13);
0643: if (str1.equals("-//W3C//DTD ")) {
0644: /* compute length of identifier e.g. "HTML 4.0 Transitional" */
0645: for (j = i + 13; j < doctype.end
0646: && this .lexbuf[j] != (byte) '/'; ++j)
0647: ;
0648: len = j - i - 13;
0649: p = getString(this .lexbuf, i + 13, len);
0650:
0651: for (j = 1; j < W3CVersion.length; ++j) {
0652: s = W3CVersion[j].name;
0653: if (len == s.length() && s.equals(p))
0654: return W3CVersion[j].code;
0655: }
0656:
0657: /* else unrecognized version */
0658: } else if (str2.equals("-//IETF//DTD ")) {
0659: /* compute length of identifier e.g. "HTML 2.0" */
0660: for (j = i + 14; j < doctype.end
0661: && this .lexbuf[j] != (byte) '/'; ++j)
0662: ;
0663: len = j - i - 14;
0664:
0665: p = getString(this .lexbuf, i + 14, len);
0666: s = W3CVersion[0].name;
0667: if (len == s.length() && s.equals(p))
0668: return W3CVersion[0].code;
0669:
0670: /* else unrecognized version */
0671: }
0672: break;
0673: }
0674: }
0675:
0676: return 0;
0677: }
0678:
0679: public void fixHTMLNameSpace(Node root, String profile) {
0680: Node node;
0681: AttVal prev, attr;
0682:
0683: for (node = root.content; node != null
0684: && node.tag != configuration.tt.tagHtml; node = node.next)
0685: ;
0686:
0687: if (node != null) {
0688: prev = null;
0689:
0690: for (attr = node.attributes; attr != null; attr = attr.next) {
0691: if (attr.attribute.equals("xmlns"))
0692: break;
0693:
0694: prev = attr;
0695: }
0696:
0697: if (attr != null) {
0698: if (!attr.value.equals(profile)) {
0699: Report.warning(this , node, null,
0700: Report.INCONSISTENT_NAMESPACE);
0701: attr.value = profile;
0702: }
0703: } else {
0704: attr = new AttVal(node.attributes, null, (int) '"',
0705: "xmlns", profile);
0706: attr.dict = AttributeTable.getDefaultAttributeTable()
0707: .findAttribute(attr);
0708: node.attributes = attr;
0709: }
0710: }
0711: }
0712:
0713: public boolean setXHTMLDocType(Node root) {
0714: String fpi = " ";
0715: String sysid = "";
0716: String namespace = XHTML_NAMESPACE;
0717: Node doctype;
0718:
0719: doctype = root.findDocType();
0720:
0721: // BEGIN RAVE MODIFICATIONS
0722: //if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
0723: if (configuration.outputJspMode
0724: || configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
0725: // END RAVE MODIFICATIONS
0726: {
0727: if (doctype != null)
0728: Node.discardElement(doctype);
0729: return true;
0730: }
0731:
0732: if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0733: /* see what flavor of XHTML this document matches */
0734: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
0735: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0736: sysid = voyager_strict;
0737: } else if ((this .versions & Dict.VERS_LOOSE) != 0) {
0738: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0739: sysid = voyager_loose;
0740: } else if ((this .versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
0741: fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
0742: sysid = voyager_frameset;
0743: } else /* lets assume XHTML transitional */
0744: {
0745: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0746: sysid = voyager_loose;
0747: }
0748: } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0749: fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0750: sysid = voyager_strict;
0751: } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0752: fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0753: sysid = voyager_loose;
0754: }
0755:
0756: fixHTMLNameSpace(root, namespace);
0757:
0758: if (doctype == null) {
0759: doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0760: doctype.next = root.content;
0761: doctype.parent = root;
0762: doctype.prev = null;
0763: root.content = doctype;
0764: }
0765:
0766: if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0767: && configuration.docTypeStr != null) {
0768: fpi = configuration.docTypeStr;
0769: sysid = "";
0770: }
0771:
0772: this .txtstart = this .lexsize;
0773: this .txtend = this .lexsize;
0774:
0775: /* add public identifier */
0776: addStringLiteral("html PUBLIC ");
0777:
0778: /* check if the fpi is quoted or not */
0779: if (fpi.charAt(0) == '"')
0780: addStringLiteral(fpi);
0781: else {
0782: addStringLiteral("\"");
0783: addStringLiteral(fpi);
0784: addStringLiteral("\"");
0785: }
0786:
0787: if (sysid.length() + 6 >= this .configuration.wraplen)
0788: addStringLiteral("\n\"");
0789: else
0790: addStringLiteral("\n \"");
0791:
0792: /* add system identifier */
0793: addStringLiteral(sysid);
0794: addStringLiteral("\"");
0795:
0796: this .txtend = this .lexsize;
0797:
0798: doctype.start = this .txtstart;
0799: doctype.end = this .txtend;
0800:
0801: return false;
0802: }
0803:
0804: public short apparentVersion() {
0805: switch (this .doctype) {
0806: case Dict.VERS_UNKNOWN:
0807: return HTMLVersion();
0808:
0809: case Dict.VERS_HTML20:
0810: if ((this .versions & Dict.VERS_HTML20) != 0)
0811: return Dict.VERS_HTML20;
0812:
0813: break;
0814:
0815: case Dict.VERS_HTML32:
0816: if ((this .versions & Dict.VERS_HTML32) != 0)
0817: return Dict.VERS_HTML32;
0818:
0819: break; /* to replace old version by new */
0820:
0821: case Dict.VERS_HTML40_STRICT:
0822: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0823: return Dict.VERS_HTML40_STRICT;
0824:
0825: break;
0826:
0827: case Dict.VERS_HTML40_LOOSE:
0828: if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0829: return Dict.VERS_HTML40_LOOSE;
0830:
0831: break; /* to replace old version by new */
0832:
0833: case Dict.VERS_FRAMES:
0834: if ((this .versions & Dict.VERS_FRAMES) != 0)
0835: return Dict.VERS_FRAMES;
0836:
0837: break;
0838: }
0839:
0840: Report.warning(this , null, null, Report.INCONSISTENT_VERSION);
0841: return this .HTMLVersion();
0842: }
0843:
0844: /* fixup doctype if missing */
0845: public boolean fixDocType(Node root) {
0846: Node doctype;
0847: int guessed = Dict.VERS_HTML40_STRICT, i;
0848:
0849: if (this .badDoctype)
0850: Report.warning(this , null, null, Report.MALFORMED_DOCTYPE);
0851:
0852: if (configuration.XmlOut)
0853: return true;
0854:
0855: doctype = root.findDocType();
0856:
0857: if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
0858: if (doctype != null)
0859: Node.discardElement(doctype);
0860: return true;
0861: }
0862:
0863: if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0864: Node.discardElement(doctype);
0865: doctype = null;
0866: guessed = Dict.VERS_HTML40_STRICT;
0867: } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0868: Node.discardElement(doctype);
0869: doctype = null;
0870: guessed = Dict.VERS_HTML40_LOOSE;
0871: } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0872: if (doctype != null) {
0873: if (this .doctype == Dict.VERS_UNKNOWN)
0874: return false;
0875:
0876: switch (this .doctype) {
0877: case Dict.VERS_UNKNOWN:
0878: return false;
0879:
0880: case Dict.VERS_HTML20:
0881: if ((this .versions & Dict.VERS_HTML20) != 0)
0882: return true;
0883:
0884: break; /* to replace old version by new */
0885:
0886: case Dict.VERS_HTML32:
0887: if ((this .versions & Dict.VERS_HTML32) != 0)
0888: return true;
0889:
0890: break; /* to replace old version by new */
0891:
0892: case Dict.VERS_HTML40_STRICT:
0893: if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0894: return true;
0895:
0896: break; /* to replace old version by new */
0897:
0898: case Dict.VERS_HTML40_LOOSE:
0899: if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0900: return true;
0901:
0902: break; /* to replace old version by new */
0903:
0904: case Dict.VERS_FRAMES:
0905: if ((this .versions & Dict.VERS_FRAMES) != 0)
0906: return true;
0907:
0908: break; /* to replace old version by new */
0909: }
0910:
0911: /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
0912: }
0913:
0914: /* choose new doctype */
0915: guessed = HTMLVersion();
0916: }
0917:
0918: if (guessed == Dict.VERS_UNKNOWN)
0919: return false;
0920:
0921: /* for XML use the Voyager system identifier */
0922: if (this .configuration.XmlOut || this .configuration.XmlTags
0923: || this .isvoyager) {
0924: if (doctype != null)
0925: Node.discardElement(doctype);
0926:
0927: for (i = 0; i < W3CVersion.length; ++i) {
0928: if (guessed == W3CVersion[i].code) {
0929: fixHTMLNameSpace(root, W3CVersion[i].profile);
0930: break;
0931: }
0932: }
0933:
0934: return true;
0935: }
0936:
0937: if (doctype == null) {
0938: doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0939: doctype.next = root.content;
0940: doctype.parent = root;
0941: doctype.prev = null;
0942: root.content = doctype;
0943: }
0944:
0945: this .txtstart = this .lexsize;
0946: this .txtend = this .lexsize;
0947:
0948: /* use the appropriate public identifier */
0949: addStringLiteral("html PUBLIC ");
0950:
0951: if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0952: && configuration.docTypeStr != null)
0953: addStringLiteral(configuration.docTypeStr);
0954: else if (guessed == Dict.VERS_HTML20)
0955: addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
0956: else {
0957: addStringLiteral("\"-//W3C//DTD ");
0958:
0959: for (i = 0; i < W3CVersion.length; ++i) {
0960: if (guessed == W3CVersion[i].code) {
0961: addStringLiteral(W3CVersion[i].name);
0962: break;
0963: }
0964: }
0965:
0966: addStringLiteral("//EN\"");
0967: }
0968:
0969: this .txtend = this .lexsize;
0970:
0971: doctype.start = this .txtstart;
0972: doctype.end = this .txtend;
0973:
0974: return true;
0975: }
0976:
0977: /* ensure XML document starts with <?XML version="1.0"?> */
0978: public boolean fixXMLPI(Node root) {
0979: Node xml;
0980: int s;
0981:
0982: if (root.content != null
0983: && root.content.type == Node.ProcInsTag) {
0984: s = root.content.start;
0985:
0986: if (this .lexbuf[s] == (byte) 'x'
0987: && this .lexbuf[s + 1] == (byte) 'm'
0988: && this .lexbuf[s + 2] == (byte) 'l')
0989: return true;
0990: }
0991:
0992: xml = newNode(Node.ProcInsTag, this .lexbuf, 0, 0);
0993: xml.next = root.content;
0994:
0995: if (root.content != null) {
0996: root.content.prev = xml;
0997: xml.next = root.content;
0998: }
0999:
1000: root.content = xml;
1001:
1002: this .txtstart = this .lexsize;
1003: this .txtend = this .lexsize;
1004: addStringLiteral("xml version=\"1.0\"");
1005: if (this .configuration.CharEncoding == Configuration.LATIN1)
1006: addStringLiteral(" encoding=\"ISO-8859-1\"");
1007: this .txtend = this .lexsize;
1008:
1009: xml.start = this .txtstart;
1010: xml.end = this .txtend;
1011: return false;
1012: }
1013:
1014: public Node inferredTag(String name) {
1015: Node node;
1016:
1017: node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
1018: this .txtend, name);
1019: node.implicit = true;
1020: return node;
1021: }
1022:
1023: public static boolean expectsContent(Node node) {
1024: if (node.type != Node.StartTag)
1025: return false;
1026:
1027: /* unknown element? */
1028: if (node.tag == null)
1029: return true;
1030:
1031: if ((node.tag.model & Dict.CM_EMPTY) != 0)
1032: return false;
1033:
1034: return true;
1035: }
1036:
1037: /*
1038: create a text node for the contents of
1039: a CDATA element like style or script
1040: which ends with </foo> for some foo.
1041: */
1042: public Node getCDATA(Node container) {
1043: int c, lastc, start, len, i;
1044: String str;
1045: boolean endtag = false;
1046:
1047: this .lines = this .in.curline;
1048: this .columns = this .in.curcol;
1049: this .waswhite = false;
1050: this .txtstart = this .lexsize;
1051: this .txtend = this .lexsize;
1052:
1053: lastc = (int) '\0';
1054: start = -1;
1055:
1056: while (true) {
1057: c = this .in.readChar();
1058: if (c == StreamIn.EndOfStream)
1059: break;
1060: /* treat \r\n as \n and \r as \n */
1061:
1062: if (c == (int) '/' && lastc == (int) '<') {
1063: if (endtag) {
1064: this .lines = this .in.curline;
1065: this .columns = this .in.curcol - 3;
1066:
1067: Report.warning(this , null, null,
1068: Report.BAD_CDATA_CONTENT);
1069: }
1070:
1071: start = this .lexsize + 1; /* to first letter */
1072: endtag = true;
1073: } else if (c == (int) '>' && start >= 0) {
1074: len = this .lexsize - start;
1075: if (len == container.element.length()) {
1076: str = getString(this .lexbuf, start, len);
1077: if (Lexer.wstrcasecmp(str, container.element) == 0) {
1078: this .txtend = start - 2;
1079: break;
1080: }
1081: }
1082:
1083: this .lines = this .in.curline;
1084: this .columns = this .in.curcol - 3;
1085:
1086: Report.warning(this , null, null,
1087: Report.BAD_CDATA_CONTENT);
1088:
1089: /* if javascript insert backslash before / */
1090:
1091: if (ParserImpl.isJavaScript(container)) {
1092: for (i = this .lexsize; i > start - 1; --i)
1093: this .lexbuf[i] = this .lexbuf[i - 1];
1094:
1095: this .lexbuf[start - 1] = (byte) '\\';
1096: this .lexsize++;
1097: }
1098:
1099: start = -1;
1100: } else if (c == (int) '\r') {
1101: c = this .in.readChar();
1102:
1103: if (c != (int) '\n')
1104: this .in.ungetChar(c);
1105:
1106: c = (int) '\n';
1107: }
1108:
1109: addCharToLexer((int) c);
1110: this .txtend = this .lexsize;
1111: lastc = c;
1112: }
1113:
1114: if (c == StreamIn.EndOfStream)
1115: Report.warning(this , container, null,
1116: Report.MISSING_ENDTAG_FOR);
1117:
1118: if (this .txtend > this .txtstart) {
1119: this .txtend = removeCDATAMark(this .lexbuf,//wzw
1120: this .txtstart, this .txtend);
1121:
1122: this .token = newNode(Node.TextNode, this .lexbuf,
1123: this .txtstart, this .txtend);
1124: return this .token;
1125: }
1126:
1127: return null;
1128: }
1129:
1130: private int removeCDATAMark(byte[] lexbuf, int txtstart, int txtend)//wzw
1131: {
1132: String s = new String(lexbuf, txtstart, txtend - txtstart);
1133: s = s.replaceAll("<!\\[CDATA\\[", "");
1134: s = s.replaceAll("\\]\\]>", "");
1135: byte[] b = s.getBytes();
1136: if (b.length < (txtend - txtstart)) {
1137: System.arraycopy(b, 0, lexbuf, txtstart, b.length);
1138:
1139: return txtstart + b.length;
1140: } else {
1141: return txtend;
1142: }
1143: }
1144:
1145: public void ungetToken() {
1146: this .pushed = true;
1147: }
1148:
1149: public static final short IgnoreWhitespace = 0;
1150: public static final short MixedContent = 1;
1151: public static final short Preformatted = 2;
1152: public static final short IgnoreMarkup = 3;
1153:
1154: /*
1155: modes for GetToken()
1156:
1157: MixedContent -- for elements which don't accept PCDATA
1158: Preformatted -- white space preserved as is
1159: IgnoreMarkup -- for CDATA elements such as script, style
1160: */
1161:
1162: public Node getToken(short mode) {
1163: short map;
1164: int c = 0;
1165: int lastc;
1166: int badcomment = 0;
1167: MutableBoolean isempty = new MutableBoolean();
1168: AttVal attributes;
1169:
1170: if (this .pushed) {
1171: /* duplicate inlines in preference to pushed text nodes when appropriate */
1172: if (this .token.type != Node.TextNode
1173: || (this .insert == -1 && this .inode == null)) {
1174: this .pushed = false;
1175: return this .token;
1176: }
1177: }
1178:
1179: /* at start of block elements, unclosed inline
1180: elements are inserted into the token stream */
1181:
1182: if (this .insert != -1 || this .inode != null)
1183: return insertedToken();
1184:
1185: this .lines = this .in.curline;
1186: this .columns = this .in.curcol;
1187: this .waswhite = false;
1188:
1189: this .txtstart = this .lexsize;
1190: this .txtend = this .lexsize;
1191:
1192: while (true) {
1193: c = this .in.readChar();
1194: if (c == StreamIn.EndOfStream)
1195: break;
1196: if (this .insertspace && mode != IgnoreWhitespace) {
1197: addCharToLexer(' ');
1198: this .waswhite = true;
1199: this .insertspace = false;
1200: }
1201:
1202: /* treat \r\n as \n and \r as \n */
1203:
1204: if (c == '\r') {
1205: c = this .in.readChar();
1206:
1207: if (c != '\n')
1208: this .in.ungetChar(c);
1209:
1210: c = '\n';
1211: }
1212:
1213: addCharToLexer(c);
1214:
1215: switch (this .state) {
1216: case LEX_CONTENT: /* element content */
1217: map = MAP((char) c);
1218:
1219: /*
1220: Discard white space if appropriate. Its cheaper
1221: to do this here rather than in parser methods
1222: for elements that don't have mixed content.
1223: */
1224: if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1225: && this .lexsize == this .txtstart + 1) {
1226: --this .lexsize;
1227: this .waswhite = false;
1228: this .lines = this .in.curline;
1229: this .columns = this .in.curcol;
1230: continue;
1231: }
1232:
1233: if (c == '<') {
1234: this .state = LEX_GT;
1235: continue;
1236: }
1237:
1238: if ((map & WHITE) != 0) {
1239: /* was previous char white? */
1240: if (this .waswhite) {
1241: if (mode != Preformatted
1242: && mode != IgnoreMarkup) {
1243: --this .lexsize;
1244: this .lines = this .in.curline;
1245: this .columns = this .in.curcol;
1246: }
1247: } else /* prev char wasn't white */
1248: {
1249: this .waswhite = true;
1250: lastc = c;
1251:
1252: if (mode != Preformatted
1253: && mode != IgnoreMarkup && c != ' ')
1254: changeChar((byte) ' ');
1255: }
1256:
1257: continue;
1258: } else if (c == '&' && mode != IgnoreMarkup)
1259: parseEntity(mode);
1260:
1261: /* this is needed to avoid trimming trailing whitespace */
1262: if (mode == IgnoreWhitespace)
1263: mode = MixedContent;
1264:
1265: this .waswhite = false;
1266: continue;
1267:
1268: case LEX_GT: /* < */
1269:
1270: /* check for endtag */
1271: if (c == '/') {
1272: c = this .in.readChar();
1273: if (c == StreamIn.EndOfStream) {
1274: this .in.ungetChar(c);
1275: continue;
1276: }
1277:
1278: addCharToLexer(c);
1279: map = MAP((char) c);
1280:
1281: if ((map & LETTER) != 0) {
1282: this .lexsize -= 3;
1283: this .txtend = this .lexsize;
1284: this .in.ungetChar(c);
1285: this .state = LEX_ENDTAG;
1286: this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
1287: this .in.curcol -= 2;
1288:
1289: /* if some text before the </ return it now */
1290: if (this .txtend > this .txtstart) {
1291: /* trim space char before end tag */
1292: if (mode == IgnoreWhitespace
1293: && this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1294: this .lexsize -= 1;
1295: this .txtend = this .lexsize;
1296: }
1297:
1298: this .token = newNode(Node.TextNode,
1299: this .lexbuf, this .txtstart,
1300: this .txtend);
1301: return this .token;
1302: }
1303:
1304: continue; /* no text so keep going */
1305: }
1306:
1307: /* otherwise treat as CDATA */
1308: this .waswhite = false;
1309: this .state = LEX_CONTENT;
1310: continue;
1311: }
1312:
1313: if (mode == IgnoreMarkup) {
1314: /* otherwise treat as CDATA */
1315: this .waswhite = false;
1316: this .state = LEX_CONTENT;
1317: continue;
1318: }
1319:
1320: /*
1321: look out for comments, doctype or marked sections
1322: this isn't quite right, but its getting there ...
1323: */
1324: if (c == '!') {
1325: c = this .in.readChar();
1326:
1327: if (c == '-') {
1328: c = this .in.readChar();
1329:
1330: if (c == '-') {
1331: this .state = LEX_COMMENT; /* comment */
1332: this .lexsize -= 2;
1333: this .txtend = this .lexsize;
1334:
1335: /* if some text before < return it now */
1336: if (this .txtend > this .txtstart) {
1337: this .token = newNode(Node.TextNode,
1338: this .lexbuf, this .txtstart,
1339: this .txtend);
1340: return this .token;
1341: }
1342:
1343: this .txtstart = this .lexsize;
1344: continue;
1345: }
1346:
1347: Report.warning(this , null, null,
1348: Report.MALFORMED_COMMENT);
1349: } else if (c == 'd' || c == 'D') {
1350: this .state = LEX_DOCTYPE; /* doctype */
1351: this .lexsize -= 2;
1352: this .txtend = this .lexsize;
1353: mode = IgnoreWhitespace;
1354:
1355: /* skip until white space or '>' */
1356:
1357: for (;;) {
1358: c = this .in.readChar();
1359:
1360: if (c == StreamIn.EndOfStream || c == '>') {
1361: this .in.ungetChar(c);
1362: break;
1363: }
1364:
1365: map = MAP((char) c);
1366:
1367: if ((map & WHITE) == 0)
1368: continue;
1369:
1370: /* and skip to end of whitespace */
1371:
1372: for (;;) {
1373: c = this .in.readChar();
1374:
1375: if (c == StreamIn.EndOfStream
1376: || c == '>') {
1377: this .in.ungetChar(c);
1378: break;
1379: }
1380:
1381: map = MAP((char) c);
1382:
1383: if ((map & WHITE) != 0)
1384: continue;
1385:
1386: this .in.ungetChar(c);
1387: break;
1388: }
1389:
1390: break;
1391: }
1392:
1393: /* if some text before < return it now */
1394: if (this .txtend > this .txtstart) {
1395: this .token = newNode(Node.TextNode,
1396: this .lexbuf, this .txtstart,
1397: this .txtend);
1398: return this .token;
1399: }
1400:
1401: this .txtstart = this .lexsize;
1402: continue;
1403: } else if (c == '[') {
1404: /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1405: this .lexsize -= 2;
1406: this .state = LEX_SECTION;
1407: this .txtend = this .lexsize;
1408:
1409: /* if some text before < return it now */
1410: if (this .txtend > this .txtstart) {
1411: this .token = newNode(Node.TextNode,
1412: this .lexbuf, this .txtstart,
1413: this .txtend);
1414: return this .token;
1415: }
1416:
1417: this .txtstart = this .lexsize;
1418: continue;
1419: }
1420:
1421: /* otherwise swallow chars up to and including next '>' */
1422: while (true) {
1423: c = this .in.readChar();
1424: if (c == '>')
1425: break;
1426: if (c == -1) {
1427: this .in.ungetChar(c);
1428: break;
1429: }
1430: }
1431:
1432: this .lexsize -= 2;
1433: this .lexbuf[this .lexsize] = (byte) '\0';
1434: this .state = LEX_CONTENT;
1435: continue;
1436: }
1437:
1438: /*
1439: processing instructions
1440: */
1441:
1442: if (c == '?') {
1443: this .lexsize -= 2;
1444: this .state = LEX_PROCINSTR;
1445: this .txtend = this .lexsize;
1446:
1447: /* if some text before < return it now */
1448: if (this .txtend > this .txtstart) {
1449: this .token = newNode(Node.TextNode,
1450: this .lexbuf, this .txtstart, this .txtend);
1451: return this .token;
1452: }
1453:
1454: this .txtstart = this .lexsize;
1455: continue;
1456: }
1457:
1458: /* Microsoft ASP's e.g. <% ... server-code ... %> */
1459: if (c == '%') {
1460: this .lexsize -= 2;
1461: this .state = LEX_ASP;
1462: this .txtend = this .lexsize;
1463:
1464: /* if some text before < return it now */
1465: if (this .txtend > this .txtstart) {
1466: this .token = newNode(Node.TextNode,
1467: this .lexbuf, this .txtstart, this .txtend);
1468: return this .token;
1469: }
1470:
1471: this .txtstart = this .lexsize;
1472: continue;
1473: }
1474:
1475: /* Netscapes JSTE e.g. <# ... server-code ... #> */
1476: if (c == '#') {
1477: this .lexsize -= 2;
1478: this .state = LEX_JSTE;
1479: this .txtend = this .lexsize;
1480:
1481: /* if some text before < return it now */
1482: if (this .txtend > this .txtstart) {
1483: this .token = newNode(Node.TextNode,
1484: this .lexbuf, this .txtstart, this .txtend);
1485: return this .token;
1486: }
1487:
1488: this .txtstart = this .lexsize;
1489: continue;
1490: }
1491:
1492: map = MAP((char) c);
1493:
1494: /* check for start tag */
1495: if ((map & LETTER) != 0) {
1496: this .in.ungetChar(c); /* push back letter */
1497: this .lexsize -= 2; /* discard "<" + letter */
1498: this .txtend = this .lexsize;
1499: this .state = LEX_STARTTAG; /* ready to read tag name */
1500:
1501: /* if some text before < return it now */
1502: if (this .txtend > this .txtstart) {
1503: this .token = newNode(Node.TextNode,
1504: this .lexbuf, this .txtstart, this .txtend);
1505: return this .token;
1506: }
1507:
1508: continue; /* no text so keep going */
1509: }
1510:
1511: /* otherwise treat as CDATA */
1512: this .state = LEX_CONTENT;
1513: this .waswhite = false;
1514: continue;
1515:
1516: case LEX_ENDTAG: /* </letter */
1517: this .txtstart = this .lexsize - 1;
1518: this .in.curcol += 2;
1519: c = parseTagName();
1520: this .token = newNode(Node.EndTag, /* create endtag token */
1521: this .lexbuf, this .txtstart, this .txtend, getString(
1522: this .lexbuf, this .txtstart, this .txtend
1523: - this .txtstart));
1524: this .lexsize = this .txtstart;
1525: this .txtend = this .txtstart;
1526:
1527: /* skip to '>' */
1528: while (c != '>') {
1529: c = this .in.readChar();
1530:
1531: if (c == StreamIn.EndOfStream)
1532: break;
1533: }
1534:
1535: if (c == StreamIn.EndOfStream) {
1536: this .in.ungetChar(c);
1537: continue;
1538: }
1539:
1540: this .state = LEX_CONTENT;
1541: this .waswhite = false;
1542: return this .token; /* the endtag token */
1543:
1544: case LEX_STARTTAG: /* first letter of tagname */
1545: this .txtstart = this .lexsize - 1; /* set txtstart to first letter */
1546: c = parseTagName();
1547: isempty.value = false;
1548: attributes = null;
1549: this .token = newNode((isempty.value ? Node.StartEndTag
1550: : Node.StartTag), this .lexbuf, this .txtstart,
1551: this .txtend, getString(this .lexbuf,
1552: this .txtstart, this .txtend
1553: - this .txtstart));
1554:
1555: /* parse attributes, consuming closing ">" */
1556: if (c != '>') {
1557: if (c == '/')
1558: this .in.ungetChar(c);
1559:
1560: attributes = parseAttrs(isempty);
1561: }
1562:
1563: if (isempty.value)
1564: this .token.type = Node.StartEndTag;
1565:
1566: this .token.attributes = attributes;
1567: this .lexsize = this .txtstart;
1568: this .txtend = this .txtstart;
1569:
1570: /* swallow newline following start tag */
1571: /* special check needed for CRLF sequence */
1572: /* this doesn't apply to empty elements */
1573:
1574: if (expectsContent(this .token)
1575: || this .token.tag == configuration.tt.tagBr) {
1576:
1577: c = this .in.readChar();
1578:
1579: if (c == '\r') {
1580: c = this .in.readChar();
1581:
1582: if (c != '\n')
1583: this .in.ungetChar(c);
1584: } else if (c != '\n' && c != '\f')
1585: this .in.ungetChar(c);
1586:
1587: this .waswhite = true; /* to swallow leading whitespace */
1588: } else
1589: this .waswhite = false;
1590:
1591: this .state = LEX_CONTENT;
1592:
1593: // BEGIN RAVE MODIFICATIONS
1594: //if (this.token.tag == null)
1595: // Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1596: if (this .token.tag == null) {
1597: if (configuration.inputJspMode
1598: && this .token.element != null
1599: && this .token.element.indexOf(":") != -1) {
1600: // This is probably a JSP tag. We don't want errors on these. We'll
1601: // just treat them as inline tags.
1602: this .configuration.tt
1603: .defineInlineTag(this .token.element);
1604: this .token.tag = configuration.tt
1605: .lookup(this .token.element);
1606: } else {
1607: Report.error(this , null, this .token,
1608: Report.UNKNOWN_ELEMENT);
1609: }
1610: }
1611: // END RAVE MODIFICATIONS
1612: else if (!this .configuration.XmlTags) {
1613: this .versions &= this .token.tag.versions;
1614:
1615: if ((this .token.tag.versions & Dict.VERS_PROPRIETARY) != 0) {
1616: if (!this .configuration.MakeClean
1617: && (this .token.tag == configuration.tt.tagNobr || this .token.tag == configuration.tt.tagWbr))
1618: Report.warning(this , null, this .token,
1619: Report.PROPRIETARY_ELEMENT);
1620: }
1621:
1622: if (this .token.tag.chkattrs != null) {
1623: this .token.checkUniqueAttributes(this );
1624: this .token.tag.chkattrs.check(this , this .token);
1625: } else
1626: this .token.checkAttributes(this );
1627: }
1628:
1629: return this .token; /* return start tag */
1630:
1631: case LEX_COMMENT: /* seen <!-- so look for --> */
1632:
1633: if (c != '-')
1634: continue;
1635:
1636: c = this .in.readChar();
1637: addCharToLexer(c);
1638:
1639: if (c != '-')
1640: continue;
1641:
1642: end_comment: while (true) {
1643: c = this .in.readChar();
1644:
1645: if (c == '>') {
1646: if (badcomment != 0)
1647: Report.warning(this , null, null,
1648: Report.MALFORMED_COMMENT);
1649:
1650: this .txtend = this .lexsize - 2; // AQ 8Jul2000
1651: this .lexbuf[this .lexsize] = (byte) '\0';
1652: this .state = LEX_CONTENT;
1653: this .waswhite = false;
1654: this .token = newNode(Node.CommentTag,
1655: this .lexbuf, this .txtstart, this .txtend);
1656:
1657: /* now look for a line break */
1658:
1659: c = this .in.readChar();
1660:
1661: if (c == '\r') {
1662: c = this .in.readChar();
1663:
1664: if (c != '\n')
1665: this .token.linebreak = true;
1666: }
1667:
1668: if (c == '\n')
1669: this .token.linebreak = true;
1670: else
1671: this .in.ungetChar(c);
1672:
1673: return this .token;
1674: }
1675:
1676: /* note position of first such error in the comment */
1677: if (badcomment == 0) {
1678: this .lines = this .in.curline;
1679: this .columns = this .in.curcol - 3;
1680: }
1681:
1682: badcomment++;
1683: if (this .configuration.FixComments)
1684: this .lexbuf[this .lexsize - 2] = (byte) '=';
1685:
1686: addCharToLexer(c);
1687:
1688: /* if '-' then look for '>' to end the comment */
1689: if (c != '-')
1690: break end_comment;
1691:
1692: }
1693: /* otherwise continue to look for --> */
1694: this .lexbuf[this .lexsize - 2] = (byte) '=';
1695: continue;
1696:
1697: case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
1698: map = MAP((char) c);
1699:
1700: if ((map & WHITE) != 0) {
1701: if (this .waswhite)
1702: this .lexsize -= 1;
1703:
1704: this .waswhite = true;
1705: } else
1706: this .waswhite = false;
1707:
1708: if (c != '>')
1709: continue;
1710:
1711: this .lexsize -= 1;
1712: this .txtend = this .lexsize;
1713: this .lexbuf[this .lexsize] = (byte) '\0';
1714: this .state = LEX_CONTENT;
1715: this .waswhite = false;
1716: this .token = newNode(Node.DocTypeTag, this .lexbuf,
1717: this .txtstart, this .txtend);
1718: /* make a note of the version named by the doctype */
1719: this .doctype = findGivenVersion(this .token);
1720: return this .token;
1721:
1722: case LEX_PROCINSTR: /* seen <? so look for '>' */
1723: /* check for PHP preprocessor instructions <?php ... ?> */
1724:
1725: if (this .lexsize - this .txtstart == 3) {
1726: if ((getString(this .lexbuf, this .txtstart, 3))
1727: .equals("php")) {
1728: this .state = LEX_PHP;
1729: continue;
1730: }
1731: }
1732:
1733: if (this .configuration.XmlPIs) /* insist on ?> as terminator */
1734: {
1735: if (c != '?')
1736: continue;
1737:
1738: /* now look for '>' */
1739: c = this .in.readChar();
1740:
1741: if (c == StreamIn.EndOfStream) {
1742: Report.warning(this , null, null,
1743: Report.UNEXPECTED_END_OF_FILE);
1744: this .in.ungetChar(c);
1745: continue;
1746: }
1747:
1748: addCharToLexer(c);
1749: }
1750:
1751: if (c != '>')
1752: continue;
1753:
1754: this .lexsize -= 1;
1755: this .txtend = this .lexsize;
1756: this .lexbuf[this .lexsize] = (byte) '\0';
1757: this .state = LEX_CONTENT;
1758: this .waswhite = false;
1759: this .token = newNode(Node.ProcInsTag, this .lexbuf,
1760: this .txtstart, this .txtend);
1761: return this .token;
1762:
1763: case LEX_ASP: /* seen <% so look for "%>" */
1764: if (c != '%')
1765: continue;
1766:
1767: /* now look for '>' */
1768: c = this .in.readChar();
1769:
1770: if (c != '>') {
1771: this .in.ungetChar(c);
1772: continue;
1773: }
1774:
1775: this .lexsize -= 1;
1776: this .txtend = this .lexsize;
1777: this .lexbuf[this .lexsize] = (byte) '\0';
1778: this .state = LEX_CONTENT;
1779: this .waswhite = false;
1780: this .token = newNode(Node.AspTag, this .lexbuf,
1781: this .txtstart, this .txtend);
1782: return this .token;
1783:
1784: case LEX_JSTE: /* seen <# so look for "#>" */
1785: if (c != '#')
1786: continue;
1787:
1788: /* now look for '>' */
1789: c = this .in.readChar();
1790:
1791: if (c != '>') {
1792: this .in.ungetChar(c);
1793: continue;
1794: }
1795:
1796: this .lexsize -= 1;
1797: this .txtend = this .lexsize;
1798: this .lexbuf[this .lexsize] = (byte) '\0';
1799: this .state = LEX_CONTENT;
1800: this .waswhite = false;
1801: this .token = newNode(Node.JsteTag, this .lexbuf,
1802: this .txtstart, this .txtend);
1803: return this .token;
1804:
1805: case LEX_PHP: /* seen "<?php" so look for "?>" */
1806: if (c != '?')
1807: continue;
1808:
1809: /* now look for '>' */
1810: c = this .in.readChar();
1811:
1812: if (c != '>') {
1813: this .in.ungetChar(c);
1814: continue;
1815: }
1816:
1817: this .lexsize -= 1;
1818: this .txtend = this .lexsize;
1819: this .lexbuf[this .lexsize] = (byte) '\0';
1820: this .state = LEX_CONTENT;
1821: this .waswhite = false;
1822: this .token = newNode(Node.PhpTag, this .lexbuf,
1823: this .txtstart, this .txtend);
1824: return this .token;
1825:
1826: case LEX_SECTION: /* seen "<![" so look for "]>" */
1827: if (c == '[') {
1828: if (this .lexsize == (this .txtstart + 6)
1829: && (getString(this .lexbuf, this .txtstart, 6))
1830: .equals("CDATA[")) {
1831: this .state = LEX_CDATA;
1832: this .lexsize -= 6;
1833: continue;
1834: }
1835: }
1836:
1837: if (c != ']')
1838: continue;
1839:
1840: /* now look for '>' */
1841: c = this .in.readChar();
1842:
1843: if (c != '>') {
1844: this .in.ungetChar(c);
1845: continue;
1846: }
1847:
1848: this .lexsize -= 1;
1849: this .txtend = this .lexsize;
1850: this .lexbuf[this .lexsize] = (byte) '\0';
1851: this .state = LEX_CONTENT;
1852: this .waswhite = false;
1853: this .token = newNode(Node.SectionTag, this .lexbuf,
1854: this .txtstart, this .txtend);
1855: return this .token;
1856:
1857: case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1858: if (c != ']')
1859: continue;
1860:
1861: /* now look for ']' */
1862: c = this .in.readChar();
1863:
1864: if (c != ']') {
1865: this .in.ungetChar(c);
1866: continue;
1867: }
1868:
1869: /* now look for '>' */
1870: c = this .in.readChar();
1871:
1872: if (c != '>') {
1873: this .in.ungetChar(c);
1874: continue;
1875: }
1876:
1877: this .lexsize -= 1;
1878: this .txtend = this .lexsize;
1879: this .lexbuf[this .lexsize] = (byte) '\0';
1880: this .state = LEX_CONTENT;
1881: this .waswhite = false;
1882: this .token = newNode(Node.CDATATag, this .lexbuf,
1883: this .txtstart, this .txtend);
1884: return this .token;
1885: }
1886: }
1887:
1888: if (this .state == LEX_CONTENT) /* text string */
1889: {
1890: this .txtend = this .lexsize;
1891:
1892: if (this .txtend > this .txtstart) {
1893: this .in.ungetChar(c);
1894:
1895: if (this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1896: this .lexsize -= 1;
1897: this .txtend = this .lexsize;
1898: }
1899:
1900: this .token = newNode(Node.TextNode, this .lexbuf,
1901: this .txtstart, this .txtend);
1902: return this .token;
1903: }
1904: } else if (this .state == LEX_COMMENT) /* comment */
1905: {
1906: if (c == StreamIn.EndOfStream)
1907: Report.warning(this , null, null,
1908: Report.MALFORMED_COMMENT);
1909:
1910: this .txtend = this .lexsize;
1911: this .lexbuf[this .lexsize] = (byte) '\0';
1912: this .state = LEX_CONTENT;
1913: this .waswhite = false;
1914: this .token = newNode(Node.CommentTag, this .lexbuf,
1915: this .txtstart, this .txtend);
1916: return this .token;
1917: }
1918:
1919: return null;
1920: }
1921:
1922: /*
1923: parser for ASP within start tags
1924:
1925: Some people use ASP for to customize attributes
1926: Tidy isn't really well suited to dealing with ASP
1927: This is a workaround for attributes, but won't
1928: deal with the case where the ASP is used to tailor
1929: the attribute value. Here is an example of a work
1930: around for using ASP in attribute values:
1931:
1932: href="<%=rsSchool.Fields("ID").Value%>"
1933:
1934: where the ASP that generates the attribute value
1935: is masked from Tidy by the quotemarks.
1936:
1937: */
1938:
1939: public Node parseAsp() {
1940: int c;
1941: Node asp = null;
1942:
1943: this .txtstart = this .lexsize;
1944:
1945: for (;;) {
1946: c = this .in.readChar();
1947: addCharToLexer(c);
1948:
1949: if (c != '%')
1950: continue;
1951:
1952: c = this .in.readChar();
1953: addCharToLexer(c);
1954:
1955: if (c == '>')
1956: break;
1957: }
1958:
1959: this .lexsize -= 2;
1960: this .txtend = this .lexsize;
1961:
1962: if (this .txtend > this .txtstart)
1963: asp = newNode(Node.AspTag, this .lexbuf, this .txtstart,
1964: this .txtend);
1965:
1966: this .txtstart = this .txtend;
1967: return asp;
1968: }
1969:
1970: /*
1971: PHP is like ASP but is based upon XML
1972: processing instructions, e.g. <?php ... ?>
1973: */
1974: public Node parsePhp() {
1975: int c;
1976: Node php = null;
1977:
1978: this .txtstart = this .lexsize;
1979:
1980: for (;;) {
1981: c = this .in.readChar();
1982: addCharToLexer(c);
1983:
1984: if (c != '?')
1985: continue;
1986:
1987: c = this .in.readChar();
1988: addCharToLexer(c);
1989:
1990: if (c == '>')
1991: break;
1992: }
1993:
1994: this .lexsize -= 2;
1995: this .txtend = this .lexsize;
1996:
1997: if (this .txtend > this .txtstart)
1998: php = newNode(Node.PhpTag, this .lexbuf, this .txtstart,
1999: this .txtend);
2000:
2001: this .txtstart = this .txtend;
2002: return php;
2003: }
2004:
2005: /* consumes the '>' terminating start tags */
2006: public String parseAttribute(MutableBoolean isempty,
2007: MutableObject asp, MutableObject php) {
2008: int start = 0;
2009: // int len = 0; Removed by BUGFIX for 126265
2010: short map;
2011: String attr;
2012: int c = 0;
2013:
2014: asp.setObject(null); /* clear asp pointer */
2015: php.setObject(null); /* clear php pointer */
2016: /* skip white space before the attribute */
2017:
2018: for (;;) {
2019: c = this .in.readChar();
2020:
2021: if (c == '/') {
2022: c = this .in.readChar();
2023:
2024: if (c == '>') {
2025: isempty.value = true;
2026: return null;
2027: }
2028:
2029: this .in.ungetChar(c);
2030: c = '/';
2031: break;
2032: }
2033:
2034: if (c == '>')
2035: return null;
2036:
2037: if (c == '<') {
2038: c = this .in.readChar();
2039:
2040: if (c == '%') {
2041: asp.setObject(parseAsp());
2042: return null;
2043: } else if (c == '?') {
2044: php.setObject(parsePhp());
2045: return null;
2046: }
2047:
2048: this .in.ungetChar(c);
2049: Report.attrError(this , this .token, null,
2050: Report.UNEXPECTED_GT);
2051: return null;
2052: }
2053:
2054: if (c == '"' || c == '\'') {
2055: Report.attrError(this , this .token, null,
2056: Report.UNEXPECTED_QUOTEMARK);
2057: continue;
2058: }
2059:
2060: if (c == StreamIn.EndOfStream) {
2061: Report.attrError(this , this .token, null,
2062: Report.UNEXPECTED_END_OF_FILE);
2063: this .in.ungetChar(c);
2064: return null;
2065: }
2066:
2067: map = MAP((char) c);
2068:
2069: if ((map & WHITE) == 0)
2070: break;
2071: }
2072:
2073: start = this .lexsize;
2074:
2075: for (;;) {
2076: /* but push back '=' for parseValue() */
2077: if (c == '=' || c == '>') {
2078: this .in.ungetChar(c);
2079: break;
2080: }
2081:
2082: if (c == '<' || c == StreamIn.EndOfStream) {
2083: this .in.ungetChar(c);
2084: break;
2085: }
2086:
2087: map = MAP((char) c);
2088:
2089: if ((map & WHITE) != 0)
2090: break;
2091:
2092: /* what should be done about non-namechar characters? */
2093: /* currently these are incorporated into the attr name */
2094: // BEGIN RAVE MODIFICATIONS
2095: if (this .configuration.inputJspMode) { // don't change case of attributes
2096: ;
2097: } else
2098: // END RAVE MODIFICATIONS
2099: if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
2100: c += (int) ('a' - 'A');
2101:
2102: // ++len; Removed by BUGFIX for 126265
2103: addCharToLexer(c);
2104:
2105: c = this .in.readChar();
2106: }
2107:
2108: // Following line added by GLP to fix BUG 126265. This is a temporary comment
2109: // and should be removed when Tidy is fixed.
2110: int len = this .lexsize - start;
2111: attr = (len > 0 ? getString(this .lexbuf, start, len) : null);
2112: this .lexsize = start;
2113:
2114: return attr;
2115: }
2116:
2117: /*
2118: invoked when < is seen in place of attribute value
2119: but terminates on whitespace if not ASP, PHP or Tango
2120: this routine recognizes ' and " quoted strings
2121: */
2122: public int parseServerInstruction() {
2123: int c, map, delim = '"';
2124: boolean isrule = false;
2125:
2126: c = this .in.readChar();
2127: addCharToLexer(c);
2128:
2129: /* check for ASP, PHP or Tango */
2130: if (c == '%' || c == '?' || c == '@')
2131: isrule = true;
2132:
2133: for (;;) {
2134: c = this .in.readChar();
2135:
2136: if (c == StreamIn.EndOfStream)
2137: break;
2138:
2139: if (c == '>') {
2140: if (isrule)
2141: addCharToLexer(c);
2142: else
2143: this .in.ungetChar(c);
2144:
2145: break;
2146: }
2147:
2148: /* if not recognized as ASP, PHP or Tango */
2149: /* then also finish value on whitespace */
2150: if (!isrule) {
2151: map = MAP((char) c);
2152:
2153: if ((map & WHITE) != 0)
2154: break;
2155: }
2156:
2157: addCharToLexer(c);
2158:
2159: if (c == '"') {
2160: do {
2161: c = this .in.readChar();
2162: addCharToLexer(c);
2163: } while (c != '"');
2164: delim = '\'';
2165: continue;
2166: }
2167:
2168: if (c == '\'') {
2169: do {
2170: c = this .in.readChar();
2171: addCharToLexer(c);
2172: } while (c != '\'');
2173: }
2174: }
2175:
2176: return delim;
2177: }
2178:
2179: /* values start with "=" or " = " etc. */
2180: /* doesn't consume the ">" at end of start tag */
2181:
2182: public String parseValue(String name, boolean foldCase,
2183: MutableBoolean isempty, MutableInteger pdelim) {
2184: int len = 0;
2185: int start;
2186: short map;
2187: boolean seen_gt = false;
2188: boolean munge = true;
2189: int c = 0;
2190: int lastc, delim, quotewarning;
2191: String value;
2192:
2193: delim = 0;
2194: pdelim.value = (int) '"';
2195:
2196: /*
2197: Henry Zrepa reports that some folk are using the
2198: embed element with script attributes where newlines
2199: are significant and must be preserved
2200: */
2201: if (configuration.LiteralAttribs)
2202: munge = false;
2203:
2204: /* skip white space before the '=' */
2205:
2206: for (;;) {
2207: c = this .in.readChar();
2208:
2209: if (c == StreamIn.EndOfStream) {
2210: this .in.ungetChar(c);
2211: break;
2212: }
2213:
2214: map = MAP((char) c);
2215:
2216: if ((map & WHITE) == 0)
2217: break;
2218: }
2219:
2220: /*
2221: c should be '=' if there is a value
2222: other legal possibilities are white
2223: space, '/' and '>'
2224: */
2225:
2226: if (c != '=') {
2227: this .in.ungetChar(c);
2228: return null;
2229: }
2230:
2231: /* skip white space after '=' */
2232:
2233: for (;;) {
2234: c = this .in.readChar();
2235:
2236: if (c == StreamIn.EndOfStream) {
2237: this .in.ungetChar(c);
2238: break;
2239: }
2240:
2241: map = MAP((char) c);
2242:
2243: if ((map & WHITE) == 0)
2244: break;
2245: }
2246:
2247: /* check for quote marks */
2248:
2249: if (c == '"' || c == '\'')
2250: delim = c;
2251: else if (c == '<') {
2252: start = this .lexsize;
2253: addCharToLexer(c);
2254: pdelim.value = parseServerInstruction();
2255: len = this .lexsize - start;
2256: this .lexsize = start;
2257: return (len > 0 ? getString(this .lexbuf, start, len) : null);
2258: } else
2259: this .in.ungetChar(c);
2260:
2261: /*
2262: and read the value string
2263: check for quote mark if needed
2264: */
2265:
2266: quotewarning = 0;
2267: start = this .lexsize;
2268: c = '\0';
2269:
2270: for (;;) {
2271: lastc = c; /* track last character */
2272: c = this .in.readChar();
2273:
2274: if (c == StreamIn.EndOfStream) {
2275: Report.attrError(this , this .token, null,
2276: Report.UNEXPECTED_END_OF_FILE);
2277: this .in.ungetChar(c);
2278: break;
2279: }
2280:
2281: if (delim == (char) 0) {
2282: if (c == '>') {
2283: this .in.ungetChar(c);
2284: break;
2285: }
2286:
2287: if (c == '"' || c == '\'') {
2288: Report.attrError(this , this .token, null,
2289: Report.UNEXPECTED_QUOTEMARK);
2290: break;
2291: }
2292:
2293: if (c == '<') {
2294: /* this.in.ungetChar(c); */
2295: Report.attrError(this , this .token, null,
2296: Report.UNEXPECTED_GT);
2297: /* break; */
2298: }
2299:
2300: /*
2301: For cases like <br clear=all/> need to avoid treating /> as
2302: part of the attribute value, however care is needed to avoid
2303: so treating <a href=http://www.acme.com/> in this way, which
2304: would map the <a> tag to <a href="http://www.acme.com"/>
2305: */
2306: if (c == '/') {
2307: /* peek ahead in case of /> */
2308: c = this .in.readChar();
2309:
2310: if (c == '>'
2311: && !AttributeTable
2312: .getDefaultAttributeTable().isUrl(
2313: name)) {
2314: isempty.value = true;
2315: this .in.ungetChar(c);
2316: break;
2317: }
2318:
2319: /* unget peeked char */
2320: this .in.ungetChar(c);
2321: c = '/';
2322: }
2323: } else /* delim is '\'' or '"' */
2324: {
2325: if (c == delim)
2326: break;
2327:
2328: /* treat CRLF, CR and LF as single line break */
2329:
2330: if (c == '\r') {
2331: c = this .in.readChar();
2332: if (c != '\n')
2333: this .in.ungetChar(c);
2334:
2335: c = '\n';
2336: }
2337:
2338: if (c == '\n' || c == '<' || c == '>')
2339: ++quotewarning;
2340:
2341: if (c == '>')
2342: seen_gt = true;
2343: }
2344:
2345: if (c == '&') {
2346: addCharToLexer(c);
2347: parseEntity((short) 0);
2348: continue;
2349: }
2350:
2351: /*
2352: kludge for JavaScript attribute values
2353: with line continuations in string literals
2354: */
2355: if (c == '\\') {
2356: c = this .in.readChar();
2357:
2358: if (c != '\n') {
2359: this .in.ungetChar(c);
2360: c = '\\';
2361: }
2362: }
2363:
2364: map = MAP((char) c);
2365:
2366: if ((map & WHITE) != 0) {
2367: if (delim == (char) 0)
2368: break;
2369:
2370: if (munge) {
2371: c = ' ';
2372:
2373: if (lastc == ' ')
2374: continue;
2375: }
2376: } else if (foldCase && (map & UPPERCASE) != 0)
2377: c += (int) ('a' - 'A');
2378:
2379: addCharToLexer(c);
2380: }
2381:
2382: if (quotewarning > 10 && seen_gt && munge) {
2383: /*
2384: there is almost certainly a missing trailling quote mark
2385: as we have see too many newlines, < or > characters.
2386:
2387: an exception is made for Javascript attributes and the
2388: javascript URL scheme which may legitimately include < and >
2389: */
2390: if (!AttributeTable.getDefaultAttributeTable().isScript(
2391: name)
2392: && !(AttributeTable.getDefaultAttributeTable()
2393: .isUrl(name) && (getString(this .lexbuf,
2394: start, 11)).equals("javascript:")))
2395: Report.error(this , null, null,
2396: Report.SUSPECTED_MISSING_QUOTE);
2397: }
2398:
2399: len = this .lexsize - start;
2400: this .lexsize = start;
2401:
2402: if (len > 0 || delim != 0)
2403: value = getString(this .lexbuf, start, len);
2404: else
2405: value = null;
2406:
2407: /* note delimiter if given */
2408: if (delim != 0)
2409: pdelim.value = delim;
2410: else
2411: pdelim.value = (int) '"';
2412:
2413: return value;
2414: }
2415:
2416: /* attr must be non-null */
2417: public static boolean isValidAttrName(String attr) {
2418: short map;
2419: char c;
2420: int i;
2421:
2422: /* first character should be a letter */
2423: c = attr.charAt(0);
2424: map = MAP(c);
2425:
2426: if (!((map & LETTER) != 0))
2427: return false;
2428:
2429: /* remaining characters should be namechars */
2430: for (i = 1; i < attr.length(); i++) {
2431: c = attr.charAt(i);
2432: map = MAP(c);
2433:
2434: if ((map & NAMECHAR) != 0)
2435: continue;
2436:
2437: return false;
2438: }
2439:
2440: return true;
2441: }
2442:
2443: /* swallows closing '>' */
2444:
2445: public AttVal parseAttrs(MutableBoolean isempty) {
2446: AttVal av, list;
2447: String attribute, value;
2448: MutableInteger delim = new MutableInteger();
2449: MutableObject asp = new MutableObject();
2450: MutableObject php = new MutableObject();
2451:
2452: list = null;
2453:
2454: for (; !endOfInput();) {
2455: attribute = parseAttribute(isempty, asp, php);
2456:
2457: if (attribute == null) {
2458: /* check if attributes are created by ASP markup */
2459: if (asp.getObject() != null) {
2460: av = new AttVal(list, null, (Node) asp.getObject(),
2461: null, '\0', null, null);
2462: list = av;
2463: continue;
2464: }
2465:
2466: /* check if attributes are created by PHP markup */
2467: if (php.getObject() != null) {
2468: av = new AttVal(list, null, null, (Node) php
2469: .getObject(), '\0', null, null);
2470: list = av;
2471: continue;
2472: }
2473:
2474: break;
2475: }
2476:
2477: value = parseValue(attribute, false, isempty, delim);
2478:
2479: if (attribute != null && isValidAttrName(attribute)) {
2480: av = new AttVal(list, null, null, null, delim.value,
2481: attribute, value);
2482: av.dict = AttributeTable.getDefaultAttributeTable()
2483: .findAttribute(av);
2484: list = av;
2485: } else {
2486: av = new AttVal(null, null, null, null, 0, attribute,
2487: value);
2488: Report.attrError(this , this .token, value,
2489: Report.BAD_ATTRIBUTE_VALUE);
2490: }
2491: }
2492:
2493: return list;
2494: }
2495:
2496: /*
2497: push a copy of an inline node onto stack
2498: but don't push if implicit or OBJECT or APPLET
2499: (implicit tags are ones generated from the istack)
2500:
2501: One issue arises with pushing inlines when
2502: the tag is already pushed. For instance:
2503:
2504: <p><em>text
2505: <p><em>more text
2506:
2507: Shouldn't be mapped to
2508:
2509: <p><em>text</em></p>
2510: <p><em><em>more text</em></em>
2511: */
2512: public void pushInline(Node node) {
2513: IStack is;
2514:
2515: if (node.implicit)
2516: return;
2517:
2518: if (node.tag == null)
2519: return;
2520:
2521: if ((node.tag.model & Dict.CM_INLINE) == 0)
2522: return;
2523:
2524: if ((node.tag.model & Dict.CM_OBJECT) != 0)
2525: return;
2526:
2527: if (node.tag != configuration.tt.tagFont && isPushed(node))
2528: return;
2529:
2530: // make sure there is enough space for the stack
2531: is = new IStack();
2532: is.tag = node.tag;
2533: is.element = node.element;
2534: if (node.attributes != null)
2535: is.attributes = cloneAttributes(node.attributes);
2536: this .istack.push(is);
2537: }
2538:
2539: /* pop inline stack */
2540: public void popInline(Node node) {
2541: AttVal av;
2542: IStack is;
2543:
2544: if (node != null) {
2545:
2546: if (node.tag == null)
2547: return;
2548:
2549: if ((node.tag.model & Dict.CM_INLINE) == 0)
2550: return;
2551:
2552: if ((node.tag.model & Dict.CM_OBJECT) != 0)
2553: return;
2554:
2555: // if node is </a> then pop until we find an <a>
2556: if (node.tag == configuration.tt.tagA) {
2557:
2558: while (this .istack.size() > 0) {
2559: is = (IStack) this .istack.pop();
2560: if (is.tag == configuration.tt.tagA) {
2561: break;
2562: }
2563: }
2564:
2565: if (this .insert >= this .istack.size())
2566: this .insert = -1;
2567: return;
2568: }
2569: }
2570:
2571: if (this .istack.size() > 0) {
2572: is = (IStack) this .istack.pop();
2573: if (this .insert >= this .istack.size())
2574: this .insert = -1;
2575: }
2576: }
2577:
2578: public boolean isPushed(Node node) {
2579: int i;
2580: IStack is;
2581:
2582: for (i = this .istack.size() - 1; i >= 0; --i) {
2583: is = (IStack) this .istack.elementAt(i);
2584: if (is.tag == node.tag)
2585: return true;
2586: }
2587:
2588: return false;
2589: }
2590:
2591: /*
2592: This has the effect of inserting "missing" inline
2593: elements around the contents of blocklevel elements
2594: such as P, TD, TH, DIV, PRE etc. This procedure is
2595: called at the start of ParseBlock. when the inline
2596: stack is not empty, as will be the case in:
2597:
2598: <i><h1>italic heading</h1></i>
2599:
2600: which is then treated as equivalent to
2601:
2602: <h1><i>italic heading</i></h1>
2603:
2604: This is implemented by setting the lexer into a mode
2605: where it gets tokens from the inline stack rather than
2606: from the input stream.
2607: */
2608: public int inlineDup(Node node) {
2609: int n;
2610:
2611: n = this .istack.size() - this .istackbase;
2612: if (n > 0) {
2613: this .insert = this .istackbase;
2614: this .inode = node;
2615: }
2616:
2617: return n;
2618: }
2619:
2620: public Node insertedToken() {
2621: Node node;
2622: IStack is;
2623: int n;
2624:
2625: // this will only be null if inode != null
2626: if (this .insert == -1) {
2627: node = this .inode;
2628: this .inode = null;
2629: return node;
2630: }
2631:
2632: // is this is the "latest" node then update
2633: // the position, otherwise use current values
2634:
2635: if (this .inode == null) {
2636: this .lines = this .in.curline;
2637: this .columns = this .in.curcol;
2638: }
2639:
2640: node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
2641: this .txtend); // GLP: Bugfix 126261. Remove when this change
2642: // is fixed in istack.c in the original Tidy
2643: node.implicit = true;
2644: is = (IStack) this .istack.elementAt(this .insert);
2645: node.element = is.element;
2646: node.tag = is.tag;
2647: if (is.attributes != null)
2648: node.attributes = cloneAttributes(is.attributes);
2649:
2650: // advance lexer to next item on the stack
2651: n = this .insert;
2652:
2653: // and recover state if we have reached the end
2654: if (++n < this .istack.size()) {
2655: this .insert = n;
2656: } else {
2657: this .insert = -1;
2658: }
2659:
2660: return node;
2661: }
2662:
2663: /* AQ: Try this for speed optimization */
2664: public static int wstrcasecmp(String s1, String s2) {
2665: return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2666: }
2667:
2668: public static int wstrcaselexcmp(String s1, String s2) {
2669: char c;
2670: int i = 0;
2671:
2672: while (i < s1.length() && i < s2.length()) {
2673: c = s1.charAt(i);
2674: if (toLower(c) != toLower(s2.charAt(i))) {
2675: break;
2676: }
2677: i += 1;
2678: }
2679: if (i == s1.length() && i == s2.length()) {
2680: return 0;
2681: } else if (i == s1.length()) {
2682: return -1;
2683: } else if (i == s2.length()) {
2684: return 1;
2685: } else {
2686: return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
2687: }
2688: }
2689:
2690: public static boolean wsubstr(String s1, String s2) {
2691: int i;
2692: int len1 = s1.length();
2693: int len2 = s2.length();
2694:
2695: for (i = 0; i <= len1 - len2; ++i) {
2696: if (s2.equalsIgnoreCase(s1.substring(i)))
2697: return true;
2698: }
2699:
2700: return false;
2701: }
2702:
2703: public boolean canPrune(Node element) {
2704: if (element.type == Node.TextNode)
2705: return true;
2706:
2707: if (element.content != null)
2708: return false;
2709:
2710: if (element.tag == configuration.tt.tagA
2711: && element.attributes != null)
2712: return false;
2713:
2714: if (element.tag == configuration.tt.tagP
2715: && !this .configuration.DropEmptyParas)
2716: return false;
2717:
2718: if (element.tag == null)
2719: return false;
2720:
2721: if ((element.tag.model & Dict.CM_ROW) != 0)
2722: return false;
2723:
2724: if (element.tag == configuration.tt.tagApplet)
2725: return false;
2726:
2727: if (element.tag == configuration.tt.tagObject)
2728: return false;
2729:
2730: if (element.attributes != null
2731: && (element.getAttrByName("id") != null || element
2732: .getAttrByName("name") != null))
2733: return false;
2734:
2735: return true;
2736: }
2737:
2738: /* duplicate name attribute as an id */
2739: public void fixId(Node node) {
2740: AttVal name = node.getAttrByName("name");
2741: AttVal id = node.getAttrByName("id");
2742:
2743: if (name != null) {
2744: if (id != null) {
2745: if (!id.value.equals(name.value))
2746: Report.attrError(this , node, "name",
2747: Report.ID_NAME_MISMATCH);
2748: } else if (this .configuration.XmlOut)
2749: node.addAttribute("id", name.value);
2750: }
2751: }
2752:
2753: /*
2754: defer duplicates when entering a table or other
2755: element where the inlines shouldn't be duplicated
2756: */
2757: public void deferDup() {
2758: this .insert = -1;
2759: this .inode = null;
2760: }
2761:
2762: /* Private methods and fields */
2763:
2764: /* lexer char types */
2765: private static final short DIGIT = 1;
2766: private static final short LETTER = 2;
2767: private static final short NAMECHAR = 4;
2768: private static final short WHITE = 8;
2769: private static final short NEWLINE = 16;
2770: private static final short LOWERCASE = 32;
2771: private static final short UPPERCASE = 64;
2772:
2773: /* lexer GetToken states */
2774:
2775: private static final short LEX_CONTENT = 0;
2776: private static final short LEX_GT = 1;
2777: private static final short LEX_ENDTAG = 2;
2778: private static final short LEX_STARTTAG = 3;
2779: private static final short LEX_COMMENT = 4;
2780: private static final short LEX_DOCTYPE = 5;
2781: private static final short LEX_PROCINSTR = 6;
2782: private static final short LEX_ENDCOMMENT = 7;
2783: private static final short LEX_CDATA = 8;
2784: private static final short LEX_SECTION = 9;
2785: private static final short LEX_ASP = 10;
2786: private static final short LEX_JSTE = 11;
2787: private static final short LEX_PHP = 12;
2788:
2789: /* used to classify chars for lexical purposes */
2790: private static short[] lexmap = new short[128];
2791:
2792: private static void mapStr(String str, short code) {
2793: int j;
2794:
2795: for (int i = 0; i < str.length(); i++) {
2796: j = (int) str.charAt(i);
2797: lexmap[j] |= code;
2798: }
2799: }
2800:
2801: static {
2802: mapStr("\r\n\f", (short) (NEWLINE | WHITE));
2803: mapStr(" \t", WHITE);
2804: mapStr("-.:_", NAMECHAR);
2805: mapStr("0123456789", (short) (DIGIT | NAMECHAR));
2806: mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE
2807: | LETTER | NAMECHAR));
2808: mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE
2809: | LETTER | NAMECHAR));
2810: }
2811:
2812: private static short MAP(char c) {
2813: return ((int) c < 128 ? lexmap[(int) c] : 0);
2814: }
2815:
2816: private static boolean isWhite(char c) {
2817: short m = MAP(c);
2818:
2819: return (m & WHITE) != 0;
2820: }
2821:
2822: private static boolean isDigit(char c) {
2823: short m;
2824:
2825: m = MAP(c);
2826:
2827: return (m & DIGIT) != 0;
2828: }
2829:
2830: private static boolean isLetter(char c) {
2831: short m;
2832:
2833: m = MAP(c);
2834:
2835: return (m & LETTER) != 0;
2836: }
2837:
2838: private static char toLower(char c) {
2839: short m = MAP(c);
2840:
2841: if ((m & UPPERCASE) != 0)
2842: c = (char) ((int) c + (int) 'a' - (int) 'A');
2843:
2844: return c;
2845: }
2846:
2847: private static char toUpper(char c) {
2848: short m = MAP(c);
2849:
2850: if ((m & LOWERCASE) != 0)
2851: c = (char) ((int) c + (int) 'A' - (int) 'a');
2852:
2853: return c;
2854: }
2855:
2856: public static char foldCase(char c, boolean tocaps, boolean xmlTags) {
2857: short m;
2858:
2859: if (!xmlTags) {
2860: m = MAP(c);
2861:
2862: if (tocaps) {
2863: if ((m & LOWERCASE) != 0)
2864: c = (char) ((int) c + (int) 'A' - (int) 'a');
2865: } else /* force to lower case */
2866: {
2867: if ((m & UPPERCASE) != 0)
2868: c = (char) ((int) c + (int) 'a' - (int) 'A');
2869: }
2870: }
2871:
2872: return c;
2873: }
2874:
2875: private static class W3CVersionInfo {
2876: String name;
2877: String voyagerName;
2878: String profile;
2879: short code;
2880:
2881: public W3CVersionInfo(String name, String voyagerName,
2882: String profile, short code) {
2883: this .name = name;
2884: this .voyagerName = voyagerName;
2885: this .profile = profile;
2886: this .code = code;
2887: }
2888: }
2889:
2890: /* the 3 URIs for the XHTML 1.0 DTDs */
2891: private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
2892: private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
2893: private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
2894:
2895: private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
2896:
2897: private static Lexer.W3CVersionInfo[] W3CVersion = {
2898: new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict",
2899: voyager_strict, Dict.VERS_HTML40_STRICT),
2900: new W3CVersionInfo("HTML 4.01 Transitional",
2901: "XHTML 1.0 Transitional", voyager_loose,
2902: Dict.VERS_HTML40_LOOSE),
2903: new W3CVersionInfo("HTML 4.01 Frameset",
2904: "XHTML 1.0 Frameset", voyager_frameset,
2905: Dict.VERS_FRAMES),
2906: new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict",
2907: voyager_strict, Dict.VERS_HTML40_STRICT),
2908: new W3CVersionInfo("HTML 4.0 Transitional",
2909: "XHTML 1.0 Transitional", voyager_loose,
2910: Dict.VERS_HTML40_LOOSE),
2911: new W3CVersionInfo("HTML 4.0 Frameset",
2912: "XHTML 1.0 Frameset", voyager_frameset,
2913: Dict.VERS_FRAMES),
2914: new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional",
2915: voyager_loose, Dict.VERS_HTML32),
2916: new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict",
2917: voyager_strict, Dict.VERS_HTML20) };
2918:
2919: }
|