Source Code Cross Referenced for Lexer.java in » IDE-Netbeans » visualweb.api.designer » org » w3c » tidy » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » IDE Netbeans » visualweb.api.designer » org.w3c.tidy
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         * @(#)Lexer.java   1.11 2000/08/16
0003:         *
0004:         */
0005:
0006:        package org.w3c.tidy;
0007:
0008:        /**
0009:         *
0010:         * Lexer for html parser
0011:         *
0012:         * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013:         * See Tidy.java for the copyright notice.
0014:         * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015:         * HTML Tidy Release 4 Aug 2000</a>
0016:         *
0017:         * @author  Dave Raggett <dsr@w3.org>
0018:         * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019:         * @version 1.0, 1999/05/22
0020:         * @version 1.0.1, 1999/05/29
0021:         * @version 1.1, 1999/06/18 Java Bean
0022:         * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023:         * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024:         * @version 1.4, 1999/09/04 DOM support
0025:         * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026:         * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027:         * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028:         * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029:         * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030:         * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031:         * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032:         */
0033:
0034:        /*
0035:         Given a file stream fp it returns a sequence of tokens.
0036:
0037:         GetToken(fp) gets the next token
0038:         UngetToken(fp) provides one level undo
0039:
0040:         The tags include an attribute list:
0041:
0042:         - linked list of attribute/value nodes
0043:         - each node has 2 null-terminated strings.
0044:         - entities are replaced in attribute values
0045:
0046:         white space is compacted if not in preformatted mode
0047:         If not in preformatted mode then leading white space
0048:         is discarded and subsequent white space sequences
0049:         compacted to single space chars.
0050:
0051:         If XmlTags is no then Tag names are folded to upper
0052:         case and attribute names to lower case.
0053:
0054:         Not yet done:
0055:         -   Doctype subset and marked sections
0056:         */
0057:
0058:        import java.io.PrintWriter;
0059:        import java.util.Stack;
0060:        import java.util.Vector;
0061:
0062:        public class Lexer {
0063:
0064:            public StreamIn in; /* file stream */
0065:            public PrintWriter errout; /* error output stream */
0066:            public short badAccess; /* for accessibility errors */
0067:            public short badLayout; /* for bad style errors */
0068:            public short badChars; /* for bad char encodings */
0069:            public short badForm; /* for mismatched/mispositioned form tags */
0070:            public short warnings; /* count of warnings in this document */
0071:            public short errors; /* count of errors */
0072:            public int lines; /* lines seen */
0073:            public int columns; /* at start of current token */
0074:            public boolean waswhite; /* used to collapse contiguous white space */
0075:            public boolean pushed; /* true after token has been pushed back */
0076:            public boolean insertspace; /* when space is moved after end tag */
0077:            public boolean excludeBlocks; /* Netscape compatibility */
0078:            public boolean exiled; /* true if moved out of table */
0079:            public boolean isvoyager; /* true if xmlns attribute on html element */
0080:            public short versions; /* bit vector of HTML versions */
0081:            public int doctype; /* version as given by doctype (if any) */
0082:            public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
0083:            public int txtstart; /* start of current node */
0084:            public int txtend; /* end of current node */
0085:            public short state; /* state of lexer's finite state machine */
0086:            public Node token;
0087:
0088:            /* 
0089:              lexer character buffer
0090:
0091:              parse tree nodes span onto this buffer
0092:              which contains the concatenated text
0093:              contents of all of the elements.
0094:
0095:             lexsize must be reset for each file.
0096:             */
0097:            public byte[] lexbuf; /* byte buffer of UTF-8 chars */
0098:            public int lexlength; /* allocated */
0099:            public int lexsize; /* used */
0100:
0101:            /* Inline stack for compatibility with Mosaic */
0102:            public Node inode; /* for deferring text node */
0103:            public int insert; /* for inferring inline tags */
0104:            public Stack istack;
0105:            public int istackbase; /* start of frame */
0106:
0107:            public Style styles; /* used for cleaning up presentation markup */
0108:
0109:            public Configuration configuration;
0110:            protected int seenBodyEndTag; /* used by parser */
0111:            private Vector nodeList;
0112:
0113:            public Lexer(StreamIn in, Configuration configuration) {
0114:                this .in = in;
0115:                this .lines = 1;
0116:                this .columns = 1;
0117:                this .state = LEX_CONTENT;
0118:                this .badAccess = 0;
0119:                this .badLayout = 0;
0120:                this .badChars = 0;
0121:                this .badForm = 0;
0122:                this .warnings = 0;
0123:                this .errors = 0;
0124:                this .waswhite = false;
0125:                this .pushed = false;
0126:                this .insertspace = false;
0127:                this .exiled = false;
0128:                this .isvoyager = false;
0129:                this .versions = Dict.VERS_EVERYTHING;
0130:                this .doctype = Dict.VERS_UNKNOWN;
0131:                this .badDoctype = false;
0132:                this .txtstart = 0;
0133:                this .txtend = 0;
0134:                this .token = null;
0135:                this .lexbuf = null;
0136:                this .lexlength = 0;
0137:                this .lexsize = 0;
0138:                this .inode = null;
0139:                this .insert = -1;
0140:                this .istack = new Stack();
0141:                this .istackbase = 0;
0142:                this .styles = null;
0143:                this .configuration = configuration;
0144:                this .seenBodyEndTag = 0;
0145:                this .nodeList = new Vector();
0146:            }
0147:
0148:            public Node newNode() {
0149:                Node node = new Node();
0150:                nodeList.addElement(node);
0151:                return node;
0152:            }
0153:
0154:            public Node newNode(short type, byte[] textarray, int start, int end) {
0155:                Node node = new Node(type, textarray, start, end);
0156:                nodeList.addElement(node);
0157:                return node;
0158:            }
0159:
0160:            public Node newNode(short type, byte[] textarray, int start,
0161:                    int end, String element) {
0162:                Node node = new Node(type, textarray, start, end, element,
0163:                        configuration.tt);
0164:                nodeList.addElement(node);
0165:                return node;
0166:            }
0167:
0168:            public Node cloneNode(Node node) {
0169:                Node cnode = (Node) node.clone();
0170:                nodeList.addElement(cnode);
0171:                for (AttVal att = cnode.attributes; att != null; att = att.next) {
0172:                    if (att.asp != null)
0173:                        nodeList.addElement(att.asp);
0174:                    if (att.php != null)
0175:                        nodeList.addElement(att.php);
0176:                }
0177:                return cnode;
0178:            }
0179:
0180:            public AttVal cloneAttributes(AttVal attrs) {
0181:                AttVal cattrs = (AttVal) attrs.clone();
0182:                for (AttVal att = cattrs; att != null; att = att.next) {
0183:                    if (att.asp != null)
0184:                        nodeList.addElement(att.asp);
0185:                    if (att.php != null)
0186:                        nodeList.addElement(att.php);
0187:                }
0188:                return cattrs;
0189:            }
0190:
0191:            protected void updateNodeTextArrays(byte[] oldtextarray,
0192:                    byte[] newtextarray) {
0193:                Node node;
0194:                for (int i = 0; i < nodeList.size(); i++) {
0195:                    node = (Node) (nodeList.elementAt(i));
0196:                    if (node.textarray == oldtextarray)
0197:                        node.textarray = newtextarray;
0198:                }
0199:            }
0200:
0201:            /* used for creating preformatted text from Word2000 */
0202:            public Node newLineNode() {
0203:                Node node = newNode();
0204:
0205:                node.textarray = this .lexbuf;
0206:                node.start = this .lexsize;
0207:                addCharToLexer((int) '\n');
0208:                node.end = this .lexsize;
0209:                return node;
0210:            }
0211:
0212:            // Should always be able convert to/from UTF-8, so encoding exceptions are
0213:            // converted to an Error to avoid adding throws declarations in
0214:            // lots of methods.
0215:
0216:            public static byte[] getBytes(String str) {
0217:                try {
0218:                    return str.getBytes("UTF8");
0219:                } catch (java.io.UnsupportedEncodingException e) {
0220:                    throw new Error("string to UTF-8 conversion failed: "
0221:                            + e.getMessage());
0222:                }
0223:            }
0224:
0225:            public static String getString(byte[] bytes, int offset, int length) {
0226:                try {
0227:                    return new String(bytes, offset, length, "UTF8");
0228:                } catch (java.io.UnsupportedEncodingException e) {
0229:                    throw new Error("UTF-8 to string conversion failed: "
0230:                            + e.getMessage());
0231:                }
0232:            }
0233:
0234:            public boolean endOfInput() {
0235:                return this .in.isEndOfStream();
0236:            }
0237:
0238:            public void addByte(int c) {
0239:                if (this .lexsize + 1 >= this .lexlength) {
0240:                    while (this .lexsize + 1 >= this .lexlength) {
0241:                        if (this .lexlength == 0)
0242:                            this .lexlength = 8192;
0243:                        else
0244:                            this .lexlength = this .lexlength * 2;
0245:                    }
0246:
0247:                    byte[] temp = this .lexbuf;
0248:                    this .lexbuf = new byte[this .lexlength];
0249:                    if (temp != null) {
0250:                        System.arraycopy(temp, 0, this .lexbuf, 0, temp.length);
0251:                        updateNodeTextArrays(temp, this .lexbuf);
0252:                    }
0253:                }
0254:
0255:                this .lexbuf[this .lexsize++] = (byte) c;
0256:                this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
0257:            }
0258:
0259:            public void changeChar(byte c) {
0260:                if (this .lexsize > 0) {
0261:                    this .lexbuf[this .lexsize - 1] = c;
0262:                }
0263:            }
0264:
0265:            /* store char c as UTF-8 encoded byte stream */
0266:            public void addCharToLexer(int c) {
0267:                if (c < 128)
0268:                    addByte(c);
0269:                else if (c <= 0x7FF) {
0270:                    addByte(0xC0 | (c >> 6));
0271:                    addByte(0x80 | (c & 0x3F));
0272:                } else if (c <= 0xFFFF) {
0273:                    addByte(0xE0 | (c >> 12));
0274:                    addByte(0x80 | ((c >> 6) & 0x3F));
0275:                    addByte(0x80 | (c & 0x3F));
0276:                } else if (c <= 0x1FFFFF) {
0277:                    addByte(0xF0 | (c >> 18));
0278:                    addByte(0x80 | ((c >> 12) & 0x3F));
0279:                    addByte(0x80 | ((c >> 6) & 0x3F));
0280:                    addByte(0x80 | (c & 0x3F));
0281:                } else {
0282:                    addByte(0xF8 | (c >> 24));
0283:                    addByte(0x80 | ((c >> 18) & 0x3F));
0284:                    addByte(0x80 | ((c >> 12) & 0x3F));
0285:                    addByte(0x80 | ((c >> 6) & 0x3F));
0286:                    addByte(0x80 | (c & 0x3F));
0287:                }
0288:            }
0289:
0290:            public void addStringToLexer(String str) {
0291:                for (int i = 0; i < str.length(); i++) {
0292:                    addCharToLexer((int) str.charAt(i));
0293:                }
0294:            }
0295:
0296:            /*
0297:              No longer attempts to insert missing ';' for unknown
0298:              enitities unless one was present already, since this
0299:              gives unexpected results.
0300:
0301:              For example:   <a href="something.htm?foo&bar&fred">
0302:              was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
0303:              rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
0304:
0305:              My thanks for Maurice Buxton for spotting this.
0306:             */
0307:            public void parseEntity(short mode) {
0308:                short map;
0309:                int start;
0310:                boolean first = true;
0311:                boolean semicolon = false;
0312:                boolean numeric = false;
0313:                int c, ch, startcol;
0314:                String str;
0315:
0316:                start = this .lexsize - 1; /* to start at "&" */
0317:                startcol = this .in.curcol - 1;
0318:
0319:                while (true) {
0320:                    c = this .in.readChar();
0321:                    if (c == StreamIn.EndOfStream)
0322:                        break;
0323:                    if (c == ';') {
0324:                        semicolon = true;
0325:                        break;
0326:                    }
0327:
0328:                    if (first && c == '#') {
0329:                        addCharToLexer(c);
0330:                        first = false;
0331:                        numeric = true;
0332:                        continue;
0333:                    }
0334:
0335:                    first = false;
0336:                    map = MAP((char) c);
0337:
0338:                    /* AQ: Added flag for numeric entities so that numeric entities
0339:                       with missing semi-colons are recognized.
0340:                       Eg. "&#114e&#112;..." is recognized as "rep"
0341:                     */
0342:                    if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) {
0343:                        addCharToLexer(c);
0344:                        continue;
0345:                    }
0346:                    if (!numeric && ((map & NAMECHAR) != 0)) {
0347:                        addCharToLexer(c);
0348:                        continue;
0349:                    }
0350:
0351:                    /* otherwise put it back */
0352:
0353:                    this .in.ungetChar(c);
0354:                    break;
0355:                }
0356:
0357:                str = getString(this .lexbuf, start, this .lexsize - start);
0358:                ch = EntityTable.getDefaultEntityTable().entityCode(str);
0359:
0360:                /* deal with unrecognized entities */
0361:                if (ch <= 0) {
0362:                    /* set error position just before offending chararcter */
0363:                    this .lines = this .in.curline;
0364:                    this .columns = startcol;
0365:
0366:                    if (this .lexsize > start + 1) {
0367:                        Report
0368:                                .entityError(this , Report.UNKNOWN_ENTITY, str,
0369:                                        ch);
0370:
0371:                        if (semicolon)
0372:                            addCharToLexer(';');
0373:                    } else /* naked & */
0374:                    {
0375:                        Report.entityError(this , Report.UNESCAPED_AMPERSAND,
0376:                                str, ch);
0377:                    }
0378:                } else {
0379:                    if (c != ';') /* issue warning if not terminated by ';' */
0380:                    {
0381:                        /* set error position just before offending chararcter */
0382:                        this .lines = this .in.curline;
0383:                        this .columns = startcol;
0384:                        Report.entityError(this , Report.MISSING_SEMICOLON, str,
0385:                                c);
0386:                    }
0387:
0388:                    this .lexsize = start;
0389:
0390:                    if (ch == 160 && (mode & Preformatted) != 0)
0391:                        ch = ' ';
0392:
0393:                    addCharToLexer(ch);
0394:
0395:                    if (ch == '&' && !this .configuration.QuoteAmpersand) {
0396:                        addCharToLexer('a');
0397:                        addCharToLexer('m');
0398:                        addCharToLexer('p');
0399:                        addCharToLexer(';');
0400:                    }
0401:                }
0402:            }
0403:
0404:            public char parseTagName() {
0405:                short map;
0406:                int c;
0407:
0408:                /* fold case of first char in buffer */
0409:
0410:                c = this .lexbuf[this .txtstart];
0411:                map = MAP((char) c);
0412:
0413:                // BEGIN RAVE MODIFICATIONS
0414:                boolean wasColon = c == ':';
0415:                if (this .configuration.inputJspMode) { // don't change case of attributes
0416:                    ;
0417:                } else
0418:                // END RAVE MODIFICATIONS
0419:                if (!this .configuration.XmlTags && (map & UPPERCASE) != 0) {
0420:                    c += (int) ((int) 'a' - (int) 'A');
0421:                    this .lexbuf[this .txtstart] = (byte) c;
0422:                }
0423:
0424:                while (true) {
0425:                    c = this .in.readChar();
0426:                    if (c == StreamIn.EndOfStream)
0427:                        break;
0428:                    map = MAP((char) c);
0429:
0430:                    if ((map & NAMECHAR) == 0)
0431:                        break;
0432:
0433:                    /* fold case of subsequent chars */
0434:
0435:                    // BEGIN RAVE MODIFICATIONS
0436:                    if (c == ':') {
0437:                        wasColon = true;
0438:                    }
0439:                    if (this .configuration.inputJspMode) { // don't change case of attributes
0440:                        ;
0441:                    } else
0442:                    // END RAVE MODIFICATIONS
0443:                    if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
0444:                        c += (int) ((int) 'a' - (int) 'A');
0445:
0446:                    addCharToLexer(c);
0447:                }
0448:
0449:                this .txtend = this .lexsize;
0450:
0451:                // BEGIN RAVE MODIFICATIONS
0452:                if (!this .configuration.XmlTags && !wasColon) {
0453:                    lowercaseBuf();
0454:                }
0455:                // END RAVE MODIFICATIONS
0456:
0457:                return (char) c;
0458:            }
0459:
0460:            // BEGIN RAVE MODIFICATIONS
0461:            /** Force the byte sequence in the buffer to lowercase if applicable.
0462:             * This assumes we're only dealing with ascii chars and as soon as
0463:             * it sees something other than that in the lex buffer it gives up
0464:             * converting the case (since the lex buffer is an UTF-8 encoded
0465:             * byte array, not unicode characters.) */
0466:            private void lowercaseBuf() {
0467:                for (int i = this .txtstart; i < this .txtend; i++) {
0468:                    byte c = this .lexbuf[i];
0469:                    if (c >= 128)
0470:                        break; // aaaaaaaaaaaaaw freak-out!
0471:                    short map = MAP((char) c);
0472:                    if ((map & UPPERCASE) != 0) {
0473:                        c += (int) ((int) 'a' - (int) 'A');
0474:                        this .lexbuf[i] = (byte) c;
0475:                    }
0476:                }
0477:            }
0478:
0479:            // END RAVE MODIFICATIONS
0480:
0481:            public void addStringLiteral(String str) {
0482:                for (int i = 0; i < str.length(); i++) {
0483:                    addCharToLexer((int) str.charAt(i));
0484:                }
0485:            }
0486:
0487:            /* choose what version to use for new doctype */
0488:            public short HTMLVersion() {
0489:                short versions;
0490:
0491:                versions = this .versions;
0492:
0493:                if ((versions & Dict.VERS_HTML20) != 0)
0494:                    return Dict.VERS_HTML20;
0495:
0496:                if ((versions & Dict.VERS_HTML32) != 0)
0497:                    return Dict.VERS_HTML32;
0498:
0499:                if ((versions & Dict.VERS_HTML40_STRICT) != 0)
0500:                    return Dict.VERS_HTML40_STRICT;
0501:
0502:                if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
0503:                    return Dict.VERS_HTML40_LOOSE;
0504:
0505:                if ((versions & Dict.VERS_FRAMES) != 0)
0506:                    return Dict.VERS_FRAMES;
0507:
0508:                return Dict.VERS_UNKNOWN;
0509:            }
0510:
0511:            public String HTMLVersionName() {
0512:                short guessed;
0513:                int j;
0514:
0515:                guessed = apparentVersion();
0516:
0517:                for (j = 0; j < W3CVersion.length; ++j) {
0518:                    if (guessed == W3CVersion[j].code) {
0519:                        if (this .isvoyager)
0520:                            return W3CVersion[j].voyagerName;
0521:
0522:                        return W3CVersion[j].name;
0523:                    }
0524:                }
0525:
0526:                return null;
0527:            }
0528:
0529:            /* add meta element for Tidy */
0530:            public boolean addGenerator(Node root) {
0531:
0532:                // BEGIN RAVE MODIFICATIONS
0533:                if (configuration.outputJspMode) {
0534:                    return false;
0535:                }
0536:                // END RAVE MODIFICATIONS
0537:
0538:                AttVal attval;
0539:                Node node;
0540:                Node head = root.findHEAD(configuration.tt);
0541:
0542:                if (head != null) {
0543:                    for (node = head.content; node != null; node = node.next) {
0544:                        if (node.tag == configuration.tt.tagMeta) {
0545:                            attval = node.getAttrByName("name");
0546:
0547:                            if (attval != null
0548:                                    && attval.value != null
0549:                                    && Lexer.wstrcasecmp(attval.value,
0550:                                            "generator") == 0) {
0551:                                attval = node.getAttrByName("content");
0552:
0553:                                if (attval != null
0554:                                        && attval.value != null
0555:                                        && attval.value.length() >= 9
0556:                                        && Lexer.wstrcasecmp(attval.value
0557:                                                .substring(0, 9), "HTML Tidy") == 0) {
0558:                                    return false;
0559:                                }
0560:                            }
0561:                        }
0562:                    }
0563:
0564:                    node = this .inferredTag("meta");
0565:                    node.addAttribute("content", "HTML Tidy, see www.w3.org");
0566:                    node.addAttribute("name", "generator");
0567:                    Node.insertNodeAtStart(head, node);
0568:                    return true;
0569:                }
0570:
0571:                return false;
0572:            }
0573:
0574:            /* return true if substring s is in p and isn't all in upper case */
0575:            /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
0576:            /* len is how many chars to check in p */
0577:            private static boolean findBadSubString(String s, String p, int len) {
0578:                int n = s.length();
0579:                int i = 0;
0580:                String ps;
0581:
0582:                while (n < len) {
0583:                    ps = p.substring(i, i + n);
0584:                    if (wstrcasecmp(s, ps) == 0)
0585:                        return (!ps.equals(s.substring(0, n)));
0586:
0587:                    ++i;
0588:                    --len;
0589:                }
0590:
0591:                return false;
0592:            }
0593:
0594:            public boolean checkDocTypeKeyWords(Node doctype) {
0595:                int len = doctype.end - doctype.start;
0596:                String s = getString(this .lexbuf, doctype.start, len);
0597:
0598:                return !(findBadSubString("SYSTEM", s, len)
0599:                        || findBadSubString("PUBLIC", s, len)
0600:                        || findBadSubString("//DTD", s, len)
0601:                        || findBadSubString("//W3C", s, len) || findBadSubString(
0602:                        "//EN", s, len));
0603:            }
0604:
0605:            /* examine <!DOCTYPE> to identify version */
0606:            public short findGivenVersion(Node doctype) {
0607:                String p, s;
0608:                int i, j;
0609:                int len;
0610:                String str1;
0611:                String str2;
0612:
0613:                /* if root tag for doctype isn't html give up now */
0614:                str1 = getString(this .lexbuf, doctype.start, 5);
0615:                if (wstrcasecmp(str1, "html ") != 0)
0616:                    return 0;
0617:
0618:                if (!checkDocTypeKeyWords(doctype))
0619:                    Report.warning(this , doctype, null,
0620:                            Report.DTYPE_NOT_UPPER_CASE);
0621:
0622:                /* give up if all we are given is the system id for the doctype */
0623:                str1 = getString(this .lexbuf, doctype.start + 5, 7);
0624:                if (wstrcasecmp(str1, "SYSTEM ") == 0) {
0625:                    /* but at least ensure the case is correct */
0626:                    if (!str1.substring(0, 6).equals("SYSTEM"))
0627:                        System.arraycopy(getBytes("SYSTEM"), 0, this .lexbuf,
0628:                                doctype.start + 5, 6);
0629:                    return 0; /* unrecognized */
0630:                }
0631:
0632:                if (wstrcasecmp(str1, "PUBLIC ") == 0) {
0633:                    if (!str1.substring(0, 6).equals("PUBLIC"))
0634:                        System.arraycopy(getBytes("PUBLIC "), 0, this .lexbuf,
0635:                                doctype.start + 5, 6);
0636:                } else
0637:                    this .badDoctype = true;
0638:
0639:                for (i = doctype.start; i < doctype.end; ++i) {
0640:                    if (this .lexbuf[i] == (byte) '"') {
0641:                        str1 = getString(this .lexbuf, i + 1, 12);
0642:                        str2 = getString(this .lexbuf, i + 1, 13);
0643:                        if (str1.equals("-//W3C//DTD ")) {
0644:                            /* compute length of identifier e.g. "HTML 4.0 Transitional" */
0645:                            for (j = i + 13; j < doctype.end
0646:                                    && this .lexbuf[j] != (byte) '/'; ++j)
0647:                                ;
0648:                            len = j - i - 13;
0649:                            p = getString(this .lexbuf, i + 13, len);
0650:
0651:                            for (j = 1; j < W3CVersion.length; ++j) {
0652:                                s = W3CVersion[j].name;
0653:                                if (len == s.length() && s.equals(p))
0654:                                    return W3CVersion[j].code;
0655:                            }
0656:
0657:                            /* else unrecognized version */
0658:                        } else if (str2.equals("-//IETF//DTD ")) {
0659:                            /* compute length of identifier e.g. "HTML 2.0" */
0660:                            for (j = i + 14; j < doctype.end
0661:                                    && this .lexbuf[j] != (byte) '/'; ++j)
0662:                                ;
0663:                            len = j - i - 14;
0664:
0665:                            p = getString(this .lexbuf, i + 14, len);
0666:                            s = W3CVersion[0].name;
0667:                            if (len == s.length() && s.equals(p))
0668:                                return W3CVersion[0].code;
0669:
0670:                            /* else unrecognized version */
0671:                        }
0672:                        break;
0673:                    }
0674:                }
0675:
0676:                return 0;
0677:            }
0678:
0679:            public void fixHTMLNameSpace(Node root, String profile) {
0680:                Node node;
0681:                AttVal prev, attr;
0682:
0683:                for (node = root.content; node != null
0684:                        && node.tag != configuration.tt.tagHtml; node = node.next)
0685:                    ;
0686:
0687:                if (node != null) {
0688:                    prev = null;
0689:
0690:                    for (attr = node.attributes; attr != null; attr = attr.next) {
0691:                        if (attr.attribute.equals("xmlns"))
0692:                            break;
0693:
0694:                        prev = attr;
0695:                    }
0696:
0697:                    if (attr != null) {
0698:                        if (!attr.value.equals(profile)) {
0699:                            Report.warning(this , node, null,
0700:                                    Report.INCONSISTENT_NAMESPACE);
0701:                            attr.value = profile;
0702:                        }
0703:                    } else {
0704:                        attr = new AttVal(node.attributes, null, (int) '"',
0705:                                "xmlns", profile);
0706:                        attr.dict = AttributeTable.getDefaultAttributeTable()
0707:                                .findAttribute(attr);
0708:                        node.attributes = attr;
0709:                    }
0710:                }
0711:            }
0712:
0713:            public boolean setXHTMLDocType(Node root) {
0714:                String fpi = " ";
0715:                String sysid = "";
0716:                String namespace = XHTML_NAMESPACE;
0717:                Node doctype;
0718:
0719:                doctype = root.findDocType();
0720:
0721:                // BEGIN RAVE MODIFICATIONS
0722:                //if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
0723:                if (configuration.outputJspMode
0724:                        || configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
0725:                // END RAVE MODIFICATIONS
0726:                {
0727:                    if (doctype != null)
0728:                        Node.discardElement(doctype);
0729:                    return true;
0730:                }
0731:
0732:                if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0733:                    /* see what flavor of XHTML this document matches */
0734:                    if ((this .versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
0735:                        fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0736:                        sysid = voyager_strict;
0737:                    } else if ((this .versions & Dict.VERS_LOOSE) != 0) {
0738:                        fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0739:                        sysid = voyager_loose;
0740:                    } else if ((this .versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
0741:                        fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
0742:                        sysid = voyager_frameset;
0743:                    } else /* lets assume XHTML transitional */
0744:                    {
0745:                        fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0746:                        sysid = voyager_loose;
0747:                    }
0748:                } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0749:                    fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
0750:                    sysid = voyager_strict;
0751:                } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0752:                    fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
0753:                    sysid = voyager_loose;
0754:                }
0755:
0756:                fixHTMLNameSpace(root, namespace);
0757:
0758:                if (doctype == null) {
0759:                    doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0760:                    doctype.next = root.content;
0761:                    doctype.parent = root;
0762:                    doctype.prev = null;
0763:                    root.content = doctype;
0764:                }
0765:
0766:                if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0767:                        && configuration.docTypeStr != null) {
0768:                    fpi = configuration.docTypeStr;
0769:                    sysid = "";
0770:                }
0771:
0772:                this .txtstart = this .lexsize;
0773:                this .txtend = this .lexsize;
0774:
0775:                /* add public identifier */
0776:                addStringLiteral("html PUBLIC ");
0777:
0778:                /* check if the fpi is quoted or not */
0779:                if (fpi.charAt(0) == '"')
0780:                    addStringLiteral(fpi);
0781:                else {
0782:                    addStringLiteral("\"");
0783:                    addStringLiteral(fpi);
0784:                    addStringLiteral("\"");
0785:                }
0786:
0787:                if (sysid.length() + 6 >= this .configuration.wraplen)
0788:                    addStringLiteral("\n\"");
0789:                else
0790:                    addStringLiteral("\n    \"");
0791:
0792:                /* add system identifier */
0793:                addStringLiteral(sysid);
0794:                addStringLiteral("\"");
0795:
0796:                this .txtend = this .lexsize;
0797:
0798:                doctype.start = this .txtstart;
0799:                doctype.end = this .txtend;
0800:
0801:                return false;
0802:            }
0803:
0804:            public short apparentVersion() {
0805:                switch (this .doctype) {
0806:                case Dict.VERS_UNKNOWN:
0807:                    return HTMLVersion();
0808:
0809:                case Dict.VERS_HTML20:
0810:                    if ((this .versions & Dict.VERS_HTML20) != 0)
0811:                        return Dict.VERS_HTML20;
0812:
0813:                    break;
0814:
0815:                case Dict.VERS_HTML32:
0816:                    if ((this .versions & Dict.VERS_HTML32) != 0)
0817:                        return Dict.VERS_HTML32;
0818:
0819:                    break; /* to replace old version by new */
0820:
0821:                case Dict.VERS_HTML40_STRICT:
0822:                    if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0823:                        return Dict.VERS_HTML40_STRICT;
0824:
0825:                    break;
0826:
0827:                case Dict.VERS_HTML40_LOOSE:
0828:                    if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0829:                        return Dict.VERS_HTML40_LOOSE;
0830:
0831:                    break; /* to replace old version by new */
0832:
0833:                case Dict.VERS_FRAMES:
0834:                    if ((this .versions & Dict.VERS_FRAMES) != 0)
0835:                        return Dict.VERS_FRAMES;
0836:
0837:                    break;
0838:                }
0839:
0840:                Report.warning(this , null, null, Report.INCONSISTENT_VERSION);
0841:                return this .HTMLVersion();
0842:            }
0843:
0844:            /* fixup doctype if missing */
0845:            public boolean fixDocType(Node root) {
0846:                Node doctype;
0847:                int guessed = Dict.VERS_HTML40_STRICT, i;
0848:
0849:                if (this .badDoctype)
0850:                    Report.warning(this , null, null, Report.MALFORMED_DOCTYPE);
0851:
0852:                if (configuration.XmlOut)
0853:                    return true;
0854:
0855:                doctype = root.findDocType();
0856:
0857:                if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
0858:                    if (doctype != null)
0859:                        Node.discardElement(doctype);
0860:                    return true;
0861:                }
0862:
0863:                if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
0864:                    Node.discardElement(doctype);
0865:                    doctype = null;
0866:                    guessed = Dict.VERS_HTML40_STRICT;
0867:                } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
0868:                    Node.discardElement(doctype);
0869:                    doctype = null;
0870:                    guessed = Dict.VERS_HTML40_LOOSE;
0871:                } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
0872:                    if (doctype != null) {
0873:                        if (this .doctype == Dict.VERS_UNKNOWN)
0874:                            return false;
0875:
0876:                        switch (this .doctype) {
0877:                        case Dict.VERS_UNKNOWN:
0878:                            return false;
0879:
0880:                        case Dict.VERS_HTML20:
0881:                            if ((this .versions & Dict.VERS_HTML20) != 0)
0882:                                return true;
0883:
0884:                            break; /* to replace old version by new */
0885:
0886:                        case Dict.VERS_HTML32:
0887:                            if ((this .versions & Dict.VERS_HTML32) != 0)
0888:                                return true;
0889:
0890:                            break; /* to replace old version by new */
0891:
0892:                        case Dict.VERS_HTML40_STRICT:
0893:                            if ((this .versions & Dict.VERS_HTML40_STRICT) != 0)
0894:                                return true;
0895:
0896:                            break; /* to replace old version by new */
0897:
0898:                        case Dict.VERS_HTML40_LOOSE:
0899:                            if ((this .versions & Dict.VERS_HTML40_LOOSE) != 0)
0900:                                return true;
0901:
0902:                            break; /* to replace old version by new */
0903:
0904:                        case Dict.VERS_FRAMES:
0905:                            if ((this .versions & Dict.VERS_FRAMES) != 0)
0906:                                return true;
0907:
0908:                            break; /* to replace old version by new */
0909:                        }
0910:
0911:                        /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
0912:                    }
0913:
0914:                    /* choose new doctype */
0915:                    guessed = HTMLVersion();
0916:                }
0917:
0918:                if (guessed == Dict.VERS_UNKNOWN)
0919:                    return false;
0920:
0921:                /* for XML use the Voyager system identifier */
0922:                if (this .configuration.XmlOut || this .configuration.XmlTags
0923:                        || this .isvoyager) {
0924:                    if (doctype != null)
0925:                        Node.discardElement(doctype);
0926:
0927:                    for (i = 0; i < W3CVersion.length; ++i) {
0928:                        if (guessed == W3CVersion[i].code) {
0929:                            fixHTMLNameSpace(root, W3CVersion[i].profile);
0930:                            break;
0931:                        }
0932:                    }
0933:
0934:                    return true;
0935:                }
0936:
0937:                if (doctype == null) {
0938:                    doctype = newNode(Node.DocTypeTag, this .lexbuf, 0, 0);
0939:                    doctype.next = root.content;
0940:                    doctype.parent = root;
0941:                    doctype.prev = null;
0942:                    root.content = doctype;
0943:                }
0944:
0945:                this .txtstart = this .lexsize;
0946:                this .txtend = this .lexsize;
0947:
0948:                /* use the appropriate public identifier */
0949:                addStringLiteral("html PUBLIC ");
0950:
0951:                if (configuration.docTypeMode == Configuration.DOCTYPE_USER
0952:                        && configuration.docTypeStr != null)
0953:                    addStringLiteral(configuration.docTypeStr);
0954:                else if (guessed == Dict.VERS_HTML20)
0955:                    addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
0956:                else {
0957:                    addStringLiteral("\"-//W3C//DTD ");
0958:
0959:                    for (i = 0; i < W3CVersion.length; ++i) {
0960:                        if (guessed == W3CVersion[i].code) {
0961:                            addStringLiteral(W3CVersion[i].name);
0962:                            break;
0963:                        }
0964:                    }
0965:
0966:                    addStringLiteral("//EN\"");
0967:                }
0968:
0969:                this .txtend = this .lexsize;
0970:
0971:                doctype.start = this .txtstart;
0972:                doctype.end = this .txtend;
0973:
0974:                return true;
0975:            }
0976:
0977:            /* ensure XML document starts with <?XML version="1.0"?> */
0978:            public boolean fixXMLPI(Node root) {
0979:                Node xml;
0980:                int s;
0981:
0982:                if (root.content != null
0983:                        && root.content.type == Node.ProcInsTag) {
0984:                    s = root.content.start;
0985:
0986:                    if (this .lexbuf[s] == (byte) 'x'
0987:                            && this .lexbuf[s + 1] == (byte) 'm'
0988:                            && this .lexbuf[s + 2] == (byte) 'l')
0989:                        return true;
0990:                }
0991:
0992:                xml = newNode(Node.ProcInsTag, this .lexbuf, 0, 0);
0993:                xml.next = root.content;
0994:
0995:                if (root.content != null) {
0996:                    root.content.prev = xml;
0997:                    xml.next = root.content;
0998:                }
0999:
1000:                root.content = xml;
1001:
1002:                this .txtstart = this .lexsize;
1003:                this .txtend = this .lexsize;
1004:                addStringLiteral("xml version=\"1.0\"");
1005:                if (this .configuration.CharEncoding == Configuration.LATIN1)
1006:                    addStringLiteral(" encoding=\"ISO-8859-1\"");
1007:                this .txtend = this .lexsize;
1008:
1009:                xml.start = this .txtstart;
1010:                xml.end = this .txtend;
1011:                return false;
1012:            }
1013:
1014:            public Node inferredTag(String name) {
1015:                Node node;
1016:
1017:                node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
1018:                        this .txtend, name);
1019:                node.implicit = true;
1020:                return node;
1021:            }
1022:
1023:            public static boolean expectsContent(Node node) {
1024:                if (node.type != Node.StartTag)
1025:                    return false;
1026:
1027:                /* unknown element? */
1028:                if (node.tag == null)
1029:                    return true;
1030:
1031:                if ((node.tag.model & Dict.CM_EMPTY) != 0)
1032:                    return false;
1033:
1034:                return true;
1035:            }
1036:
1037:            /*
1038:              create a text node for the contents of
1039:              a CDATA element like style or script
1040:              which ends with </foo> for some foo.
1041:             */
1042:            public Node getCDATA(Node container) {
1043:                int c, lastc, start, len, i;
1044:                String str;
1045:                boolean endtag = false;
1046:
1047:                this .lines = this .in.curline;
1048:                this .columns = this .in.curcol;
1049:                this .waswhite = false;
1050:                this .txtstart = this .lexsize;
1051:                this .txtend = this .lexsize;
1052:
1053:                lastc = (int) '\0';
1054:                start = -1;
1055:
1056:                while (true) {
1057:                    c = this .in.readChar();
1058:                    if (c == StreamIn.EndOfStream)
1059:                        break;
1060:                    /* treat \r\n as \n and \r as \n */
1061:
1062:                    if (c == (int) '/' && lastc == (int) '<') {
1063:                        if (endtag) {
1064:                            this .lines = this .in.curline;
1065:                            this .columns = this .in.curcol - 3;
1066:
1067:                            Report.warning(this , null, null,
1068:                                    Report.BAD_CDATA_CONTENT);
1069:                        }
1070:
1071:                        start = this .lexsize + 1; /* to first letter */
1072:                        endtag = true;
1073:                    } else if (c == (int) '>' && start >= 0) {
1074:                        len = this .lexsize - start;
1075:                        if (len == container.element.length()) {
1076:                            str = getString(this .lexbuf, start, len);
1077:                            if (Lexer.wstrcasecmp(str, container.element) == 0) {
1078:                                this .txtend = start - 2;
1079:                                break;
1080:                            }
1081:                        }
1082:
1083:                        this .lines = this .in.curline;
1084:                        this .columns = this .in.curcol - 3;
1085:
1086:                        Report.warning(this , null, null,
1087:                                Report.BAD_CDATA_CONTENT);
1088:
1089:                        /* if javascript insert backslash before / */
1090:
1091:                        if (ParserImpl.isJavaScript(container)) {
1092:                            for (i = this .lexsize; i > start - 1; --i)
1093:                                this .lexbuf[i] = this .lexbuf[i - 1];
1094:
1095:                            this .lexbuf[start - 1] = (byte) '\\';
1096:                            this .lexsize++;
1097:                        }
1098:
1099:                        start = -1;
1100:                    } else if (c == (int) '\r') {
1101:                        c = this .in.readChar();
1102:
1103:                        if (c != (int) '\n')
1104:                            this .in.ungetChar(c);
1105:
1106:                        c = (int) '\n';
1107:                    }
1108:
1109:                    addCharToLexer((int) c);
1110:                    this .txtend = this .lexsize;
1111:                    lastc = c;
1112:                }
1113:
1114:                if (c == StreamIn.EndOfStream)
1115:                    Report.warning(this , container, null,
1116:                            Report.MISSING_ENDTAG_FOR);
1117:
1118:                if (this .txtend > this .txtstart) {
1119:                    this .txtend = removeCDATAMark(this .lexbuf,//wzw
1120:                            this .txtstart, this .txtend);
1121:
1122:                    this .token = newNode(Node.TextNode, this .lexbuf,
1123:                            this .txtstart, this .txtend);
1124:                    return this .token;
1125:                }
1126:
1127:                return null;
1128:            }
1129:
1130:            private int removeCDATAMark(byte[] lexbuf, int txtstart, int txtend)//wzw
1131:            {
1132:                String s = new String(lexbuf, txtstart, txtend - txtstart);
1133:                s = s.replaceAll("<!\\[CDATA\\[", "");
1134:                s = s.replaceAll("\\]\\]>", "");
1135:                byte[] b = s.getBytes();
1136:                if (b.length < (txtend - txtstart)) {
1137:                    System.arraycopy(b, 0, lexbuf, txtstart, b.length);
1138:
1139:                    return txtstart + b.length;
1140:                } else {
1141:                    return txtend;
1142:                }
1143:            }
1144:
1145:            public void ungetToken() {
1146:                this .pushed = true;
1147:            }
1148:
1149:            public static final short IgnoreWhitespace = 0;
1150:            public static final short MixedContent = 1;
1151:            public static final short Preformatted = 2;
1152:            public static final short IgnoreMarkup = 3;
1153:
1154:            /*
1155:              modes for GetToken()
1156:
1157:              MixedContent   -- for elements which don't accept PCDATA
1158:              Preformatted       -- white space preserved as is
1159:              IgnoreMarkup       -- for CDATA elements such as script, style
1160:             */
1161:
1162:            public Node getToken(short mode) {
1163:                short map;
1164:                int c = 0;
1165:                int lastc;
1166:                int badcomment = 0;
1167:                MutableBoolean isempty = new MutableBoolean();
1168:                AttVal attributes;
1169:
1170:                if (this .pushed) {
1171:                    /* duplicate inlines in preference to pushed text nodes when appropriate */
1172:                    if (this .token.type != Node.TextNode
1173:                            || (this .insert == -1 && this .inode == null)) {
1174:                        this .pushed = false;
1175:                        return this .token;
1176:                    }
1177:                }
1178:
1179:                /* at start of block elements, unclosed inline
1180:                   elements are inserted into the token stream */
1181:
1182:                if (this .insert != -1 || this .inode != null)
1183:                    return insertedToken();
1184:
1185:                this .lines = this .in.curline;
1186:                this .columns = this .in.curcol;
1187:                this .waswhite = false;
1188:
1189:                this .txtstart = this .lexsize;
1190:                this .txtend = this .lexsize;
1191:
1192:                while (true) {
1193:                    c = this .in.readChar();
1194:                    if (c == StreamIn.EndOfStream)
1195:                        break;
1196:                    if (this .insertspace && mode != IgnoreWhitespace) {
1197:                        addCharToLexer(' ');
1198:                        this .waswhite = true;
1199:                        this .insertspace = false;
1200:                    }
1201:
1202:                    /* treat \r\n as \n and \r as \n */
1203:
1204:                    if (c == '\r') {
1205:                        c = this .in.readChar();
1206:
1207:                        if (c != '\n')
1208:                            this .in.ungetChar(c);
1209:
1210:                        c = '\n';
1211:                    }
1212:
1213:                    addCharToLexer(c);
1214:
1215:                    switch (this .state) {
1216:                    case LEX_CONTENT: /* element content */
1217:                        map = MAP((char) c);
1218:
1219:                        /*
1220:                         Discard white space if appropriate. Its cheaper
1221:                         to do this here rather than in parser methods
1222:                         for elements that don't have mixed content.
1223:                         */
1224:                        if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1225:                                && this .lexsize == this .txtstart + 1) {
1226:                            --this .lexsize;
1227:                            this .waswhite = false;
1228:                            this .lines = this .in.curline;
1229:                            this .columns = this .in.curcol;
1230:                            continue;
1231:                        }
1232:
1233:                        if (c == '<') {
1234:                            this .state = LEX_GT;
1235:                            continue;
1236:                        }
1237:
1238:                        if ((map & WHITE) != 0) {
1239:                            /* was previous char white? */
1240:                            if (this .waswhite) {
1241:                                if (mode != Preformatted
1242:                                        && mode != IgnoreMarkup) {
1243:                                    --this .lexsize;
1244:                                    this .lines = this .in.curline;
1245:                                    this .columns = this .in.curcol;
1246:                                }
1247:                            } else /* prev char wasn't white */
1248:                            {
1249:                                this .waswhite = true;
1250:                                lastc = c;
1251:
1252:                                if (mode != Preformatted
1253:                                        && mode != IgnoreMarkup && c != ' ')
1254:                                    changeChar((byte) ' ');
1255:                            }
1256:
1257:                            continue;
1258:                        } else if (c == '&' && mode != IgnoreMarkup)
1259:                            parseEntity(mode);
1260:
1261:                        /* this is needed to avoid trimming trailing whitespace */
1262:                        if (mode == IgnoreWhitespace)
1263:                            mode = MixedContent;
1264:
1265:                        this .waswhite = false;
1266:                        continue;
1267:
1268:                    case LEX_GT: /* < */
1269:
1270:                        /* check for endtag */
1271:                        if (c == '/') {
1272:                            c = this .in.readChar();
1273:                            if (c == StreamIn.EndOfStream) {
1274:                                this .in.ungetChar(c);
1275:                                continue;
1276:                            }
1277:
1278:                            addCharToLexer(c);
1279:                            map = MAP((char) c);
1280:
1281:                            if ((map & LETTER) != 0) {
1282:                                this .lexsize -= 3;
1283:                                this .txtend = this .lexsize;
1284:                                this .in.ungetChar(c);
1285:                                this .state = LEX_ENDTAG;
1286:                                this .lexbuf[this .lexsize] = (byte) '\0'; /* debug */
1287:                                this .in.curcol -= 2;
1288:
1289:                                /* if some text before the </ return it now */
1290:                                if (this .txtend > this .txtstart) {
1291:                                    /* trim space char before end tag */
1292:                                    if (mode == IgnoreWhitespace
1293:                                            && this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1294:                                        this .lexsize -= 1;
1295:                                        this .txtend = this .lexsize;
1296:                                    }
1297:
1298:                                    this .token = newNode(Node.TextNode,
1299:                                            this .lexbuf, this .txtstart,
1300:                                            this .txtend);
1301:                                    return this .token;
1302:                                }
1303:
1304:                                continue; /* no text so keep going */
1305:                            }
1306:
1307:                            /* otherwise treat as CDATA */
1308:                            this .waswhite = false;
1309:                            this .state = LEX_CONTENT;
1310:                            continue;
1311:                        }
1312:
1313:                        if (mode == IgnoreMarkup) {
1314:                            /* otherwise treat as CDATA */
1315:                            this .waswhite = false;
1316:                            this .state = LEX_CONTENT;
1317:                            continue;
1318:                        }
1319:
1320:                        /*
1321:                           look out for comments, doctype or marked sections
1322:                           this isn't quite right, but its getting there ...
1323:                         */
1324:                        if (c == '!') {
1325:                            c = this .in.readChar();
1326:
1327:                            if (c == '-') {
1328:                                c = this .in.readChar();
1329:
1330:                                if (c == '-') {
1331:                                    this .state = LEX_COMMENT; /* comment */
1332:                                    this .lexsize -= 2;
1333:                                    this .txtend = this .lexsize;
1334:
1335:                                    /* if some text before < return it now */
1336:                                    if (this .txtend > this .txtstart) {
1337:                                        this .token = newNode(Node.TextNode,
1338:                                                this .lexbuf, this .txtstart,
1339:                                                this .txtend);
1340:                                        return this .token;
1341:                                    }
1342:
1343:                                    this .txtstart = this .lexsize;
1344:                                    continue;
1345:                                }
1346:
1347:                                Report.warning(this , null, null,
1348:                                        Report.MALFORMED_COMMENT);
1349:                            } else if (c == 'd' || c == 'D') {
1350:                                this .state = LEX_DOCTYPE; /* doctype */
1351:                                this .lexsize -= 2;
1352:                                this .txtend = this .lexsize;
1353:                                mode = IgnoreWhitespace;
1354:
1355:                                /* skip until white space or '>' */
1356:
1357:                                for (;;) {
1358:                                    c = this .in.readChar();
1359:
1360:                                    if (c == StreamIn.EndOfStream || c == '>') {
1361:                                        this .in.ungetChar(c);
1362:                                        break;
1363:                                    }
1364:
1365:                                    map = MAP((char) c);
1366:
1367:                                    if ((map & WHITE) == 0)
1368:                                        continue;
1369:
1370:                                    /* and skip to end of whitespace */
1371:
1372:                                    for (;;) {
1373:                                        c = this .in.readChar();
1374:
1375:                                        if (c == StreamIn.EndOfStream
1376:                                                || c == '>') {
1377:                                            this .in.ungetChar(c);
1378:                                            break;
1379:                                        }
1380:
1381:                                        map = MAP((char) c);
1382:
1383:                                        if ((map & WHITE) != 0)
1384:                                            continue;
1385:
1386:                                        this .in.ungetChar(c);
1387:                                        break;
1388:                                    }
1389:
1390:                                    break;
1391:                                }
1392:
1393:                                /* if some text before < return it now */
1394:                                if (this .txtend > this .txtstart) {
1395:                                    this .token = newNode(Node.TextNode,
1396:                                            this .lexbuf, this .txtstart,
1397:                                            this .txtend);
1398:                                    return this .token;
1399:                                }
1400:
1401:                                this .txtstart = this .lexsize;
1402:                                continue;
1403:                            } else if (c == '[') {
1404:                                /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1405:                                this .lexsize -= 2;
1406:                                this .state = LEX_SECTION;
1407:                                this .txtend = this .lexsize;
1408:
1409:                                /* if some text before < return it now */
1410:                                if (this .txtend > this .txtstart) {
1411:                                    this .token = newNode(Node.TextNode,
1412:                                            this .lexbuf, this .txtstart,
1413:                                            this .txtend);
1414:                                    return this .token;
1415:                                }
1416:
1417:                                this .txtstart = this .lexsize;
1418:                                continue;
1419:                            }
1420:
1421:                            /* otherwise swallow chars up to and including next '>' */
1422:                            while (true) {
1423:                                c = this .in.readChar();
1424:                                if (c == '>')
1425:                                    break;
1426:                                if (c == -1) {
1427:                                    this .in.ungetChar(c);
1428:                                    break;
1429:                                }
1430:                            }
1431:
1432:                            this .lexsize -= 2;
1433:                            this .lexbuf[this .lexsize] = (byte) '\0';
1434:                            this .state = LEX_CONTENT;
1435:                            continue;
1436:                        }
1437:
1438:                        /*
1439:                           processing instructions
1440:                         */
1441:
1442:                        if (c == '?') {
1443:                            this .lexsize -= 2;
1444:                            this .state = LEX_PROCINSTR;
1445:                            this .txtend = this .lexsize;
1446:
1447:                            /* if some text before < return it now */
1448:                            if (this .txtend > this .txtstart) {
1449:                                this .token = newNode(Node.TextNode,
1450:                                        this .lexbuf, this .txtstart, this .txtend);
1451:                                return this .token;
1452:                            }
1453:
1454:                            this .txtstart = this .lexsize;
1455:                            continue;
1456:                        }
1457:
1458:                        /* Microsoft ASP's e.g. <% ... server-code ... %> */
1459:                        if (c == '%') {
1460:                            this .lexsize -= 2;
1461:                            this .state = LEX_ASP;
1462:                            this .txtend = this .lexsize;
1463:
1464:                            /* if some text before < return it now */
1465:                            if (this .txtend > this .txtstart) {
1466:                                this .token = newNode(Node.TextNode,
1467:                                        this .lexbuf, this .txtstart, this .txtend);
1468:                                return this .token;
1469:                            }
1470:
1471:                            this .txtstart = this .lexsize;
1472:                            continue;
1473:                        }
1474:
1475:                        /* Netscapes JSTE e.g. <# ... server-code ... #> */
1476:                        if (c == '#') {
1477:                            this .lexsize -= 2;
1478:                            this .state = LEX_JSTE;
1479:                            this .txtend = this .lexsize;
1480:
1481:                            /* if some text before < return it now */
1482:                            if (this .txtend > this .txtstart) {
1483:                                this .token = newNode(Node.TextNode,
1484:                                        this .lexbuf, this .txtstart, this .txtend);
1485:                                return this .token;
1486:                            }
1487:
1488:                            this .txtstart = this .lexsize;
1489:                            continue;
1490:                        }
1491:
1492:                        map = MAP((char) c);
1493:
1494:                        /* check for start tag */
1495:                        if ((map & LETTER) != 0) {
1496:                            this .in.ungetChar(c); /* push back letter */
1497:                            this .lexsize -= 2; /* discard "<" + letter */
1498:                            this .txtend = this .lexsize;
1499:                            this .state = LEX_STARTTAG; /* ready to read tag name */
1500:
1501:                            /* if some text before < return it now */
1502:                            if (this .txtend > this .txtstart) {
1503:                                this .token = newNode(Node.TextNode,
1504:                                        this .lexbuf, this .txtstart, this .txtend);
1505:                                return this .token;
1506:                            }
1507:
1508:                            continue; /* no text so keep going */
1509:                        }
1510:
1511:                        /* otherwise treat as CDATA */
1512:                        this .state = LEX_CONTENT;
1513:                        this .waswhite = false;
1514:                        continue;
1515:
1516:                    case LEX_ENDTAG: /* </letter */
1517:                        this .txtstart = this .lexsize - 1;
1518:                        this .in.curcol += 2;
1519:                        c = parseTagName();
1520:                        this .token = newNode(Node.EndTag, /* create endtag token */
1521:                        this .lexbuf, this .txtstart, this .txtend, getString(
1522:                                this .lexbuf, this .txtstart, this .txtend
1523:                                        - this .txtstart));
1524:                        this .lexsize = this .txtstart;
1525:                        this .txtend = this .txtstart;
1526:
1527:                        /* skip to '>' */
1528:                        while (c != '>') {
1529:                            c = this .in.readChar();
1530:
1531:                            if (c == StreamIn.EndOfStream)
1532:                                break;
1533:                        }
1534:
1535:                        if (c == StreamIn.EndOfStream) {
1536:                            this .in.ungetChar(c);
1537:                            continue;
1538:                        }
1539:
1540:                        this .state = LEX_CONTENT;
1541:                        this .waswhite = false;
1542:                        return this .token; /* the endtag token */
1543:
1544:                    case LEX_STARTTAG: /* first letter of tagname */
1545:                        this .txtstart = this .lexsize - 1; /* set txtstart to first letter */
1546:                        c = parseTagName();
1547:                        isempty.value = false;
1548:                        attributes = null;
1549:                        this .token = newNode((isempty.value ? Node.StartEndTag
1550:                                : Node.StartTag), this .lexbuf, this .txtstart,
1551:                                this .txtend, getString(this .lexbuf,
1552:                                        this .txtstart, this .txtend
1553:                                                - this .txtstart));
1554:
1555:                        /* parse attributes, consuming closing ">" */
1556:                        if (c != '>') {
1557:                            if (c == '/')
1558:                                this .in.ungetChar(c);
1559:
1560:                            attributes = parseAttrs(isempty);
1561:                        }
1562:
1563:                        if (isempty.value)
1564:                            this .token.type = Node.StartEndTag;
1565:
1566:                        this .token.attributes = attributes;
1567:                        this .lexsize = this .txtstart;
1568:                        this .txtend = this .txtstart;
1569:
1570:                        /* swallow newline following start tag */
1571:                        /* special check needed for CRLF sequence */
1572:                        /* this doesn't apply to empty elements */
1573:
1574:                        if (expectsContent(this .token)
1575:                                || this .token.tag == configuration.tt.tagBr) {
1576:
1577:                            c = this .in.readChar();
1578:
1579:                            if (c == '\r') {
1580:                                c = this .in.readChar();
1581:
1582:                                if (c != '\n')
1583:                                    this .in.ungetChar(c);
1584:                            } else if (c != '\n' && c != '\f')
1585:                                this .in.ungetChar(c);
1586:
1587:                            this .waswhite = true; /* to swallow leading whitespace */
1588:                        } else
1589:                            this .waswhite = false;
1590:
1591:                        this .state = LEX_CONTENT;
1592:
1593:                        // BEGIN RAVE MODIFICATIONS
1594:                        //if (this.token.tag == null)
1595:                        //    Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1596:                        if (this .token.tag == null) {
1597:                            if (configuration.inputJspMode
1598:                                    && this .token.element != null
1599:                                    && this .token.element.indexOf(":") != -1) {
1600:                                // This is probably a JSP tag. We don't want errors on these. We'll
1601:                                // just treat them as inline tags.
1602:                                this .configuration.tt
1603:                                        .defineInlineTag(this .token.element);
1604:                                this .token.tag = configuration.tt
1605:                                        .lookup(this .token.element);
1606:                            } else {
1607:                                Report.error(this , null, this .token,
1608:                                        Report.UNKNOWN_ELEMENT);
1609:                            }
1610:                        }
1611:                        // END RAVE MODIFICATIONS                
1612:                        else if (!this .configuration.XmlTags) {
1613:                            this .versions &= this .token.tag.versions;
1614:
1615:                            if ((this .token.tag.versions & Dict.VERS_PROPRIETARY) != 0) {
1616:                                if (!this .configuration.MakeClean
1617:                                        && (this .token.tag == configuration.tt.tagNobr || this .token.tag == configuration.tt.tagWbr))
1618:                                    Report.warning(this , null, this .token,
1619:                                            Report.PROPRIETARY_ELEMENT);
1620:                            }
1621:
1622:                            if (this .token.tag.chkattrs != null) {
1623:                                this .token.checkUniqueAttributes(this );
1624:                                this .token.tag.chkattrs.check(this , this .token);
1625:                            } else
1626:                                this .token.checkAttributes(this );
1627:                        }
1628:
1629:                        return this .token; /* return start tag */
1630:
1631:                    case LEX_COMMENT: /* seen <!-- so look for --> */
1632:
1633:                        if (c != '-')
1634:                            continue;
1635:
1636:                        c = this .in.readChar();
1637:                        addCharToLexer(c);
1638:
1639:                        if (c != '-')
1640:                            continue;
1641:
1642:                        end_comment: while (true) {
1643:                            c = this .in.readChar();
1644:
1645:                            if (c == '>') {
1646:                                if (badcomment != 0)
1647:                                    Report.warning(this , null, null,
1648:                                            Report.MALFORMED_COMMENT);
1649:
1650:                                this .txtend = this .lexsize - 2; // AQ 8Jul2000
1651:                                this .lexbuf[this .lexsize] = (byte) '\0';
1652:                                this .state = LEX_CONTENT;
1653:                                this .waswhite = false;
1654:                                this .token = newNode(Node.CommentTag,
1655:                                        this .lexbuf, this .txtstart, this .txtend);
1656:
1657:                                /* now look for a line break */
1658:
1659:                                c = this .in.readChar();
1660:
1661:                                if (c == '\r') {
1662:                                    c = this .in.readChar();
1663:
1664:                                    if (c != '\n')
1665:                                        this .token.linebreak = true;
1666:                                }
1667:
1668:                                if (c == '\n')
1669:                                    this .token.linebreak = true;
1670:                                else
1671:                                    this .in.ungetChar(c);
1672:
1673:                                return this .token;
1674:                            }
1675:
1676:                            /* note position of first such error in the comment */
1677:                            if (badcomment == 0) {
1678:                                this .lines = this .in.curline;
1679:                                this .columns = this .in.curcol - 3;
1680:                            }
1681:
1682:                            badcomment++;
1683:                            if (this .configuration.FixComments)
1684:                                this .lexbuf[this .lexsize - 2] = (byte) '=';
1685:
1686:                            addCharToLexer(c);
1687:
1688:                            /* if '-' then look for '>' to end the comment */
1689:                            if (c != '-')
1690:                                break end_comment;
1691:
1692:                        }
1693:                        /* otherwise continue to look for --> */
1694:                        this .lexbuf[this .lexsize - 2] = (byte) '=';
1695:                        continue;
1696:
1697:                    case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
1698:                        map = MAP((char) c);
1699:
1700:                        if ((map & WHITE) != 0) {
1701:                            if (this .waswhite)
1702:                                this .lexsize -= 1;
1703:
1704:                            this .waswhite = true;
1705:                        } else
1706:                            this .waswhite = false;
1707:
1708:                        if (c != '>')
1709:                            continue;
1710:
1711:                        this .lexsize -= 1;
1712:                        this .txtend = this .lexsize;
1713:                        this .lexbuf[this .lexsize] = (byte) '\0';
1714:                        this .state = LEX_CONTENT;
1715:                        this .waswhite = false;
1716:                        this .token = newNode(Node.DocTypeTag, this .lexbuf,
1717:                                this .txtstart, this .txtend);
1718:                        /* make a note of the version named by the doctype */
1719:                        this .doctype = findGivenVersion(this .token);
1720:                        return this .token;
1721:
1722:                    case LEX_PROCINSTR: /* seen <? so look for '>' */
1723:                        /* check for PHP preprocessor instructions <?php ... ?> */
1724:
1725:                        if (this .lexsize - this .txtstart == 3) {
1726:                            if ((getString(this .lexbuf, this .txtstart, 3))
1727:                                    .equals("php")) {
1728:                                this .state = LEX_PHP;
1729:                                continue;
1730:                            }
1731:                        }
1732:
1733:                        if (this .configuration.XmlPIs) /* insist on ?> as terminator */
1734:                        {
1735:                            if (c != '?')
1736:                                continue;
1737:
1738:                            /* now look for '>' */
1739:                            c = this .in.readChar();
1740:
1741:                            if (c == StreamIn.EndOfStream) {
1742:                                Report.warning(this , null, null,
1743:                                        Report.UNEXPECTED_END_OF_FILE);
1744:                                this .in.ungetChar(c);
1745:                                continue;
1746:                            }
1747:
1748:                            addCharToLexer(c);
1749:                        }
1750:
1751:                        if (c != '>')
1752:                            continue;
1753:
1754:                        this .lexsize -= 1;
1755:                        this .txtend = this .lexsize;
1756:                        this .lexbuf[this .lexsize] = (byte) '\0';
1757:                        this .state = LEX_CONTENT;
1758:                        this .waswhite = false;
1759:                        this .token = newNode(Node.ProcInsTag, this .lexbuf,
1760:                                this .txtstart, this .txtend);
1761:                        return this .token;
1762:
1763:                    case LEX_ASP: /* seen <% so look for "%>" */
1764:                        if (c != '%')
1765:                            continue;
1766:
1767:                        /* now look for '>' */
1768:                        c = this .in.readChar();
1769:
1770:                        if (c != '>') {
1771:                            this .in.ungetChar(c);
1772:                            continue;
1773:                        }
1774:
1775:                        this .lexsize -= 1;
1776:                        this .txtend = this .lexsize;
1777:                        this .lexbuf[this .lexsize] = (byte) '\0';
1778:                        this .state = LEX_CONTENT;
1779:                        this .waswhite = false;
1780:                        this .token = newNode(Node.AspTag, this .lexbuf,
1781:                                this .txtstart, this .txtend);
1782:                        return this .token;
1783:
1784:                    case LEX_JSTE: /* seen <# so look for "#>" */
1785:                        if (c != '#')
1786:                            continue;
1787:
1788:                        /* now look for '>' */
1789:                        c = this .in.readChar();
1790:
1791:                        if (c != '>') {
1792:                            this .in.ungetChar(c);
1793:                            continue;
1794:                        }
1795:
1796:                        this .lexsize -= 1;
1797:                        this .txtend = this .lexsize;
1798:                        this .lexbuf[this .lexsize] = (byte) '\0';
1799:                        this .state = LEX_CONTENT;
1800:                        this .waswhite = false;
1801:                        this .token = newNode(Node.JsteTag, this .lexbuf,
1802:                                this .txtstart, this .txtend);
1803:                        return this .token;
1804:
1805:                    case LEX_PHP: /* seen "<?php" so look for "?>" */
1806:                        if (c != '?')
1807:                            continue;
1808:
1809:                        /* now look for '>' */
1810:                        c = this .in.readChar();
1811:
1812:                        if (c != '>') {
1813:                            this .in.ungetChar(c);
1814:                            continue;
1815:                        }
1816:
1817:                        this .lexsize -= 1;
1818:                        this .txtend = this .lexsize;
1819:                        this .lexbuf[this .lexsize] = (byte) '\0';
1820:                        this .state = LEX_CONTENT;
1821:                        this .waswhite = false;
1822:                        this .token = newNode(Node.PhpTag, this .lexbuf,
1823:                                this .txtstart, this .txtend);
1824:                        return this .token;
1825:
1826:                    case LEX_SECTION: /* seen "<![" so look for "]>" */
1827:                        if (c == '[') {
1828:                            if (this .lexsize == (this .txtstart + 6)
1829:                                    && (getString(this .lexbuf, this .txtstart, 6))
1830:                                            .equals("CDATA[")) {
1831:                                this .state = LEX_CDATA;
1832:                                this .lexsize -= 6;
1833:                                continue;
1834:                            }
1835:                        }
1836:
1837:                        if (c != ']')
1838:                            continue;
1839:
1840:                        /* now look for '>' */
1841:                        c = this .in.readChar();
1842:
1843:                        if (c != '>') {
1844:                            this .in.ungetChar(c);
1845:                            continue;
1846:                        }
1847:
1848:                        this .lexsize -= 1;
1849:                        this .txtend = this .lexsize;
1850:                        this .lexbuf[this .lexsize] = (byte) '\0';
1851:                        this .state = LEX_CONTENT;
1852:                        this .waswhite = false;
1853:                        this .token = newNode(Node.SectionTag, this .lexbuf,
1854:                                this .txtstart, this .txtend);
1855:                        return this .token;
1856:
1857:                    case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1858:                        if (c != ']')
1859:                            continue;
1860:
1861:                        /* now look for ']' */
1862:                        c = this .in.readChar();
1863:
1864:                        if (c != ']') {
1865:                            this .in.ungetChar(c);
1866:                            continue;
1867:                        }
1868:
1869:                        /* now look for '>' */
1870:                        c = this .in.readChar();
1871:
1872:                        if (c != '>') {
1873:                            this .in.ungetChar(c);
1874:                            continue;
1875:                        }
1876:
1877:                        this .lexsize -= 1;
1878:                        this .txtend = this .lexsize;
1879:                        this .lexbuf[this .lexsize] = (byte) '\0';
1880:                        this .state = LEX_CONTENT;
1881:                        this .waswhite = false;
1882:                        this .token = newNode(Node.CDATATag, this .lexbuf,
1883:                                this .txtstart, this .txtend);
1884:                        return this .token;
1885:                    }
1886:                }
1887:
1888:                if (this .state == LEX_CONTENT) /* text string */
1889:                {
1890:                    this .txtend = this .lexsize;
1891:
1892:                    if (this .txtend > this .txtstart) {
1893:                        this .in.ungetChar(c);
1894:
1895:                        if (this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1896:                            this .lexsize -= 1;
1897:                            this .txtend = this .lexsize;
1898:                        }
1899:
1900:                        this .token = newNode(Node.TextNode, this .lexbuf,
1901:                                this .txtstart, this .txtend);
1902:                        return this .token;
1903:                    }
1904:                } else if (this .state == LEX_COMMENT) /* comment */
1905:                {
1906:                    if (c == StreamIn.EndOfStream)
1907:                        Report.warning(this , null, null,
1908:                                Report.MALFORMED_COMMENT);
1909:
1910:                    this .txtend = this .lexsize;
1911:                    this .lexbuf[this .lexsize] = (byte) '\0';
1912:                    this .state = LEX_CONTENT;
1913:                    this .waswhite = false;
1914:                    this .token = newNode(Node.CommentTag, this .lexbuf,
1915:                            this .txtstart, this .txtend);
1916:                    return this .token;
1917:                }
1918:
1919:                return null;
1920:            }
1921:
1922:            /*
1923:             parser for ASP within start tags
1924:
1925:             Some people use ASP for to customize attributes
1926:             Tidy isn't really well suited to dealing with ASP
1927:             This is a workaround for attributes, but won't
1928:             deal with the case where the ASP is used to tailor
1929:             the attribute value. Here is an example of a work
1930:             around for using ASP in attribute values:
1931:
1932:              href="<%=rsSchool.Fields("ID").Value%>"
1933:
1934:             where the ASP that generates the attribute value
1935:             is masked from Tidy by the quotemarks.
1936:
1937:             */
1938:
1939:            public Node parseAsp() {
1940:                int c;
1941:                Node asp = null;
1942:
1943:                this .txtstart = this .lexsize;
1944:
1945:                for (;;) {
1946:                    c = this .in.readChar();
1947:                    addCharToLexer(c);
1948:
1949:                    if (c != '%')
1950:                        continue;
1951:
1952:                    c = this .in.readChar();
1953:                    addCharToLexer(c);
1954:
1955:                    if (c == '>')
1956:                        break;
1957:                }
1958:
1959:                this .lexsize -= 2;
1960:                this .txtend = this .lexsize;
1961:
1962:                if (this .txtend > this .txtstart)
1963:                    asp = newNode(Node.AspTag, this .lexbuf, this .txtstart,
1964:                            this .txtend);
1965:
1966:                this .txtstart = this .txtend;
1967:                return asp;
1968:            }
1969:
1970:            /*
1971:             PHP is like ASP but is based upon XML
1972:             processing instructions, e.g. <?php ... ?>
1973:             */
1974:            public Node parsePhp() {
1975:                int c;
1976:                Node php = null;
1977:
1978:                this .txtstart = this .lexsize;
1979:
1980:                for (;;) {
1981:                    c = this .in.readChar();
1982:                    addCharToLexer(c);
1983:
1984:                    if (c != '?')
1985:                        continue;
1986:
1987:                    c = this .in.readChar();
1988:                    addCharToLexer(c);
1989:
1990:                    if (c == '>')
1991:                        break;
1992:                }
1993:
1994:                this .lexsize -= 2;
1995:                this .txtend = this .lexsize;
1996:
1997:                if (this .txtend > this .txtstart)
1998:                    php = newNode(Node.PhpTag, this .lexbuf, this .txtstart,
1999:                            this .txtend);
2000:
2001:                this .txtstart = this .txtend;
2002:                return php;
2003:            }
2004:
2005:            /* consumes the '>' terminating start tags */
2006:            public String parseAttribute(MutableBoolean isempty,
2007:                    MutableObject asp, MutableObject php) {
2008:                int start = 0;
2009:                // int len = 0;   Removed by BUGFIX for 126265
2010:                short map;
2011:                String attr;
2012:                int c = 0;
2013:
2014:                asp.setObject(null); /* clear asp pointer */
2015:                php.setObject(null); /* clear php pointer */
2016:                /* skip white space before the attribute */
2017:
2018:                for (;;) {
2019:                    c = this .in.readChar();
2020:
2021:                    if (c == '/') {
2022:                        c = this .in.readChar();
2023:
2024:                        if (c == '>') {
2025:                            isempty.value = true;
2026:                            return null;
2027:                        }
2028:
2029:                        this .in.ungetChar(c);
2030:                        c = '/';
2031:                        break;
2032:                    }
2033:
2034:                    if (c == '>')
2035:                        return null;
2036:
2037:                    if (c == '<') {
2038:                        c = this .in.readChar();
2039:
2040:                        if (c == '%') {
2041:                            asp.setObject(parseAsp());
2042:                            return null;
2043:                        } else if (c == '?') {
2044:                            php.setObject(parsePhp());
2045:                            return null;
2046:                        }
2047:
2048:                        this .in.ungetChar(c);
2049:                        Report.attrError(this , this .token, null,
2050:                                Report.UNEXPECTED_GT);
2051:                        return null;
2052:                    }
2053:
2054:                    if (c == '"' || c == '\'') {
2055:                        Report.attrError(this , this .token, null,
2056:                                Report.UNEXPECTED_QUOTEMARK);
2057:                        continue;
2058:                    }
2059:
2060:                    if (c == StreamIn.EndOfStream) {
2061:                        Report.attrError(this , this .token, null,
2062:                                Report.UNEXPECTED_END_OF_FILE);
2063:                        this .in.ungetChar(c);
2064:                        return null;
2065:                    }
2066:
2067:                    map = MAP((char) c);
2068:
2069:                    if ((map & WHITE) == 0)
2070:                        break;
2071:                }
2072:
2073:                start = this .lexsize;
2074:
2075:                for (;;) {
2076:                    /* but push back '=' for parseValue() */
2077:                    if (c == '=' || c == '>') {
2078:                        this .in.ungetChar(c);
2079:                        break;
2080:                    }
2081:
2082:                    if (c == '<' || c == StreamIn.EndOfStream) {
2083:                        this .in.ungetChar(c);
2084:                        break;
2085:                    }
2086:
2087:                    map = MAP((char) c);
2088:
2089:                    if ((map & WHITE) != 0)
2090:                        break;
2091:
2092:                    /* what should be done about non-namechar characters? */
2093:                    /* currently these are incorporated into the attr name */
2094:                    // BEGIN RAVE MODIFICATIONS
2095:                    if (this .configuration.inputJspMode) { // don't change case of attributes
2096:                        ;
2097:                    } else
2098:                    // END RAVE MODIFICATIONS
2099:                    if (!this .configuration.XmlTags && (map & UPPERCASE) != 0)
2100:                        c += (int) ('a' - 'A');
2101:
2102:                    //  ++len;    Removed by BUGFIX for 126265 
2103:                    addCharToLexer(c);
2104:
2105:                    c = this .in.readChar();
2106:                }
2107:
2108:                // Following line added by GLP to fix BUG 126265.  This is a temporary comment
2109:                // and should be removed when Tidy is fixed.
2110:                int len = this .lexsize - start;
2111:                attr = (len > 0 ? getString(this .lexbuf, start, len) : null);
2112:                this .lexsize = start;
2113:
2114:                return attr;
2115:            }
2116:
2117:            /*
2118:             invoked when < is seen in place of attribute value
2119:             but terminates on whitespace if not ASP, PHP or Tango
2120:             this routine recognizes ' and " quoted strings
2121:             */
2122:            public int parseServerInstruction() {
2123:                int c, map, delim = '"';
2124:                boolean isrule = false;
2125:
2126:                c = this .in.readChar();
2127:                addCharToLexer(c);
2128:
2129:                /* check for ASP, PHP or Tango */
2130:                if (c == '%' || c == '?' || c == '@')
2131:                    isrule = true;
2132:
2133:                for (;;) {
2134:                    c = this .in.readChar();
2135:
2136:                    if (c == StreamIn.EndOfStream)
2137:                        break;
2138:
2139:                    if (c == '>') {
2140:                        if (isrule)
2141:                            addCharToLexer(c);
2142:                        else
2143:                            this .in.ungetChar(c);
2144:
2145:                        break;
2146:                    }
2147:
2148:                    /* if not recognized as ASP, PHP or Tango */
2149:                    /* then also finish value on whitespace */
2150:                    if (!isrule) {
2151:                        map = MAP((char) c);
2152:
2153:                        if ((map & WHITE) != 0)
2154:                            break;
2155:                    }
2156:
2157:                    addCharToLexer(c);
2158:
2159:                    if (c == '"') {
2160:                        do {
2161:                            c = this .in.readChar();
2162:                            addCharToLexer(c);
2163:                        } while (c != '"');
2164:                        delim = '\'';
2165:                        continue;
2166:                    }
2167:
2168:                    if (c == '\'') {
2169:                        do {
2170:                            c = this .in.readChar();
2171:                            addCharToLexer(c);
2172:                        } while (c != '\'');
2173:                    }
2174:                }
2175:
2176:                return delim;
2177:            }
2178:
2179:            /* values start with "=" or " = " etc. */
2180:            /* doesn't consume the ">" at end of start tag */
2181:
2182:            public String parseValue(String name, boolean foldCase,
2183:                    MutableBoolean isempty, MutableInteger pdelim) {
2184:                int len = 0;
2185:                int start;
2186:                short map;
2187:                boolean seen_gt = false;
2188:                boolean munge = true;
2189:                int c = 0;
2190:                int lastc, delim, quotewarning;
2191:                String value;
2192:
2193:                delim = 0;
2194:                pdelim.value = (int) '"';
2195:
2196:                /*
2197:                 Henry Zrepa reports that some folk are using the
2198:                 embed element with script attributes where newlines
2199:                 are significant and must be preserved
2200:                 */
2201:                if (configuration.LiteralAttribs)
2202:                    munge = false;
2203:
2204:                /* skip white space before the '=' */
2205:
2206:                for (;;) {
2207:                    c = this .in.readChar();
2208:
2209:                    if (c == StreamIn.EndOfStream) {
2210:                        this .in.ungetChar(c);
2211:                        break;
2212:                    }
2213:
2214:                    map = MAP((char) c);
2215:
2216:                    if ((map & WHITE) == 0)
2217:                        break;
2218:                }
2219:
2220:                /*
2221:                  c should be '=' if there is a value
2222:                  other legal possibilities are white
2223:                  space, '/' and '>'
2224:                 */
2225:
2226:                if (c != '=') {
2227:                    this .in.ungetChar(c);
2228:                    return null;
2229:                }
2230:
2231:                /* skip white space after '=' */
2232:
2233:                for (;;) {
2234:                    c = this .in.readChar();
2235:
2236:                    if (c == StreamIn.EndOfStream) {
2237:                        this .in.ungetChar(c);
2238:                        break;
2239:                    }
2240:
2241:                    map = MAP((char) c);
2242:
2243:                    if ((map & WHITE) == 0)
2244:                        break;
2245:                }
2246:
2247:                /* check for quote marks */
2248:
2249:                if (c == '"' || c == '\'')
2250:                    delim = c;
2251:                else if (c == '<') {
2252:                    start = this .lexsize;
2253:                    addCharToLexer(c);
2254:                    pdelim.value = parseServerInstruction();
2255:                    len = this .lexsize - start;
2256:                    this .lexsize = start;
2257:                    return (len > 0 ? getString(this .lexbuf, start, len) : null);
2258:                } else
2259:                    this .in.ungetChar(c);
2260:
2261:                /*
2262:                  and read the value string
2263:                  check for quote mark if needed
2264:                 */
2265:
2266:                quotewarning = 0;
2267:                start = this .lexsize;
2268:                c = '\0';
2269:
2270:                for (;;) {
2271:                    lastc = c; /* track last character */
2272:                    c = this .in.readChar();
2273:
2274:                    if (c == StreamIn.EndOfStream) {
2275:                        Report.attrError(this , this .token, null,
2276:                                Report.UNEXPECTED_END_OF_FILE);
2277:                        this .in.ungetChar(c);
2278:                        break;
2279:                    }
2280:
2281:                    if (delim == (char) 0) {
2282:                        if (c == '>') {
2283:                            this .in.ungetChar(c);
2284:                            break;
2285:                        }
2286:
2287:                        if (c == '"' || c == '\'') {
2288:                            Report.attrError(this , this .token, null,
2289:                                    Report.UNEXPECTED_QUOTEMARK);
2290:                            break;
2291:                        }
2292:
2293:                        if (c == '<') {
2294:                            /* this.in.ungetChar(c); */
2295:                            Report.attrError(this , this .token, null,
2296:                                    Report.UNEXPECTED_GT);
2297:                            /* break; */
2298:                        }
2299:
2300:                        /*
2301:                         For cases like <br clear=all/> need to avoid treating /> as
2302:                         part of the attribute value, however care is needed to avoid
2303:                         so treating <a href=http://www.acme.com/> in this way, which
2304:                         would map the <a> tag to <a href="http://www.acme.com"/>
2305:                         */
2306:                        if (c == '/') {
2307:                            /* peek ahead in case of /> */
2308:                            c = this .in.readChar();
2309:
2310:                            if (c == '>'
2311:                                    && !AttributeTable
2312:                                            .getDefaultAttributeTable().isUrl(
2313:                                                    name)) {
2314:                                isempty.value = true;
2315:                                this .in.ungetChar(c);
2316:                                break;
2317:                            }
2318:
2319:                            /* unget peeked char */
2320:                            this .in.ungetChar(c);
2321:                            c = '/';
2322:                        }
2323:                    } else /* delim is '\'' or '"' */
2324:                    {
2325:                        if (c == delim)
2326:                            break;
2327:
2328:                        /* treat CRLF, CR and LF as single line break */
2329:
2330:                        if (c == '\r') {
2331:                            c = this .in.readChar();
2332:                            if (c != '\n')
2333:                                this .in.ungetChar(c);
2334:
2335:                            c = '\n';
2336:                        }
2337:
2338:                        if (c == '\n' || c == '<' || c == '>')
2339:                            ++quotewarning;
2340:
2341:                        if (c == '>')
2342:                            seen_gt = true;
2343:                    }
2344:
2345:                    if (c == '&') {
2346:                        addCharToLexer(c);
2347:                        parseEntity((short) 0);
2348:                        continue;
2349:                    }
2350:
2351:                    /*
2352:                     kludge for JavaScript attribute values
2353:                     with line continuations in string literals
2354:                     */
2355:                    if (c == '\\') {
2356:                        c = this .in.readChar();
2357:
2358:                        if (c != '\n') {
2359:                            this .in.ungetChar(c);
2360:                            c = '\\';
2361:                        }
2362:                    }
2363:
2364:                    map = MAP((char) c);
2365:
2366:                    if ((map & WHITE) != 0) {
2367:                        if (delim == (char) 0)
2368:                            break;
2369:
2370:                        if (munge) {
2371:                            c = ' ';
2372:
2373:                            if (lastc == ' ')
2374:                                continue;
2375:                        }
2376:                    } else if (foldCase && (map & UPPERCASE) != 0)
2377:                        c += (int) ('a' - 'A');
2378:
2379:                    addCharToLexer(c);
2380:                }
2381:
2382:                if (quotewarning > 10 && seen_gt && munge) {
2383:                    /*
2384:                       there is almost certainly a missing trailling quote mark
2385:                       as we have see too many newlines, < or > characters.
2386:
2387:                       an exception is made for Javascript attributes and the
2388:                       javascript URL scheme which may legitimately include < and >
2389:                     */
2390:                    if (!AttributeTable.getDefaultAttributeTable().isScript(
2391:                            name)
2392:                            && !(AttributeTable.getDefaultAttributeTable()
2393:                                    .isUrl(name) && (getString(this .lexbuf,
2394:                                    start, 11)).equals("javascript:")))
2395:                        Report.error(this , null, null,
2396:                                Report.SUSPECTED_MISSING_QUOTE);
2397:                }
2398:
2399:                len = this .lexsize - start;
2400:                this .lexsize = start;
2401:
2402:                if (len > 0 || delim != 0)
2403:                    value = getString(this .lexbuf, start, len);
2404:                else
2405:                    value = null;
2406:
2407:                /* note delimiter if given */
2408:                if (delim != 0)
2409:                    pdelim.value = delim;
2410:                else
2411:                    pdelim.value = (int) '"';
2412:
2413:                return value;
2414:            }
2415:
2416:            /* attr must be non-null */
2417:            public static boolean isValidAttrName(String attr) {
2418:                short map;
2419:                char c;
2420:                int i;
2421:
2422:                /* first character should be a letter */
2423:                c = attr.charAt(0);
2424:                map = MAP(c);
2425:
2426:                if (!((map & LETTER) != 0))
2427:                    return false;
2428:
2429:                /* remaining characters should be namechars */
2430:                for (i = 1; i < attr.length(); i++) {
2431:                    c = attr.charAt(i);
2432:                    map = MAP(c);
2433:
2434:                    if ((map & NAMECHAR) != 0)
2435:                        continue;
2436:
2437:                    return false;
2438:                }
2439:
2440:                return true;
2441:            }
2442:
2443:            /* swallows closing '>' */
2444:
2445:            public AttVal parseAttrs(MutableBoolean isempty) {
2446:                AttVal av, list;
2447:                String attribute, value;
2448:                MutableInteger delim = new MutableInteger();
2449:                MutableObject asp = new MutableObject();
2450:                MutableObject php = new MutableObject();
2451:
2452:                list = null;
2453:
2454:                for (; !endOfInput();) {
2455:                    attribute = parseAttribute(isempty, asp, php);
2456:
2457:                    if (attribute == null) {
2458:                        /* check if attributes are created by ASP markup */
2459:                        if (asp.getObject() != null) {
2460:                            av = new AttVal(list, null, (Node) asp.getObject(),
2461:                                    null, '\0', null, null);
2462:                            list = av;
2463:                            continue;
2464:                        }
2465:
2466:                        /* check if attributes are created by PHP markup */
2467:                        if (php.getObject() != null) {
2468:                            av = new AttVal(list, null, null, (Node) php
2469:                                    .getObject(), '\0', null, null);
2470:                            list = av;
2471:                            continue;
2472:                        }
2473:
2474:                        break;
2475:                    }
2476:
2477:                    value = parseValue(attribute, false, isempty, delim);
2478:
2479:                    if (attribute != null && isValidAttrName(attribute)) {
2480:                        av = new AttVal(list, null, null, null, delim.value,
2481:                                attribute, value);
2482:                        av.dict = AttributeTable.getDefaultAttributeTable()
2483:                                .findAttribute(av);
2484:                        list = av;
2485:                    } else {
2486:                        av = new AttVal(null, null, null, null, 0, attribute,
2487:                                value);
2488:                        Report.attrError(this , this .token, value,
2489:                                Report.BAD_ATTRIBUTE_VALUE);
2490:                    }
2491:                }
2492:
2493:                return list;
2494:            }
2495:
2496:            /*
2497:              push a copy of an inline node onto stack
2498:              but don't push if implicit or OBJECT or APPLET
2499:              (implicit tags are ones generated from the istack)
2500:
2501:              One issue arises with pushing inlines when
2502:              the tag is already pushed. For instance:
2503:
2504:                  <p><em>text
2505:                  <p><em>more text
2506:
2507:              Shouldn't be mapped to
2508:
2509:                  <p><em>text</em></p>
2510:                  <p><em><em>more text</em></em>
2511:             */
2512:            public void pushInline(Node node) {
2513:                IStack is;
2514:
2515:                if (node.implicit)
2516:                    return;
2517:
2518:                if (node.tag == null)
2519:                    return;
2520:
2521:                if ((node.tag.model & Dict.CM_INLINE) == 0)
2522:                    return;
2523:
2524:                if ((node.tag.model & Dict.CM_OBJECT) != 0)
2525:                    return;
2526:
2527:                if (node.tag != configuration.tt.tagFont && isPushed(node))
2528:                    return;
2529:
2530:                // make sure there is enough space for the stack
2531:                is = new IStack();
2532:                is.tag = node.tag;
2533:                is.element = node.element;
2534:                if (node.attributes != null)
2535:                    is.attributes = cloneAttributes(node.attributes);
2536:                this .istack.push(is);
2537:            }
2538:
2539:            /* pop inline stack */
2540:            public void popInline(Node node) {
2541:                AttVal av;
2542:                IStack is;
2543:
2544:                if (node != null) {
2545:
2546:                    if (node.tag == null)
2547:                        return;
2548:
2549:                    if ((node.tag.model & Dict.CM_INLINE) == 0)
2550:                        return;
2551:
2552:                    if ((node.tag.model & Dict.CM_OBJECT) != 0)
2553:                        return;
2554:
2555:                    // if node is </a> then pop until we find an <a>
2556:                    if (node.tag == configuration.tt.tagA) {
2557:
2558:                        while (this .istack.size() > 0) {
2559:                            is = (IStack) this .istack.pop();
2560:                            if (is.tag == configuration.tt.tagA) {
2561:                                break;
2562:                            }
2563:                        }
2564:
2565:                        if (this .insert >= this .istack.size())
2566:                            this .insert = -1;
2567:                        return;
2568:                    }
2569:                }
2570:
2571:                if (this .istack.size() > 0) {
2572:                    is = (IStack) this .istack.pop();
2573:                    if (this .insert >= this .istack.size())
2574:                        this .insert = -1;
2575:                }
2576:            }
2577:
2578:            public boolean isPushed(Node node) {
2579:                int i;
2580:                IStack is;
2581:
2582:                for (i = this .istack.size() - 1; i >= 0; --i) {
2583:                    is = (IStack) this .istack.elementAt(i);
2584:                    if (is.tag == node.tag)
2585:                        return true;
2586:                }
2587:
2588:                return false;
2589:            }
2590:
2591:            /*
2592:              This has the effect of inserting "missing" inline
2593:              elements around the contents of blocklevel elements
2594:              such as P, TD, TH, DIV, PRE etc. This procedure is
2595:              called at the start of ParseBlock. when the inline
2596:              stack is not empty, as will be the case in:
2597:
2598:                <i><h1>italic heading</h1></i>
2599:
2600:              which is then treated as equivalent to
2601:
2602:                <h1><i>italic heading</i></h1>
2603:
2604:              This is implemented by setting the lexer into a mode
2605:              where it gets tokens from the inline stack rather than
2606:              from the input stream.
2607:             */
2608:            public int inlineDup(Node node) {
2609:                int n;
2610:
2611:                n = this .istack.size() - this .istackbase;
2612:                if (n > 0) {
2613:                    this .insert = this .istackbase;
2614:                    this .inode = node;
2615:                }
2616:
2617:                return n;
2618:            }
2619:
2620:            public Node insertedToken() {
2621:                Node node;
2622:                IStack is;
2623:                int n;
2624:
2625:                // this will only be null if inode != null
2626:                if (this .insert == -1) {
2627:                    node = this .inode;
2628:                    this .inode = null;
2629:                    return node;
2630:                }
2631:
2632:                // is this is the "latest" node then update
2633:                // the position, otherwise use current values
2634:
2635:                if (this .inode == null) {
2636:                    this .lines = this .in.curline;
2637:                    this .columns = this .in.curcol;
2638:                }
2639:
2640:                node = newNode(Node.StartTag, this .lexbuf, this .txtstart,
2641:                        this .txtend); // GLP:  Bugfix 126261.  Remove when this change
2642:                //       is fixed in istack.c in the original Tidy
2643:                node.implicit = true;
2644:                is = (IStack) this .istack.elementAt(this .insert);
2645:                node.element = is.element;
2646:                node.tag = is.tag;
2647:                if (is.attributes != null)
2648:                    node.attributes = cloneAttributes(is.attributes);
2649:
2650:                // advance lexer to next item on the stack
2651:                n = this .insert;
2652:
2653:                // and recover state if we have reached the end
2654:                if (++n < this .istack.size()) {
2655:                    this .insert = n;
2656:                } else {
2657:                    this .insert = -1;
2658:                }
2659:
2660:                return node;
2661:            }
2662:
2663:            /* AQ: Try this for speed optimization */
2664:            public static int wstrcasecmp(String s1, String s2) {
2665:                return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2666:            }
2667:
2668:            public static int wstrcaselexcmp(String s1, String s2) {
2669:                char c;
2670:                int i = 0;
2671:
2672:                while (i < s1.length() && i < s2.length()) {
2673:                    c = s1.charAt(i);
2674:                    if (toLower(c) != toLower(s2.charAt(i))) {
2675:                        break;
2676:                    }
2677:                    i += 1;
2678:                }
2679:                if (i == s1.length() && i == s2.length()) {
2680:                    return 0;
2681:                } else if (i == s1.length()) {
2682:                    return -1;
2683:                } else if (i == s2.length()) {
2684:                    return 1;
2685:                } else {
2686:                    return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
2687:                }
2688:            }
2689:
2690:            public static boolean wsubstr(String s1, String s2) {
2691:                int i;
2692:                int len1 = s1.length();
2693:                int len2 = s2.length();
2694:
2695:                for (i = 0; i <= len1 - len2; ++i) {
2696:                    if (s2.equalsIgnoreCase(s1.substring(i)))
2697:                        return true;
2698:                }
2699:
2700:                return false;
2701:            }
2702:
2703:            public boolean canPrune(Node element) {
2704:                if (element.type == Node.TextNode)
2705:                    return true;
2706:
2707:                if (element.content != null)
2708:                    return false;
2709:
2710:                if (element.tag == configuration.tt.tagA
2711:                        && element.attributes != null)
2712:                    return false;
2713:
2714:                if (element.tag == configuration.tt.tagP
2715:                        && !this .configuration.DropEmptyParas)
2716:                    return false;
2717:
2718:                if (element.tag == null)
2719:                    return false;
2720:
2721:                if ((element.tag.model & Dict.CM_ROW) != 0)
2722:                    return false;
2723:
2724:                if (element.tag == configuration.tt.tagApplet)
2725:                    return false;
2726:
2727:                if (element.tag == configuration.tt.tagObject)
2728:                    return false;
2729:
2730:                if (element.attributes != null
2731:                        && (element.getAttrByName("id") != null || element
2732:                                .getAttrByName("name") != null))
2733:                    return false;
2734:
2735:                return true;
2736:            }
2737:
2738:            /* duplicate name attribute as an id */
2739:            public void fixId(Node node) {
2740:                AttVal name = node.getAttrByName("name");
2741:                AttVal id = node.getAttrByName("id");
2742:
2743:                if (name != null) {
2744:                    if (id != null) {
2745:                        if (!id.value.equals(name.value))
2746:                            Report.attrError(this , node, "name",
2747:                                    Report.ID_NAME_MISMATCH);
2748:                    } else if (this .configuration.XmlOut)
2749:                        node.addAttribute("id", name.value);
2750:                }
2751:            }
2752:
2753:            /*
2754:             defer duplicates when entering a table or other
2755:             element where the inlines shouldn't be duplicated
2756:             */
2757:            public void deferDup() {
2758:                this .insert = -1;
2759:                this .inode = null;
2760:            }
2761:
2762:            /* Private methods and fields */
2763:
2764:            /* lexer char types */
2765:            private static final short DIGIT = 1;
2766:            private static final short LETTER = 2;
2767:            private static final short NAMECHAR = 4;
2768:            private static final short WHITE = 8;
2769:            private static final short NEWLINE = 16;
2770:            private static final short LOWERCASE = 32;
2771:            private static final short UPPERCASE = 64;
2772:
2773:            /* lexer GetToken states */
2774:
2775:            private static final short LEX_CONTENT = 0;
2776:            private static final short LEX_GT = 1;
2777:            private static final short LEX_ENDTAG = 2;
2778:            private static final short LEX_STARTTAG = 3;
2779:            private static final short LEX_COMMENT = 4;
2780:            private static final short LEX_DOCTYPE = 5;
2781:            private static final short LEX_PROCINSTR = 6;
2782:            private static final short LEX_ENDCOMMENT = 7;
2783:            private static final short LEX_CDATA = 8;
2784:            private static final short LEX_SECTION = 9;
2785:            private static final short LEX_ASP = 10;
2786:            private static final short LEX_JSTE = 11;
2787:            private static final short LEX_PHP = 12;
2788:
2789:            /* used to classify chars for lexical purposes */
2790:            private static short[] lexmap = new short[128];
2791:
2792:            private static void mapStr(String str, short code) {
2793:                int j;
2794:
2795:                for (int i = 0; i < str.length(); i++) {
2796:                    j = (int) str.charAt(i);
2797:                    lexmap[j] |= code;
2798:                }
2799:            }
2800:
2801:            static {
2802:                mapStr("\r\n\f", (short) (NEWLINE | WHITE));
2803:                mapStr(" \t", WHITE);
2804:                mapStr("-.:_", NAMECHAR);
2805:                mapStr("0123456789", (short) (DIGIT | NAMECHAR));
2806:                mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE
2807:                        | LETTER | NAMECHAR));
2808:                mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE
2809:                        | LETTER | NAMECHAR));
2810:            }
2811:
2812:            private static short MAP(char c) {
2813:                return ((int) c < 128 ? lexmap[(int) c] : 0);
2814:            }
2815:
2816:            private static boolean isWhite(char c) {
2817:                short m = MAP(c);
2818:
2819:                return (m & WHITE) != 0;
2820:            }
2821:
2822:            private static boolean isDigit(char c) {
2823:                short m;
2824:
2825:                m = MAP(c);
2826:
2827:                return (m & DIGIT) != 0;
2828:            }
2829:
2830:            private static boolean isLetter(char c) {
2831:                short m;
2832:
2833:                m = MAP(c);
2834:
2835:                return (m & LETTER) != 0;
2836:            }
2837:
2838:            private static char toLower(char c) {
2839:                short m = MAP(c);
2840:
2841:                if ((m & UPPERCASE) != 0)
2842:                    c = (char) ((int) c + (int) 'a' - (int) 'A');
2843:
2844:                return c;
2845:            }
2846:
2847:            private static char toUpper(char c) {
2848:                short m = MAP(c);
2849:
2850:                if ((m & LOWERCASE) != 0)
2851:                    c = (char) ((int) c + (int) 'A' - (int) 'a');
2852:
2853:                return c;
2854:            }
2855:
2856:            public static char foldCase(char c, boolean tocaps, boolean xmlTags) {
2857:                short m;
2858:
2859:                if (!xmlTags) {
2860:                    m = MAP(c);
2861:
2862:                    if (tocaps) {
2863:                        if ((m & LOWERCASE) != 0)
2864:                            c = (char) ((int) c + (int) 'A' - (int) 'a');
2865:                    } else /* force to lower case */
2866:                    {
2867:                        if ((m & UPPERCASE) != 0)
2868:                            c = (char) ((int) c + (int) 'a' - (int) 'A');
2869:                    }
2870:                }
2871:
2872:                return c;
2873:            }
2874:
2875:            private static class W3CVersionInfo {
2876:                String name;
2877:                String voyagerName;
2878:                String profile;
2879:                short code;
2880:
2881:                public W3CVersionInfo(String name, String voyagerName,
2882:                        String profile, short code) {
2883:                    this .name = name;
2884:                    this .voyagerName = voyagerName;
2885:                    this .profile = profile;
2886:                    this .code = code;
2887:                }
2888:            }
2889:
2890:            /* the 3 URIs  for the XHTML 1.0 DTDs */
2891:            private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
2892:            private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
2893:            private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
2894:
2895:            private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
2896:
2897:            private static Lexer.W3CVersionInfo[] W3CVersion = {
2898:                    new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict",
2899:                            voyager_strict, Dict.VERS_HTML40_STRICT),
2900:                    new W3CVersionInfo("HTML 4.01 Transitional",
2901:                            "XHTML 1.0 Transitional", voyager_loose,
2902:                            Dict.VERS_HTML40_LOOSE),
2903:                    new W3CVersionInfo("HTML 4.01 Frameset",
2904:                            "XHTML 1.0 Frameset", voyager_frameset,
2905:                            Dict.VERS_FRAMES),
2906:                    new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict",
2907:                            voyager_strict, Dict.VERS_HTML40_STRICT),
2908:                    new W3CVersionInfo("HTML 4.0 Transitional",
2909:                            "XHTML 1.0 Transitional", voyager_loose,
2910:                            Dict.VERS_HTML40_LOOSE),
2911:                    new W3CVersionInfo("HTML 4.0 Frameset",
2912:                            "XHTML 1.0 Frameset", voyager_frameset,
2913:                            Dict.VERS_FRAMES),
2914:                    new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional",
2915:                            voyager_loose, Dict.VERS_HTML32),
2916:                    new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict",
2917:                            voyager_strict, Dict.VERS_HTML20) };
2918:
2919:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.