Source Code Cross Referenced for Lexer.java in » HTML-Parser » JTidy » org » w3c » tidy » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » HTML Parser » JTidy » org.w3c.tidy
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         *  Java HTML Tidy - JTidy
0003:         *  HTML parser and pretty printer
0004:         *
0005:         *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0006:         *  Institute of Technology, Institut National de Recherche en
0007:         *  Informatique et en Automatique, Keio University). All Rights
0008:         *  Reserved.
0009:         *
0010:         *  Contributing Author(s):
0011:         *
0012:         *     Dave Raggett <dsr@w3.org>
0013:         *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0014:         *     Gary L Peskin <garyp@firstech.com> (Java development)
0015:         *     Sami Lempinen <sami@lempinen.net> (release management)
0016:         *     Fabrizio Giustina <fgiust at users.sourceforge.net>
0017:         *
0018:         *  The contributing author(s) would like to thank all those who
0019:         *  helped with testing, bug fixes, and patience.  This wouldn't
0020:         *  have been possible without all of you.
0021:         *
0022:         *  COPYRIGHT NOTICE:
0023:         *
0024:         *  This software and documentation is provided "as is," and
0025:         *  the copyright holders and contributing author(s) make no
0026:         *  representations or warranties, express or implied, including
0027:         *  but not limited to, warranties of merchantability or fitness
0028:         *  for any particular purpose or that the use of the software or
0029:         *  documentation will not infringe any third party patents,
0030:         *  copyrights, trademarks or other rights.
0031:         *
0032:         *  The copyright holders and contributing author(s) will not be
0033:         *  liable for any direct, indirect, special or consequential damages
0034:         *  arising out of any use of the software or documentation, even if
0035:         *  advised of the possibility of such damage.
0036:         *
0037:         *  Permission is hereby granted to use, copy, modify, and distribute
0038:         *  this source code, or portions hereof, documentation and executables,
0039:         *  for any purpose, without fee, subject to the following restrictions:
0040:         *
0041:         *  1. The origin of this source code must not be misrepresented.
0042:         *  2. Altered versions must be plainly marked as such and must
0043:         *     not be misrepresented as being the original source.
0044:         *  3. This Copyright notice may not be removed or altered from any
0045:         *     source or altered source distribution.
0046:         *
0047:         *  The copyright holders and contributing author(s) specifically
0048:         *  permit, without fee, and encourage the use of this source code
0049:         *  as a component for supporting the Hypertext Markup Language in
0050:         *  commercial products. If you use this source code in a product,
0051:         *  acknowledgment is not required but would be appreciated.
0052:         *
0053:         */
0054:        package org.w3c.tidy;
0055:
0056:        import java.io.PrintWriter;
0057:        import java.util.List;
0058:        import java.util.Stack;
0059:        import java.util.Vector;
0060:
0061:        /**
0062:         * Lexer for html parser.
0063:         * <p>
0064:         * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
0065:         * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
0066:         * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
0067:         * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
0068:         * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
0069:         * Not yet done: - Doctype subset and marked sections
0070:         * </p>
0071:         * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
0072:         * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
0073:         * @author Fabrizio Giustina
0074:         * @version $Revision: 1.93 $ ($Author: fgiust $)
0075:         */
0076:        public class Lexer {
0077:
0078:            /**
0079:             * state: ignore whitespace.
0080:             */
0081:            public static final short IGNORE_WHITESPACE = 0;
0082:
0083:            /**
0084:             * state: mixed content.
0085:             */
0086:            public static final short MIXED_CONTENT = 1;
0087:
0088:            /**
0089:             * state: preformatted.
0090:             */
0091:            public static final short PREFORMATTED = 2;
0092:
0093:            /**
0094:             * state: ignore markup.
0095:             */
0096:            public static final short IGNORE_MARKUP = 3;
0097:
0098:            /**
0099:             * URI for XHTML 1.0 transitional DTD.
0100:             */
0101:            private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
0102:
0103:            /**
0104:             * URI for XHTML 1.0 strict DTD.
0105:             */
0106:            private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
0107:
0108:            /**
0109:             * URI for XHTML 1.0 frameset DTD.
0110:             */
0111:            private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
0112:
0113:            /**
0114:             * URI for XHTML 1.1.
0115:             */
0116:            private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
0117:
0118:            /**
0119:             * URI for XHTML Basic 1.0.
0120:             */
0121:            // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
0122:            /**
0123:             * xhtml namespace.
0124:             */
0125:            private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
0126:
0127:            /**
0128:             * lists all the known versions.
0129:             */
0130:            private static final Lexer.W3CVersionInfo[] W3CVERSION = {
0131:                    new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict",
0132:                            VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
0133:                    new W3CVersionInfo("HTML 4.01 Transitional",
0134:                            "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0135:                            Dict.VERS_HTML40_LOOSE),
0136:                    new W3CVersionInfo("HTML 4.01 Frameset",
0137:                            "XHTML 1.0 Frameset", VOYAGER_FRAMESET,
0138:                            Dict.VERS_FRAMESET),
0139:                    new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict",
0140:                            VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
0141:                    new W3CVersionInfo("HTML 4.0 Transitional",
0142:                            "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0143:                            Dict.VERS_HTML40_LOOSE),
0144:                    new W3CVersionInfo("HTML 4.0 Frameset",
0145:                            "XHTML 1.0 Frameset", VOYAGER_FRAMESET,
0146:                            Dict.VERS_FRAMESET),
0147:                    new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional",
0148:                            VOYAGER_LOOSE, Dict.VERS_HTML32),
0149:                    new W3CVersionInfo("HTML 3.2 Final",
0150:                            "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0151:                            Dict.VERS_HTML32),
0152:                    new W3CVersionInfo("HTML 3.2 Draft",
0153:                            "XHTML 1.0 Transitional", VOYAGER_LOOSE,
0154:                            Dict.VERS_HTML32),
0155:                    new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict",
0156:                            VOYAGER_STRICT, Dict.VERS_HTML20),
0157:                    new W3CVersionInfo("HTML 4.01", "XHTML 1.1",
0158:                            VOYAGER_STRICT, Dict.VERS_XHTML11) };
0159:
0160:            /**
0161:             * getToken state: content.
0162:             */
0163:            private static final short LEX_CONTENT = 0;
0164:
0165:            /**
0166:             * getToken state: gt.
0167:             */
0168:            private static final short LEX_GT = 1;
0169:
0170:            /**
0171:             * getToken state: endtag.
0172:             */
0173:            private static final short LEX_ENDTAG = 2;
0174:
0175:            /**
0176:             * getToken state: start tag.
0177:             */
0178:            private static final short LEX_STARTTAG = 3;
0179:
0180:            /**
0181:             * getToken state: comment.
0182:             */
0183:            private static final short LEX_COMMENT = 4;
0184:
0185:            /**
0186:             * getToken state: doctype.
0187:             */
0188:            private static final short LEX_DOCTYPE = 5;
0189:
0190:            /**
0191:             * getToken state: procinstr.
0192:             */
0193:            private static final short LEX_PROCINSTR = 6;
0194:
0195:            /**
0196:             * getToken state: cdata.
0197:             */
0198:            private static final short LEX_CDATA = 8;
0199:
0200:            /**
0201:             * getToken state: section.
0202:             */
0203:            private static final short LEX_SECTION = 9;
0204:
0205:            /**
0206:             * getToken state: asp.
0207:             */
0208:            private static final short LEX_ASP = 10;
0209:
0210:            /**
0211:             * getToken state: jste.
0212:             */
0213:            private static final short LEX_JSTE = 11;
0214:
0215:            /**
0216:             * getToken state: php.
0217:             */
0218:            private static final short LEX_PHP = 12;
0219:
0220:            /**
0221:             * getToken state: xml declaration.
0222:             */
0223:            private static final short LEX_XMLDECL = 13;
0224:
0225:            /**
0226:             * file stream.
0227:             */
0228:            protected StreamIn in;
0229:
0230:            /**
0231:             * error output stream.
0232:             */
0233:            protected PrintWriter errout;
0234:
0235:            /**
0236:             * for accessibility errors.
0237:             */
0238:            protected short badAccess;
0239:
0240:            /**
0241:             * for bad style errors.
0242:             */
0243:            protected short badLayout;
0244:
0245:            /**
0246:             * for bad char encodings.
0247:             */
0248:            protected short badChars;
0249:
0250:            /**
0251:             * for mismatched/mispositioned form tags.
0252:             */
0253:            protected short badForm;
0254:
0255:            /**
0256:             * count of warnings in this document.
0257:             */
0258:            protected short warnings;
0259:
0260:            /**
0261:             * count of errors.
0262:             */
0263:            protected short errors;
0264:
0265:            /**
0266:             * lines seen.
0267:             */
0268:            protected int lines;
0269:
0270:            /**
0271:             * at start of current token.
0272:             */
0273:            protected int columns;
0274:
0275:            /**
0276:             * used to collapse contiguous white space.
0277:             */
0278:            protected boolean waswhite;
0279:
0280:            /**
0281:             * true after token has been pushed back.
0282:             */
0283:            protected boolean pushed;
0284:
0285:            /**
0286:             * when space is moved after end tag.
0287:             */
0288:            protected boolean insertspace;
0289:
0290:            /**
0291:             * Netscape compatibility.
0292:             */
0293:            protected boolean excludeBlocks;
0294:
0295:            /**
0296:             * true if moved out of table.
0297:             */
0298:            protected boolean exiled;
0299:
0300:            /**
0301:             * true if xmlns attribute on html element.
0302:             */
0303:            protected boolean isvoyager;
0304:
0305:            /**
0306:             * bit vector of HTML versions.
0307:             */
0308:            protected short versions;
0309:
0310:            /**
0311:             * version as given by doctype (if any).
0312:             */
0313:            protected int doctype;
0314:
0315:            /**
0316:             * set if html or PUBLIC is missing.
0317:             */
0318:            protected boolean badDoctype;
0319:
0320:            /**
0321:             * start of current node.
0322:             */
0323:            protected int txtstart;
0324:
0325:            /**
0326:             * end of current node.
0327:             */
0328:            protected int txtend;
0329:
0330:            /**
0331:             * state of lexer's finite state machine.
0332:             */
0333:            protected short state;
0334:
0335:            /**
0336:             * current node.
0337:             */
0338:            protected Node token;
0339:
0340:            /**
0341:             * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
0342:             * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
0343:             */
0344:            protected byte[] lexbuf;
0345:
0346:            /**
0347:             * allocated.
0348:             */
0349:            protected int lexlength;
0350:
0351:            /**
0352:             * used.
0353:             */
0354:            protected int lexsize;
0355:
0356:            /**
0357:             * Inline stack for compatibility with Mosaic. For deferring text node.
0358:             */
0359:            protected Node inode;
0360:
0361:            /**
0362:             * for inferring inline tags.
0363:             */
0364:            protected int insert;
0365:
0366:            /**
0367:             * stack.
0368:             */
0369:            protected Stack istack;
0370:
0371:            /**
0372:             * start of frame.
0373:             */
0374:            protected int istackbase;
0375:
0376:            /**
0377:             * used for cleaning up presentation markup.
0378:             */
0379:            protected Style styles;
0380:
0381:            /**
0382:             * configuration.
0383:             */
0384:            protected Configuration configuration;
0385:
0386:            /**
0387:             * already seen end body tag?
0388:             */
0389:            protected boolean seenEndBody;
0390:
0391:            /**
0392:             * already seen end html tag?
0393:             */
0394:            protected boolean seenEndHtml;
0395:
0396:            /**
0397:             * report.
0398:             */
0399:            protected Report report;
0400:
0401:            /**
0402:             * Root node is saved here.
0403:             */
0404:            protected Node root;
0405:
0406:            /**
0407:             * node list.
0408:             */
0409:            private List nodeList;
0410:
0411:            /**
0412:             * Instantiates a new Lexer.
0413:             * @param in StreamIn
0414:             * @param configuration configuation instance
0415:             * @param report report instance, for reporting errors
0416:             */
0417:            public Lexer(StreamIn in, Configuration configuration, Report report) {
0418:                this .report = report;
0419:                this .in = in;
0420:                this .lines = 1;
0421:                this .columns = 1;
0422:                this .state = LEX_CONTENT;
0423:                this .versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
0424:                this .doctype = Dict.VERS_UNKNOWN;
0425:                this .insert = -1;
0426:                this .istack = new Stack();
0427:                this .configuration = configuration;
0428:                this .nodeList = new Vector();
0429:            }
0430:
0431:            /**
0432:             * Creates a new node and add it to nodelist.
0433:             * @return Node
0434:             */
0435:            public Node newNode() {
0436:                Node node = new Node();
0437:                this .nodeList.add(node);
0438:                return node;
0439:            }
0440:
0441:            /**
0442:             * Creates a new node and add it to nodelist.
0443:             * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0444:             * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0445:             * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0446:             * @param textarray array of bytes contained in the Node
0447:             * @param start start position
0448:             * @param end end position
0449:             * @return Node
0450:             */
0451:            public Node newNode(short type, byte[] textarray, int start, int end) {
0452:                Node node = new Node(type, textarray, start, end);
0453:                this .nodeList.add(node);
0454:                return node;
0455:            }
0456:
0457:            /**
0458:             * Creates a new node and add it to nodelist.
0459:             * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0460:             * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0461:             * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0462:             * @param textarray array of bytes contained in the Node
0463:             * @param start start position
0464:             * @param end end position
0465:             * @param element tag name
0466:             * @return Node
0467:             */
0468:            public Node newNode(short type, byte[] textarray, int start,
0469:                    int end, String element) {
0470:                Node node = new Node(type, textarray, start, end, element,
0471:                        this .configuration.tt);
0472:                this .nodeList.add(node);
0473:                return node;
0474:            }
0475:
0476:            /**
0477:             * Clones a node and add it to node list.
0478:             * @param node Node
0479:             * @return cloned Node
0480:             */
0481:            public Node cloneNode(Node node) {
0482:                Node cnode = (Node) node.clone();
0483:                this .nodeList.add(cnode);
0484:                for (AttVal att = cnode.attributes; att != null; att = att.next) {
0485:                    if (att.asp != null) {
0486:                        this .nodeList.add(att.asp);
0487:                    }
0488:                    if (att.php != null) {
0489:                        this .nodeList.add(att.php);
0490:                    }
0491:                }
0492:                return cnode;
0493:            }
0494:
0495:            /**
0496:             * Clones an attribute value and add eventual asp or php node to node list.
0497:             * @param attrs original AttVal
0498:             * @return cloned AttVal
0499:             */
0500:            public AttVal cloneAttributes(AttVal attrs) {
0501:                AttVal cattrs = (AttVal) attrs.clone();
0502:                for (AttVal att = cattrs; att != null; att = att.next) {
0503:                    if (att.asp != null) {
0504:                        this .nodeList.add(att.asp);
0505:                    }
0506:                    if (att.php != null) {
0507:                        this .nodeList.add(att.php);
0508:                    }
0509:                }
0510:                return cattrs;
0511:            }
0512:
0513:            /**
0514:             * Update <code>oldtextarray</code> in the current nodes.
0515:             * @param oldtextarray previous text array
0516:             * @param newtextarray new text array
0517:             */
0518:            protected void updateNodeTextArrays(byte[] oldtextarray,
0519:                    byte[] newtextarray) {
0520:                Node node;
0521:                for (int i = 0; i < this .nodeList.size(); i++) {
0522:                    node = (Node) (this .nodeList.get(i));
0523:                    if (node.textarray == oldtextarray) {
0524:                        node.textarray = newtextarray;
0525:                    }
0526:                }
0527:            }
0528:
0529:            /**
0530:             * Adds a new line node. Used for creating preformatted text from Word2000.
0531:             * @return new line node
0532:             */
0533:            public Node newLineNode() {
0534:                Node node = newNode();
0535:
0536:                node.textarray = this .lexbuf;
0537:                node.start = this .lexsize;
0538:                addCharToLexer('\n');
0539:                node.end = this .lexsize;
0540:                return node;
0541:            }
0542:
0543:            /**
0544:             * Has end of input stream been reached?
0545:             * @return <code>true</code> if end of input stream been reached
0546:             */
0547:            public boolean endOfInput() {
0548:                return this .in.isEndOfStream();
0549:            }
0550:
0551:            /**
0552:             * Adds a byte to lexer buffer.
0553:             * @param c byte to add
0554:             */
0555:            public void addByte(int c) {
0556:                if (this .lexsize + 1 >= this .lexlength) {
0557:                    while (this .lexsize + 1 >= this .lexlength) {
0558:                        if (this .lexlength == 0) {
0559:                            this .lexlength = 8192;
0560:                        } else {
0561:                            this .lexlength = this .lexlength * 2;
0562:                        }
0563:                    }
0564:
0565:                    byte[] temp = this .lexbuf;
0566:                    this .lexbuf = new byte[this .lexlength];
0567:                    if (temp != null) {
0568:                        System.arraycopy(temp, 0, this .lexbuf, 0, temp.length);
0569:                        updateNodeTextArrays(temp, this .lexbuf);
0570:                    }
0571:                }
0572:
0573:                this .lexbuf[this .lexsize++] = (byte) c;
0574:                this .lexbuf[this .lexsize] = (byte) '\0'; // debug
0575:            }
0576:
0577:            /**
0578:             * Substitute the last char in buffer.
0579:             * @param c new char
0580:             */
0581:            public void changeChar(byte c) {
0582:                if (this .lexsize > 0) {
0583:                    this .lexbuf[this .lexsize - 1] = c;
0584:                }
0585:            }
0586:
0587:            /**
0588:             * Store char c as UTF-8 encoded byte stream.
0589:             * @param c char to store
0590:             */
0591:            public void addCharToLexer(int c) {
0592:                // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
0593:                // Fix by Pablo Mayrgundter 17-08-2004
0594:
0595:                if ((this .configuration.xmlOut || this .configuration.xHTML) // only for xml output
0596:                        && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
0597:                                || c == 0x9 || c == 0xA || c == 0xD // Then white-space.
0598:                                || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
0599:                        || (c >= 0x10000 && c <= 0x10FFFF))) {
0600:                    return;
0601:                }
0602:
0603:                int i = 0;
0604:                int[] count = new int[] { 0 };
0605:                byte[] buf = new byte[10]; // unsigned char
0606:
0607:                boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null,
0608:                        count);
0609:                if (err) {
0610:                    // replacement char 0xFFFD encoded as UTF-8
0611:                    buf[0] = (byte) 0xEF;
0612:                    buf[1] = (byte) 0xBF;
0613:                    buf[2] = (byte) 0xBD;
0614:                    count[0] = 3;
0615:                }
0616:
0617:                for (i = 0; i < count[0]; i++) {
0618:                    addByte(buf[i]); // uint
0619:                }
0620:
0621:            }
0622:
0623:            /**
0624:             * Adds a string to lexer buffer.
0625:             * @param str String to add
0626:             */
0627:            public void addStringToLexer(String str) {
0628:                for (int i = 0; i < str.length(); i++) {
0629:                    addCharToLexer(str.charAt(i));
0630:                }
0631:            }
0632:
0633:            /**
0634:             * Parse an html entity.
0635:             * @param mode mode
0636:             */
0637:            public void parseEntity(short mode) {
0638:                // No longer attempts to insert missing ';' for unknown
0639:                // entities unless one was present already, since this
0640:                // gives unexpected results.
0641:                // 
0642:                // For example: <a href="something.htm?foo&bar&fred">
0643:                // was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
0644:                // rather than: <a href="something.htm?foo&amp;bar&amp;fred">
0645:                // 
0646:                // My thanks for Maurice Buxton for spotting this.
0647:                // 
0648:                // Also Randy Waki pointed out the following case for the
0649:                // 04 Aug 00 version (bug #433012):
0650:                // 
0651:                // For example: <a href="something.htm?id=1&lang=en">
0652:                // was tidied to: <a href="something.htm?id=1&lang;=en">
0653:                // rather than: <a href="something.htm?id=1&amp;lang=en">
0654:                //
0655:                // where "lang" is a known entity (#9001), but browsers would
0656:                // misinterpret "&lang;" because it had a value > 256.
0657:                //
0658:                // So the case of an apparently known entity with a value > 256 and
0659:                // missing a semicolon is handled specially.
0660:                //
0661:                // "ParseEntity" is also a bit of a misnomer - it handles entities and
0662:                // numeric character references. Invalid NCR's are now reported.
0663:
0664:                int start;
0665:                boolean first = true;
0666:                boolean semicolon = false;
0667:                int c, ch, startcol;
0668:                String str;
0669:
0670:                start = this .lexsize - 1; // to start at "&"
0671:                startcol = this .in.getCurcol() - 1;
0672:
0673:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
0674:                    if (c == ';') {
0675:                        semicolon = true;
0676:                        break;
0677:                    }
0678:
0679:                    if (first && c == '#') {
0680:                        // #431953 - start RJ
0681:                        if (!this .configuration.ncr
0682:                                || this .configuration.getInCharEncoding() == Configuration.BIG5
0683:                                || this .configuration.getInCharEncoding() == Configuration.SHIFTJIS) {
0684:                            this .in.ungetChar(c);
0685:                            return;
0686:                        }
0687:                        // #431953 - end RJ
0688:
0689:                        addCharToLexer(c);
0690:                        first = false;
0691:                        continue;
0692:                    }
0693:
0694:                    first = false;
0695:
0696:                    if (TidyUtils.isNamechar((char) c)) {
0697:                        addCharToLexer(c);
0698:                        continue;
0699:                    }
0700:
0701:                    // otherwise put it back
0702:                    this .in.ungetChar(c);
0703:                    break;
0704:                }
0705:
0706:                str = TidyUtils.getString(this .lexbuf, start, this .lexsize
0707:                        - start);
0708:
0709:                if ("&apos".equals(str) && !configuration.xmlOut
0710:                        && !this .isvoyager && !configuration.xHTML) {
0711:                    report.entityError(this , Report.APOS_UNDEFINED, str, 39);
0712:                }
0713:
0714:                ch = EntityTable.getDefaultEntityTable().entityCode(str);
0715:
0716:                // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
0717:                // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
0718:                // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
0719:                // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
0720:                // || (ch >= 0xE000 && ch <= 0xFFFD)))
0721:                // {
0722:                // this.lexsize = start;
0723:                // return;
0724:                // }
0725:
0726:                // deal with unrecognized or invalid entities
0727:                // #433012 - fix by Randy Waki 17 Feb 01
0728:                // report invalid NCR's - Terry Teague 01 Sep 01
0729:                if (ch <= 0 || (ch >= 256 && c != ';')) {
0730:                    // set error position just before offending character
0731:                    this .lines = this .in.getCurline();
0732:                    this .columns = startcol;
0733:
0734:                    if (this .lexsize > start + 1) {
0735:                        if (ch >= 128 && ch <= 159) {
0736:                            // invalid numeric character reference
0737:                            int c1 = 0;
0738:
0739:                            if (configuration.replacementCharEncoding == Configuration.WIN1252) {
0740:                                c1 = EncodingUtils.decodeWin1252(ch);
0741:                            } else if (configuration.replacementCharEncoding == Configuration.MACROMAN) {
0742:                                c1 = EncodingUtils.decodeMacRoman(ch);
0743:                            }
0744:
0745:                            // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
0746:
0747:                            int replaceMode = c1 != 0 ? Report.REPLACED_CHAR
0748:                                    : Report.DISCARDED_CHAR;
0749:
0750:                            if (c != ';') /* issue warning if not terminated by ';' */
0751:                            {
0752:                                report.entityError(this ,
0753:                                        Report.MISSING_SEMICOLON_NCR, str, c);
0754:                            }
0755:
0756:                            report.encodingError(this ,
0757:                                    (short) (Report.INVALID_NCR | replaceMode),
0758:                                    ch);
0759:
0760:                            if (c1 != 0) {
0761:                                // make the replacement
0762:                                this .lexsize = start;
0763:                                addCharToLexer(c1);
0764:                                semicolon = false;
0765:                            } else {
0766:                                /* discard */
0767:                                this .lexsize = start;
0768:                                semicolon = false;
0769:                            }
0770:
0771:                        } else {
0772:                            report.entityError(this , Report.UNKNOWN_ENTITY,
0773:                                    str, ch);
0774:                        }
0775:
0776:                        if (semicolon) {
0777:                            addCharToLexer(';');
0778:                        }
0779:                    } else {
0780:                        // naked &
0781:                        report.entityError(this , Report.UNESCAPED_AMPERSAND,
0782:                                str, ch);
0783:                    }
0784:                } else {
0785:                    // issue warning if not terminated by ';'
0786:                    if (c != ';') {
0787:                        // set error position just before offending character
0788:                        this .lines = this .in.getCurline();
0789:                        this .columns = startcol;
0790:                        report.entityError(this , Report.MISSING_SEMICOLON, str,
0791:                                c);
0792:                    }
0793:
0794:                    this .lexsize = start;
0795:
0796:                    if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED)) {
0797:                        ch = ' ';
0798:                    }
0799:
0800:                    addCharToLexer(ch);
0801:
0802:                    if (ch == '&' && !this .configuration.quoteAmpersand) {
0803:                        addCharToLexer('a');
0804:                        addCharToLexer('m');
0805:                        addCharToLexer('p');
0806:                        addCharToLexer(';');
0807:                    }
0808:                }
0809:            }
0810:
0811:            /**
0812:             * Parses a tag name.
0813:             * @return first char after the tag name
0814:             */
0815:            public char parseTagName() {
0816:                int c;
0817:
0818:                // fold case of first char in buffer
0819:                c = this .lexbuf[this .txtstart];
0820:
0821:                if (!this .configuration.xmlTags && TidyUtils.isUpper((char) c)) {
0822:                    c = TidyUtils.toLower((char) c);
0823:                    this .lexbuf[this .txtstart] = (byte) c;
0824:                }
0825:
0826:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
0827:                    if (!TidyUtils.isNamechar((char) c)) {
0828:                        break;
0829:                    }
0830:
0831:                    // fold case of subsequent chars
0832:                    if (!this .configuration.xmlTags
0833:                            && TidyUtils.isUpper((char) c)) {
0834:                        c = TidyUtils.toLower((char) c);
0835:                    }
0836:
0837:                    addCharToLexer(c);
0838:                }
0839:
0840:                this .txtend = this .lexsize;
0841:                return (char) c;
0842:            }
0843:
0844:            /**
0845:             * calls addCharToLexer for any char in the string.
0846:             * @param str input String
0847:             */
0848:            public void addStringLiteral(String str) {
0849:                int len = str.length();
0850:                for (int i = 0; i < len; i++) {
0851:                    addCharToLexer(str.charAt(i));
0852:                }
0853:            }
0854:
0855:            /**
0856:             * calls addCharToLexer for any char in the string till len is reached.
0857:             * @param str input String
0858:             * @param len length of the substring to be added
0859:             */
0860:            void addStringLiteralLen(String str, int len) {
0861:                int strlen = str.length();
0862:                if (strlen < len) {
0863:                    len = strlen;
0864:                }
0865:                for (int i = 0; i < len; i++) {
0866:                    addCharToLexer(str.charAt(i));
0867:                }
0868:            }
0869:
0870:            /**
0871:             * Choose what version to use for new doctype.
0872:             * @return html version constant
0873:             */
0874:            public short htmlVersion() {
0875:                if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20)) {
0876:                    return Dict.VERS_HTML20;
0877:                }
0878:
0879:                if (!(this .configuration.xmlOut | this .configuration.xmlTags | this .isvoyager)
0880:                        && TidyUtils.toBoolean(versions & Dict.VERS_HTML32)) {
0881:                    return Dict.VERS_HTML32;
0882:                }
0883:                if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11)) {
0884:                    return Dict.VERS_XHTML11;
0885:                }
0886:                if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT)) {
0887:                    return Dict.VERS_HTML40_STRICT;
0888:                }
0889:
0890:                if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE)) {
0891:                    return Dict.VERS_HTML40_LOOSE;
0892:                }
0893:
0894:                if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET)) {
0895:                    return Dict.VERS_FRAMESET;
0896:                }
0897:
0898:                return Dict.VERS_UNKNOWN;
0899:            }
0900:
0901:            /**
0902:             * Choose what version to use for new doctype.
0903:             * @return html version name
0904:             */
0905:            public String htmlVersionName() {
0906:                short guessed;
0907:                int j;
0908:
0909:                guessed = apparentVersion();
0910:
0911:                for (j = 0; j < W3CVERSION.length; ++j) {
0912:                    if (guessed == W3CVERSION[j].code) {
0913:                        if (this .isvoyager) {
0914:                            return W3CVERSION[j].voyagerName;
0915:                        }
0916:
0917:                        return W3CVERSION[j].name;
0918:                    }
0919:                }
0920:
0921:                return null;
0922:            }
0923:
0924:            /**
0925:             * Add meta element for Tidy. If the meta tag is already present, update release date.
0926:             * @param root root node
0927:             * @return <code>true</code> if the tag has been added
0928:             */
0929:            public boolean addGenerator(Node root) {
0930:                AttVal attval;
0931:                Node node;
0932:                Node head = root.findHEAD(this .configuration.tt);
0933:
0934:                if (head != null) {
0935:                    String meta = "HTML Tidy for Java (vers. "
0936:                            + Report.RELEASE_DATE_STRING + "), see www.w3.org";
0937:
0938:                    for (node = head.content; node != null; node = node.next) {
0939:                        if (node.tag == this .configuration.tt.tagMeta) {
0940:                            attval = node.getAttrByName("name");
0941:
0942:                            if (attval != null
0943:                                    && attval.value != null
0944:                                    && "generator"
0945:                                            .equalsIgnoreCase(attval.value)) {
0946:                                attval = node.getAttrByName("content");
0947:
0948:                                if (attval != null
0949:                                        && attval.value != null
0950:                                        && attval.value.length() >= 9
0951:                                        && "HTML Tidy"
0952:                                                .equalsIgnoreCase(attval.value
0953:                                                        .substring(0, 9))) {
0954:                                    attval.value = meta;
0955:                                    return false;
0956:                                }
0957:                            }
0958:                        }
0959:                    }
0960:
0961:                    node = this .inferredTag("meta");
0962:                    node.addAttribute("content", meta);
0963:                    node.addAttribute("name", "generator");
0964:                    head.insertNodeAtStart(node);
0965:                    return true;
0966:                }
0967:
0968:                return false;
0969:            }
0970:
0971:            /**
0972:             * Check system keywords (keywords should be uppercase).
0973:             * @param doctype doctype node
0974:             * @return true if doctype keywords are all uppercase
0975:             */
0976:            public boolean checkDocTypeKeyWords(Node doctype) {
0977:                int len = doctype.end - doctype.start;
0978:                String s = TidyUtils.getString(this .lexbuf, doctype.start, len);
0979:
0980:                return !(TidyUtils.findBadSubString("SYSTEM", s, len)
0981:                        || TidyUtils.findBadSubString("PUBLIC", s, len)
0982:                        || TidyUtils.findBadSubString("//DTD", s, len)
0983:                        || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils
0984:                        .findBadSubString("//EN", s, len));
0985:            }
0986:
0987:            /**
0988:             * Examine DOCTYPE to identify version.
0989:             * @param doctype doctype node
0990:             * @return version code
0991:             */
0992:            public short findGivenVersion(Node doctype) {
0993:                String p, s;
0994:                int i, j;
0995:                int len;
0996:                String str1;
0997:                String str2;
0998:
0999:                // if root tag for doctype isn't html give up now
1000:                str1 = TidyUtils.getString(this .lexbuf, doctype.start, 5);
1001:                if (!"html ".equalsIgnoreCase(str1)) {
1002:                    return 0;
1003:                }
1004:
1005:                if (!checkDocTypeKeyWords(doctype)) {
1006:                    report.warning(this , doctype, null,
1007:                            Report.DTYPE_NOT_UPPER_CASE);
1008:                }
1009:
1010:                // give up if all we are given is the system id for the doctype
1011:                str1 = TidyUtils.getString(this .lexbuf, doctype.start + 5, 7);
1012:                if ("SYSTEM ".equalsIgnoreCase(str1)) {
1013:                    // but at least ensure the case is correct
1014:                    if (!str1.substring(0, 6).equals("SYSTEM")) {
1015:                        System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0,
1016:                                this .lexbuf, doctype.start + 5, 6);
1017:                    }
1018:                    return 0; // unrecognized
1019:                }
1020:
1021:                if ("PUBLIC ".equalsIgnoreCase(str1)) {
1022:                    if (!str1.substring(0, 6).equals("PUBLIC")) {
1023:                        System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0,
1024:                                this .lexbuf, doctype.start + 5, 6);
1025:                    }
1026:                } else {
1027:                    this .badDoctype = true;
1028:                }
1029:
1030:                for (i = doctype.start; i < doctype.end; ++i) {
1031:                    if (this .lexbuf[i] == (byte) '"') {
1032:                        str1 = TidyUtils.getString(this .lexbuf, i + 1, 12);
1033:                        str2 = TidyUtils.getString(this .lexbuf, i + 1, 13);
1034:                        if (str1.equals("-//W3C//DTD ")) {
1035:                            // compute length of identifier e.g. "HTML 4.0 Transitional"
1036:                            for (j = i + 13; j < doctype.end
1037:                                    && this .lexbuf[j] != (byte) '/'; ++j) {
1038:                                //
1039:                            }
1040:                            len = j - i - 13;
1041:                            p = TidyUtils.getString(this .lexbuf, i + 13, len);
1042:
1043:                            for (j = 1; j < W3CVERSION.length; ++j) {
1044:                                s = W3CVERSION[j].name;
1045:                                if (len == s.length() && s.equals(p)) {
1046:                                    return W3CVERSION[j].code;
1047:                                }
1048:                            }
1049:
1050:                            // else unrecognized version
1051:                        } else if (str2.equals("-//IETF//DTD ")) {
1052:                            // compute length of identifier e.g. "HTML 2.0"
1053:                            for (j = i + 14; j < doctype.end
1054:                                    && this .lexbuf[j] != (byte) '/'; ++j) {
1055:                                //
1056:                            }
1057:                            len = j - i - 14;
1058:
1059:                            p = TidyUtils.getString(this .lexbuf, i + 14, len);
1060:                            s = W3CVERSION[0].name;
1061:                            if (len == s.length() && s.equals(p)) {
1062:                                return W3CVERSION[0].code;
1063:                            }
1064:
1065:                            // else unrecognized version
1066:                        }
1067:                        break;
1068:                    }
1069:                }
1070:
1071:                return 0;
1072:            }
1073:
1074:            /**
1075:             * Fix xhtml namespace.
1076:             * @param root root Node
1077:             * @param profile current profile
1078:             */
1079:            public void fixHTMLNameSpace(Node root, String profile) {
1080:                Node node;
1081:                AttVal attr;
1082:
1083:                node = root.content;
1084:                while (node != null
1085:                        && node.tag != this .configuration.tt.tagHtml) {
1086:                    node = node.next;
1087:                }
1088:
1089:                if (node != null) {
1090:
1091:                    for (attr = node.attributes; attr != null; attr = attr.next) {
1092:                        if (attr.attribute.equals("xmlns")) {
1093:                            break;
1094:                        }
1095:
1096:                    }
1097:
1098:                    if (attr != null) {
1099:                        if (!attr.value.equals(profile)) {
1100:                            report.warning(this , node, null,
1101:                                    Report.INCONSISTENT_NAMESPACE);
1102:                            attr.value = profile;
1103:                        }
1104:                    } else {
1105:                        attr = new AttVal(node.attributes, null, '"', "xmlns",
1106:                                profile);
1107:                        attr.dict = AttributeTable.getDefaultAttributeTable()
1108:                                .findAttribute(attr);
1109:                        node.attributes = attr;
1110:                    }
1111:                }
1112:            }
1113:
1114:            /**
1115:             * Put DOCTYPE declaration between the &lt:?xml version "1.0" ... ?&gt; declaration, if any, and the
1116:             * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1117:             * @param root root node
1118:             * @return new doctype node
1119:             */
1120:            Node newXhtmlDocTypeNode(Node root) {
1121:                Node html = root.findHTML(this .configuration.tt);
1122:                if (html == null) {
1123:                    return null;
1124:                }
1125:
1126:                Node newdoctype = newNode();
1127:                newdoctype.setType(Node.DOCTYPE_TAG);
1128:                newdoctype.next = html;
1129:                newdoctype.parent = root;
1130:                newdoctype.prev = null;
1131:
1132:                if (html == root.content) {
1133:                    // No <?xml ... ?> declaration.
1134:                    root.content.prev = newdoctype;
1135:                    root.content = newdoctype;
1136:                    newdoctype.prev = null;
1137:                } else {
1138:                    // we have an <?xml ... ?> declaration.
1139:                    newdoctype.prev = html.prev;
1140:                    newdoctype.prev.next = newdoctype;
1141:                }
1142:                html.prev = newdoctype;
1143:                return newdoctype;
1144:            }
1145:
1146:            /**
1147:             * Adds a new xhtml doctype to the document.
1148:             * @param root root node
1149:             * @return <code>true</code> if a doctype has been added
1150:             */
1151:            public boolean setXHTMLDocType(Node root) {
1152:                String fpi = " ";
1153:                String sysid = "";
1154:                String namespace = XHTML_NAMESPACE;
1155:                String dtdsub = null;
1156:                Node doctype;
1157:                int dtdlen = 0;
1158:
1159:                doctype = root.findDocType();
1160:
1161:                fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
1162:
1163:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
1164:                    if (doctype != null) {
1165:                        Node.discardElement(doctype);
1166:                    }
1167:                    return true;
1168:                }
1169:
1170:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
1171:                    // see what flavor of XHTML this document matches
1172:                    if (TidyUtils.toBoolean(this .versions
1173:                            & Dict.VERS_HTML40_STRICT)) {
1174:                        // use XHTML strict
1175:                        fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1176:                        sysid = VOYAGER_STRICT;
1177:                    } else if (TidyUtils.toBoolean(this .versions
1178:                            & Dict.VERS_FRAMESET)) {
1179:                        // use XHTML frames
1180:                        fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1181:                        sysid = VOYAGER_FRAMESET;
1182:                    } else if (TidyUtils.toBoolean(this .versions
1183:                            & Dict.VERS_LOOSE)) {
1184:                        fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1185:                        sysid = VOYAGER_LOOSE;
1186:                    } else if (TidyUtils.toBoolean(this .versions
1187:                            & Dict.VERS_XHTML11)) {
1188:                        // use XHTML 1.1
1189:                        fpi = "-//W3C//DTD XHTML 1.1//EN";
1190:                        sysid = VOYAGER_11;
1191:                    } else {
1192:                        // proprietary
1193:                        fpi = null;
1194:                        sysid = "";
1195:                        if (doctype != null)// #473490 - fix by Bj�rn H�hrmann 10 Oct 01
1196:                        {
1197:                            Node.discardElement(doctype);
1198:                        }
1199:                    }
1200:                } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
1201:                    fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1202:                    sysid = VOYAGER_STRICT;
1203:                } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
1204:                    fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1205:                    sysid = VOYAGER_LOOSE;
1206:                }
1207:
1208:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_USER
1209:                        && this .configuration.docTypeStr != null) {
1210:                    fpi = this .configuration.docTypeStr;
1211:                    sysid = "";
1212:                }
1213:
1214:                if (fpi == null) {
1215:                    return false;
1216:                }
1217:
1218:                if (doctype != null) {
1219:                    // Look for internal DTD subset
1220:                    if (configuration.xHTML || configuration.xmlOut) {
1221:
1222:                        int len = doctype.end - doctype.start + 1;
1223:                        String start = TidyUtils.getString(this .lexbuf,
1224:                                doctype.start, len);
1225:
1226:                        int dtdbeg = start.indexOf('[');
1227:                        if (dtdbeg >= 0) {
1228:                            int dtdend = start.substring(dtdbeg).indexOf(']');
1229:                            if (dtdend >= 0) {
1230:                                dtdlen = dtdend + 1;
1231:                                dtdsub = start.substring(dtdbeg);
1232:                            }
1233:                        }
1234:                    }
1235:                } else {
1236:                    if ((doctype = newXhtmlDocTypeNode(root)) == null) {
1237:                        return false;
1238:                    }
1239:                }
1240:
1241:                this .txtstart = this .lexsize;
1242:                this .txtend = this .lexsize;
1243:
1244:                // add public identifier
1245:                addStringLiteral("html PUBLIC ");
1246:
1247:                // check if the fpi is quoted or not
1248:                if (fpi.charAt(0) == '"') {
1249:                    addStringLiteral(fpi);
1250:                } else {
1251:                    addStringLiteral("\"");
1252:                    addStringLiteral(fpi);
1253:                    addStringLiteral("\"");
1254:                }
1255:
1256:                if (this .configuration.wraplen != 0
1257:                        && sysid.length() + 6 >= this .configuration.wraplen) {
1258:                    addStringLiteral("\n\"");
1259:                } else {
1260:                    // FG: don't wrap
1261:                    addStringLiteral(" \"");
1262:                }
1263:
1264:                // add system identifier
1265:                addStringLiteral(sysid);
1266:                addStringLiteral("\"");
1267:
1268:                if (dtdlen > 0 && dtdsub != null) {
1269:                    addCharToLexer(' ');
1270:                    addStringLiteralLen(dtdsub, dtdlen);
1271:                }
1272:
1273:                this .txtend = this .lexsize;
1274:
1275:                int length = this .txtend - this .txtstart;
1276:                doctype.textarray = new byte[length];
1277:
1278:                System.arraycopy(this .lexbuf, this .txtstart, doctype.textarray,
1279:                        0, length);
1280:                doctype.start = 0;
1281:                doctype.end = length;
1282:
1283:                return false;
1284:            }
1285:
1286:            /**
1287:             * Return the html version used in document.
1288:             * @return version code
1289:             */
1290:            public short apparentVersion() {
1291:                switch (this .doctype) {
1292:                case Dict.VERS_UNKNOWN:
1293:                    return htmlVersion();
1294:
1295:                case Dict.VERS_HTML20:
1296:                    if (TidyUtils.toBoolean(this .versions & Dict.VERS_HTML20)) {
1297:                        return Dict.VERS_HTML20;
1298:                    }
1299:
1300:                    break;
1301:
1302:                case Dict.VERS_HTML32:
1303:                    if (TidyUtils.toBoolean(this .versions & Dict.VERS_HTML32)) {
1304:                        return Dict.VERS_HTML32;
1305:                    }
1306:
1307:                    break; // to replace old version by new
1308:
1309:                case Dict.VERS_HTML40_STRICT:
1310:                    if (TidyUtils.toBoolean(this .versions
1311:                            & Dict.VERS_HTML40_STRICT)) {
1312:                        return Dict.VERS_HTML40_STRICT;
1313:                    }
1314:
1315:                    break;
1316:
1317:                case Dict.VERS_HTML40_LOOSE:
1318:                    if (TidyUtils.toBoolean(this .versions
1319:                            & Dict.VERS_HTML40_LOOSE)) {
1320:                        return Dict.VERS_HTML40_LOOSE;
1321:                    }
1322:
1323:                    break; // to replace old version by new
1324:
1325:                case Dict.VERS_FRAMESET:
1326:                    if (TidyUtils.toBoolean(this .versions & Dict.VERS_FRAMESET)) {
1327:                        return Dict.VERS_FRAMESET;
1328:                    }
1329:
1330:                    break;
1331:
1332:                case Dict.VERS_XHTML11:
1333:                    if (TidyUtils.toBoolean(this .versions & Dict.VERS_XHTML11)) {
1334:                        return Dict.VERS_XHTML11;
1335:                    }
1336:
1337:                    break;
1338:                default:
1339:                    // should never reach here
1340:                    break;
1341:                }
1342:
1343:                // kludge to avoid error appearing at end of file
1344:                // it would be better to note the actual position
1345:                // when first encountering the doctype declaration
1346:
1347:                this .lines = 1;
1348:                this .columns = 1;
1349:
1350:                report.warning(this , null, null, Report.INCONSISTENT_VERSION);
1351:                return this .htmlVersion();
1352:            }
1353:
1354:            /**
1355:             * Fixup doctype if missing.
1356:             * @param root root node
1357:             * @return <code>false</code> if current version has not been identified
1358:             */
1359:            public boolean fixDocType(Node root) {
1360:                Node doctype;
1361:                int guessed = Dict.VERS_HTML40_STRICT, i;
1362:
1363:                if (this .badDoctype) {
1364:                    report.warning(this , null, null, Report.MALFORMED_DOCTYPE);
1365:                }
1366:
1367:                doctype = root.findDocType();
1368:
1369:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
1370:                    if (doctype != null) {
1371:                        Node.discardElement(doctype);
1372:                    }
1373:                    return true;
1374:                }
1375:
1376:                if (this .configuration.xmlOut) {
1377:                    return true;
1378:                }
1379:
1380:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
1381:                    Node.discardElement(doctype);
1382:                    doctype = null;
1383:                    guessed = Dict.VERS_HTML40_STRICT;
1384:                } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
1385:                    Node.discardElement(doctype);
1386:                    doctype = null;
1387:                    guessed = Dict.VERS_HTML40_LOOSE;
1388:                } else if (this .configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
1389:                    if (doctype != null) {
1390:                        if (this .doctype == Dict.VERS_UNKNOWN) {
1391:                            return false;
1392:                        }
1393:
1394:                        switch (this .doctype) {
1395:                        case Dict.VERS_UNKNOWN:
1396:                            return false;
1397:
1398:                        case Dict.VERS_HTML20:
1399:                            if (TidyUtils.toBoolean(this .versions
1400:                                    & Dict.VERS_HTML20)) {
1401:                                return true;
1402:                            }
1403:
1404:                            break; // to replace old version by new
1405:
1406:                        case Dict.VERS_HTML32:
1407:                            if (TidyUtils.toBoolean(this .versions
1408:                                    & Dict.VERS_HTML32)) {
1409:                                return true;
1410:                            }
1411:
1412:                            break; // to replace old version by new
1413:
1414:                        case Dict.VERS_HTML40_STRICT:
1415:                            if (TidyUtils.toBoolean(this .versions
1416:                                    & Dict.VERS_HTML40_STRICT)) {
1417:                                return true;
1418:                            }
1419:
1420:                            break; // to replace old version by new
1421:
1422:                        case Dict.VERS_HTML40_LOOSE:
1423:                            if (TidyUtils.toBoolean(this .versions
1424:                                    & Dict.VERS_HTML40_LOOSE)) {
1425:                                return true;
1426:                            }
1427:
1428:                            break; // to replace old version by new
1429:
1430:                        case Dict.VERS_FRAMESET:
1431:                            if (TidyUtils.toBoolean(this .versions
1432:                                    & Dict.VERS_FRAMESET)) {
1433:                                return true;
1434:                            }
1435:
1436:                            break; // to replace old version by new
1437:
1438:                        case Dict.VERS_XHTML11:
1439:                            if (TidyUtils.toBoolean(this .versions
1440:                                    & Dict.VERS_XHTML11)) {
1441:                                return true;
1442:                            }
1443:
1444:                            break; // to replace old version by new
1445:                        default:
1446:                            // should never reach here
1447:                            break;
1448:                        }
1449:
1450:                        // INCONSISTENT_VERSION warning is now issued by ApparentVersion()
1451:                    }
1452:
1453:                    // choose new doctype
1454:                    guessed = htmlVersion();
1455:                }
1456:
1457:                if (guessed == Dict.VERS_UNKNOWN) {
1458:                    return false;
1459:                }
1460:
1461:                // for XML use the Voyager system identifier
1462:                if (this .configuration.xmlOut || this .configuration.xmlTags
1463:                        || this .isvoyager) {
1464:                    if (doctype != null) {
1465:                        Node.discardElement(doctype);
1466:                    }
1467:
1468:                    fixHTMLNameSpace(root, XHTML_NAMESPACE);
1469:
1470:                    // Namespace is the same for all XHTML variants
1471:                    // Also, don't return yet. Still need to add DOCTYPE declaration.
1472:                    //
1473:                    // for (i = 0; i < W3CVersion.length; ++i)
1474:                    // {
1475:                    // if (guessed == W3CVersion[i].code)
1476:                    // {
1477:                    // fixHTMLNameSpace(root, W3CVersion[i].profile);
1478:                    // break;
1479:                    // }
1480:                    // }
1481:                    // return true;
1482:                }
1483:
1484:                if (doctype == null) {
1485:                    if ((doctype = newXhtmlDocTypeNode(root)) == null) {
1486:                        return false;
1487:                    }
1488:                }
1489:
1490:                this .txtstart = this .lexsize;
1491:                this .txtend = this .lexsize;
1492:
1493:                // use the appropriate public identifier
1494:                addStringLiteral("html PUBLIC ");
1495:
1496:                if (this .configuration.docTypeMode == Configuration.DOCTYPE_USER
1497:                        && this .configuration.docTypeStr != null
1498:                        && this .configuration.docTypeStr.length() > 0) {
1499:                    // check if the fpi is quoted or not
1500:                    if (this .configuration.docTypeStr.charAt(0) == '"') {
1501:                        addStringLiteral(this .configuration.docTypeStr);
1502:                    } else {
1503:                        addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1504:                        addStringLiteral(this .configuration.docTypeStr);
1505:                        addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1506:                    }
1507:                } else if (guessed == Dict.VERS_HTML20) {
1508:                    addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1509:                } else {
1510:                    addStringLiteral("\"-//W3C//DTD ");
1511:
1512:                    for (i = 0; i < W3CVERSION.length; ++i) {
1513:                        if (guessed == W3CVERSION[i].code) {
1514:                            addStringLiteral(W3CVERSION[i].name);
1515:                            break;
1516:                        }
1517:                    }
1518:
1519:                    addStringLiteral("//EN\"");
1520:                }
1521:
1522:                this .txtend = this .lexsize;
1523:
1524:                int length = this .txtend - this .txtstart;
1525:                doctype.textarray = new byte[length];
1526:
1527:                System.arraycopy(this .lexbuf, this .txtstart, doctype.textarray,
1528:                        0, length);
1529:                doctype.start = 0;
1530:                doctype.end = length;
1531:
1532:                return true;
1533:            }
1534:
1535:            /**
1536:             * Ensure XML document starts with <code>&lt;?XML version="1.0"?&gt;</code>. Add encoding attribute if not using
1537:             * ASCII or UTF-8 output.
1538:             * @param root root node
1539:             * @return always true
1540:             */
1541:            public boolean fixXmlDecl(Node root) {
1542:                Node xml;
1543:                AttVal version;
1544:                AttVal encoding;
1545:
1546:                if (root.content != null && root.content.type == Node.XML_DECL) {
1547:                    xml = root.content;
1548:                } else {
1549:                    xml = newNode(Node.XML_DECL, this .lexbuf, 0, 0);
1550:                    xml.next = root.content;
1551:
1552:                    if (root.content != null) {
1553:                        root.content.prev = xml;
1554:                        xml.next = root.content;
1555:                    }
1556:
1557:                    root.content = xml;
1558:                }
1559:
1560:                version = xml.getAttrByName("version");
1561:                encoding = xml.getAttrByName("encoding");
1562:
1563:                // We need to insert a check if declared encoding and output encoding mismatch
1564:                // and fix the Xml declaration accordingly!!!
1565:                if (encoding == null
1566:                        && this .configuration.getOutCharEncoding() != Configuration.UTF8) {
1567:                    if (this .configuration.getOutCharEncoding() == Configuration.LATIN1) {
1568:                        xml.addAttribute("encoding", "iso-8859-1");
1569:                    }
1570:                    if (this .configuration.getOutCharEncoding() == Configuration.ISO2022) {
1571:                        xml.addAttribute("encoding", "iso-2022");
1572:                    }
1573:                }
1574:
1575:                if (version == null) {
1576:                    xml.addAttribute("version", "1.0");
1577:                }
1578:
1579:                return true;
1580:            }
1581:
1582:            /**
1583:             * Generates and inserts a new node.
1584:             * @param name tag name
1585:             * @return generated node
1586:             */
1587:            public Node inferredTag(String name) {
1588:                Node node;
1589:
1590:                node = newNode(Node.START_TAG, this .lexbuf, this .txtstart,
1591:                        this .txtend, name);
1592:                node.implicit = true;
1593:                return node;
1594:            }
1595:
1596:            /**
1597:             * Create a text node for the contents of a CDATA element like style or script which ends with &lt;/foo> for some
1598:             * foo.
1599:             * @param container container node
1600:             * @return cdata node
1601:             */
1602:            public Node getCDATA(Node container) {
1603:                int c, lastc, start, len, i;
1604:                int qt = 0;
1605:                int esc = 0;
1606:                String str;
1607:                boolean endtag = false;
1608:                boolean begtag = false;
1609:
1610:                if (container.isJavaScript()) {
1611:                    esc = '\\';
1612:                }
1613:
1614:                this .lines = this .in.getCurline();
1615:                this .columns = this .in.getCurcol();
1616:                this .waswhite = false;
1617:                this .txtstart = this .lexsize;
1618:                this .txtend = this .lexsize;
1619:
1620:                lastc = '\0';
1621:                start = -1;
1622:
1623:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
1624:                    // treat \r\n as \n and \r as \n
1625:                    if (qt > 0) {
1626:                        // #598860 script parsing fails with quote chars
1627:                        // A quoted string is ended by the quotation character, or end of line
1628:                        if ((c == '\r' || c == '\n' || c == qt)
1629:                                && (!TidyUtils.toBoolean(esc) || lastc != esc)) {
1630:                            qt = 0;
1631:                        } else if (c == '/' && lastc == '<') {
1632:                            start = this .lexsize + 1; // to first letter
1633:                        }
1634:
1635:                        else if (c == '>' && start >= 0) {
1636:                            len = this .lexsize - start;
1637:
1638:                            this .lines = this .in.getCurline();
1639:                            this .columns = this .in.getCurcol() - 3;
1640:
1641:                            report.warning(this , null, null,
1642:                                    Report.BAD_CDATA_CONTENT);
1643:
1644:                            // if javascript insert backslash before /
1645:                            if (TidyUtils.toBoolean(esc)) {
1646:                                for (i = this .lexsize; i > start - 1; --i) {
1647:                                    this .lexbuf[i] = this .lexbuf[i - 1];
1648:                                }
1649:
1650:                                this .lexbuf[start - 1] = (byte) esc;
1651:                                this .lexsize++;
1652:                            }
1653:
1654:                            start = -1;
1655:                        }
1656:                    } else if (TidyUtils.isQuote(c)
1657:                            && (!TidyUtils.toBoolean(esc) || lastc != esc)) {
1658:                        qt = c;
1659:                    } else if (c == '<') {
1660:                        start = this .lexsize + 1; // to first letter
1661:                        endtag = false;
1662:                        begtag = true;
1663:                    } else if (c == '!' && lastc == '<') // Cancel start tag
1664:                    {
1665:                        start = -1;
1666:                        endtag = false;
1667:                        begtag = false;
1668:                    } else if (c == '/' && lastc == '<') {
1669:                        start = this .lexsize + 1; // to first letter
1670:                        endtag = true;
1671:                        begtag = false;
1672:                    } else if (c == '>' && start >= 0) // End of begin or end tag
1673:                    {
1674:                        int decr = 2;
1675:
1676:                        if (endtag
1677:                                && ((len = this .lexsize - start) == container.element
1678:                                        .length())) {
1679:
1680:                            str = TidyUtils.getString(this .lexbuf, start, len);
1681:                            if (container.element.equalsIgnoreCase(str)) {
1682:                                this .txtend = start - decr;
1683:                                this .lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01
1684:                                break;
1685:                            }
1686:                        }
1687:
1688:                        // Unquoted markup will end SCRIPT or STYLE elements
1689:
1690:                        this .lines = this .in.getCurline();
1691:                        this .columns = this .in.getCurcol() - 3;
1692:
1693:                        report.warning(this , null, null,
1694:                                Report.BAD_CDATA_CONTENT);
1695:                        if (begtag) {
1696:                            decr = 1;
1697:                        }
1698:                        this .txtend = start - decr;
1699:                        this .lexsize = start - decr;
1700:                        break;
1701:                    }
1702:                    // #427844 - fix by Markus Hoenicka 21 Oct 00
1703:                    else if (c == '\r') {
1704:                        if (begtag || endtag) {
1705:                            continue; // discard whitespace in endtag
1706:                        }
1707:
1708:                        c = this .in.readChar();
1709:
1710:                        if (c != '\n') {
1711:                            this .in.ungetChar(c);
1712:                        }
1713:
1714:                        c = '\n';
1715:
1716:                    } else if ((c == '\n' || c == '\t' || c == ' ')
1717:                            && (begtag || endtag)) {
1718:                        continue; // discard whitespace in endtag
1719:                    }
1720:
1721:                    addCharToLexer(c);
1722:                    this .txtend = this .lexsize;
1723:                    lastc = c;
1724:                }
1725:
1726:                if (c == StreamIn.END_OF_STREAM) {
1727:                    report.warning(this , container, null,
1728:                            Report.MISSING_ENDTAG_FOR);
1729:                }
1730:
1731:                if (this .txtend > this .txtstart) {
1732:                    this .token = newNode(Node.TEXT_NODE, this .lexbuf,
1733:                            this .txtstart, this .txtend);
1734:                    return this .token;
1735:                }
1736:
1737:                return null;
1738:            }
1739:
1740:            /**
1741:             * 
1742:             *
1743:             */
1744:            public void ungetToken() {
1745:                this .pushed = true;
1746:            }
1747:
1748:            /**
1749:             * Gets a token.
1750:             * @param mode one of the following:
1751:             * <ul>
1752:             * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1753:             * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1754:             * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1755:             * </ul>
1756:             * @return next Node
1757:             */
1758:            public Node getToken(short mode) {
1759:                int c = 0;
1760:                int badcomment = 0;
1761:                // pass by reference
1762:                boolean[] isempty = new boolean[1];
1763:                boolean inDTDSubset = false;
1764:                AttVal attributes = null;
1765:
1766:                if (this .pushed) {
1767:                    // duplicate inlines in preference to pushed text nodes when appropriate
1768:                    if (this .token.type != Node.TEXT_NODE
1769:                            || (this .insert == -1 && this .inode == null)) {
1770:                        this .pushed = false;
1771:                        return this .token;
1772:                    }
1773:                }
1774:
1775:                // at start of block elements, unclosed inline
1776:                if (this .insert != -1 || this .inode != null) {
1777:                    return insertedToken();
1778:                }
1779:
1780:                this .lines = this .in.getCurline();
1781:                this .columns = this .in.getCurcol();
1782:                this .waswhite = false;
1783:
1784:                this .txtstart = this .lexsize;
1785:                this .txtend = this .lexsize;
1786:
1787:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
1788:                    // FG fix for [427846] different from tidy
1789:                    // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1790:                    if (this .insertspace && mode != IGNORE_WHITESPACE) {
1791:                        addCharToLexer(' ');
1792:                    }
1793:                    if (this .insertspace
1794:                            && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) {
1795:                        this .waswhite = true;
1796:                        this .insertspace = false;
1797:                    }
1798:
1799:                    // treat \r\n as \n and \r as \n
1800:                    if (c == '\r') {
1801:                        c = this .in.readChar();
1802:
1803:                        if (c != '\n') {
1804:                            this .in.ungetChar(c);
1805:                        }
1806:
1807:                        c = '\n';
1808:                    }
1809:
1810:                    addCharToLexer(c);
1811:
1812:                    switch (this .state) {
1813:                    case LEX_CONTENT:
1814:                        // element content
1815:
1816:                        // Discard white space if appropriate.
1817:                        // Its cheaper to do this here rather than in parser methods for elements that
1818:                        // don't have mixed content.
1819:                        if (TidyUtils.isWhite((char) c)
1820:                                && (mode == IGNORE_WHITESPACE)
1821:                                && this .lexsize == this .txtstart + 1) {
1822:                            --this .lexsize;
1823:                            this .waswhite = false;
1824:                            this .lines = this .in.getCurline();
1825:                            this .columns = this .in.getCurcol();
1826:                            continue;
1827:                        }
1828:
1829:                        if (c == '<') {
1830:                            this .state = LEX_GT;
1831:                            continue;
1832:                        }
1833:
1834:                        if (TidyUtils.isWhite((char) c)) {
1835:                            // was previous char white?
1836:                            if (this .waswhite) {
1837:                                if (mode != PREFORMATTED
1838:                                        && mode != IGNORE_MARKUP) {
1839:                                    --this .lexsize;
1840:                                    this .lines = this .in.getCurline();
1841:                                    this .columns = this .in.getCurcol();
1842:                                }
1843:                            } else {
1844:                                // prev char wasn't white
1845:                                this .waswhite = true;
1846:
1847:                                if (mode != PREFORMATTED
1848:                                        && mode != IGNORE_MARKUP && c != ' ') {
1849:                                    changeChar((byte) ' ');
1850:                                }
1851:                            }
1852:
1853:                            continue;
1854:                        } else if (c == '&' && mode != IGNORE_MARKUP) {
1855:                            parseEntity(mode);
1856:                        }
1857:
1858:                        // this is needed to avoid trimming trailing whitespace
1859:                        if (mode == IGNORE_WHITESPACE) {
1860:                            mode = MIXED_CONTENT;
1861:                        }
1862:
1863:                        this .waswhite = false;
1864:                        continue;
1865:
1866:                    case LEX_GT:
1867:                        // <
1868:
1869:                        // check for endtag
1870:                        if (c == '/') {
1871:                            c = this .in.readChar();
1872:                            if (c == StreamIn.END_OF_STREAM) {
1873:                                this .in.ungetChar(c);
1874:                                continue;
1875:                            }
1876:
1877:                            addCharToLexer(c);
1878:
1879:                            if (TidyUtils.isLetter((char) c)) {
1880:                                this .lexsize -= 3;
1881:                                this .txtend = this .lexsize;
1882:                                this .in.ungetChar(c);
1883:                                this .state = LEX_ENDTAG;
1884:                                this .lexbuf[this .lexsize] = (byte) '\0'; // debug
1885:
1886:                                // changed from
1887:                                // this.in.curcol -= 2;
1888:                                this .columns -= 2;
1889:
1890:                                // if some text before the </ return it now
1891:                                if (this .txtend > this .txtstart) {
1892:                                    // trim space char before end tag
1893:                                    if (mode == IGNORE_WHITESPACE
1894:                                            && this .lexbuf[this .lexsize - 1] == (byte) ' ') {
1895:                                        this .lexsize -= 1;
1896:                                        this .txtend = this .lexsize;
1897:                                    }
1898:
1899:                                    this .token = newNode(Node.TEXT_NODE,
1900:                                            this .lexbuf, this .txtstart,
1901:                                            this .txtend);
1902:                                    return this .token;
1903:                                }
1904:
1905:                                continue; // no text so keep going
1906:                            }
1907:
1908:                            // otherwise treat as CDATA
1909:                            this .waswhite = false;
1910:                            this .state = LEX_CONTENT;
1911:                            continue;
1912:                        }
1913:
1914:                        if (mode == IGNORE_MARKUP) {
1915:                            // otherwise treat as CDATA
1916:                            this .waswhite = false;
1917:                            this .state = LEX_CONTENT;
1918:                            continue;
1919:                        }
1920:
1921:                        // look out for comments, doctype or marked sections this isn't quite right, but its getting there
1922:                        if (c == '!') {
1923:                            c = this .in.readChar();
1924:
1925:                            if (c == '-') {
1926:                                c = this .in.readChar();
1927:
1928:                                if (c == '-') {
1929:                                    this .state = LEX_COMMENT; // comment
1930:                                    this .lexsize -= 2;
1931:                                    this .txtend = this .lexsize;
1932:
1933:                                    // if some text before < return it now
1934:                                    if (this .txtend > this .txtstart) {
1935:                                        this .token = newNode(Node.TEXT_NODE,
1936:                                                this .lexbuf, this .txtstart,
1937:                                                this .txtend);
1938:                                        return this .token;
1939:                                    }
1940:
1941:                                    this .txtstart = this .lexsize;
1942:                                    continue;
1943:                                }
1944:
1945:                                report.warning(this , null, null,
1946:                                        Report.MALFORMED_COMMENT);
1947:                            } else if (c == 'd' || c == 'D') {
1948:                                this .state = LEX_DOCTYPE; // doctype
1949:                                this .lexsize -= 2;
1950:                                this .txtend = this .lexsize;
1951:                                mode = IGNORE_WHITESPACE;
1952:
1953:                                // skip until white space or '>'
1954:
1955:                                for (;;) {
1956:                                    c = this .in.readChar();
1957:
1958:                                    if (c == StreamIn.END_OF_STREAM || c == '>') {
1959:                                        this .in.ungetChar(c);
1960:                                        break;
1961:                                    }
1962:
1963:                                    if (!TidyUtils.isWhite((char) c)) {
1964:                                        continue;
1965:                                    }
1966:
1967:                                    // and skip to end of whitespace
1968:
1969:                                    for (;;) {
1970:                                        c = this .in.readChar();
1971:
1972:                                        if (c == StreamIn.END_OF_STREAM
1973:                                                || c == '>') {
1974:                                            this .in.ungetChar(c);
1975:                                            break;
1976:                                        }
1977:
1978:                                        if (TidyUtils.isWhite((char) c)) {
1979:                                            continue;
1980:                                        }
1981:
1982:                                        this .in.ungetChar(c);
1983:                                        break;
1984:                                    }
1985:
1986:                                    break;
1987:                                }
1988:
1989:                                // if some text before < return it now
1990:                                if (this .txtend > this .txtstart) {
1991:                                    this .token = newNode(Node.TEXT_NODE,
1992:                                            this .lexbuf, this .txtstart,
1993:                                            this .txtend);
1994:                                    return this .token;
1995:                                }
1996:
1997:                                this .txtstart = this .lexsize;
1998:                                continue;
1999:                            } else if (c == '[') {
2000:                                // Word 2000 embeds <![if ...]> ... <![endif]> sequences
2001:                                this .lexsize -= 2;
2002:                                this .state = LEX_SECTION;
2003:                                this .txtend = this .lexsize;
2004:
2005:                                // if some text before < return it now
2006:                                if (this .txtend > this .txtstart) {
2007:                                    this .token = newNode(Node.TEXT_NODE,
2008:                                            this .lexbuf, this .txtstart,
2009:                                            this .txtend);
2010:                                    return this .token;
2011:                                }
2012:
2013:                                this .txtstart = this .lexsize;
2014:                                continue;
2015:                            }
2016:
2017:                            // otherwise swallow chars up to and including next '>'
2018:                            while (true) {
2019:                                c = this .in.readChar();
2020:                                if (c == '>') {
2021:                                    break;
2022:                                }
2023:                                if (c == -1) {
2024:                                    this .in.ungetChar(c);
2025:                                    break;
2026:                                }
2027:                            }
2028:
2029:                            this .lexsize -= 2;
2030:                            this .lexbuf[this .lexsize] = (byte) '\0';
2031:                            this .state = LEX_CONTENT;
2032:                            continue;
2033:                        }
2034:
2035:                        // processing instructions
2036:
2037:                        if (c == '?') {
2038:                            this .lexsize -= 2;
2039:                            this .state = LEX_PROCINSTR;
2040:                            this .txtend = this .lexsize;
2041:
2042:                            // if some text before < return it now
2043:                            if (this .txtend > this .txtstart) {
2044:                                this .token = newNode(Node.TEXT_NODE,
2045:                                        this .lexbuf, this .txtstart, this .txtend);
2046:                                return this .token;
2047:                            }
2048:
2049:                            this .txtstart = this .lexsize;
2050:                            continue;
2051:                        }
2052:
2053:                        // Microsoft ASP's e.g. <% ... server-code ... %>
2054:                        if (c == '%') {
2055:                            this .lexsize -= 2;
2056:                            this .state = LEX_ASP;
2057:                            this .txtend = this .lexsize;
2058:
2059:                            // if some text before < return it now
2060:                            if (this .txtend > this .txtstart) {
2061:                                this .token = newNode(Node.TEXT_NODE,
2062:                                        this .lexbuf, this .txtstart, this .txtend);
2063:                                return this .token;
2064:                            }
2065:
2066:                            this .txtstart = this .lexsize;
2067:                            continue;
2068:                        }
2069:
2070:                        // Netscapes JSTE e.g. <# ... server-code ... #>
2071:                        if (c == '#') {
2072:                            this .lexsize -= 2;
2073:                            this .state = LEX_JSTE;
2074:                            this .txtend = this .lexsize;
2075:
2076:                            // if some text before < return it now
2077:                            if (this .txtend > this .txtstart) {
2078:                                this .token = newNode(Node.TEXT_NODE,
2079:                                        this .lexbuf, this .txtstart, this .txtend);
2080:                                return this .token;
2081:                            }
2082:
2083:                            this .txtstart = this .lexsize;
2084:                            continue;
2085:                        }
2086:
2087:                        // check for start tag
2088:                        if (TidyUtils.isLetter((char) c)) {
2089:                            this .in.ungetChar(c); // push back letter
2090:                            this .lexsize -= 2; // discard " <" + letter
2091:                            this .txtend = this .lexsize;
2092:                            this .state = LEX_STARTTAG; // ready to read tag name
2093:
2094:                            // if some text before < return it now
2095:                            if (this .txtend > this .txtstart) {
2096:                                this .token = newNode(Node.TEXT_NODE,
2097:                                        this .lexbuf, this .txtstart, this .txtend);
2098:                                return this .token;
2099:                            }
2100:
2101:                            continue; // no text so keep going
2102:                        }
2103:
2104:                        // otherwise treat as CDATA
2105:                        this .state = LEX_CONTENT;
2106:                        this .waswhite = false;
2107:                        continue;
2108:
2109:                    case LEX_ENDTAG:
2110:                        // </letter
2111:                        this .txtstart = this .lexsize - 1;
2112:
2113:                        // changed from
2114:                        // this.in.curcol -= 2;
2115:                        this .columns -= 2;
2116:
2117:                        c = parseTagName();
2118:                        this .token = newNode(
2119:                                Node.END_TAG, // create endtag token
2120:                                this .lexbuf, this .txtstart, this .txtend,
2121:                                TidyUtils.getString(this .lexbuf, this .txtstart,
2122:                                        this .txtend - this .txtstart));
2123:                        this .lexsize = this .txtstart;
2124:                        this .txtend = this .txtstart;
2125:
2126:                        // skip to '>'
2127:                        while (c != '>') {
2128:                            c = this .in.readChar();
2129:
2130:                            if (c == StreamIn.END_OF_STREAM) {
2131:                                break;
2132:                            }
2133:                        }
2134:
2135:                        if (c == StreamIn.END_OF_STREAM) {
2136:                            this .in.ungetChar(c);
2137:                            continue;
2138:                        }
2139:
2140:                        this .state = LEX_CONTENT;
2141:                        this .waswhite = false;
2142:                        return this .token; // the endtag token
2143:
2144:                    case LEX_STARTTAG:
2145:                        // first letter of tagname
2146:                        this .txtstart = this .lexsize - 1; // set txtstart to first letter
2147:                        c = parseTagName();
2148:                        isempty[0] = false;
2149:                        attributes = null;
2150:                        this .token = newNode((isempty[0] ? Node.START_END_TAG
2151:                                : Node.START_TAG), this .lexbuf, this .txtstart,
2152:                                this .txtend, TidyUtils.getString(this .lexbuf,
2153:                                        this .txtstart, this .txtend
2154:                                                - this .txtstart));
2155:
2156:                        // parse attributes, consuming closing ">"
2157:                        if (c != '>') {
2158:                            if (c == '/') {
2159:                                this .in.ungetChar(c);
2160:                            }
2161:
2162:                            attributes = parseAttrs(isempty);
2163:                        }
2164:
2165:                        if (isempty[0]) {
2166:                            this .token.type = Node.START_END_TAG;
2167:                        }
2168:
2169:                        this .token.attributes = attributes;
2170:                        this .lexsize = this .txtstart;
2171:                        this .txtend = this .txtstart;
2172:
2173:                        // swallow newline following start tag
2174:                        // special check needed for CRLF sequence
2175:                        // this doesn't apply to empty elements
2176:                        // nor to preformatted content that needs escaping
2177:
2178:                        if (
2179:
2180:                        (mode != PREFORMATTED || preContent(this .token))
2181:                                && (this .token.expectsContent() || this .token.tag == this .configuration.tt.tagBr)) {
2182:
2183:                            c = this .in.readChar();
2184:
2185:                            if (c == '\r') {
2186:                                c = this .in.readChar();
2187:
2188:                                if (c != '\n') {
2189:                                    this .in.ungetChar(c);
2190:                                }
2191:                            } else if (c != '\n' && c != '\f') {
2192:                                this .in.ungetChar(c);
2193:                            }
2194:
2195:                            this .waswhite = true; // to swallow leading whitespace
2196:                        } else {
2197:                            this .waswhite = false;
2198:                        }
2199:
2200:                        this .state = LEX_CONTENT;
2201:
2202:                        if (this .token.tag == null) {
2203:                            report.error(this , null, this .token,
2204:                                    Report.UNKNOWN_ELEMENT);
2205:                        } else if (!this .configuration.xmlTags) {
2206:                            constrainVersion(this .token.tag.versions);
2207:
2208:                            if (TidyUtils.toBoolean(this .token.tag.versions
2209:                                    & Dict.VERS_PROPRIETARY)) {
2210:                                // #427810 - fix by Gary Deschaines 24 May 00
2211:                                if (this .configuration.makeClean
2212:                                        && (this .token.tag != this .configuration.tt.tagNobr && //
2213:                                        this .token.tag != this .configuration.tt.tagWbr)) {
2214:                                    report.warning(this , null, this .token,
2215:                                            Report.PROPRIETARY_ELEMENT);
2216:                                }
2217:                                // #427810 - fix by Terry Teague 2 Jul 01
2218:                                else if (!this .configuration.makeClean) {
2219:                                    report.warning(this , null, this .token,
2220:                                            Report.PROPRIETARY_ELEMENT);
2221:                                }
2222:                            }
2223:
2224:                            if (this .token.tag.getChkattrs() != null) {
2225:                                this .token.tag.getChkattrs().check(this ,
2226:                                        this .token);
2227:                            } else {
2228:                                this .token.checkAttributes(this );
2229:                            }
2230:
2231:                            // should this be called before attribute checks?
2232:                            this .token.repairDuplicateAttributes(this );
2233:
2234:                        }
2235:
2236:                        return this .token; // return start tag
2237:
2238:                    case LEX_COMMENT:
2239:                        // seen <!-- so look for -->
2240:
2241:                        if (c != '-') {
2242:                            continue;
2243:                        }
2244:
2245:                        c = this .in.readChar();
2246:                        addCharToLexer(c);
2247:
2248:                        if (c != '-') {
2249:                            continue;
2250:                        }
2251:
2252:                        end_comment: while (true) {
2253:                            c = this .in.readChar();
2254:
2255:                            if (c == '>') {
2256:                                if (badcomment != 0) {
2257:                                    report.warning(this , null, null,
2258:                                            Report.MALFORMED_COMMENT);
2259:                                }
2260:
2261:                                this .txtend = this .lexsize - 2; // AQ 8Jul2000
2262:                                this .lexbuf[this .lexsize] = (byte) '\0';
2263:                                this .state = LEX_CONTENT;
2264:                                this .waswhite = false;
2265:                                this .token = newNode(Node.COMMENT_TAG,
2266:                                        this .lexbuf, this .txtstart, this .txtend);
2267:
2268:                                // now look for a line break
2269:
2270:                                c = this .in.readChar();
2271:
2272:                                if (c == '\r') {
2273:                                    c = this .in.readChar();
2274:
2275:                                    if (c != '\n') {
2276:                                        this .token.linebreak = true;
2277:                                    }
2278:                                }
2279:
2280:                                if (c == '\n') {
2281:                                    this .token.linebreak = true;
2282:                                } else {
2283:                                    this .in.ungetChar(c);
2284:                                }
2285:
2286:                                return this .token;
2287:                            }
2288:
2289:                            // note position of first such error in the comment
2290:                            if (badcomment == 0) {
2291:                                this .lines = this .in.getCurline();
2292:                                this .columns = this .in.getCurcol() - 3;
2293:                            }
2294:
2295:                            badcomment++;
2296:                            if (this .configuration.fixComments) {
2297:                                this .lexbuf[this .lexsize - 2] = (byte) '=';
2298:                            }
2299:
2300:                            addCharToLexer(c);
2301:
2302:                            // if '-' then look for '>' to end the comment
2303:                            if (c != '-') {
2304:                                break end_comment;
2305:                            }
2306:
2307:                        }
2308:                        // otherwise continue to look for -->
2309:                        this .lexbuf[this .lexsize - 2] = (byte) '=';
2310:                        continue;
2311:
2312:                    case LEX_DOCTYPE:
2313:                        // seen <!d so look for '> ' munging whitespace
2314:
2315:                        if (TidyUtils.isWhite((char) c)) {
2316:                            if (this .waswhite) {
2317:                                this .lexsize -= 1;
2318:                            }
2319:
2320:                            this .waswhite = true;
2321:                        } else {
2322:                            this .waswhite = false;
2323:                        }
2324:
2325:                        if (inDTDSubset) {
2326:                            if (c == ']') {
2327:                                inDTDSubset = false;
2328:                            }
2329:                        } else if (c == '[') {
2330:                            inDTDSubset = true;
2331:                        }
2332:                        if (inDTDSubset || c != '>') {
2333:                            continue;
2334:                        }
2335:
2336:                        this .lexsize -= 1;
2337:                        this .txtend = this .lexsize;
2338:                        this .lexbuf[this .lexsize] = (byte) '\0';
2339:                        this .state = LEX_CONTENT;
2340:                        this .waswhite = false;
2341:                        this .token = newNode(Node.DOCTYPE_TAG, this .lexbuf,
2342:                                this .txtstart, this .txtend);
2343:                        // make a note of the version named by the doctype
2344:                        this .doctype = findGivenVersion(this .token);
2345:                        return this .token;
2346:
2347:                    case LEX_PROCINSTR:
2348:                        // seen <? so look for '> '
2349:                        // check for PHP preprocessor instructions <?php ... ?>
2350:
2351:                        if (this .lexsize - this .txtstart == 3) {
2352:                            if ((TidyUtils.getString(this .lexbuf,
2353:                                    this .txtstart, 3)).equals("php")) {
2354:                                this .state = LEX_PHP;
2355:                                continue;
2356:                            }
2357:                        }
2358:
2359:                        if (this .lexsize - this .txtstart == 4) {
2360:                            if ((TidyUtils.getString(this .lexbuf,
2361:                                    this .txtstart, 3)).equals("xml")
2362:                                    && TidyUtils
2363:                                            .isWhite((char) this .lexbuf[this .txtstart + 3])) {
2364:                                this .state = LEX_XMLDECL;
2365:                                attributes = null;
2366:                                continue;
2367:                            }
2368:                        }
2369:
2370:                        if (this .configuration.xmlPIs) // insist on ?> as terminator
2371:                        {
2372:                            if (c != '?') {
2373:                                continue;
2374:                            }
2375:
2376:                            // now look for '>'
2377:                            c = this .in.readChar();
2378:
2379:                            if (c == StreamIn.END_OF_STREAM) {
2380:                                report.warning(this , null, null,
2381:                                        Report.UNEXPECTED_END_OF_FILE);
2382:                                this .in.ungetChar(c);
2383:                                continue;
2384:                            }
2385:
2386:                            addCharToLexer(c);
2387:                        }
2388:
2389:                        if (c != '>') {
2390:                            continue;
2391:                        }
2392:
2393:                        this .lexsize -= 1;
2394:                        this .txtend = this .lexsize;
2395:                        this .lexbuf[this .lexsize] = (byte) '\0';
2396:                        this .state = LEX_CONTENT;
2397:                        this .waswhite = false;
2398:                        this .token = newNode(Node.PROC_INS_TAG, this .lexbuf,
2399:                                this .txtstart, this .txtend);
2400:                        return this .token;
2401:
2402:                    case LEX_ASP:
2403:                        // seen <% so look for "%> "
2404:                        if (c != '%') {
2405:                            continue;
2406:                        }
2407:
2408:                        // now look for '>'
2409:                        c = this .in.readChar();
2410:
2411:                        if (c != '>') {
2412:                            this .in.ungetChar(c);
2413:                            continue;
2414:                        }
2415:
2416:                        this .lexsize -= 1;
2417:                        this .txtend = this .lexsize;
2418:                        this .lexbuf[this .lexsize] = (byte) '\0';
2419:                        this .state = LEX_CONTENT;
2420:                        this .waswhite = false;
2421:                        this .token = newNode(Node.ASP_TAG, this .lexbuf,
2422:                                this .txtstart, this .txtend);
2423:                        return this .token;
2424:
2425:                    case LEX_JSTE:
2426:                        // seen <# so look for "#> "
2427:                        if (c != '#') {
2428:                            continue;
2429:                        }
2430:
2431:                        // now look for '>'
2432:                        c = this .in.readChar();
2433:
2434:                        if (c != '>') {
2435:                            this .in.ungetChar(c);
2436:                            continue;
2437:                        }
2438:
2439:                        this .lexsize -= 1;
2440:                        this .txtend = this .lexsize;
2441:                        this .lexbuf[this .lexsize] = (byte) '\0';
2442:                        this .state = LEX_CONTENT;
2443:                        this .waswhite = false;
2444:                        this .token = newNode(Node.JSTE_TAG, this .lexbuf,
2445:                                this .txtstart, this .txtend);
2446:                        return this .token;
2447:
2448:                    case LEX_PHP:
2449:                        // seen " <?php" so look for "?> "
2450:                        if (c != '?') {
2451:                            continue;
2452:                        }
2453:
2454:                        // now look for '>'
2455:                        c = this .in.readChar();
2456:
2457:                        if (c != '>') {
2458:                            this .in.ungetChar(c);
2459:                            continue;
2460:                        }
2461:
2462:                        this .lexsize -= 1;
2463:                        this .txtend = this .lexsize;
2464:                        this .lexbuf[this .lexsize] = (byte) '\0';
2465:                        this .state = LEX_CONTENT;
2466:                        this .waswhite = false;
2467:                        this .token = newNode(Node.PHP_TAG, this .lexbuf,
2468:                                this .txtstart, this .txtend);
2469:                        return this .token;
2470:
2471:                    case LEX_XMLDECL: // seen "<?xml" so look for "?>"
2472:
2473:                        if (TidyUtils.isWhite((char) c) && c != '?') {
2474:                            continue;
2475:                        }
2476:
2477:                        // get pseudo-attribute
2478:                        if (c != '?') {
2479:                            String name;
2480:                            Node[] asp = new Node[1];
2481:                            Node[] php = new Node[1];
2482:                            AttVal av = new AttVal();
2483:                            int[] pdelim = new int[1];
2484:                            isempty[0] = false;
2485:
2486:                            this .in.ungetChar(c);
2487:
2488:                            name = this .parseAttribute(isempty, asp, php);
2489:                            av.attribute = name;
2490:
2491:                            av.value = this .parseValue(name, true, isempty,
2492:                                    pdelim);
2493:                            av.delim = pdelim[0];
2494:                            av.next = attributes;
2495:
2496:                            attributes = av;
2497:                            // continue;
2498:                        }
2499:
2500:                        // now look for '>'
2501:                        c = this .in.readChar();
2502:
2503:                        if (c != '>') {
2504:                            this .in.ungetChar(c);
2505:                            continue;
2506:                        }
2507:                        this .lexsize -= 1;
2508:                        this .txtend = this .txtstart;
2509:                        this .lexbuf[this .txtend] = '\0';
2510:                        this .state = LEX_CONTENT;
2511:                        this .waswhite = false;
2512:                        this .token = newNode(Node.XML_DECL, this .lexbuf,
2513:                                this .txtstart, this .txtend);
2514:                        this .token.attributes = attributes;
2515:                        return this .token;
2516:
2517:                    case LEX_SECTION:
2518:                        // seen " <![" so look for "]> "
2519:                        if (c == '[') {
2520:                            if (this .lexsize == (this .txtstart + 6)
2521:                                    && (TidyUtils.getString(this .lexbuf,
2522:                                            this .txtstart, 6)).equals("CDATA[")) {
2523:                                this .state = LEX_CDATA;
2524:                                this .lexsize -= 6;
2525:                                continue;
2526:                            }
2527:                        }
2528:
2529:                        if (c != ']') {
2530:                            continue;
2531:                        }
2532:
2533:                        // now look for '>'
2534:                        c = this .in.readChar();
2535:
2536:                        if (c != '>') {
2537:                            this .in.ungetChar(c);
2538:                            continue;
2539:                        }
2540:
2541:                        this .lexsize -= 1;
2542:                        this .txtend = this .lexsize;
2543:                        this .lexbuf[this .lexsize] = (byte) '\0';
2544:                        this .state = LEX_CONTENT;
2545:                        this .waswhite = false;
2546:                        this .token = newNode(Node.SECTION_TAG, this .lexbuf,
2547:                                this .txtstart, this .txtend);
2548:                        return this .token;
2549:
2550:                    case LEX_CDATA:
2551:                        // seen " <![CDATA[" so look for "]]> "
2552:                        if (c != ']') {
2553:                            continue;
2554:                        }
2555:
2556:                        // now look for ']'
2557:                        c = this .in.readChar();
2558:
2559:                        if (c != ']') {
2560:                            this .in.ungetChar(c);
2561:                            continue;
2562:                        }
2563:
2564:                        // now look for '>'
2565:                        c = this .in.readChar();
2566:
2567:                        if (c != '>') {
2568:                            this .in.ungetChar(c);
2569:                            continue;
2570:                        }
2571:
2572:                        this .lexsize -= 1;
2573:                        this .txtend = this .lexsize;
2574:                        this .lexbuf[this .lexsize] = (byte) '\0';
2575:                        this .state = LEX_CONTENT;
2576:                        this .waswhite = false;
2577:                        this .token = newNode(Node.CDATA_TAG, this .lexbuf,
2578:                                this .txtstart, this .txtend);
2579:                        return this .token;
2580:
2581:                    default:
2582:                        // should never reach here
2583:                        break;
2584:                    }
2585:                }
2586:
2587:                if (this .state == LEX_CONTENT) // text string
2588:                {
2589:                    this .txtend = this .lexsize;
2590:
2591:                    if (this .txtend > this .txtstart) {
2592:                        this .in.ungetChar(c);
2593:
2594:                        if (this .lexbuf[this .lexsize - 1] == (byte) ' ') {
2595:                            this .lexsize -= 1;
2596:                            this .txtend = this .lexsize;
2597:                        }
2598:
2599:                        this .token = newNode(Node.TEXT_NODE, this .lexbuf,
2600:                                this .txtstart, this .txtend);
2601:                        return this .token;
2602:                    }
2603:                } else if (this .state == LEX_COMMENT) // comment
2604:                {
2605:                    if (c == StreamIn.END_OF_STREAM) {
2606:                        report.warning(this , null, null,
2607:                                Report.MALFORMED_COMMENT);
2608:                    }
2609:
2610:                    this .txtend = this .lexsize;
2611:                    this .lexbuf[this .lexsize] = (byte) '\0';
2612:                    this .state = LEX_CONTENT;
2613:                    this .waswhite = false;
2614:                    this .token = newNode(Node.COMMENT_TAG, this .lexbuf,
2615:                            this .txtstart, this .txtend);
2616:                    return this .token;
2617:                }
2618:
2619:                return null;
2620:            }
2621:
2622:            /**
2623:             * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2624:             * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2625:             * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2626:             * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2627:             * masked from Tidy by the quotemarks.
2628:             * @return parsed Node
2629:             */
2630:            public Node parseAsp() {
2631:                int c;
2632:                Node asp = null;
2633:
2634:                this .txtstart = this .lexsize;
2635:
2636:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
2637:
2638:                    addCharToLexer(c);
2639:
2640:                    if (c != '%') {
2641:                        continue;
2642:                    }
2643:
2644:                    if ((c = this .in.readChar()) == StreamIn.END_OF_STREAM) {
2645:                        break;
2646:                    }
2647:                    addCharToLexer(c);
2648:
2649:                    if (c == '>') {
2650:                        break;
2651:                    }
2652:                }
2653:
2654:                this .lexsize -= 2;
2655:                this .txtend = this .lexsize;
2656:
2657:                if (this .txtend > this .txtstart) {
2658:                    asp = newNode(Node.ASP_TAG, this .lexbuf, this .txtstart,
2659:                            this .txtend);
2660:                }
2661:
2662:                this .txtstart = this .txtend;
2663:                return asp;
2664:            }
2665:
2666:            /**
2667:             * PHP is like ASP but is based upon XML processing instructions, e.g. <code>&lt;?php ... ?&gt;</code>.
2668:             * @return parsed Node
2669:             */
2670:            public Node parsePhp() {
2671:                int c;
2672:                Node php = null;
2673:
2674:                this .txtstart = this .lexsize;
2675:
2676:                while ((c = this .in.readChar()) != StreamIn.END_OF_STREAM) {
2677:                    addCharToLexer(c);
2678:
2679:                    if (c != '?') {
2680:                        continue;
2681:                    }
2682:
2683:                    if ((c = this .in.readChar()) == StreamIn.END_OF_STREAM) {
2684:                        break;
2685:                    }
2686:                    addCharToLexer(c);
2687:
2688:                    if (c == '>') {
2689:                        break;
2690:                    }
2691:                }
2692:
2693:                this .lexsize -= 2;
2694:                this .txtend = this .lexsize;
2695:
2696:                if (this .txtend > this .txtstart) {
2697:                    php = newNode(Node.PHP_TAG, this .lexbuf, this .txtstart,
2698:                            this .txtend);
2699:                }
2700:
2701:                this .txtstart = this .txtend;
2702:                return php;
2703:            }
2704:
2705:            /**
2706:             * consumes the '>' terminating start tags.
2707:             * @param isempty flag is passed as array so it can be modified
2708:             * @param asp asp Node, passed as array so it can be modified
2709:             * @param php php Node, passed as array so it can be modified
2710:             * @return parsed attribute
2711:             */
2712:            public String parseAttribute(boolean[] isempty, Node[] asp,
2713:                    Node[] php) {
2714:                int start = 0;
2715:                String attr;
2716:                int c = 0;
2717:                int lastc = 0;
2718:
2719:                asp[0] = null; // clear asp pointer
2720:                php[0] = null; // clear php pointer
2721:                // skip white space before the attribute
2722:
2723:                for (;;) {
2724:                    c = this .in.readChar();
2725:
2726:                    if (c == '/') {
2727:                        c = this .in.readChar();
2728:
2729:                        if (c == '>') {
2730:                            isempty[0] = true;
2731:                            return null;
2732:                        }
2733:
2734:                        this .in.ungetChar(c);
2735:                        c = '/';
2736:                        break;
2737:                    }
2738:
2739:                    if (c == '>') {
2740:                        return null;
2741:                    }
2742:
2743:                    if (c == '<') {
2744:                        c = this .in.readChar();
2745:
2746:                        if (c == '%') {
2747:                            asp[0] = parseAsp();
2748:                            return null;
2749:                        } else if (c == '?') {
2750:                            php[0] = parsePhp();
2751:                            return null;
2752:                        }
2753:
2754:                        this .in.ungetChar(c);
2755:                        if (this .state != LEX_XMLDECL) // FG fix for 532535
2756:                        {
2757:                            this .in.ungetChar('<'); // fix for 433360
2758:                        }
2759:                        report.attrError(this , this .token, null,
2760:                                Report.UNEXPECTED_GT);
2761:                        return null;
2762:                    }
2763:
2764:                    if (c == '=') {
2765:                        report.attrError(this , this .token, null,
2766:                                Report.UNEXPECTED_EQUALSIGN);
2767:                        continue;
2768:                    }
2769:
2770:                    if (c == '"' || c == '\'') {
2771:                        report.attrError(this , this .token, null,
2772:                                Report.UNEXPECTED_QUOTEMARK);
2773:                        continue;
2774:                    }
2775:
2776:                    if (c == StreamIn.END_OF_STREAM) {
2777:                        report.attrError(this , this .token, null,
2778:                                Report.UNEXPECTED_END_OF_FILE);
2779:                        this .in.ungetChar(c);
2780:                        return null;
2781:                    }
2782:
2783:                    if (!TidyUtils.isWhite((char) c)) {
2784:                        break;
2785:                    }
2786:                }
2787:
2788:                start = this .lexsize;
2789:                lastc = c;
2790:
2791:                for (;;) {
2792:                    // but push back '=' for parseValue()
2793:                    if (c == '=' || c == '>') {
2794:                        this .in.ungetChar(c);
2795:                        break;
2796:                    }
2797:
2798:                    if (c == '<' || c == StreamIn.END_OF_STREAM) {
2799:                        this .in.ungetChar(c);
2800:                        break;
2801:                    }
2802:                    if (lastc == '-' && (c == '"' || c == '\'')) {
2803:                        this .lexsize--;
2804:                        this .in.ungetChar(c);
2805:                        break;
2806:                    }
2807:                    if (TidyUtils.isWhite((char) c)) {
2808:                        break;
2809:                    }
2810:
2811:                    // what should be done about non-namechar characters?
2812:                    // currently these are incorporated into the attr name
2813:
2814:                    if (!this .configuration.xmlTags
2815:                            && TidyUtils.isUpper((char) c)) {
2816:                        c = TidyUtils.toLower((char) c);
2817:                    }
2818:
2819:                    // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
2820:                    addCharToLexer(c);
2821:
2822:                    lastc = c;
2823:                    c = this .in.readChar();
2824:                }
2825:
2826:                // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
2827:                int len = this .lexsize - start;
2828:                attr = (len > 0 ? TidyUtils.getString(this .lexbuf, start, len)
2829:                        : null);
2830:                this .lexsize = start;
2831:
2832:                return attr;
2833:            }
2834:
2835:            /**
2836:             * Invoked when &lt; is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
2837:             * routine recognizes ' and " quoted strings.
2838:             * @return delimiter
2839:             */
2840:            public int parseServerInstruction() {
2841:                int c, delim = '"';
2842:                boolean isrule = false;
2843:
2844:                c = this .in.readChar();
2845:                addCharToLexer(c);
2846:
2847:                // check for ASP, PHP or Tango
2848:                if (c == '%' || c == '?' || c == '@') {
2849:                    isrule = true;
2850:                }
2851:
2852:                for (;;) {
2853:                    c = this .in.readChar();
2854:
2855:                    if (c == StreamIn.END_OF_STREAM) {
2856:                        break;
2857:                    }
2858:
2859:                    if (c == '>') {
2860:                        if (isrule) {
2861:                            addCharToLexer(c);
2862:                        } else {
2863:                            this .in.ungetChar(c);
2864:                        }
2865:
2866:                        break;
2867:                    }
2868:
2869:                    // if not recognized as ASP, PHP or Tango
2870:                    // then also finish value on whitespace
2871:                    if (!isrule) {
2872:                        if (TidyUtils.isWhite((char) c)) {
2873:                            break;
2874:                        }
2875:                    }
2876:
2877:                    addCharToLexer(c);
2878:
2879:                    if (c == '"') {
2880:                        do {
2881:                            c = this .in.readChar();
2882:
2883:                            if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
2884:                            {
2885:                                report.attrError(this , this .token, null,
2886:                                        Report.UNEXPECTED_END_OF_FILE);
2887:                                this .in.ungetChar(c);
2888:                                return 0;
2889:                            }
2890:                            if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
2891:                            {
2892:                                this .in.ungetChar(c);
2893:                                report.attrError(this , this .token, null,
2894:                                        Report.UNEXPECTED_GT);
2895:                                return 0;
2896:                            }
2897:
2898:                            addCharToLexer(c);
2899:                        } while (c != '"');
2900:                        delim = '\'';
2901:                        continue;
2902:                    }
2903:
2904:                    if (c == '\'') {
2905:                        do {
2906:                            c = this .in.readChar();
2907:
2908:                            if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
2909:                            {
2910:                                report.attrError(this , this .token, null,
2911:                                        Report.UNEXPECTED_END_OF_FILE);
2912:                                this .in.ungetChar(c);
2913:                                return 0;
2914:                            }
2915:                            if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
2916:                            {
2917:                                this .in.ungetChar(c);
2918:                                report.attrError(this , this .token, null,
2919:                                        Report.UNEXPECTED_GT);
2920:                                return 0;
2921:                            }
2922:
2923:                            addCharToLexer(c);
2924:                        } while (c != '\'');
2925:                    }
2926:                }
2927:
2928:                return delim;
2929:            }
2930:
2931:            /**
2932:             * Parse an attribute value.
2933:             * @param name attribute name
2934:             * @param foldCase fold case?
2935:             * @param isempty is attribute empty? Passed as an array reference to allow modification
2936:             * @param pdelim delimiter, passed as an array reference to allow modification
2937:             * @return parsed value
2938:             */
2939:            public String parseValue(String name, boolean foldCase,
2940:                    boolean[] isempty, int[] pdelim) {
2941:                // values start with "=" or " = " etc.
2942:                // doesn't consume the ">" at end of start tag
2943:
2944:                int len = 0;
2945:                int start;
2946:                boolean seenGt = false;
2947:                boolean munge = true;
2948:                int c = 0;
2949:                int lastc, delim, quotewarning;
2950:                String value;
2951:
2952:                delim = 0;
2953:                pdelim[0] = '"';
2954:
2955:                // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
2956:                // significant and must be preserved
2957:
2958:                if (this .configuration.literalAttribs) {
2959:                    munge = false;
2960:                }
2961:
2962:                // skip white space before the '='
2963:                while (true) {
2964:                    c = this .in.readChar();
2965:
2966:                    if (c == StreamIn.END_OF_STREAM) {
2967:                        this .in.ungetChar(c);
2968:                        break;
2969:                    }
2970:
2971:                    if (!TidyUtils.isWhite((char) c)) {
2972:                        break;
2973:                    }
2974:                }
2975:
2976:                // c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
2977:
2978:                if (c != '=' && c != '"' && c != '\'') {
2979:                    this .in.ungetChar(c);
2980:                    return null;
2981:                }
2982:
2983:                // skip white space after '='
2984:
2985:                while (true) {
2986:                    c = this .in.readChar();
2987:
2988:                    if (c == StreamIn.END_OF_STREAM) {
2989:                        this .in.ungetChar(c);
2990:                        break;
2991:                    }
2992:
2993:                    if (!TidyUtils.isWhite((char) c)) {
2994:                        break;
2995:                    }
2996:                }
2997:
2998:                // check for quote marks
2999:
3000:                if (c == '"' || c == '\'') {
3001:                    delim = c;
3002:                } else if (c == '<') {
3003:                    start = this .lexsize;
3004:                    addCharToLexer(c);
3005:                    pdelim[0] = parseServerInstruction();
3006:                    len = this .lexsize - start;
3007:                    this .lexsize = start;
3008:                    return (len > 0 ? TidyUtils.getString(this .lexbuf, start,
3009:                            len) : null);
3010:                } else {
3011:                    this .in.ungetChar(c);
3012:                }
3013:
3014:                // and read the value string check for quote mark if needed
3015:
3016:                quotewarning = 0;
3017:                start = this .lexsize;
3018:                c = '\0';
3019:
3020:                while (true) {
3021:                    lastc = c; // track last character
3022:                    c = this .in.readChar();
3023:
3024:                    if (c == StreamIn.END_OF_STREAM) {
3025:                        report.attrError(this , this .token, null,
3026:                                Report.UNEXPECTED_END_OF_FILE);
3027:                        this .in.ungetChar(c);
3028:                        break;
3029:                    }
3030:
3031:                    if (delim == (char) 0) {
3032:                        if (c == '>') {
3033:                            this .in.ungetChar(c);
3034:                            break;
3035:                        }
3036:
3037:                        if (c == '"' || c == '\'') {
3038:                            report.attrError(this , this .token, null,
3039:                                    Report.UNEXPECTED_QUOTEMARK);
3040:                            break;
3041:                        }
3042:
3043:                        if (c == '<') {
3044:                            this .in.ungetChar(c); // fix for 433360
3045:                            c = '>';
3046:                            this .in.ungetChar(c);
3047:                            report.attrError(this , this .token, null,
3048:                                    Report.UNEXPECTED_GT);
3049:                            break;
3050:                        }
3051:
3052:                        // For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
3053:                        // care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
3054:                        // <a> tag to <a href="http://www.acme.com"/>
3055:
3056:                        if (c == '/') {
3057:                            // peek ahead in case of />
3058:                            c = this .in.readChar();
3059:
3060:                            if (c == '>'
3061:                                    && !AttributeTable
3062:                                            .getDefaultAttributeTable().isUrl(
3063:                                                    name)) {
3064:                                isempty[0] = true;
3065:                                this .in.ungetChar(c);
3066:                                break;
3067:                            }
3068:
3069:                            // unget peeked char
3070:                            this .in.ungetChar(c);
3071:                            c = '/';
3072:                        }
3073:                    } else {
3074:                        // delim is '\'' or '"'
3075:                        if (c == delim) {
3076:                            break;
3077:                        }
3078:
3079:                        // treat CRLF, CR and LF as single line break
3080:
3081:                        if (c == '\r') {
3082:                            c = this .in.readChar();
3083:                            if (c != '\n') {
3084:                                this .in.ungetChar(c);
3085:                            }
3086:
3087:                            c = '\n';
3088:                        }
3089:
3090:                        if (c == '\n' || c == '<' || c == '>') {
3091:                            ++quotewarning;
3092:                        }
3093:
3094:                        if (c == '>') {
3095:                            seenGt = true;
3096:                        }
3097:                    }
3098:
3099:                    if (c == '&') {
3100:                        // no entities in ID attributes
3101:                        if ("id".equalsIgnoreCase(name)) {
3102:                            report.attrError(this , null, null,
3103:                                    Report.ENTITY_IN_ID);
3104:                            continue;
3105:                        }
3106:
3107:                        addCharToLexer(c);
3108:                        parseEntity((short) 0);
3109:                        continue;
3110:
3111:                    }
3112:
3113:                    // kludge for JavaScript attribute values with line continuations in string literals
3114:
3115:                    if (c == '\\') {
3116:                        c = this .in.readChar();
3117:
3118:                        if (c != '\n') {
3119:                            this .in.ungetChar(c);
3120:                            c = '\\';
3121:                        }
3122:                    }
3123:
3124:                    if (TidyUtils.isWhite((char) c)) {
3125:                        if (delim == (char) 0) {
3126:                            break;
3127:                        }
3128:
3129:                        if (munge) {
3130:                            // discard line breaks in quoted URLs
3131:                            // #438650 - fix by Randy Waki
3132:                            if (c == '\n'
3133:                                    && AttributeTable
3134:                                            .getDefaultAttributeTable().isUrl(
3135:                                                    name)) {
3136:                                // warn that we discard this newline
3137:                                report.attrError(this , this .token, null,
3138:                                        Report.NEWLINE_IN_URI);
3139:                                continue;
3140:                            }
3141:
3142:                            c = ' ';
3143:
3144:                            if (lastc == ' ') {
3145:                                continue;
3146:                            }
3147:                        }
3148:                    } else if (foldCase && TidyUtils.isUpper((char) c)) {
3149:                        c = TidyUtils.toLower((char) c);
3150:                    }
3151:
3152:                    addCharToLexer(c);
3153:                }
3154:
3155:                if (quotewarning > 10 && seenGt && munge) {
3156:                    // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
3157:                    // characters. an exception is made for Javascript attributes and the javascript URL scheme which may
3158:                    // legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
3159:
3160:                    if (!AttributeTable.getDefaultAttributeTable().isScript(
3161:                            name)
3162:                            && !(AttributeTable.getDefaultAttributeTable()
3163:                                    .isUrl(name) && "javascript:"
3164:                                    .equals(TidyUtils.getString(this .lexbuf,
3165:                                            start, 11)))
3166:                            && !"<xml ".equals(TidyUtils.getString(this .lexbuf,
3167:                                    start, 5))) // #500236 - fix by Klaus Johannes Rusch
3168:                    // 06 Jan 02
3169:                    {
3170:                        report.error(this , null, null,
3171:                                Report.SUSPECTED_MISSING_QUOTE);
3172:                    }
3173:                }
3174:
3175:                len = this .lexsize - start;
3176:                this .lexsize = start;
3177:
3178:                if (len > 0 || delim != 0) {
3179:                    // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
3180:                    // --literal-attributes is set to yes
3181:                    // #994841 - Whitespace is removed from value attributes
3182:
3183:                    if (munge
3184:                            && !TidyUtils.isInValuesIgnoreCase(new String[] {
3185:                                    "alt", "title", "value", "prompt" }, name)) {
3186:                        while (TidyUtils.isWhite((char) this .lexbuf[start + len
3187:                                - 1])) {
3188:                            --len;
3189:                        }
3190:
3191:                        while (TidyUtils.isWhite((char) this .lexbuf[start])
3192:                                && start < len) {
3193:                            ++start;
3194:                            --len;
3195:                        }
3196:                    }
3197:
3198:                    value = TidyUtils.getString(this .lexbuf, start, len);
3199:                } else {
3200:                    value = null;
3201:                }
3202:
3203:                // note delimiter if given
3204:                if (delim != 0) {
3205:                    pdelim[0] = delim;
3206:                } else {
3207:                    pdelim[0] = '"';
3208:                }
3209:
3210:                return value;
3211:            }
3212:
3213:            /**
3214:             * Check if attr is a valid name.
3215:             * @param attr String to check, must be non-null
3216:             * @return <code>true</code> if attr is a valid name.
3217:             */
3218:            public static boolean isValidAttrName(String attr) {
3219:                char c;
3220:                int i;
3221:
3222:                // first character should be a letter
3223:                c = attr.charAt(0);
3224:
3225:                if (!TidyUtils.isLetter(c)) {
3226:                    return false;
3227:                }
3228:
3229:                // remaining characters should be namechars
3230:                for (i = 1; i < attr.length(); i++) {
3231:                    c = attr.charAt(i);
3232:
3233:                    if (TidyUtils.isNamechar(c)) {
3234:                        continue;
3235:                    }
3236:
3237:                    return false;
3238:                }
3239:
3240:                return true;
3241:            }
3242:
3243:            /**
3244:             * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3245:             * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3246:             * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3247:             * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3248:             * meaning, by putting a backslash in front.
3249:             * @param buf css selector name
3250:             * @return <code>true</code> if the given string is a valid css1 selector name
3251:             */
3252:            public static boolean isCSS1Selector(String buf) {
3253:                if (buf == null) {
3254:                    return false;
3255:                }
3256:
3257:                // #508936 - CSS class naming for -clean option
3258:                boolean valid = true;
3259:                int esclen = 0;
3260:                char c;
3261:                int pos;
3262:
3263:                for (pos = 0; valid && pos < buf.length(); ++pos) {
3264:                    c = buf.charAt(pos);
3265:                    if (c == '\\') {
3266:                        esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
3267:                    } else if (Character.isDigit(c)) {
3268:                        // Digit not 1st, unless escaped (Max length "\112F")
3269:                        if (esclen > 0) {
3270:                            valid = (++esclen < 6);
3271:                        }
3272:                        if (valid) {
3273:                            valid = (pos > 0 || esclen > 0);
3274:                        }
3275:                    } else {
3276:                        valid = (esclen > 0 // Escaped? Anything goes.
3277:                                || (pos > 0 && c == '-') // Dash cannot be 1st char
3278:                                || Character.isLetter(c) // a-z, A-Z anywhere
3279:                        || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
3280:                        esclen = 0;
3281:                    }
3282:                }
3283:                return valid;
3284:            }
3285:
3286:            /**
3287:             * Parse tag attributes.
3288:             * @param isempty is tag empty?
3289:             * @return parsed attribute/value list
3290:             */
3291:            public AttVal parseAttrs(boolean[] isempty) {
3292:                AttVal av, list;
3293:                String attribute, value;
3294:                int[] delim = new int[1];
3295:                Node[] asp = new Node[1];
3296:                Node[] php = new Node[1];
3297:
3298:                list = null;
3299:
3300:                while (!endOfInput()) {
3301:                    attribute = parseAttribute(isempty, asp, php);
3302:
3303:                    if (attribute == null) {
3304:                        // check if attributes are created by ASP markup
3305:                        if (asp[0] != null) {
3306:                            av = new AttVal(list, null, asp[0], null, '\0',
3307:                                    null, null);
3308:                            list = av;
3309:                            continue;
3310:                        }
3311:
3312:                        // check if attributes are created by PHP markup
3313:                        if (php[0] != null) {
3314:                            av = new AttVal(list, null, null, php[0], '\0',
3315:                                    null, null);
3316:                            list = av;
3317:                            continue;
3318:                        }
3319:
3320:                        break;
3321:                    }
3322:
3323:                    value = parseValue(attribute, false, isempty, delim);
3324:
3325:                    if (attribute != null && isValidAttrName(attribute)) {
3326:                        av = new AttVal(list, null, null, null, delim[0],
3327:                                attribute, value);
3328:                        av.dict = AttributeTable.getDefaultAttributeTable()
3329:                                .findAttribute(av);
3330:                        list = av;
3331:                    } else {
3332:                        av = new AttVal(null, null, null, null, 0, attribute,
3333:                                value);
3334:
3335:                        // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
3336:                        if (value != null) {
3337:                            report.attrError(this , this .token, av,
3338:                                    Report.BAD_ATTRIBUTE_VALUE);
3339:                        } else if (TidyUtils.lastChar(attribute) == '"') {
3340:                            report.attrError(this , this .token, av,
3341:                                    Report.MISSING_QUOTEMARK);
3342:                        } else {
3343:                            report.attrError(this , this .token, av,
3344:                                    Report.UNKNOWN_ATTRIBUTE);
3345:                        }
3346:                    }
3347:                }
3348:
3349:                return list;
3350:            }
3351:
3352:            /**
3353:             * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3354:             * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3355:             * <code>&lt;p>&lt;em> text &lt;p>&lt;em> more text</code> Shouldn't be mapped to
3356:             * <code>&lt;p>&lt;em> text &lt;/em>&lt;/p>&lt;p>&lt;em>&lt;em> more text &lt;/em>&lt;/em></code>
3357:             * @param node Node to be pushed
3358:             */
3359:            public void pushInline(Node node) {
3360:                IStack is;
3361:
3362:                if (node.implicit) {
3363:                    return;
3364:                }
3365:
3366:                if (node.tag == null) {
3367:                    return;
3368:                }
3369:
3370:                if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) {
3371:                    return;
3372:                }
3373:
3374:                if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) {
3375:                    return;
3376:                }
3377:
3378:                if (node.tag != this .configuration.tt.tagFont && isPushed(node)) {
3379:                    return;
3380:                }
3381:
3382:                // make sure there is enough space for the stack
3383:                is = new IStack();
3384:                is.tag = node.tag;
3385:                is.element = node.element;
3386:                if (node.attributes != null) {
3387:                    is.attributes = cloneAttributes(node.attributes);
3388:                }
3389:                this .istack.push(is);
3390:            }
3391:
3392:            /**
3393:             * Pop a copy of an inline node from the stack.
3394:             * @param node Node to be popped
3395:             */
3396:            public void popInline(Node node) {
3397:                IStack is;
3398:
3399:                if (node != null) {
3400:
3401:                    if (node.tag == null) {
3402:                        return;
3403:                    }
3404:
3405:                    if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) {
3406:                        return;
3407:                    }
3408:
3409:                    if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) {
3410:                        return;
3411:                    }
3412:
3413:                    // if node is </a> then pop until we find an <a>
3414:                    if (node.tag == this .configuration.tt.tagA) {
3415:
3416:                        while (this .istack.size() > 0) {
3417:                            is = (IStack) this .istack.pop();
3418:                            if (is.tag == this .configuration.tt.tagA) {
3419:                                break;
3420:                            }
3421:                        }
3422:
3423:                        if (this .insert >= this .istack.size()) {
3424:                            this .insert = -1;
3425:                        }
3426:                        return;
3427:                    }
3428:                }
3429:
3430:                if (this .istack.size() > 0) {
3431:                    is = (IStack) this .istack.pop();
3432:                    if (this .insert >= this .istack.size()) {
3433:                        this .insert = -1;
3434:                    }
3435:                }
3436:            }
3437:
3438:            /**
3439:             * Is the node in the stack?
3440:             * @param node Node
3441:             * @return <code>true</code> is the node is found in the stack
3442:             */
3443:            public boolean isPushed(Node node) {
3444:                int i;
3445:                IStack is;
3446:
3447:                for (i = this .istack.size() - 1; i >= 0; --i) {
3448:                    is = (IStack) this .istack.elementAt(i);
3449:                    if (is.tag == node.tag) {
3450:                        return true;
3451:                    }
3452:                }
3453:
3454:                return false;
3455:            }
3456:
3457:            /**
3458:             * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3459:             * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3460:             * will be the case in: <code>&lt;i>&lt;h1>italic heading&lt;/h1>&lt;/i></code> which is then treated as
3461:             * equivalent to <code>&lt;h1>&lt;i>italic heading&lt;/i>&lt;/h1></code> This is implemented by setting the lexer
3462:             * into a mode where it gets tokens from the inline stack rather than from the input stream.
3463:             * @param node original node
3464:             * @return stack size
3465:             */
3466:            public int inlineDup(Node node) {
3467:                int n;
3468:
3469:                n = this .istack.size() - this .istackbase;
3470:                if (n > 0) {
3471:                    this .insert = this .istackbase;
3472:                    this .inode = node;
3473:                }
3474:
3475:                return n;
3476:            }
3477:
3478:            /**
3479:             * @return
3480:             */
3481:            public Node insertedToken() {
3482:                Node node;
3483:                IStack is;
3484:                int n;
3485:
3486:                // this will only be null if inode != null
3487:                if (this .insert == -1) {
3488:                    node = this .inode;
3489:                    this .inode = null;
3490:                    return node;
3491:                }
3492:
3493:                // is this is the "latest" node then update the position, otherwise use current values
3494:                if (this .inode == null) {
3495:                    this .lines = this .in.getCurline();
3496:                    this .columns = this .in.getCurcol();
3497:                }
3498:
3499:                node = newNode(Node.START_TAG, this .lexbuf, this .txtstart,
3500:                        this .txtend);
3501:
3502:                // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
3503:                node.implicit = true;
3504:                is = (IStack) this .istack.elementAt(this .insert);
3505:                node.element = is.element;
3506:                node.tag = is.tag;
3507:                if (is.attributes != null) {
3508:                    node.attributes = cloneAttributes(is.attributes);
3509:                }
3510:
3511:                // advance lexer to next item on the stack
3512:                n = this .insert;
3513:
3514:                // and recover state if we have reached the end
3515:                if (++n < this .istack.size()) {
3516:                    this .insert = n;
3517:                } else {
3518:                    this .insert = -1;
3519:                }
3520:
3521:                return node;
3522:            }
3523:
3524:            /**
3525:             * Can the given element be removed?
3526:             * @param element node
3527:             * @return <code>true</code> if he element can be removed
3528:             */
3529:            public boolean canPrune(Node element) {
3530:                if (element.type == Node.TEXT_NODE) {
3531:                    return true;
3532:                }
3533:
3534:                if (element.content != null) {
3535:                    return false;
3536:                }
3537:
3538:                if (element.tag == this .configuration.tt.tagA
3539:                        && element.attributes != null) {
3540:                    return false;
3541:                }
3542:
3543:                if (element.tag == this .configuration.tt.tagP
3544:                        && !this .configuration.dropEmptyParas) {
3545:                    return false;
3546:                }
3547:
3548:                if (element.tag == null) {
3549:                    return false;
3550:                }
3551:
3552:                if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW)) {
3553:                    return false;
3554:                }
3555:
3556:                if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) {
3557:                    return false;
3558:                }
3559:
3560:                if (element.tag == this .configuration.tt.tagApplet) {
3561:                    return false;
3562:                }
3563:
3564:                if (element.tag == this .configuration.tt.tagObject) {
3565:                    return false;
3566:                }
3567:
3568:                if (element.tag == this .configuration.tt.tagScript
3569:                        && element.getAttrByName("src") != null) {
3570:                    return false;
3571:                }
3572:
3573:                // #540555 Empty title tag is trimmed
3574:                if (element.tag == this .configuration.tt.tagTitle) {
3575:                    return false;
3576:                }
3577:
3578:                // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
3579:                if (element.tag == this .configuration.tt.tagIframe) {
3580:                    return false;
3581:                }
3582:
3583:                if (element.getAttrByName("id") != null
3584:                        || element.getAttrByName("name") != null) {
3585:                    return false;
3586:                }
3587:
3588:                return true;
3589:            }
3590:
3591:            /**
3592:             * duplicate name attribute as an id and check if id and name match.
3593:             * @param node Node to check for name/it attributes
3594:             */
3595:            public void fixId(Node node) {
3596:                AttVal name = node.getAttrByName("name");
3597:                AttVal id = node.getAttrByName("id");
3598:
3599:                if (name != null) {
3600:                    if (id != null) {
3601:                        if (id.value != null && !id.value.equals(name.value)) {
3602:                            report.attrError(this , node, name,
3603:                                    Report.ID_NAME_MISMATCH);
3604:                        }
3605:                    } else if (this .configuration.xmlOut) {
3606:                        node.addAttribute("id", name.value);
3607:                    }
3608:                }
3609:            }
3610:
3611:            /**
3612:             * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3613:             */
3614:            public void deferDup() {
3615:                this .insert = -1;
3616:                this .inode = null;
3617:            }
3618:
3619:            /**
3620:             * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3621:             * HTML this is handled here rather than in the tag/attr dicts.
3622:             * @param vers html version code
3623:             */
3624:            void constrainVersion(int vers) {
3625:                this .versions &= (vers | Dict.VERS_PROPRIETARY);
3626:            }
3627:
3628:            /**
3629:             * Is content acceptable for pre elements?
3630:             * @param node content
3631:             * @return <code>true</code> if node is acceptable in pre elements
3632:             */
3633:            protected boolean preContent(Node node) {
3634:                // p is coerced to br's
3635:                if (node.tag == this .configuration.tt.tagP) {
3636:                    return true;
3637:                }
3638:
3639:                if (node.tag == null
3640:                        || node.tag == this .configuration.tt.tagP
3641:                        || !TidyUtils.toBoolean(node.tag.model
3642:                                & (Dict.CM_INLINE | Dict.CM_NEW))) {
3643:                    return false;
3644:                }
3645:                return true;
3646:            }
3647:
3648:            /**
3649:             * document type.
3650:             */
3651:            private static class W3CVersionInfo {
3652:
3653:                /**
3654:                 * name.
3655:                 */
3656:                String name;
3657:
3658:                /**
3659:                 * voyager name.
3660:                 */
3661:                String voyagerName;
3662:
3663:                /**
3664:                 * profile.
3665:                 */
3666:                String profile;
3667:
3668:                /**
3669:                 * code.
3670:                 */
3671:                short code;
3672:
3673:                /**
3674:                 * Instantiates a new W3CVersionInfo.
3675:                 * @param name version name
3676:                 * @param voyagerName voyager (xhtml) name
3677:                 * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
3678:                 * @param code unique code for this version info
3679:                 */
3680:                public W3CVersionInfo(String name, String voyagerName,
3681:                        String profile, short code) {
3682:                    this.name = name;
3683:                    this.voyagerName = voyagerName;
3684:                    this.profile = profile;
3685:                    this.code = code;
3686:                }
3687:            }
3688:
3689:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.