Source Code Cross Referenced for HTMLScanner.java in  » HTML-Parser » nekohtml » org » cyberneko » html » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » HTML Parser » nekohtml » org.cyberneko.html 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* 
0002:         * Copyright 2002-2008 Andy Clark
0003:         * 
0004:         * Licensed under the Apache License, Version 2.0 (the "License");
0005:         * you may not use this file except in compliance with the License.
0006:         * You may obtain a copy of the License at
0007:         *
0008:         *     http://www.apache.org/licenses/LICENSE-2.0
0009:         *
0010:         * Unless required by applicable law or agreed to in writing, software
0011:         * distributed under the License is distributed on an "AS IS" BASIS,
0012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013:         * See the License for the specific language governing permissions and
0014:         * limitations under the License.
0015:         */
0016:
0017:        package org.cyberneko.html;
0018:
0019:        import java.io.EOFException;
0020:        import java.io.FilterInputStream;
0021:        import java.io.IOException;
0022:        import java.io.InputStream;
0023:        import java.io.InputStreamReader;
0024:        import java.io.Reader;
0025:        import java.io.UnsupportedEncodingException;
0026:        import java.lang.reflect.InvocationTargetException;
0027:        import java.lang.reflect.Method;
0028:        import java.net.URL;
0029:        import java.util.Stack;
0030:
0031:        import org.apache.xerces.util.EncodingMap;
0032:        import org.apache.xerces.util.NamespaceSupport;
0033:        import org.apache.xerces.util.URI;
0034:        import org.apache.xerces.util.XMLAttributesImpl;
0035:        import org.apache.xerces.util.XMLResourceIdentifierImpl;
0036:        import org.apache.xerces.util.XMLStringBuffer;
0037:        import org.apache.xerces.xni.Augmentations;
0038:        import org.apache.xerces.xni.NamespaceContext;
0039:        import org.apache.xerces.xni.QName;
0040:        import org.apache.xerces.xni.XMLAttributes;
0041:        import org.apache.xerces.xni.XMLDocumentHandler;
0042:        import org.apache.xerces.xni.XMLLocator;
0043:        import org.apache.xerces.xni.XMLResourceIdentifier;
0044:        import org.apache.xerces.xni.XMLString;
0045:        import org.apache.xerces.xni.XNIException;
0046:        import org.apache.xerces.xni.parser.XMLComponentManager;
0047:        import org.apache.xerces.xni.parser.XMLConfigurationException;
0048:        import org.apache.xerces.xni.parser.XMLDocumentScanner;
0049:        import org.apache.xerces.xni.parser.XMLInputSource;
0050:
0051:        /**
0052:         * A simple HTML scanner. This scanner makes no attempt to balance tags
0053:         * or fix other problems in the source document — it just scans what 
0054:         * it can and generates XNI document "events", ignoring errors of all 
0055:         * kinds.
0056:         * <p>
0057:         * This component recognizes the following features:
0058:         * <ul>
0059:         * <li>http://cyberneko.org/html/features/augmentations
0060:         * <li>http://cyberneko.org/html/features/report-errors
0061:         * <li>http://apache.org/xml/features/scanner/notify-char-refs
0062:         * <li>http://apache.org/xml/features/scanner/notify-builtin-refs
0063:         * <li>http://cyberneko.org/html/features/scanner/notify-builtin-refs
0064:         * <li>http://cyberneko.org/html/features/scanner/fix-mswindows-refs
0065:         * <li>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
0066:         * <li>http://cyberneko.org/html/features/scanner/script/strip-comment-delims
0067:         * <li>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
0068:         * <li>http://cyberneko.org/html/features/scanner/style/strip-comment-delims
0069:         * <li>http://cyberneko.org/html/features/scanner/ignore-specified-charset
0070:         * <li>http://cyberneko.org/html/features/scanner/cdata-sections
0071:         * <li>http://cyberneko.org/html/features/override-doctype
0072:         * <li>http://cyberneko.org/html/features/insert-doctype
0073:         * </ul>
0074:         * <p>
0075:         * This component recognizes the following properties:
0076:         * <ul>
0077:         * <li>http://cyberneko.org/html/properties/names/elems
0078:         * <li>http://cyberneko.org/html/properties/names/attrs
0079:         * <li>http://cyberneko.org/html/properties/default-encoding
0080:         * <li>http://cyberneko.org/html/properties/error-reporter
0081:         * <li>http://cyberneko.org/html/properties/doctype/pubid
0082:         * <li>http://cyberneko.org/html/properties/doctype/sysid
0083:         * </ul>
0084:         *
0085:         * @see HTMLElements
0086:         * @see HTMLEntities
0087:         *
0088:         * @author Andy Clark
0089:         * @author Ahmed Ashour
0090:         *
0091:         * @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
0092:         */
0093:        public class HTMLScanner implements  XMLDocumentScanner, XMLLocator,
0094:                HTMLComponent {
0095:
0096:            //
0097:            // Constants
0098:            //
0099:
0100:            // doctype info: HTML 4.01 strict
0101:
0102:            /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
0103:            public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";
0104:
0105:            /** HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). */
0106:            public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";
0107:
0108:            // doctype info: HTML 4.01 loose
0109:
0110:            /** HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). */
0111:            public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";
0112:
0113:            /** HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). */
0114:            public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";
0115:
0116:            // doctype info: HTML 4.01 frameset
0117:
0118:            /** HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). */
0119:            public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";
0120:
0121:            /** HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). */
0122:            public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";
0123:
0124:            // features
0125:
0126:            /** Include infoset augmentations. */
0127:            protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
0128:
0129:            /** Report errors. */
0130:            protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
0131:
0132:            /** Notify character entity references (e.g. &amp;#32;, &amp;#x20;, etc). */
0133:            public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
0134:
0135:            /** 
0136:             * Notify handler of built-in entity references (e.g. &amp;amp;, 
0137:             * &amp;lt;, etc).
0138:             * <p>
0139:             * <strong>Note:</strong>
0140:             * This only applies to the five pre-defined XML general entities.
0141:             * Specifically, "amp", "lt", "gt", "quot", and "apos". This is done 
0142:             * for compatibility with the Xerces feature.
0143:             * <p>
0144:             * To be notified of the built-in entity references in HTML, set the 
0145:             * <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code> 
0146:             * feature to <code>true</code>.
0147:             */
0148:            public static final String NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs";
0149:
0150:            /** 
0151:             * Notify handler of built-in entity references (e.g. &amp;nobr;, 
0152:             * &amp;copy;, etc).
0153:             * <p>
0154:             * <strong>Note:</strong>
0155:             * This <em>includes</em> the five pre-defined XML general entities.
0156:             */
0157:            public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
0158:
0159:            /** Fix Microsoft Windows&reg; character entity references. */
0160:            public static final String FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs";
0161:
0162:            /** 
0163:             * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and 
0164:             * "&minus;&minus;&gt;") from SCRIPT tag contents.
0165:             */
0166:            public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";
0167:
0168:            /** 
0169:             * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from 
0170:             * SCRIPT tag contents.
0171:             */
0172:            public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";
0173:
0174:            /** 
0175:             * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and 
0176:             * "&minus;&minus;&gt;") from STYLE tag contents.
0177:             */
0178:            public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";
0179:
0180:            /** 
0181:             * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from 
0182:             * STYLE tag contents.
0183:             */
0184:            public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";
0185:
0186:            /**
0187:             * Ignore specified charset found in the &lt;meta equiv='Content-Type'
0188:             * content='text/html;charset=&hellip;'&gt; tag.
0189:             */
0190:            public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";
0191:
0192:            /** Scan CDATA sections. */
0193:            public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";
0194:
0195:            /** Override doctype declaration public and system identifiers. */
0196:            public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";
0197:
0198:            /** Insert document type declaration. */
0199:            public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";
0200:
0201:            /** Normalize attribute values. */
0202:            protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs";
0203:
0204:            /** Recognized features. */
0205:            private static final String[] RECOGNIZED_FEATURES = {
0206:                    AUGMENTATIONS, REPORT_ERRORS, NOTIFY_CHAR_REFS,
0207:                    NOTIFY_XML_BUILTIN_REFS, NOTIFY_HTML_BUILTIN_REFS,
0208:                    FIX_MSWINDOWS_REFS, SCRIPT_STRIP_CDATA_DELIMS,
0209:                    SCRIPT_STRIP_COMMENT_DELIMS, STYLE_STRIP_CDATA_DELIMS,
0210:                    STYLE_STRIP_COMMENT_DELIMS, IGNORE_SPECIFIED_CHARSET,
0211:                    CDATA_SECTIONS, OVERRIDE_DOCTYPE, INSERT_DOCTYPE,
0212:                    NORMALIZE_ATTRIBUTES, };
0213:
0214:            /** Recognized features defaults. */
0215:            private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
0216:                    null, null, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0217:                    Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0218:                    Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0219:                    Boolean.FALSE, Boolean.FALSE, };
0220:
0221:            // properties
0222:
0223:            /** Modify HTML element names: { "upper", "lower", "default" }. */
0224:            protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
0225:
0226:            /** Modify HTML attribute names: { "upper", "lower", "default" }. */
0227:            protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
0228:
0229:            /** Default encoding. */
0230:            protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";
0231:
0232:            /** Error reporter. */
0233:            protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
0234:
0235:            /** Doctype declaration public identifier. */
0236:            protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";
0237:
0238:            /** Doctype declaration system identifier. */
0239:            protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";
0240:
0241:            /** Recognized properties. */
0242:            private static final String[] RECOGNIZED_PROPERTIES = {
0243:                    NAMES_ELEMS, NAMES_ATTRS, DEFAULT_ENCODING, ERROR_REPORTER,
0244:                    DOCTYPE_PUBID, DOCTYPE_SYSID, };
0245:
0246:            /** Recognized properties defaults. */
0247:            private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
0248:                    null, null, "Windows-1252", null,
0249:                    HTML_4_01_TRANSITIONAL_PUBID, HTML_4_01_TRANSITIONAL_SYSID, };
0250:
0251:            // states
0252:
0253:            /** State: content. */
0254:            protected static final short STATE_CONTENT = 0;
0255:
0256:            /** State: markup bracket. */
0257:            protected static final short STATE_MARKUP_BRACKET = 1;
0258:
0259:            /** State: start document. */
0260:            protected static final short STATE_START_DOCUMENT = 10;
0261:
0262:            /** State: end document. */
0263:            protected static final short STATE_END_DOCUMENT = 11;
0264:
0265:            // modify HTML names
0266:
0267:            /** Don't modify HTML names. */
0268:            protected static final short NAMES_NO_CHANGE = 0;
0269:
0270:            /** Uppercase HTML names. */
0271:            protected static final short NAMES_UPPERCASE = 1;
0272:
0273:            /** Lowercase HTML names. */
0274:            protected static final short NAMES_LOWERCASE = 2;
0275:
0276:            // defaults
0277:
0278:            /** Default buffer size. */
0279:            protected static final int DEFAULT_BUFFER_SIZE = 2048;
0280:
0281:            // debugging
0282:
0283:            /** Set to true to debug changes in the scanner. */
0284:            private static final boolean DEBUG_SCANNER = false;
0285:
0286:            /** Set to true to debug changes in the scanner state. */
0287:            private static final boolean DEBUG_SCANNER_STATE = false;
0288:
0289:            /** Set to true to debug the buffer. */
0290:            private static final boolean DEBUG_BUFFER = false;
0291:
0292:            /** Set to true to debug character encoding handling. */
0293:            private static final boolean DEBUG_CHARSET = false;
0294:
0295:            /** Set to true to debug callbacks. */
0296:            protected static final boolean DEBUG_CALLBACKS = false;
0297:
0298:            // static vars
0299:
0300:            /** Synthesized event info item. */
0301:            protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
0302:
0303:            //
0304:            // Data
0305:            //
0306:
0307:            // features
0308:
0309:            /** Augmentations. */
0310:            protected boolean fAugmentations;
0311:
0312:            /** Report errors. */
0313:            protected boolean fReportErrors;
0314:
0315:            /** Notify character entity references. */
0316:            protected boolean fNotifyCharRefs;
0317:
0318:            /** Notify XML built-in general entity references. */
0319:            protected boolean fNotifyXmlBuiltinRefs;
0320:
0321:            /** Notify HTML built-in general entity references. */
0322:            protected boolean fNotifyHtmlBuiltinRefs;
0323:
0324:            /** Fix Microsoft Windows&reg; character entity references. */
0325:            protected boolean fFixWindowsCharRefs;
0326:
0327:            /** Strip CDATA delimiters from SCRIPT tags. */
0328:            protected boolean fScriptStripCDATADelims;
0329:
0330:            /** Strip comment delimiters from SCRIPT tags. */
0331:            protected boolean fScriptStripCommentDelims;
0332:
0333:            /** Strip CDATA delimiters from STYLE tags. */
0334:            protected boolean fStyleStripCDATADelims;
0335:
0336:            /** Strip comment delimiters from STYLE tags. */
0337:            protected boolean fStyleStripCommentDelims;
0338:
0339:            /** Ignore specified character set. */
0340:            protected boolean fIgnoreSpecifiedCharset;
0341:
0342:            /** CDATA sections. */
0343:            protected boolean fCDATASections;
0344:
0345:            /** Override doctype declaration public and system identifiers. */
0346:            protected boolean fOverrideDoctype;
0347:
0348:            /** Insert document type declaration. */
0349:            protected boolean fInsertDoctype;
0350:
0351:            /** Normalize attribute values. */
0352:            protected boolean fNormalizeAttributes;
0353:
0354:            // properties
0355:
0356:            /** Modify HTML element names. */
0357:            protected short fNamesElems;
0358:
0359:            /** Modify HTML attribute names. */
0360:            protected short fNamesAttrs;
0361:
0362:            /** Default encoding. */
0363:            protected String fDefaultIANAEncoding;
0364:
0365:            /** Error reporter. */
0366:            protected HTMLErrorReporter fErrorReporter;
0367:
0368:            /** Doctype declaration public identifier. */
0369:            protected String fDoctypePubid;
0370:
0371:            /** Doctype declaration system identifier. */
0372:            protected String fDoctypeSysid;
0373:
0374:            // boundary locator information
0375:
0376:            /** Beginning line number. */
0377:            protected int fBeginLineNumber;
0378:
0379:            /** Beginning column number. */
0380:            protected int fBeginColumnNumber;
0381:
0382:            /** Ending line number. */
0383:            protected int fEndLineNumber;
0384:
0385:            /** Ending column number. */
0386:            protected int fEndColumnNumber;
0387:
0388:            // state
0389:
0390:            /** The playback byte stream. */
0391:            protected PlaybackInputStream fByteStream;
0392:
0393:            /** Current entity. */
0394:            protected CurrentEntity fCurrentEntity;
0395:
0396:            /** The current entity stack. */
0397:            protected final Stack fCurrentEntityStack = new Stack();
0398:
0399:            /** The current scanner. */
0400:            protected Scanner fScanner;
0401:
0402:            /** The current scanner state. */
0403:            protected short fScannerState;
0404:
0405:            /** The document handler. */
0406:            protected XMLDocumentHandler fDocumentHandler;
0407:
0408:            /** Auto-detected IANA encoding. */
0409:            protected String fIANAEncoding;
0410:
0411:            /** Auto-detected Java encoding. */
0412:            protected String fJavaEncoding;
0413:
0414:            /** True if the encoding matches "ISO-8859-*". */
0415:            protected boolean fIso8859Encoding;
0416:
0417:            /** Element count. */
0418:            protected int fElementCount;
0419:
0420:            /** Element depth. */
0421:            protected int fElementDepth;
0422:
0423:            // scanners
0424:
0425:            /** Content scanner. */
0426:            protected Scanner fContentScanner = new ContentScanner();
0427:
0428:            /** 
0429:             * Special scanner used for elements whose content needs to be scanned 
0430:             * as plain text, ignoring markup such as elements and entity references.
0431:             * For example: &lt;SCRIPT&gt; and &lt;COMMENT&gt;.
0432:             */
0433:            protected SpecialScanner fSpecialScanner = new SpecialScanner();
0434:
0435:            // temp vars
0436:
0437:            /** String. */
0438:            protected final XMLString fString = new XMLString();
0439:
0440:            /** String buffer. */
0441:            protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(
0442:                    1024);
0443:
0444:            /** String buffer. */
0445:            private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(
0446:                    1024);
0447:
0448:            /** Non-normalized attribute string buffer. */
0449:            private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(
0450:                    128);
0451:
0452:            /** Augmentations. */
0453:            private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
0454:
0455:            /** Location infoset item. */
0456:            private final LocationItem fLocationItem = new LocationItem();
0457:
0458:            /** Single boolean array. */
0459:            private final boolean[] fSingleBoolean = { false };
0460:
0461:            /** Resource identifier. */
0462:            private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl();
0463:
0464:            //
0465:            // Public methods
0466:            //
0467:
0468:            /** 
0469:             * Pushes an input source onto the current entity stack. This 
0470:             * enables the scanner to transparently scan new content (e.g. 
0471:             * the output written by an embedded script). At the end of the
0472:             * current entity, the scanner returns where it left off at the
0473:             * time this entity source was pushed.
0474:             * <p>
0475:             * <strong>Note:</strong>
0476:             * This functionality is experimental at this time and is
0477:             * subject to change in future releases of NekoHTML.
0478:             *
0479:             * @param inputSource The new input source to start scanning.
0480:             * @see #evaluateInputSource(XMLInputSource)
0481:             */
0482:            public void pushInputSource(XMLInputSource inputSource) {
0483:                final Reader reader = getReader(inputSource);
0484:
0485:                fCurrentEntityStack.push(fCurrentEntity);
0486:                String encoding = inputSource.getEncoding();
0487:                String publicId = inputSource.getPublicId();
0488:                String baseSystemId = inputSource.getBaseSystemId();
0489:                String literalSystemId = inputSource.getSystemId();
0490:                String expandedSystemId = expandSystemId(literalSystemId,
0491:                        baseSystemId);
0492:                fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0493:                        baseSystemId, literalSystemId, expandedSystemId);
0494:            } // pushInputSource(XMLInputSource)
0495:
0496:            private Reader getReader(final XMLInputSource inputSource) {
0497:                Reader reader = inputSource.getCharacterStream();
0498:                if (reader == null) {
0499:                    try {
0500:                        return new InputStreamReader(inputSource
0501:                                .getByteStream(), fJavaEncoding);
0502:                    } catch (final UnsupportedEncodingException e) {
0503:                        // should not happen as this encoding is already used to parse the "main" source
0504:                    }
0505:                }
0506:                return reader;
0507:            }
0508:
0509:            /** 
0510:             * Immediately evaluates an input source and add the new content (e.g. 
0511:             * the output written by an embedded script).
0512:             *
0513:             * @param inputSource The new input source to start evaluating.
0514:             * @see #pushInputSource(XMLInputSource)
0515:             */
0516:            public void evaluateInputSource(XMLInputSource inputSource) {
0517:                final Reader reader = getReader(inputSource);
0518:
0519:                String encoding = inputSource.getEncoding();
0520:                String publicId = inputSource.getPublicId();
0521:                String baseSystemId = inputSource.getBaseSystemId();
0522:                String literalSystemId = inputSource.getSystemId();
0523:                String expandedSystemId = expandSystemId(literalSystemId,
0524:                        baseSystemId);
0525:                fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0526:                        baseSystemId, literalSystemId, expandedSystemId);
0527:                setScanner(fContentScanner);
0528:                setScannerState(STATE_CONTENT);
0529:                try {
0530:                    fScanner.scan(false);
0531:                } catch (final IOException e) {
0532:                    // ignore
0533:                }
0534:            } // evaluateInputSource(XMLInputSource)
0535:
0536:            /**
0537:             * Cleans up used resources. For example, if scanning is terminated
0538:             * early, then this method ensures all remaining open streams are
0539:             * closed.
0540:             *
0541:             * @param closeall Close all streams, including the original.
0542:             *                 This is used in cases when the application has
0543:             *                 opened the original document stream and should
0544:             *                 be responsible for closing it.
0545:             */
0546:            public void cleanup(boolean closeall) {
0547:                int size = fCurrentEntityStack.size();
0548:                if (size > 0) {
0549:                    // current entity is not the original, so close it
0550:                    if (fCurrentEntity != null) {
0551:                        try {
0552:                            fCurrentEntity.stream.close();
0553:                        } catch (IOException e) {
0554:                            // ignore
0555:                        }
0556:                    }
0557:                    // close remaining streams
0558:                    for (int i = closeall ? 0 : 1; i < size; i++) {
0559:                        fCurrentEntity = (CurrentEntity) fCurrentEntityStack
0560:                                .pop();
0561:                        try {
0562:                            fCurrentEntity.stream.close();
0563:                        } catch (IOException e) {
0564:                            // ignore
0565:                        }
0566:                    }
0567:                } else if (closeall && fCurrentEntity != null) {
0568:                    try {
0569:                        fCurrentEntity.stream.close();
0570:                    } catch (IOException e) {
0571:                        // ignore
0572:                    }
0573:                }
0574:            } // cleanup(boolean)
0575:
0576:            //
0577:            // XMLLocator methods
0578:            //
0579:
0580:            /** Returns the encoding. */
0581:            public String getEncoding() {
0582:                return fCurrentEntity != null ? fCurrentEntity.encoding : null;
0583:            } // getEncoding():String
0584:
0585:            /** Returns the public identifier. */
0586:            public String getPublicId() {
0587:                return fCurrentEntity != null ? fCurrentEntity.publicId : null;
0588:            } // getPublicId():String
0589:
0590:            /** Returns the base system identifier. */
0591:            public String getBaseSystemId() {
0592:                return fCurrentEntity != null ? fCurrentEntity.baseSystemId
0593:                        : null;
0594:            } // getBaseSystemId():String
0595:
0596:            /** Returns the literal system identifier. */
0597:            public String getLiteralSystemId() {
0598:                return fCurrentEntity != null ? fCurrentEntity.literalSystemId
0599:                        : null;
0600:            } // getLiteralSystemId():String
0601:
0602:            /** Returns the expanded system identifier. */
0603:            public String getExpandedSystemId() {
0604:                return fCurrentEntity != null ? fCurrentEntity.expandedSystemId
0605:                        : null;
0606:            } // getExpandedSystemId():String
0607:
0608:            /** Returns the current line number. */
0609:            public int getLineNumber() {
0610:                return fCurrentEntity != null ? fCurrentEntity.lineNumber : -1;
0611:            } // getLineNumber():int
0612:
0613:            /** Returns the current column number. */
0614:            public int getColumnNumber() {
0615:                return fCurrentEntity != null ? fCurrentEntity.columnNumber
0616:                        : -1;
0617:            } // getColumnNumber():int
0618:
0619:            /** Returns the XML version. */
0620:            public String getXMLVersion() {
0621:                return fCurrentEntity != null ? fCurrentEntity.version : null;
0622:            } // getXMLVersion():String
0623:
0624:            /** Returns the character offset. */
0625:            public int getCharacterOffset() {
0626:                return fCurrentEntity != null ? fCurrentEntity.charOffset : -1;
0627:            } // getCharacterOffset():int
0628:
0629:            //
0630:            // HTMLComponent methods
0631:            //
0632:
0633:            /** Returns the default state for a feature. */
0634:            public Boolean getFeatureDefault(String featureId) {
0635:                int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length
0636:                        : 0;
0637:                for (int i = 0; i < length; i++) {
0638:                    if (RECOGNIZED_FEATURES[i].equals(featureId)) {
0639:                        return RECOGNIZED_FEATURES_DEFAULTS[i];
0640:                    }
0641:                }
0642:                return null;
0643:            } // getFeatureDefault(String):Boolean
0644:
0645:            /** Returns the default state for a property. */
0646:            public Object getPropertyDefault(String propertyId) {
0647:                int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length
0648:                        : 0;
0649:                for (int i = 0; i < length; i++) {
0650:                    if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
0651:                        return RECOGNIZED_PROPERTIES_DEFAULTS[i];
0652:                    }
0653:                }
0654:                return null;
0655:            } // getPropertyDefault(String):Object
0656:
0657:            //
0658:            // XMLComponent methods
0659:            //
0660:
0661:            /** Returns recognized features. */
0662:            public String[] getRecognizedFeatures() {
0663:                return RECOGNIZED_FEATURES;
0664:            } // getRecognizedFeatures():String[]
0665:
0666:            /** Returns recognized properties. */
0667:            public String[] getRecognizedProperties() {
0668:                return RECOGNIZED_PROPERTIES;
0669:            } // getRecognizedProperties():String[]
0670:
0671:            /** Resets the component. */
0672:            public void reset(XMLComponentManager manager)
0673:                    throws XMLConfigurationException {
0674:
0675:                // get features
0676:                fAugmentations = manager.getFeature(AUGMENTATIONS);
0677:                fReportErrors = manager.getFeature(REPORT_ERRORS);
0678:                fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS);
0679:                fNotifyXmlBuiltinRefs = manager
0680:                        .getFeature(NOTIFY_XML_BUILTIN_REFS);
0681:                fNotifyHtmlBuiltinRefs = manager
0682:                        .getFeature(NOTIFY_HTML_BUILTIN_REFS);
0683:                fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS);
0684:                fScriptStripCDATADelims = manager
0685:                        .getFeature(SCRIPT_STRIP_CDATA_DELIMS);
0686:                fScriptStripCommentDelims = manager
0687:                        .getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
0688:                fStyleStripCDATADelims = manager
0689:                        .getFeature(STYLE_STRIP_CDATA_DELIMS);
0690:                fStyleStripCommentDelims = manager
0691:                        .getFeature(STYLE_STRIP_COMMENT_DELIMS);
0692:                fIgnoreSpecifiedCharset = manager
0693:                        .getFeature(IGNORE_SPECIFIED_CHARSET);
0694:                fCDATASections = manager.getFeature(CDATA_SECTIONS);
0695:                fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
0696:                fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
0697:                fNormalizeAttributes = manager.getFeature(NORMALIZE_ATTRIBUTES);
0698:
0699:                // get properties
0700:                fNamesElems = getNamesValue(String.valueOf(manager
0701:                        .getProperty(NAMES_ELEMS)));
0702:                fNamesAttrs = getNamesValue(String.valueOf(manager
0703:                        .getProperty(NAMES_ATTRS)));
0704:                fDefaultIANAEncoding = String.valueOf(manager
0705:                        .getProperty(DEFAULT_ENCODING));
0706:                fErrorReporter = (HTMLErrorReporter) manager
0707:                        .getProperty(ERROR_REPORTER);
0708:                fDoctypePubid = String.valueOf(manager
0709:                        .getProperty(DOCTYPE_PUBID));
0710:                fDoctypeSysid = String.valueOf(manager
0711:                        .getProperty(DOCTYPE_SYSID));
0712:
0713:            } // reset(XMLComponentManager)
0714:
0715:            /** Sets a feature. */
0716:            public void setFeature(String featureId, boolean state)
0717:                    throws XMLConfigurationException {
0718:
0719:                if (featureId.equals(AUGMENTATIONS)) {
0720:                    fAugmentations = state;
0721:                } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
0722:                    fIgnoreSpecifiedCharset = state;
0723:                } else if (featureId.equals(NOTIFY_CHAR_REFS)) {
0724:                    fNotifyCharRefs = state;
0725:                } else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) {
0726:                    fNotifyXmlBuiltinRefs = state;
0727:                } else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) {
0728:                    fNotifyHtmlBuiltinRefs = state;
0729:                } else if (featureId.equals(FIX_MSWINDOWS_REFS)) {
0730:                    fFixWindowsCharRefs = state;
0731:                } else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) {
0732:                    fScriptStripCDATADelims = state;
0733:                } else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) {
0734:                    fScriptStripCommentDelims = state;
0735:                } else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) {
0736:                    fStyleStripCDATADelims = state;
0737:                } else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) {
0738:                    fStyleStripCommentDelims = state;
0739:                } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
0740:                    fIgnoreSpecifiedCharset = state;
0741:                }
0742:
0743:            } // setFeature(String,boolean)
0744:
0745:            /** Sets a property. */
0746:            public void setProperty(String propertyId, Object value)
0747:                    throws XMLConfigurationException {
0748:
0749:                if (propertyId.equals(NAMES_ELEMS)) {
0750:                    fNamesElems = getNamesValue(String.valueOf(value));
0751:                    return;
0752:                }
0753:
0754:                if (propertyId.equals(NAMES_ATTRS)) {
0755:                    fNamesAttrs = getNamesValue(String.valueOf(value));
0756:                    return;
0757:                }
0758:
0759:                if (propertyId.equals(DEFAULT_ENCODING)) {
0760:                    fDefaultIANAEncoding = String.valueOf(value);
0761:                    return;
0762:                }
0763:
0764:            } // setProperty(String,Object)
0765:
0766:            //
0767:            // XMLDocumentScanner methods
0768:            //
0769:
0770:            /** Sets the input source. */
0771:            public void setInputSource(XMLInputSource source)
0772:                    throws IOException {
0773:
0774:                // reset state
0775:                fElementCount = 0;
0776:                fElementDepth = -1;
0777:                fByteStream = null;
0778:                fCurrentEntityStack.removeAllElements();
0779:
0780:                fBeginLineNumber = 1;
0781:                fBeginColumnNumber = 1;
0782:                fEndLineNumber = fBeginLineNumber;
0783:                fEndColumnNumber = fBeginColumnNumber;
0784:
0785:                // reset encoding information
0786:                fIANAEncoding = fDefaultIANAEncoding;
0787:                fJavaEncoding = fIANAEncoding;
0788:
0789:                // get location information
0790:                String encoding = source.getEncoding();
0791:                String publicId = source.getPublicId();
0792:                String baseSystemId = source.getBaseSystemId();
0793:                String literalSystemId = source.getSystemId();
0794:                String expandedSystemId = expandSystemId(literalSystemId,
0795:                        baseSystemId);
0796:
0797:                // open stream
0798:                Reader reader = source.getCharacterStream();
0799:                if (reader == null) {
0800:                    InputStream inputStream = source.getByteStream();
0801:                    if (inputStream == null) {
0802:                        URL url = new URL(expandedSystemId);
0803:                        inputStream = url.openStream();
0804:                    }
0805:                    fByteStream = new PlaybackInputStream(inputStream);
0806:                    String[] encodings = new String[2];
0807:                    if (encoding == null) {
0808:                        fByteStream.detectEncoding(encodings);
0809:                    } else {
0810:                        encodings[0] = encoding;
0811:                    }
0812:                    if (encodings[0] == null) {
0813:                        encodings[0] = fDefaultIANAEncoding;
0814:                        if (fReportErrors) {
0815:                            fErrorReporter.reportWarning("HTML1000", null);
0816:                        }
0817:                    }
0818:                    if (encodings[1] == null) {
0819:                        encodings[1] = EncodingMap
0820:                                .getIANA2JavaMapping(encodings[0].toUpperCase());
0821:                        if (encodings[1] == null) {
0822:                            encodings[1] = encodings[0];
0823:                            if (fReportErrors) {
0824:                                fErrorReporter.reportWarning("HTML1001",
0825:                                        new Object[] { encodings[0] });
0826:                            }
0827:                        }
0828:                    }
0829:                    fIANAEncoding = encodings[0];
0830:                    fJavaEncoding = encodings[1];
0831:                    /* PATCH: Asgeir Asgeirsson */
0832:                    fIso8859Encoding = fIANAEncoding == null
0833:                            || fIANAEncoding.toUpperCase().startsWith(
0834:                                    "ISO-8859")
0835:                            || fIANAEncoding
0836:                                    .equalsIgnoreCase(fDefaultIANAEncoding);
0837:                    encoding = fIANAEncoding;
0838:                    reader = new InputStreamReader(fByteStream, fJavaEncoding);
0839:                }
0840:                fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0841:                        baseSystemId, literalSystemId, expandedSystemId);
0842:
0843:                // set scanner and state
0844:                setScanner(fContentScanner);
0845:                setScannerState(STATE_START_DOCUMENT);
0846:
0847:            } // setInputSource(XMLInputSource)
0848:
0849:            /** Scans the document. */
0850:            public boolean scanDocument(boolean complete) throws XNIException,
0851:                    IOException {
0852:                do {
0853:                    if (!fScanner.scan(complete)) {
0854:                        return false;
0855:                    }
0856:                } while (complete);
0857:                return true;
0858:            } // scanDocument(boolean):boolean
0859:
0860:            /** Sets the document handler. */
0861:            public void setDocumentHandler(XMLDocumentHandler handler) {
0862:                fDocumentHandler = handler;
0863:            } // setDocumentHandler(XMLDocumentHandler)
0864:
0865:            // @since Xerces 2.1.0
0866:
0867:            /** Returns the document handler. */
0868:            public XMLDocumentHandler getDocumentHandler() {
0869:                return fDocumentHandler;
0870:            } // getDocumentHandler():XMLDocumentHandler
0871:
0872:            //
0873:            // Protected static methods
0874:            //
0875:
0876:            /** Returns the value of the specified attribute, ignoring case. */
0877:            protected static String getValue(XMLAttributes attrs, String aname) {
0878:                int length = attrs != null ? attrs.getLength() : 0;
0879:                for (int i = 0; i < length; i++) {
0880:                    if (attrs.getQName(i).equalsIgnoreCase(aname)) {
0881:                        return attrs.getValue(i);
0882:                    }
0883:                }
0884:                return null;
0885:            } // getValue(XMLAttributes,String):String
0886:
0887:            /**
0888:             * Expands a system id and returns the system id as a URI, if
0889:             * it can be expanded. A return value of null means that the
0890:             * identifier is already expanded. An exception thrown
0891:             * indicates a failure to expand the id.
0892:             *
0893:             * @param systemId The systemId to be expanded.
0894:             *
0895:             * @return Returns the URI string representing the expanded system
0896:             *         identifier. A null value indicates that the given
0897:             *         system identifier is already expanded.
0898:             *
0899:             */
0900:            public static String expandSystemId(String systemId,
0901:                    String baseSystemId) {
0902:
0903:                // check for bad parameters id
0904:                if (systemId == null || systemId.length() == 0) {
0905:                    return systemId;
0906:                }
0907:                // if id already expanded, return
0908:                try {
0909:                    URI uri = new URI(systemId);
0910:                    if (uri != null) {
0911:                        return systemId;
0912:                    }
0913:                } catch (URI.MalformedURIException e) {
0914:                    // continue on...
0915:                }
0916:                // normalize id
0917:                String id = fixURI(systemId);
0918:
0919:                // normalize base
0920:                URI base = null;
0921:                URI uri = null;
0922:                try {
0923:                    if (baseSystemId == null || baseSystemId.length() == 0
0924:                            || baseSystemId.equals(systemId)) {
0925:                        String dir;
0926:                        try {
0927:                            dir = fixURI(System.getProperty("user.dir"));
0928:                        } catch (SecurityException se) {
0929:                            dir = "";
0930:                        }
0931:                        if (!dir.endsWith("/")) {
0932:                            dir = dir + "/";
0933:                        }
0934:                        base = new URI("file", "", dir, null, null);
0935:                    } else {
0936:                        try {
0937:                            base = new URI(fixURI(baseSystemId));
0938:                        } catch (URI.MalformedURIException e) {
0939:                            String dir;
0940:                            try {
0941:                                dir = fixURI(System.getProperty("user.dir"));
0942:                            } catch (SecurityException se) {
0943:                                dir = "";
0944:                            }
0945:                            if (baseSystemId.indexOf(':') != -1) {
0946:                                // for xml schemas we might have baseURI with
0947:                                // a specified drive
0948:                                base = new URI("file", "",
0949:                                        fixURI(baseSystemId), null, null);
0950:                            } else {
0951:                                if (!dir.endsWith("/")) {
0952:                                    dir = dir + "/";
0953:                                }
0954:                                dir = dir + fixURI(baseSystemId);
0955:                                base = new URI("file", "", dir, null, null);
0956:                            }
0957:                        }
0958:                    }
0959:                    // expand id
0960:                    uri = new URI(base, id);
0961:                } catch (URI.MalformedURIException e) {
0962:                    // let it go through
0963:                }
0964:
0965:                if (uri == null) {
0966:                    return systemId;
0967:                }
0968:                return uri.toString();
0969:
0970:            } // expandSystemId(String,String):String
0971:
0972:            /**
0973:             * Fixes a platform dependent filename to standard URI form.
0974:             *
0975:             * @param str The string to fix.
0976:             *
0977:             * @return Returns the fixed URI string.
0978:             */
0979:            protected static String fixURI(String str) {
0980:
0981:                // handle platform dependent strings
0982:                str = str.replace(java.io.File.separatorChar, '/');
0983:
0984:                // Windows fix
0985:                if (str.length() >= 2) {
0986:                    char ch1 = str.charAt(1);
0987:                    // change "C:blah" to "/C:blah"
0988:                    if (ch1 == ':') {
0989:                        char ch0 = Character.toUpperCase(str.charAt(0));
0990:                        if (ch0 >= 'A' && ch0 <= 'Z') {
0991:                            str = "/" + str;
0992:                        }
0993:                    }
0994:                    // change "//blah" to "file://blah"
0995:                    else if (ch1 == '/' && str.charAt(0) == '/') {
0996:                        str = "file:" + str;
0997:                    }
0998:                }
0999:
1000:                // done
1001:                return str;
1002:
1003:            } // fixURI(String):String
1004:
1005:            /** Modifies the given name based on the specified mode. */
1006:            protected static final String modifyName(String name, short mode) {
1007:                switch (mode) {
1008:                case NAMES_UPPERCASE:
1009:                    return name.toUpperCase();
1010:                case NAMES_LOWERCASE:
1011:                    return name.toLowerCase();
1012:                }
1013:                return name;
1014:            } // modifyName(String,short):String
1015:
1016:            /**
1017:             * Converts HTML names string value to constant value. 
1018:             *
1019:             * @see #NAMES_NO_CHANGE
1020:             * @see #NAMES_LOWERCASE
1021:             * @see #NAMES_UPPERCASE
1022:             */
1023:            protected static final short getNamesValue(String value) {
1024:                if (value.equals("lower")) {
1025:                    return NAMES_LOWERCASE;
1026:                }
1027:                if (value.equals("upper")) {
1028:                    return NAMES_UPPERCASE;
1029:                }
1030:                return NAMES_NO_CHANGE;
1031:            } // getNamesValue(String):short
1032:
1033:            /**
1034:             * Fixes Microsoft Windows&reg; specific characters.
1035:             * <p>
1036:             * Details about this common problem can be found at 
1037:             * <a href='http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.html</a>
1038:             */
1039:            protected int fixWindowsCharacter(int origChar) {
1040:                /* PATCH: Asgeir Asgeirsson */
1041:                switch (origChar) {
1042:                case 130:
1043:                    return 8218;
1044:                case 131:
1045:                    return 402;
1046:                case 132:
1047:                    return 8222;
1048:                case 133:
1049:                    return 8230;
1050:                case 134:
1051:                    return 8224;
1052:                case 135:
1053:                    return 8225;
1054:                case 136:
1055:                    return 710;
1056:                case 137:
1057:                    return 8240;
1058:                case 138:
1059:                    return 352;
1060:                case 139:
1061:                    return 8249;
1062:                case 140:
1063:                    return 338;
1064:                case 145:
1065:                    return 8216;
1066:                case 146:
1067:                    return 8217;
1068:                case 147:
1069:                    return 8220;
1070:                case 148:
1071:                    return 8221;
1072:                case 149:
1073:                    return 8226;
1074:                case 150:
1075:                    return 8211;
1076:                case 151:
1077:                    return 8212;
1078:                case 152:
1079:                    return 732;
1080:                case 153:
1081:                    return 8482;
1082:                case 154:
1083:                    return 353;
1084:                case 155:
1085:                    return 8250;
1086:                case 156:
1087:                    return 339;
1088:                case 159:
1089:                    return 376;
1090:                }
1091:                return origChar;
1092:            } // fixWindowsCharacter(int):int
1093:
1094:            //
1095:            // Protected methods
1096:            //
1097:
1098:            // i/o
1099:
1100:            /** Reads a single character. */
1101:            protected int read() throws IOException {
1102:                if (DEBUG_BUFFER) {
1103:                    System.out.print("(read: ");
1104:                    printBuffer();
1105:                    System.out.println();
1106:                }
1107:                if (fCurrentEntity.offset == fCurrentEntity.length) {
1108:                    if (load(0) == -1) {
1109:                        if (DEBUG_BUFFER) {
1110:                            System.out.println(")read: -> -1");
1111:                        }
1112:                        return -1;
1113:                    }
1114:                }
1115:                int c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1116:                fCurrentEntity.columnNumber++;
1117:                if (DEBUG_BUFFER) {
1118:                    System.out.print(")read: ");
1119:                    printBuffer();
1120:                    System.out.print(" -> ");
1121:                    System.out.print(c);
1122:                    System.out.println();
1123:                }
1124:                return c;
1125:            } // read():int
1126:
1127:            /** 
1128:             * Loads a new chunk of data into the buffer and returns the number of
1129:             * characters loaded or -1 if no additional characters were loaded.
1130:             *
1131:             * @param offset The offset at which new characters should be loaded.
1132:             */
1133:            protected int load(int offset) throws IOException {
1134:                if (DEBUG_BUFFER) {
1135:                    System.out.print("(load: ");
1136:                    printBuffer();
1137:                    System.out.println();
1138:                }
1139:                // resize buffer, if needed
1140:                if (offset == fCurrentEntity.buffer.length) {
1141:                    int adjust = fCurrentEntity.buffer.length / 4;
1142:                    char[] array = new char[fCurrentEntity.buffer.length
1143:                            + adjust];
1144:                    System.arraycopy(fCurrentEntity.buffer, 0, array, 0,
1145:                            fCurrentEntity.length);
1146:                    fCurrentEntity.buffer = array;
1147:                }
1148:                // read a block of characters
1149:                int count = fCurrentEntity.stream.read(fCurrentEntity.buffer,
1150:                        offset, fCurrentEntity.buffer.length - offset);
1151:                fCurrentEntity.length = count != -1 ? count + offset : offset;
1152:                fCurrentEntity.offset = offset;
1153:                if (DEBUG_BUFFER) {
1154:                    System.out.print(")load: ");
1155:                    printBuffer();
1156:                    System.out.print(" -> ");
1157:                    System.out.print(count);
1158:                    System.out.println();
1159:                }
1160:                return count;
1161:            } // load():int
1162:
1163:            // debugging
1164:
1165:            /** Sets the scanner. */
1166:            protected void setScanner(Scanner scanner) {
1167:                fScanner = scanner;
1168:                if (DEBUG_SCANNER) {
1169:                    System.out.print("$$$ setScanner(");
1170:                    System.out.print(scanner != null ? scanner.getClass()
1171:                            .getName() : "null");
1172:                    System.out.println(");");
1173:                }
1174:            } // setScanner(Scanner)
1175:
1176:            /** Sets the scanner state. */
1177:            protected void setScannerState(short state) {
1178:                fScannerState = state;
1179:                if (DEBUG_SCANNER_STATE) {
1180:                    System.out.print("$$$ setScannerState(");
1181:                    switch (fScannerState) {
1182:                    case STATE_CONTENT: {
1183:                        System.out.print("STATE_CONTENT");
1184:                        break;
1185:                    }
1186:                    case STATE_MARKUP_BRACKET: {
1187:                        System.out.print("STATE_MARKUP_BRACKET");
1188:                        break;
1189:                    }
1190:                    case STATE_START_DOCUMENT: {
1191:                        System.out.print("STATE_START_DOCUMENT");
1192:                        break;
1193:                    }
1194:                    case STATE_END_DOCUMENT: {
1195:                        System.out.print("STATE_END_DOCUMENT");
1196:                        break;
1197:                    }
1198:                    }
1199:                    System.out.println(");");
1200:                }
1201:            } // setScannerState(short)
1202:
1203:            // scanning
1204:
1205:            /** Scans a DOCTYPE line. */
1206:            protected void scanDoctype() throws IOException {
1207:                String root = null;
1208:                String pubid = null;
1209:                String sysid = null;
1210:
1211:                if (skipSpaces()) {
1212:                    root = scanName();
1213:                    if (root == null) {
1214:                        if (fReportErrors) {
1215:                            fErrorReporter.reportError("HTML1014", null);
1216:                        }
1217:                    } else {
1218:                        root = modifyName(root, fNamesElems);
1219:                    }
1220:                    if (skipSpaces()) {
1221:                        if (skip("PUBLIC", false)) {
1222:                            skipSpaces();
1223:                            pubid = scanLiteral();
1224:                            if (skipSpaces()) {
1225:                                sysid = scanLiteral();
1226:                            }
1227:                        } else if (skip("SYSTEM", false)) {
1228:                            skipSpaces();
1229:                            sysid = scanLiteral();
1230:                        }
1231:                    }
1232:                }
1233:                int c;
1234:                while ((c = read()) != -1) {
1235:                    if (c == '<') {
1236:                        fCurrentEntity.offset--;
1237:                        fCurrentEntity.columnNumber--;
1238:                        break;
1239:                    }
1240:                    if (c == '>') {
1241:                        break;
1242:                    }
1243:                    if (c == '[') {
1244:                        skipMarkup(true);
1245:                        break;
1246:                    }
1247:                }
1248:
1249:                if (fDocumentHandler != null) {
1250:                    if (fOverrideDoctype) {
1251:                        pubid = fDoctypePubid;
1252:                        sysid = fDoctypeSysid;
1253:                    }
1254:                    fEndLineNumber = fCurrentEntity.lineNumber;
1255:                    fEndColumnNumber = fCurrentEntity.columnNumber;
1256:                    fDocumentHandler.doctypeDecl(root, pubid, sysid,
1257:                            locationAugs());
1258:                }
1259:
1260:            } // scanDoctype()
1261:
1262:            /** Scans a quoted literal. */
1263:            protected String scanLiteral() throws IOException {
1264:                int quote = read();
1265:                if (quote == '\'' || quote == '"') {
1266:                    StringBuffer str = new StringBuffer();
1267:                    int c;
1268:                    while ((c = read()) != -1) {
1269:                        if (c == quote) {
1270:                            break;
1271:                        }
1272:                        if (c == '\r' || c == '\n') {
1273:                            fCurrentEntity.offset--;
1274:                            fCurrentEntity.columnNumber--;
1275:                            // NOTE: This collapses newlines to a single space.
1276:                            //       [Q] Is this the right thing to do here? -Ac
1277:                            skipNewlines();
1278:                            str.append(' ');
1279:                        } else if (c == '<') {
1280:                            fCurrentEntity.offset--;
1281:                            fCurrentEntity.columnNumber--;
1282:                            break;
1283:                        } else {
1284:                            str.append((char) c);
1285:                        }
1286:                    }
1287:                    if (c == -1) {
1288:                        if (fReportErrors) {
1289:                            fErrorReporter.reportError("HTML1007", null);
1290:                        }
1291:                        throw new EOFException();
1292:                    }
1293:                    return str.toString();
1294:                } else {
1295:                    fCurrentEntity.offset--;
1296:                    fCurrentEntity.columnNumber--;
1297:                }
1298:                return null;
1299:            } // scanLiteral():String
1300:
1301:            /** Scans a name. */
1302:            protected String scanName() throws IOException {
1303:                if (DEBUG_BUFFER) {
1304:                    System.out.print("(scanName: ");
1305:                    printBuffer();
1306:                    System.out.println();
1307:                }
1308:                if (fCurrentEntity.offset == fCurrentEntity.length) {
1309:                    if (load(0) == -1) {
1310:                        if (DEBUG_BUFFER) {
1311:                            System.out.print(")scanName: ");
1312:                            printBuffer();
1313:                            System.out.println(" -> null");
1314:                        }
1315:                        return null;
1316:                    }
1317:                }
1318:                int offset = fCurrentEntity.offset;
1319:                while (true) {
1320:                    while (fCurrentEntity.offset < fCurrentEntity.length) {
1321:                        char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1322:                        if (!Character.isLetterOrDigit(c)
1323:                                && !(c == '-' || c == '.' || c == ':' || c == '_')) {
1324:                            break;
1325:                        }
1326:                        fCurrentEntity.offset++;
1327:                        fCurrentEntity.columnNumber++;
1328:                    }
1329:                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1330:                        int length = fCurrentEntity.length - offset;
1331:                        System.arraycopy(fCurrentEntity.buffer, offset,
1332:                                fCurrentEntity.buffer, 0, length);
1333:                        int count = load(length);
1334:                        offset = 0;
1335:                        if (count == -1) {
1336:                            break;
1337:                        }
1338:                    } else {
1339:                        break;
1340:                    }
1341:                }
1342:                int length = fCurrentEntity.offset - offset;
1343:                String name = length > 0 ? new String(fCurrentEntity.buffer,
1344:                        offset, length) : null;
1345:                if (DEBUG_BUFFER) {
1346:                    System.out.print(")scanName: ");
1347:                    printBuffer();
1348:                    System.out.print(" -> \"");
1349:                    System.out.print(name);
1350:                    System.out.println('"');
1351:                }
1352:                return name;
1353:            } // scanName():String
1354:
1355:            /** Scans an entity reference. */
1356:            protected int scanEntityRef(XMLStringBuffer str, boolean content)
1357:                    throws IOException {
1358:                str.clear();
1359:                str.append('&');
1360:                while (true) {
1361:                    int c = read();
1362:                    if (c == ';') {
1363:                        str.append(';');
1364:                        break;
1365:                    }
1366:                    if (c == -1) {
1367:                        if (fReportErrors) {
1368:                            fErrorReporter.reportWarning("HTML1004", null);
1369:                        }
1370:                        if (content && fDocumentHandler != null
1371:                                && fElementCount >= fElementDepth) {
1372:                            fEndLineNumber = fCurrentEntity.lineNumber;
1373:                            fEndColumnNumber = fCurrentEntity.columnNumber;
1374:                            fDocumentHandler.characters(str, locationAugs());
1375:                        }
1376:                        return -1;
1377:                    }
1378:                    if (!Character.isLetterOrDigit((char) c) && c != '#') {
1379:                        if (fReportErrors) {
1380:                            fErrorReporter.reportWarning("HTML1004", null);
1381:                        }
1382:                        fCurrentEntity.offset--;
1383:                        fCurrentEntity.columnNumber--;
1384:                        if (content && fDocumentHandler != null
1385:                                && fElementCount >= fElementDepth) {
1386:                            fEndLineNumber = fCurrentEntity.lineNumber;
1387:                            fEndColumnNumber = fCurrentEntity.columnNumber;
1388:                            fDocumentHandler.characters(str, locationAugs());
1389:                        }
1390:                        return -1;
1391:                    }
1392:                    str.append((char) c);
1393:                }
1394:                if (str.length == 1) {
1395:                    if (content && fDocumentHandler != null
1396:                            && fElementCount >= fElementDepth) {
1397:                        fEndLineNumber = fCurrentEntity.lineNumber;
1398:                        fEndColumnNumber = fCurrentEntity.columnNumber;
1399:                        fDocumentHandler.characters(str, locationAugs());
1400:                    }
1401:                    return -1;
1402:                }
1403:
1404:                String name = str.toString().substring(1, str.length - 1);
1405:                if (name.startsWith("#")) {
1406:                    int value = -1;
1407:                    try {
1408:                        if (name.startsWith("#x")) {
1409:                            value = Integer.parseInt(name.substring(2), 16);
1410:                        } else {
1411:                            value = Integer.parseInt(name.substring(1));
1412:                        }
1413:                        /* PATCH: Asgeir Asgeirsson */
1414:                        if (fFixWindowsCharRefs && fIso8859Encoding) {
1415:                            value = fixWindowsCharacter(value);
1416:                        }
1417:                        if (content && fDocumentHandler != null
1418:                                && fElementCount >= fElementDepth) {
1419:                            fEndLineNumber = fCurrentEntity.lineNumber;
1420:                            fEndColumnNumber = fCurrentEntity.columnNumber;
1421:                            if (fNotifyCharRefs) {
1422:                                XMLResourceIdentifier id = resourceId();
1423:                                String encoding = null;
1424:                                fDocumentHandler.startGeneralEntity(name, id,
1425:                                        encoding, locationAugs());
1426:                            }
1427:                            str.clear();
1428:                            str.append((char) value);
1429:                            fDocumentHandler.characters(str, locationAugs());
1430:                            if (fNotifyCharRefs) {
1431:                                fDocumentHandler.endGeneralEntity(name,
1432:                                        locationAugs());
1433:                            }
1434:                        }
1435:                    } catch (NumberFormatException e) {
1436:                        if (fReportErrors) {
1437:                            fErrorReporter.reportError("HTML1005",
1438:                                    new Object[] { name });
1439:                        }
1440:                        if (content && fDocumentHandler != null
1441:                                && fElementCount >= fElementDepth) {
1442:                            fEndLineNumber = fCurrentEntity.lineNumber;
1443:                            fEndColumnNumber = fCurrentEntity.columnNumber;
1444:                            fDocumentHandler.characters(str, locationAugs());
1445:                        }
1446:                    }
1447:                    return value;
1448:                }
1449:
1450:                int c = HTMLEntities.get(name);
1451:                if (c == -1) {
1452:                    if (fReportErrors) {
1453:                        fErrorReporter.reportWarning("HTML1006",
1454:                                new Object[] { name });
1455:                    }
1456:                    if (content && fDocumentHandler != null
1457:                            && fElementCount >= fElementDepth) {
1458:                        fEndLineNumber = fCurrentEntity.lineNumber;
1459:                        fEndColumnNumber = fCurrentEntity.columnNumber;
1460:                        fDocumentHandler.characters(str, locationAugs());
1461:                    }
1462:                    return -1;
1463:                }
1464:                if (content && fDocumentHandler != null
1465:                        && fElementCount >= fElementDepth) {
1466:                    fEndLineNumber = fCurrentEntity.lineNumber;
1467:                    fEndColumnNumber = fCurrentEntity.columnNumber;
1468:                    boolean notify = fNotifyHtmlBuiltinRefs
1469:                            || (fNotifyXmlBuiltinRefs && builtinXmlRef(name));
1470:                    if (notify) {
1471:                        XMLResourceIdentifier id = resourceId();
1472:                        String encoding = null;
1473:                        fDocumentHandler.startGeneralEntity(name, id, encoding,
1474:                                locationAugs());
1475:                    }
1476:                    str.clear();
1477:                    str.append((char) c);
1478:                    fDocumentHandler.characters(str, locationAugs());
1479:                    if (notify) {
1480:                        fDocumentHandler.endGeneralEntity(name, locationAugs());
1481:                    }
1482:                }
1483:                return c;
1484:
1485:            } // scanEntityRef(XMLStringBuffer,boolean):int
1486:
1487:            /** Returns true if the specified text is present and is skipped. */
1488:            protected boolean skip(String s, boolean caseSensitive)
1489:                    throws IOException {
1490:                int length = s != null ? s.length() : 0;
1491:                for (int i = 0; i < length; i++) {
1492:                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1493:                        System.arraycopy(fCurrentEntity.buffer,
1494:                                fCurrentEntity.offset - i,
1495:                                fCurrentEntity.buffer, 0, i);
1496:                        if (load(i) == -1) {
1497:                            fCurrentEntity.offset = 0;
1498:                            return false;
1499:                        }
1500:                    }
1501:                    char c0 = s.charAt(i);
1502:                    char c1 = fCurrentEntity.buffer[fCurrentEntity.offset++];
1503:                    fCurrentEntity.columnNumber++;
1504:                    if (!caseSensitive) {
1505:                        c0 = Character.toUpperCase(c0);
1506:                        c1 = Character.toUpperCase(c1);
1507:                    }
1508:                    if (c0 != c1) {
1509:                        fCurrentEntity.offset -= i + 1;
1510:                        return false;
1511:                    }
1512:                }
1513:                return true;
1514:            } // skip(String):boolean
1515:
1516:            /** Skips markup. */
1517:            protected boolean skipMarkup(boolean balance) throws IOException {
1518:                if (DEBUG_BUFFER) {
1519:                    System.out.print("(skipMarkup: ");
1520:                    printBuffer();
1521:                    System.out.println();
1522:                }
1523:                int depth = 1;
1524:                boolean slashgt = false;
1525:                OUTER: while (true) {
1526:                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1527:                        if (load(0) == -1) {
1528:                            break OUTER;
1529:                        }
1530:                    }
1531:                    while (fCurrentEntity.offset < fCurrentEntity.length) {
1532:                        char c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1533:                        fCurrentEntity.columnNumber++;
1534:                        if (balance && c == '<') {
1535:                            depth++;
1536:                        } else if (c == '>') {
1537:                            depth--;
1538:                            if (depth == 0) {
1539:                                break OUTER;
1540:                            }
1541:                        } else if (c == '/') {
1542:                            if (fCurrentEntity.offset == fCurrentEntity.length) {
1543:                                if (load(0) == -1) {
1544:                                    break OUTER;
1545:                                }
1546:                            }
1547:                            c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1548:                            fCurrentEntity.columnNumber++;
1549:                            if (c == '>') {
1550:                                slashgt = true;
1551:                                depth--;
1552:                                if (depth == 0) {
1553:                                    break OUTER;
1554:                                }
1555:                            } else {
1556:                                fCurrentEntity.offset--;
1557:                                fCurrentEntity.columnNumber--;
1558:                            }
1559:                        } else if (c == '\r' || c == '\n') {
1560:                            skipNewlines();
1561:                        }
1562:                    }
1563:                }
1564:                if (DEBUG_BUFFER) {
1565:                    System.out.print(")skipMarkup: ");
1566:                    printBuffer();
1567:                    System.out.print(" -> " + slashgt);
1568:                    System.out.println();
1569:                }
1570:                return slashgt;
1571:            } // skipMarkup():boolean
1572:
1573:            /** Skips whitespace. */
1574:            protected boolean skipSpaces() throws IOException {
1575:                if (DEBUG_BUFFER) {
1576:                    System.out.print("(skipSpaces: ");
1577:                    printBuffer();
1578:                    System.out.println();
1579:                }
1580:                boolean spaces = false;
1581:                while (true) {
1582:                    if (fCurrentEntity.offset == fCurrentEntity.length) {
1583:                        if (load(0) == -1) {
1584:                            break;
1585:                        }
1586:                    }
1587:                    char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1588:                    if (!Character.isSpace(c)) {
1589:                        break;
1590:                    }
1591:                    spaces = true;
1592:                    if (c == '\r' || c == '\n') {
1593:                        skipNewlines();
1594:                        continue;
1595:                    }
1596:                    fCurrentEntity.offset++;
1597:                    fCurrentEntity.columnNumber++;
1598:                }
1599:                if (DEBUG_BUFFER) {
1600:                    System.out.print(")skipSpaces: ");
1601:                    printBuffer();
1602:                    System.out.print(" -> ");
1603:                    System.out.print(spaces);
1604:                    System.out.println();
1605:                }
1606:                return spaces;
1607:            } // skipSpaces()
1608:
1609:            /** Skips newlines and returns the number of newlines skipped. */
1610:            protected int skipNewlines() throws IOException {
1611:                return skipNewlines(Integer.MAX_VALUE);
1612:            } // skipNewlines():int
1613:
1614:            /** Skips newlines and returns the number of newlines skipped. */
1615:            protected int skipNewlines(int maxlines) throws IOException {
1616:                if (DEBUG_BUFFER) {
1617:                    System.out.print("(skipNewlines: ");
1618:                    printBuffer();
1619:                    System.out.println();
1620:                }
1621:                if (fCurrentEntity.offset == fCurrentEntity.length) {
1622:                    if (load(0) == -1) {
1623:                        if (DEBUG_BUFFER) {
1624:                            System.out.print(")skipNewlines: ");
1625:                            printBuffer();
1626:                            System.out.println();
1627:                        }
1628:                        return 0;
1629:                    }
1630:                }
1631:                char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1632:                int newlines = 0;
1633:                int offset = fCurrentEntity.offset;
1634:                if (c == '\n' || c == '\r') {
1635:                    do {
1636:                        c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1637:                        if (c == '\r') {
1638:                            newlines++;
1639:                            if (fCurrentEntity.offset == fCurrentEntity.length) {
1640:                                offset = 0;
1641:                                fCurrentEntity.offset = newlines;
1642:                                if (load(newlines) == -1) {
1643:                                    break;
1644:                                }
1645:                            }
1646:                            if (fCurrentEntity.buffer[fCurrentEntity.offset] == '\n') {
1647:                                fCurrentEntity.offset++;
1648:                                offset++;
1649:                            }
1650:                        } else if (c == '\n') {
1651:                            newlines++;
1652:                            if (fCurrentEntity.offset == fCurrentEntity.length) {
1653:                                offset = 0;
1654:                                fCurrentEntity.offset = newlines;
1655:                                if (load(newlines) == -1) {
1656:                                    break;
1657:                                }
1658:                            }
1659:                        } else {
1660:                            fCurrentEntity.offset--;
1661:                            break;
1662:                        }
1663:                    } while (newlines < maxlines
1664:                            && fCurrentEntity.offset < fCurrentEntity.length - 1);
1665:                    fCurrentEntity.lineNumber += newlines;
1666:                    fCurrentEntity.columnNumber = 1;
1667:                }
1668:                if (DEBUG_BUFFER) {
1669:                    System.out.print(")skipNewlines: ");
1670:                    printBuffer();
1671:                    System.out.print(" -> ");
1672:                    System.out.print(newlines);
1673:                    System.out.println();
1674:                }
1675:                return newlines;
1676:            } // skipNewlines(int):int
1677:
1678:            // infoset utility methods
1679:
1680:            /** Returns an augmentations object with a location item added. */
1681:            protected final Augmentations locationAugs() {
1682:                HTMLAugmentations augs = null;
1683:                if (fAugmentations) {
1684:                    fLocationItem.setValues(fBeginLineNumber,
1685:                            fBeginColumnNumber, fEndLineNumber,
1686:                            fEndColumnNumber);
1687:                    augs = fInfosetAugs;
1688:                    augs.removeAllItems();
1689:                    augs.putItem(AUGMENTATIONS, fLocationItem);
1690:                }
1691:                return augs;
1692:            } // locationAugs():Augmentations
1693:
1694:            /** Returns an augmentations object with a synthesized item added. */
1695:            protected final Augmentations synthesizedAugs() {
1696:                HTMLAugmentations augs = null;
1697:                if (fAugmentations) {
1698:                    augs = fInfosetAugs;
1699:                    augs.removeAllItems();
1700:                    augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
1701:                }
1702:                return augs;
1703:            } // synthesizedAugs():Augmentations
1704:
1705:            /** Returns an empty resource identifier. */
1706:            protected final XMLResourceIdentifier resourceId() {
1707:                /***/
1708:                fResourceId.clear();
1709:                return fResourceId;
1710:                /***
1711:                // NOTE: Unfortunately, the Xerces DOM parser classes expect a
1712:                //       non-null resource identifier object to be passed to
1713:                //       startGeneralEntity. -Ac
1714:                return null;
1715:                /***/
1716:            } // resourceId():XMLResourceIdentifier
1717:
1718:            //
1719:            // Protected static methods
1720:            //
1721:
1722:            /** Returns true if the name is a built-in XML general entity reference. */
1723:            protected static boolean builtinXmlRef(String name) {
1724:                return name.equals("amp") || name.equals("lt")
1725:                        || name.equals("gt") || name.equals("quot")
1726:                        || name.equals("apos");
1727:            } // builtinXmlRef(String):boolean
1728:
1729:            //
1730:            // Private methods
1731:            //
1732:
1733:            /** Prints the contents of the character buffer to standard out. */
1734:            private void printBuffer() {
1735:                if (DEBUG_BUFFER) {
1736:                    System.out.print('[');
1737:                    System.out.print(fCurrentEntity.length);
1738:                    System.out.print(' ');
1739:                    System.out.print(fCurrentEntity.offset);
1740:                    if (fCurrentEntity.length > 0) {
1741:                        System.out.print(" \"");
1742:                        for (int i = 0; i < fCurrentEntity.length; i++) {
1743:                            if (i == fCurrentEntity.offset) {
1744:                                System.out.print('^');
1745:                            }
1746:                            char c = fCurrentEntity.buffer[i];
1747:                            switch (c) {
1748:                            case '\r': {
1749:                                System.out.print("\\r");
1750:                                break;
1751:                            }
1752:                            case '\n': {
1753:                                System.out.print("\\n");
1754:                                break;
1755:                            }
1756:                            case '\t': {
1757:                                System.out.print("\\t");
1758:                                break;
1759:                            }
1760:                            case '"': {
1761:                                System.out.print("\\\"");
1762:                                break;
1763:                            }
1764:                            default: {
1765:                                System.out.print(c);
1766:                            }
1767:                            }
1768:                        }
1769:                        if (fCurrentEntity.offset == fCurrentEntity.length) {
1770:                            System.out.print('^');
1771:                        }
1772:                        System.out.print('"');
1773:                    }
1774:                    System.out.print(']');
1775:                }
1776:            } // printBuffer()
1777:
1778:            //
1779:            // Interfaces
1780:            //
1781:
1782:            /**
1783:             * Basic scanner interface.
1784:             *
1785:             * @author Andy Clark
1786:             */
1787:            public interface Scanner {
1788:
1789:                //
1790:                // Scanner methods
1791:                //
1792:
1793:                /** 
1794:                 * Scans part of the document. This interface allows scanning to
1795:                 * be performed in a pulling manner.
1796:                 *
1797:                 * @param complete True if the scanner should not return until
1798:                 *                 scanning is complete.
1799:                 *
1800:                 * @return True if additional scanning is required.
1801:                 *
1802:                 * @throws IOException Thrown if I/O error occurs.
1803:                 */
1804:                public boolean scan(boolean complete) throws IOException;
1805:
1806:            } // interface Scanner
1807:
1808:            //
1809:            // Classes
1810:            //
1811:
1812:            /**
1813:             * Current entity.
1814:             *
1815:             * @author Andy Clark
1816:             */
1817:            public static class CurrentEntity {
1818:
1819:                //
1820:                // Data
1821:                //
1822:
1823:                /** Character stream. */
1824:                public Reader stream;
1825:
1826:                /** Encoding. */
1827:                public String encoding;
1828:
1829:                /** Public identifier. */
1830:                public String publicId;
1831:
1832:                /** Base system identifier. */
1833:                public String baseSystemId;
1834:
1835:                /** Literal system identifier. */
1836:                public String literalSystemId;
1837:
1838:                /** Expanded system identifier. */
1839:                public String expandedSystemId;
1840:
1841:                /** XML version. */
1842:                public String version = "1.0";
1843:
1844:                /** Line number. */
1845:                public int lineNumber = 1;
1846:
1847:                /** Column number. */
1848:                public int columnNumber = 1;
1849:
1850:                /** Character offset. */
1851:                public int charOffset = -1;
1852:
1853:                // buffer
1854:
1855:                /** Character buffer. */
1856:                public char[] buffer = new char[DEFAULT_BUFFER_SIZE];
1857:
1858:                /** Offset into character buffer. */
1859:                public int offset = 0;
1860:
1861:                /** Length of characters read into character buffer. */
1862:                public int length = 0;
1863:
1864:                //
1865:                // Constructors
1866:                //
1867:
1868:                /** Constructs an entity from the specified stream. */
1869:                public CurrentEntity(Reader stream, String encoding,
1870:                        String publicId, String baseSystemId,
1871:                        String literalSystemId, String expandedSystemId) {
1872:                    this .stream = stream;
1873:                    this .encoding = encoding;
1874:                    this .publicId = publicId;
1875:                    this .baseSystemId = baseSystemId;
1876:                    this .literalSystemId = literalSystemId;
1877:                    this .expandedSystemId = expandedSystemId;
1878:                } // <init>(Reader,String,String,String,String)
1879:
1880:            } // class CurrentEntity
1881:
1882:            /**
1883:             * The primary HTML document scanner.
1884:             *
1885:             * @author Andy Clark
1886:             */
1887:            public class ContentScanner implements  Scanner {
1888:
1889:                //
1890:                // Data
1891:                //
1892:
1893:                // temp vars
1894:
1895:                /** A qualified name. */
1896:                private final QName fQName = new QName();
1897:
1898:                /** Attributes. */
1899:                private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();
1900:
1901:                //
1902:                // Scanner methods
1903:                //
1904:
1905:                /** Scan. */
1906:                public boolean scan(boolean complete) throws IOException {
1907:                    boolean next;
1908:                    do {
1909:                        try {
1910:                            next = false;
1911:                            switch (fScannerState) {
1912:                            case STATE_CONTENT: {
1913:                                fBeginLineNumber = fCurrentEntity.lineNumber;
1914:                                fBeginColumnNumber = fCurrentEntity.columnNumber;
1915:                                int c = read();
1916:                                if (c == '<') {
1917:                                    setScannerState(STATE_MARKUP_BRACKET);
1918:                                    next = true;
1919:                                } else if (c == '&') {
1920:                                    scanEntityRef(fStringBuffer, true);
1921:                                } else if (c == -1) {
1922:                                    throw new EOFException();
1923:                                } else {
1924:                                    fCurrentEntity.offset--;
1925:                                    fCurrentEntity.columnNumber--;
1926:                                    scanCharacters();
1927:                                }
1928:                                break;
1929:                            }
1930:                            case STATE_MARKUP_BRACKET: {
1931:                                int c = read();
1932:                                if (c == '!') {
1933:                                    if (skip("--", false)) {
1934:                                        scanComment();
1935:                                    } else if (skip("[CDATA[", false)) {
1936:                                        scanCDATA();
1937:                                    } else if (skip("DOCTYPE", false)) {
1938:                                        scanDoctype();
1939:                                    } else {
1940:                                        if (fReportErrors) {
1941:                                            fErrorReporter.reportError(
1942:                                                    "HTML1002", null);
1943:                                        }
1944:                                        skipMarkup(true);
1945:                                    }
1946:                                } else if (c == '?') {
1947:                                    scanPI();
1948:                                } else if (c == '/') {
1949:                                    scanEndElement();
1950:                                } else if (c == -1) {
1951:                                    if (fReportErrors) {
1952:                                        fErrorReporter.reportError("HTML1003",
1953:                                                null);
1954:                                    }
1955:                                    if (fDocumentHandler != null
1956:                                            && fElementCount >= fElementDepth) {
1957:                                        fStringBuffer.clear();
1958:                                        fStringBuffer.append('<');
1959:                                        fDocumentHandler.characters(
1960:                                                fStringBuffer, null);
1961:                                    }
1962:                                    throw new EOFException();
1963:                                } else {
1964:                                    fCurrentEntity.offset--;
1965:                                    fCurrentEntity.columnNumber--;
1966:                                    fElementCount++;
1967:                                    fSingleBoolean[0] = false;
1968:                                    String ename = scanStartElement(fSingleBoolean);
1969:                                    if (ename != null
1970:                                            && !fSingleBoolean[0]
1971:                                            && HTMLElements.getElement(ename)
1972:                                                    .isSpecial()) {
1973:                                        setScanner(fSpecialScanner
1974:                                                .setElementName(ename));
1975:                                        setScannerState(STATE_CONTENT);
1976:                                        return true;
1977:                                    }
1978:                                }
1979:                                setScannerState(STATE_CONTENT);
1980:                                break;
1981:                            }
1982:                            case STATE_START_DOCUMENT: {
1983:                                if (fDocumentHandler != null
1984:                                        && fElementCount >= fElementDepth) {
1985:                                    if (DEBUG_CALLBACKS) {
1986:                                        System.out.println("startDocument()");
1987:                                    }
1988:                                    XMLLocator locator = HTMLScanner.this ;
1989:                                    String encoding = fIANAEncoding;
1990:                                    Augmentations augs = locationAugs();
1991:                                    try {
1992:                                        // NOTE: Hack to allow the default filter to work with
1993:                                        //       old and new versions of the XNI document handler
1994:                                        //       interface. -Ac
1995:                                        Class cls = fDocumentHandler.getClass();
1996:                                        Class[] types = { XMLLocator.class,
1997:                                                String.class,
1998:                                                NamespaceContext.class,
1999:                                                Augmentations.class };
2000:                                        Method method = cls.getMethod(
2001:                                                "startDocument", types);
2002:                                        NamespaceContext nscontext = new NamespaceSupport();
2003:                                        Object[] params = { locator, encoding,
2004:                                                nscontext, augs };
2005:                                        method.invoke(fDocumentHandler, params);
2006:                                    } catch (IllegalAccessException e) {
2007:                                        throw new XNIException(e);
2008:                                    } catch (InvocationTargetException e) {
2009:                                        throw new XNIException(e);
2010:                                    } catch (NoSuchMethodException e) {
2011:                                        try {
2012:                                            // NOTE: Hack to allow the default filter to work with
2013:                                            //       old and new versions of the XNI document handler
2014:                                            //       interface. -Ac
2015:                                            Class cls = fDocumentHandler
2016:                                                    .getClass();
2017:                                            Class[] types = { XMLLocator.class,
2018:                                                    String.class,
2019:                                                    Augmentations.class };
2020:                                            Method method = cls.getMethod(
2021:                                                    "startDocument", types);
2022:                                            Object[] params = { locator,
2023:                                                    encoding, augs };
2024:                                            method.invoke(fDocumentHandler,
2025:                                                    params);
2026:                                        } catch (IllegalAccessException ex) {
2027:                                            // NOTE: Should never reach here!
2028:                                            throw new XNIException(ex);
2029:                                        } catch (InvocationTargetException ex) {
2030:                                            // NOTE: Should never reach here!
2031:                                            throw new XNIException(ex);
2032:                                        } catch (NoSuchMethodException ex) {
2033:                                            // NOTE: Should never reach here!
2034:                                            throw new XNIException(ex);
2035:                                        }
2036:                                    }
2037:                                }
2038:                                if (fInsertDoctype && fDocumentHandler != null) {
2039:                                    String root = HTMLElements
2040:                                            .getElement(HTMLElements.HTML).name;
2041:                                    root = modifyName(root, fNamesElems);
2042:                                    String pubid = fDoctypePubid;
2043:                                    String sysid = fDoctypeSysid;
2044:                                    fDocumentHandler.doctypeDecl(root, pubid,
2045:                                            sysid, synthesizedAugs());
2046:                                }
2047:                                setScannerState(STATE_CONTENT);
2048:                                break;
2049:                            }
2050:                            case STATE_END_DOCUMENT: {
2051:                                if (fDocumentHandler != null
2052:                                        && fElementCount >= fElementDepth) {
2053:                                    if (DEBUG_CALLBACKS) {
2054:                                        System.out.println("endDocument()");
2055:                                    }
2056:                                    fEndLineNumber = fCurrentEntity.lineNumber;
2057:                                    fEndColumnNumber = fCurrentEntity.columnNumber;
2058:                                    fDocumentHandler
2059:                                            .endDocument(locationAugs());
2060:                                }
2061:                                return false;
2062:                            }
2063:                            default: {
2064:                                throw new RuntimeException(
2065:                                        "unknown scanner state: "
2066:                                                + fScannerState);
2067:                            }
2068:                            }
2069:                        } catch (EOFException e) {
2070:                            if (fCurrentEntityStack.empty()) {
2071:                                setScannerState(STATE_END_DOCUMENT);
2072:                            } else {
2073:                                fCurrentEntity = (CurrentEntity) fCurrentEntityStack
2074:                                        .pop();
2075:                            }
2076:                            next = true;
2077:                        }
2078:                    } while (next || complete);
2079:                    return true;
2080:                } // scan(boolean):boolean
2081:
2082:                //
2083:                // Protected methods
2084:                //
2085:
2086:                /** Scans characters. */
2087:                protected void scanCharacters() throws IOException {
2088:                    if (DEBUG_BUFFER) {
2089:                        System.out.print("(scanCharacters: ");
2090:                        printBuffer();
2091:                        System.out.println();
2092:                    }
2093:                    int newlines = skipNewlines();
2094:                    if (newlines == 0
2095:                            && fCurrentEntity.offset == fCurrentEntity.length) {
2096:                        if (DEBUG_BUFFER) {
2097:                            System.out.print(")scanCharacters: ");
2098:                            printBuffer();
2099:                            System.out.println();
2100:                        }
2101:                        return;
2102:                    }
2103:                    char c;
2104:                    int offset = fCurrentEntity.offset - newlines;
2105:                    for (int i = offset; i < fCurrentEntity.offset; i++) {
2106:                        fCurrentEntity.buffer[i] = '\n';
2107:                    }
2108:                    while (fCurrentEntity.offset < fCurrentEntity.length) {
2109:                        c = fCurrentEntity.buffer[fCurrentEntity.offset];
2110:                        if (c == '<' || c == '&' || c == '\n' || c == '\r') {
2111:                            break;
2112:                        }
2113:                        fCurrentEntity.offset++;
2114:                        fCurrentEntity.columnNumber++;
2115:                    }
2116:                    if (fCurrentEntity.offset > offset
2117:                            && fDocumentHandler != null
2118:                            && fElementCount >= fElementDepth) {
2119:                        fString.setValues(fCurrentEntity.buffer, offset,
2120:                                fCurrentEntity.offset - offset);
2121:                        if (DEBUG_CALLBACKS) {
2122:                            System.out.println("characters(" + fString + ")");
2123:                        }
2124:                        fEndLineNumber = fCurrentEntity.lineNumber;
2125:                        fEndColumnNumber = fCurrentEntity.columnNumber;
2126:                        fDocumentHandler.characters(fString, locationAugs());
2127:                    }
2128:                    if (DEBUG_BUFFER) {
2129:                        System.out.print(")scanCharacters: ");
2130:                        printBuffer();
2131:                        System.out.println();
2132:                    }
2133:                } // scanCharacters()
2134:
2135:                /** Scans a CDATA section. */
2136:                protected void scanCDATA() throws IOException {
2137:                    if (DEBUG_BUFFER) {
2138:                        System.out.print("(scanCDATA: ");
2139:                        printBuffer();
2140:                        System.out.println();
2141:                    }
2142:                    fStringBuffer.clear();
2143:                    if (fCDATASections) {
2144:                        if (fDocumentHandler != null
2145:                                && fElementCount >= fElementDepth) {
2146:                            fEndLineNumber = fCurrentEntity.lineNumber;
2147:                            fEndColumnNumber = fCurrentEntity.columnNumber;
2148:                            if (DEBUG_CALLBACKS) {
2149:                                System.out.println("startCDATA()");
2150:                            }
2151:                            fDocumentHandler.startCDATA(locationAugs());
2152:                        }
2153:                    } else {
2154:                        fStringBuffer.append("[CDATA[");
2155:                    }
2156:                    boolean eof = scanMarkupContent(fStringBuffer, ']');
2157:                    if (!fCDATASections) {
2158:                        fStringBuffer.append("]]");
2159:                    }
2160:                    if (fDocumentHandler != null
2161:                            && fElementCount >= fElementDepth) {
2162:                        fEndLineNumber = fCurrentEntity.lineNumber;
2163:                        fEndColumnNumber = fCurrentEntity.columnNumber;
2164:                        if (fCDATASections) {
2165:                            if (DEBUG_CALLBACKS) {
2166:                                System.out.println("characters("
2167:                                        + fStringBuffer + ")");
2168:                            }
2169:                            fDocumentHandler.characters(fStringBuffer,
2170:                                    locationAugs());
2171:                            if (DEBUG_CALLBACKS) {
2172:                                System.out.println("endCDATA()");
2173:                            }
2174:                            fDocumentHandler.endCDATA(locationAugs());
2175:                        } else {
2176:                            if (DEBUG_CALLBACKS) {
2177:                                System.out.println("comment(" + fStringBuffer
2178:                                        + ")");
2179:                            }
2180:                            fDocumentHandler.comment(fStringBuffer,
2181:                                    locationAugs());
2182:                        }
2183:                    }
2184:                    if (DEBUG_BUFFER) {
2185:                        System.out.print(")scanCDATA: ");
2186:                        printBuffer();
2187:                        System.out.println();
2188:                    }
2189:                    if (eof) {
2190:                        throw new EOFException();
2191:                    }
2192:                } // scanCDATA()
2193:
2194:                /** Scans a comment. */
2195:                protected void scanComment() throws IOException {
2196:                    if (DEBUG_BUFFER) {
2197:                        System.out.print("(scanComment: ");
2198:                        printBuffer();
2199:                        System.out.println();
2200:                    }
2201:                    fStringBuffer.clear();
2202:                    boolean eof = scanMarkupContent(fStringBuffer, '-');
2203:                    if (fDocumentHandler != null
2204:                            && fElementCount >= fElementDepth) {
2205:                        if (DEBUG_CALLBACKS) {
2206:                            System.out
2207:                                    .println("comment(" + fStringBuffer + ")");
2208:                        }
2209:                        fEndLineNumber = fCurrentEntity.lineNumber;
2210:                        fEndColumnNumber = fCurrentEntity.columnNumber;
2211:                        fDocumentHandler.comment(fStringBuffer, locationAugs());
2212:                    }
2213:                    if (DEBUG_BUFFER) {
2214:                        System.out.print(")scanComment: ");
2215:                        printBuffer();
2216:                        System.out.println();
2217:                    }
2218:                    if (eof) {
2219:                        throw new EOFException();
2220:                    }
2221:                } // scanComment()
2222:
2223:                /** Scans markup content. */
2224:                protected boolean scanMarkupContent(XMLStringBuffer buffer,
2225:                        char cend) throws IOException {
2226:                    int c = -1;
2227:                    OUTER: while (true) {
2228:                        c = read();
2229:                        if (c == cend) {
2230:                            int count = 1;
2231:                            while (true) {
2232:                                c = read();
2233:                                if (c == cend) {
2234:                                    count++;
2235:                                    continue;
2236:                                }
2237:                                break;
2238:                            }
2239:                            if (c == -1) {
2240:                                if (fReportErrors) {
2241:                                    fErrorReporter
2242:                                            .reportError("HTML1007", null);
2243:                                }
2244:                                break OUTER;
2245:                            }
2246:                            if (count < 2) {
2247:                                buffer.append(cend);
2248:                                //if (c != -1) {
2249:                                fCurrentEntity.offset--;
2250:                                fCurrentEntity.columnNumber--;
2251:                                //}
2252:                                continue;
2253:                            }
2254:                            if (c != '>') {
2255:                                for (int i = 0; i < count; i++) {
2256:                                    buffer.append(cend);
2257:                                }
2258:                                fCurrentEntity.offset--;
2259:                                fCurrentEntity.columnNumber--;
2260:                                continue;
2261:                            }
2262:                            for (int i = 0; i < count - 2; i++) {
2263:                                buffer.append(cend);
2264:                            }
2265:                            break;
2266:                        } else if (c == '\n' || c == '\r') {
2267:                            fCurrentEntity.offset--;
2268:                            fCurrentEntity.columnNumber--;
2269:                            int newlines = skipNewlines();
2270:                            for (int i = 0; i < newlines; i++) {
2271:                                buffer.append('\n');
2272:                            }
2273:                            continue;
2274:                        } else if (c == -1) {
2275:                            if (fReportErrors) {
2276:                                fErrorReporter.reportError("HTML1007", null);
2277:                            }
2278:                            break;
2279:                        }
2280:                        buffer.append((char) c);
2281:                    }
2282:                    return c == -1;
2283:                } // scanMarkupContent(XMLStringBuffer,char):boolean
2284:
2285:                /** Scans a processing instruction. */
2286:                protected void scanPI() throws IOException {
2287:                    if (DEBUG_BUFFER) {
2288:                        System.out.print("(scanPI: ");
2289:                        printBuffer();
2290:                        System.out.println();
2291:                    }
2292:                    if (fReportErrors) {
2293:                        fErrorReporter.reportWarning("HTML1008", null);
2294:                    }
2295:
2296:                    // scan processing instruction
2297:                    String target = scanName();
2298:                    if (target != null && !target.equalsIgnoreCase("xml")) {
2299:                        while (true) {
2300:                            int c = read();
2301:                            if (c == '\r' || c == '\n') {
2302:                                fCurrentEntity.lineNumber++;
2303:                                fCurrentEntity.columnNumber = 1;
2304:                                if (c == '\r') {
2305:                                    c = read();
2306:                                    if (c != '\n') {
2307:                                        fCurrentEntity.offset--;
2308:                                    }
2309:                                }
2310:                                continue;
2311:                            }
2312:                            if (c == -1) {
2313:                                break;
2314:                            }
2315:                            if (c != ' ' && c != '\t') {
2316:                                fCurrentEntity.offset--;
2317:                                fCurrentEntity.columnNumber--;
2318:                                break;
2319:                            }
2320:                        }
2321:                        fStringBuffer.clear();
2322:                        while (true) {
2323:                            int c = read();
2324:                            if (c == '?' || c == '/') {
2325:                                char c0 = (char) c;
2326:                                c = read();
2327:                                if (c == '>') {
2328:                                    break;
2329:                                } else {
2330:                                    fStringBuffer.append(c0);
2331:                                    fCurrentEntity.offset--;
2332:                                    fCurrentEntity.columnNumber--;
2333:                                    continue;
2334:                                }
2335:                            } else if (c == '\r' || c == '\n') {
2336:                                fStringBuffer.append('\n');
2337:                                fCurrentEntity.lineNumber++;
2338:                                fCurrentEntity.columnNumber = 1;
2339:                                if (c == '\r') {
2340:                                    c = read();
2341:                                    if (c != '\n') {
2342:                                        fCurrentEntity.offset--;
2343:                                    }
2344:                                }
2345:                                continue;
2346:                            } else if (c == -1) {
2347:                                break;
2348:                            } else {
2349:                                fStringBuffer.append((char) c);
2350:                            }
2351:                        }
2352:                        XMLString data = fStringBuffer;
2353:                        if (fDocumentHandler != null) {
2354:                            fEndLineNumber = fCurrentEntity.lineNumber;
2355:                            fEndColumnNumber = fCurrentEntity.columnNumber;
2356:                            fDocumentHandler.processingInstruction(target,
2357:                                    data, locationAugs());
2358:                        }
2359:                    }
2360:
2361:                    // scan xml/text declaration
2362:                    else {
2363:                        int beginLineNumber = fBeginLineNumber;
2364:                        int beginColumnNumber = fBeginColumnNumber;
2365:                        fAttributes.removeAllAttributes();
2366:                        int aindex = 0;
2367:                        while (scanPseudoAttribute(fAttributes)) {
2368:                            fAttributes.getName(aindex, fQName);
2369:                            fQName.rawname = fQName.rawname.toLowerCase();
2370:                            fAttributes.setName(aindex, fQName);
2371:                            aindex++;
2372:                        }
2373:                        if (fDocumentHandler != null) {
2374:                            String version = fAttributes.getValue("version");
2375:                            String encoding = fAttributes.getValue("encoding");
2376:                            String standalone = fAttributes
2377:                                    .getValue("standalone");
2378:
2379:                            fBeginLineNumber = beginLineNumber;
2380:                            fBeginColumnNumber = beginColumnNumber;
2381:                            fEndLineNumber = fCurrentEntity.lineNumber;
2382:                            fEndColumnNumber = fCurrentEntity.columnNumber;
2383:                            fDocumentHandler.xmlDecl(version, encoding,
2384:                                    standalone, locationAugs());
2385:                        }
2386:                    }
2387:
2388:                    if (DEBUG_BUFFER) {
2389:                        System.out.print(")scanPI: ");
2390:                        printBuffer();
2391:                        System.out.println();
2392:                    }
2393:                } // scanPI()
2394:
2395:                /** 
2396:                 * Scans a start element. 
2397:                 *
2398:                 * @param empty Is used for a second return value to indicate whether
2399:                 *              the start element tag is empty (e.g. "/&gt;").
2400:                 */
2401:                protected String scanStartElement(boolean[] empty)
2402:                        throws IOException {
2403:                    String ename = scanName();
2404:                    int length = ename != null ? ename.length() : 0;
2405:                    int c = length > 0 ? ename.charAt(0) : -1;
2406:                    if (length == 0
2407:                            || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
2408:                        if (fReportErrors) {
2409:                            fErrorReporter.reportError("HTML1009", null);
2410:                        }
2411:                        if (fDocumentHandler != null
2412:                                && fElementCount >= fElementDepth) {
2413:                            fStringBuffer.clear();
2414:                            fStringBuffer.append('<');
2415:                            if (length > 0) {
2416:                                fStringBuffer.append(ename);
2417:                            }
2418:                            fDocumentHandler.characters(fStringBuffer, null);
2419:                        }
2420:                        return null;
2421:                    }
2422:                    ename = modifyName(ename, fNamesElems);
2423:                    fAttributes.removeAllAttributes();
2424:                    int beginLineNumber = fBeginLineNumber;
2425:                    int beginColumnNumber = fBeginColumnNumber;
2426:                    while (scanAttribute(fAttributes, empty)) {
2427:                        // do nothing
2428:                    }
2429:                    fBeginLineNumber = beginLineNumber;
2430:                    fBeginColumnNumber = beginColumnNumber;
2431:                    if (fByteStream != null && fElementDepth == -1) {
2432:                        if (ename.equalsIgnoreCase("META")) {
2433:                            if (DEBUG_CHARSET) {
2434:                                System.out.println("+++ <META>");
2435:                            }
2436:                            String httpEquiv = getValue(fAttributes,
2437:                                    "http-equiv");
2438:                            if (httpEquiv != null
2439:                                    && httpEquiv
2440:                                            .equalsIgnoreCase("content-type")) {
2441:                                if (DEBUG_CHARSET) {
2442:                                    System.out.println("+++ @content-type: \""
2443:                                            + httpEquiv + '"');
2444:                                }
2445:                                String content = getValue(fAttributes,
2446:                                        "content");
2447:                                int index1 = content != null ? content
2448:                                        .toLowerCase().indexOf("charset=") : -1;
2449:                                if (index1 != -1 && !fIgnoreSpecifiedCharset) {
2450:                                    int index2 = content.indexOf(';', index1);
2451:                                    String charset = index2 != -1 ? content
2452:                                            .substring(index1 + 8, index2)
2453:                                            : content.substring(index1 + 8);
2454:                                    try {
2455:                                        String ianaEncoding = charset;
2456:                                        String javaEncoding = EncodingMap
2457:                                                .getIANA2JavaMapping(ianaEncoding
2458:                                                        .toUpperCase());
2459:                                        if (DEBUG_CHARSET) {
2460:                                            System.out
2461:                                                    .println("+++ ianaEncoding: "
2462:                                                            + ianaEncoding);
2463:                                            System.out
2464:                                                    .println("+++ javaEncoding: "
2465:                                                            + javaEncoding);
2466:                                        }
2467:                                        if (javaEncoding == null) {
2468:                                            javaEncoding = ianaEncoding;
2469:                                            if (fReportErrors) {
2470:                                                fErrorReporter
2471:                                                        .reportError(
2472:                                                                "HTML1001",
2473:                                                                new Object[] { ianaEncoding });
2474:                                            }
2475:                                        }
2476:                                        // patch: Marc Guillemot
2477:                                        if (!javaEncoding.equals(fJavaEncoding)) {
2478:                                            if (!isEncodingCompatible(
2479:                                                    javaEncoding, fJavaEncoding)) {
2480:                                                if (fReportErrors) {
2481:                                                    fErrorReporter
2482:                                                            .reportError(
2483:                                                                    "HTML1015",
2484:                                                                    new Object[] {
2485:                                                                            javaEncoding,
2486:                                                                            fJavaEncoding });
2487:                                                }
2488:                                            }
2489:                                            // change the charset
2490:                                            else {
2491:                                                fIso8859Encoding = ianaEncoding == null
2492:                                                        || ianaEncoding
2493:                                                                .toUpperCase()
2494:                                                                .startsWith(
2495:                                                                        "ISO-8859")
2496:                                                        || ianaEncoding
2497:                                                                .equalsIgnoreCase(fDefaultIANAEncoding);
2498:                                                fCurrentEntity.stream = new InputStreamReader(
2499:                                                        fByteStream,
2500:                                                        javaEncoding);
2501:                                                fByteStream.playback();
2502:                                                fElementDepth = fElementCount;
2503:                                                fElementCount = 0;
2504:                                                fCurrentEntity.offset = fCurrentEntity.length = 0;
2505:                                                fCurrentEntity.lineNumber = 1;
2506:                                                fCurrentEntity.columnNumber = 1;
2507:                                            }
2508:                                        }
2509:                                    } catch (UnsupportedEncodingException e) {
2510:                                        if (fReportErrors) {
2511:                                            fErrorReporter.reportError(
2512:                                                    "HTML1010",
2513:                                                    new Object[] { charset });
2514:                                        }
2515:                                        // NOTE: If the encoding change doesn't work, 
2516:                                        //       then there's no point in continuing to 
2517:                                        //       buffer the input stream.
2518:                                        fByteStream.clear();
2519:                                        fByteStream = null;
2520:                                    }
2521:                                }
2522:                            }
2523:                        } else if (ename.equalsIgnoreCase("BODY")) {
2524:                            fByteStream.clear();
2525:                            fByteStream = null;
2526:                        } else {
2527:                            HTMLElements.Element element = HTMLElements
2528:                                    .getElement(ename);
2529:                            if (element.parent != null
2530:                                    && element.parent.length > 0) {
2531:                                if (element.parent[0].code == HTMLElements.BODY) {
2532:                                    fByteStream.clear();
2533:                                    fByteStream = null;
2534:                                }
2535:                            }
2536:                        }
2537:                    }
2538:                    if (fDocumentHandler != null
2539:                            && fElementCount >= fElementDepth) {
2540:                        fQName.setValues(null, ename, ename, null);
2541:                        if (DEBUG_CALLBACKS) {
2542:                            System.out.println("startElement(" + fQName + ','
2543:                                    + fAttributes + ")");
2544:                        }
2545:                        fEndLineNumber = fCurrentEntity.lineNumber;
2546:                        fEndColumnNumber = fCurrentEntity.columnNumber;
2547:                        if (empty[0]) {
2548:                            fDocumentHandler.emptyElement(fQName, fAttributes,
2549:                                    locationAugs());
2550:                        } else {
2551:                            fDocumentHandler.startElement(fQName, fAttributes,
2552:                                    locationAugs());
2553:                        }
2554:                    }
2555:                    return ename;
2556:                } // scanStartElement():ename
2557:
2558:                /** 
2559:                 * Scans a real attribute. 
2560:                 *
2561:                 * @param attributes The list of attributes.
2562:                 * @param empty      Is used for a second return value to indicate 
2563:                 *                   whether the start element tag is empty 
2564:                 *                   (e.g. "/&gt;").
2565:                 */
2566:                protected boolean scanAttribute(XMLAttributesImpl attributes,
2567:                        boolean[] empty) throws IOException {
2568:                    return scanAttribute(attributes, empty, '/');
2569:                } // scanAttribute(XMLAttributesImpl,boolean[]):boolean
2570:
2571:                /** 
2572:                 * Scans a pseudo attribute. 
2573:                 *
2574:                 * @param attributes The list of attributes.
2575:                 */
2576:                protected boolean scanPseudoAttribute(
2577:                        XMLAttributesImpl attributes) throws IOException {
2578:                    return scanAttribute(attributes, fSingleBoolean, '?');
2579:                } // scanPseudoAttribute(XMLAttributesImpl):boolean
2580:
2581:                /** 
2582:                 * Scans an attribute, pseudo or real. 
2583:                 *
2584:                 * @param attributes The list of attributes.
2585:                 * @param empty      Is used for a second return value to indicate 
2586:                 *                   whether the start element tag is empty 
2587:                 *                   (e.g. "/&gt;").
2588:                 * @param endc       The end character that appears before the
2589:                 *                   closing angle bracket ('>').
2590:                 */
2591:                protected boolean scanAttribute(XMLAttributesImpl attributes,
2592:                        boolean[] empty, char endc) throws IOException {
2593:                    boolean skippedSpaces = skipSpaces();
2594:                    fBeginLineNumber = fCurrentEntity.lineNumber;
2595:                    fBeginColumnNumber = fCurrentEntity.columnNumber;
2596:                    int c = read();
2597:                    if (c == -1) {
2598:                        if (fReportErrors) {
2599:                            fErrorReporter.reportError("HTML1007", null);
2600:                        }
2601:                        throw new EOFException();
2602:                    }
2603:                    if (c == '>') {
2604:                        return false;
2605:                    }
2606:                    fCurrentEntity.offset--;
2607:                    fCurrentEntity.columnNumber--;
2608:                    String aname = scanName();
2609:                    if (aname == null) {
2610:                        if (fReportErrors) {
2611:                            fErrorReporter.reportError("HTML1011", null);
2612:                        }
2613:                        empty[0] = skipMarkup(false);
2614:                        return false;
2615:                    }
2616:                    if (!skippedSpaces && fReportErrors) {
2617:                        fErrorReporter.reportError("HTML1013",
2618:                                new Object[] { aname });
2619:                    }
2620:                    aname = modifyName(aname, fNamesAttrs);
2621:                    skipSpaces();
2622:                    c = read();
2623:                    if (c == -1) {
2624:                        if (fReportErrors) {
2625:                            fErrorReporter.reportError("HTML1007", null);
2626:                        }
2627:                        throw new EOFException();
2628:                    }
2629:                    if (c == '/' || c == '>') {
2630:                        fQName.setValues(null, aname, aname, null);
2631:                        attributes.addAttribute(fQName, "CDATA", "");
2632:                        attributes.setSpecified(attributes.getLength() - 1,
2633:                                true);
2634:                        if (fAugmentations) {
2635:                            addLocationItem(attributes,
2636:                                    attributes.getLength() - 1);
2637:                        }
2638:                        if (c == '/') {
2639:                            fCurrentEntity.offset--;
2640:                            fCurrentEntity.columnNumber--;
2641:                            empty[0] = skipMarkup(false);
2642:                        }
2643:                        return false;
2644:                    }
2645:                    /***
2646:                    // REVISIT: [Q] Why is this still here? -Ac
2647:                    if (c == '/' || c == '>') {
2648:                        if (c == '/') {
2649:                            fCurrentEntity.offset--;
2650:                            fCurrentEntity.columnNumber--;
2651:                            empty[0] = skipMarkup(false);
2652:                        }
2653:                        fQName.setValues(null, aname, aname, null);
2654:                        attributes.addAttribute(fQName, "CDATA", "");
2655:                        attributes.setSpecified(attributes.getLength()-1, true);
2656:                        if (fAugmentations) {
2657:                            addLocationItem(attributes, attributes.getLength() - 1);
2658:                        }
2659:                        return false;
2660:                    }
2661:                    /***/
2662:                    if (c == '=') {
2663:                        skipSpaces();
2664:                        c = read();
2665:                        if (c == -1) {
2666:                            if (fReportErrors) {
2667:                                fErrorReporter.reportError("HTML1007", null);
2668:                            }
2669:                            throw new EOFException();
2670:                        }
2671:                        // Xiaowei/Ac: Fix for <a href=/cgi-bin/myscript>...</a>
2672:                        if (c == '>') {
2673:                            fQName.setValues(null, aname, aname, null);
2674:                            attributes.addAttribute(fQName, "CDATA", "");
2675:                            attributes.setSpecified(attributes.getLength() - 1,
2676:                                    true);
2677:                            if (fAugmentations) {
2678:                                addLocationItem(attributes, attributes
2679:                                        .getLength() - 1);
2680:                            }
2681:                            return false;
2682:                        }
2683:                        fStringBuffer.clear();
2684:                        fNonNormAttr.clear();
2685:                        if (c != '\'' && c != '"') {
2686:                            fCurrentEntity.offset--;
2687:                            fCurrentEntity.columnNumber--;
2688:                            while (true) {
2689:                                c = read();
2690:                                // Xiaowei/Ac: Fix for <a href=/broken/>...</a>
2691:                                if (Character.isSpace((char) c) || c == '>') {
2692:                                    //fCharOffset--;
2693:                                    fCurrentEntity.offset--;
2694:                                    fCurrentEntity.columnNumber--;
2695:                                    break;
2696:                                }
2697:                                if (c == -1) {
2698:                                    if (fReportErrors) {
2699:                                        fErrorReporter.reportError("HTML1007",
2700:                                                null);
2701:                                    }
2702:                                    throw new EOFException();
2703:                                }
2704:                                if (c == '&') {
2705:                                    int ce = scanEntityRef(fStringBuffer2,
2706:                                            false);
2707:                                    if (ce != -1) {
2708:                                        fStringBuffer.append((char) ce);
2709:                                    } else {
2710:                                        fStringBuffer.append(fStringBuffer2);
2711:                                    }
2712:                                    fNonNormAttr.append(fStringBuffer2);
2713:                                } else {
2714:                                    fStringBuffer.append((char) c);
2715:                                    fNonNormAttr.append((char) c);
2716:                                }
2717:                            }
2718:                            fQName.setValues(null, aname, aname, null);
2719:                            String avalue = fStringBuffer.toString();
2720:                            attributes.addAttribute(fQName, "CDATA", avalue);
2721:
2722:                            int lastattr = attributes.getLength() - 1;
2723:                            attributes.setSpecified(lastattr, true);
2724:                            attributes.setNonNormalizedValue(lastattr,
2725:                                    fNonNormAttr.toString());
2726:                            if (fAugmentations) {
2727:                                addLocationItem(attributes, attributes
2728:                                        .getLength() - 1);
2729:                            }
2730:                            return true;
2731:                        }
2732:                        char quote = (char) c;
2733:                        boolean isStart = true;
2734:                        boolean prevSpace = false;
2735:                        do {
2736:                            boolean acceptSpace = !fNormalizeAttributes
2737:                                    || (!isStart && !prevSpace);
2738:                            c = read();
2739:                            if (c == -1) {
2740:                                if (fReportErrors) {
2741:                                    fErrorReporter
2742:                                            .reportError("HTML1007", null);
2743:                                }
2744:                                throw new EOFException();
2745:                            }
2746:                            if (c == '&') {
2747:                                isStart = false;
2748:                                int ce = scanEntityRef(fStringBuffer2, false);
2749:                                if (ce != -1) {
2750:                                    fStringBuffer.append((char) ce);
2751:                                } else {
2752:                                    fStringBuffer.append(fStringBuffer2);
2753:                                }
2754:                                fNonNormAttr.append(fStringBuffer2);
2755:                            } else if (c == ' ' || c == '\t') {
2756:                                if (acceptSpace) {
2757:                                    fStringBuffer
2758:                                            .append(fNormalizeAttributes ? ' '
2759:                                                    : (char) c);
2760:                                }
2761:                                fNonNormAttr.append((char) c);
2762:                            } else if (c == '\r' || c == '\n') {
2763:                                fCurrentEntity.lineNumber++;
2764:                                fCurrentEntity.columnNumber = 0;
2765:                                if (c == '\r') {
2766:                                    int c2 = read();
2767:                                    if (c2 != '\n') {
2768:                                        fCurrentEntity.offset--;
2769:                                        fCurrentEntity.columnNumber--;
2770:                                    } else {
2771:                                        fNonNormAttr.append('\r');
2772:                                        c = c2;
2773:                                    }
2774:                                }
2775:                                if (acceptSpace) {
2776:                                    fStringBuffer
2777:                                            .append(fNormalizeAttributes ? ' '
2778:                                                    : '\n');
2779:                                }
2780:                                fNonNormAttr.append((char) c);
2781:                            } else if (c != quote) {
2782:                                isStart = false;
2783:                                fStringBuffer.append((char) c);
2784:                                fNonNormAttr.append((char) c);
2785:                            }
2786:                            prevSpace = c == ' ' || c == '\t' || c == '\r'
2787:                                    || c == '\n';
2788:                            isStart = isStart && prevSpace;
2789:                        } while (c != quote);
2790:
2791:                        if (fNormalizeAttributes) {
2792:                            // trailing whitespace already normalized to single space
2793:                            if (fStringBuffer.ch[fStringBuffer.length - 1] == ' ') {
2794:                                fStringBuffer.length--;
2795:                            }
2796:                        }
2797:
2798:                        fQName.setValues(null, aname, aname, null);
2799:                        String avalue = fStringBuffer.toString();
2800:                        attributes.addAttribute(fQName, "CDATA", avalue);
2801:
2802:                        int lastattr = attributes.getLength() - 1;
2803:                        attributes.setSpecified(lastattr, true);
2804:                        attributes.setNonNormalizedValue(lastattr, fNonNormAttr
2805:                                .toString());
2806:                        if (fAugmentations) {
2807:                            addLocationItem(attributes,
2808:                                    attributes.getLength() - 1);
2809:                        }
2810:                    } else {
2811:                        fQName.setValues(null, aname, aname, null);
2812:                        attributes.addAttribute(fQName, "CDATA", "");
2813:                        attributes.setSpecified(attributes.getLength() - 1,
2814:                                true);
2815:                        fCurrentEntity.offset--;
2816:                        fCurrentEntity.columnNumber--;
2817:                        if (fAugmentations) {
2818:                            addLocationItem(attributes,
2819:                                    attributes.getLength() - 1);
2820:                        }
2821:                    }
2822:                    return true;
2823:                } // scanAttribute(XMLAttributesImpl):boolean
2824:
2825:                /** Adds location augmentations to the specified attribute. */
2826:                protected void addLocationItem(XMLAttributes attributes,
2827:                        int index) {
2828:                    fEndLineNumber = fCurrentEntity.lineNumber;
2829:                    fEndColumnNumber = fCurrentEntity.columnNumber;
2830:                    LocationItem locationItem = new LocationItem();
2831:                    locationItem.setValues(fBeginLineNumber,
2832:                            fBeginColumnNumber, fEndLineNumber,
2833:                            fEndColumnNumber);
2834:                    Augmentations augs = attributes.getAugmentations(index);
2835:                    augs.putItem(AUGMENTATIONS, locationItem);
2836:                } // addLocationItem(XMLAttributes,int)
2837:
2838:                /** Scans an end element. */
2839:                protected void scanEndElement() throws IOException {
2840:                    String ename = scanName();
2841:                    if (fReportErrors && ename == null) {
2842:                        fErrorReporter.reportError("HTML1012", null);
2843:                    }
2844:                    skipMarkup(false);
2845:                    if (ename != null) {
2846:                        ename = modifyName(ename, fNamesElems);
2847:                        if (fDocumentHandler != null
2848:                                && fElementCount >= fElementDepth) {
2849:                            fQName.setValues(null, ename, ename, null);
2850:                            if (DEBUG_CALLBACKS) {
2851:                                System.out
2852:                                        .println("endElement(" + fQName + ")");
2853:                            }
2854:                            fEndLineNumber = fCurrentEntity.lineNumber;
2855:                            fEndColumnNumber = fCurrentEntity.columnNumber;
2856:                            fDocumentHandler.endElement(fQName, locationAugs());
2857:                        }
2858:                    }
2859:                } // scanEndElement()
2860:            } // class ContentScanner
2861:
2862:            /**
2863:             * Special scanner used for elements whose content needs to be scanned 
2864:             * as plain text, ignoring markup such as elements and entity references.
2865:             * For example: &lt;SCRIPT&gt; and &lt;COMMENT&gt;.
2866:             *
2867:             * @author Andy Clark
2868:             */
2869:            public class SpecialScanner implements  Scanner {
2870:
2871:                //
2872:                // Data
2873:                //
2874:
2875:                /** Name of element whose content needs to be scanned as text. */
2876:                protected String fElementName;
2877:
2878:                /** True if &lt;script&gt; element. */
2879:                protected boolean fScript;
2880:
2881:                /** True if &lt;style&gt; element. */
2882:                protected boolean fStyle;
2883:
2884:                /** True if &lt;textarea&gt; element. */
2885:                protected boolean fTextarea;
2886:
2887:                /** True if &lt;title&gt; element. */
2888:                protected boolean fTitle;
2889:
2890:                // temp vars
2891:
2892:                /** A qualified name. */
2893:                private final QName fQName = new QName();
2894:
2895:                /** A string buffer. */
2896:                private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
2897:
2898:                //
2899:                // Public methods
2900:                //
2901:
2902:                /** Sets the element name. */
2903:                public Scanner setElementName(String ename) {
2904:                    fElementName = ename;
2905:                    fScript = fElementName.equalsIgnoreCase("SCRIPT");
2906:                    fStyle = fElementName.equalsIgnoreCase("STYLE");
2907:                    fTextarea = fElementName.equalsIgnoreCase("TEXTAREA");
2908:                    fTitle = fElementName.equalsIgnoreCase("TITLE");
2909:                    return this ;
2910:                } // setElementName(String):Scanner
2911:
2912:                //
2913:                // Scanner methods
2914:                //
2915:
2916:                /** Scan. */
2917:                public boolean scan(boolean complete) throws IOException {
2918:                    boolean next;
2919:                    do {
2920:                        try {
2921:                            next = false;
2922:                            switch (fScannerState) {
2923:                            case STATE_CONTENT: {
2924:                                fBeginLineNumber = fCurrentEntity.lineNumber;
2925:                                fBeginColumnNumber = fCurrentEntity.columnNumber;
2926:                                int c = read();
2927:                                if (c == '<') {
2928:                                    setScannerState(STATE_MARKUP_BRACKET);
2929:                                    continue;
2930:                                }
2931:                                if (c == '&') {
2932:                                    if (fTextarea || fTitle) {
2933:                                        scanEntityRef(fStringBuffer, true);
2934:                                        continue;
2935:                                    }
2936:                                    fStringBuffer.clear();
2937:                                    fStringBuffer.append('&');
2938:                                } else if (c == -1) {
2939:                                    if (fReportErrors) {
2940:                                        fErrorReporter.reportError("HTML1007",
2941:                                                null);
2942:                                    }
2943:                                    throw new EOFException();
2944:                                } else {
2945:                                    fCurrentEntity.offset--;
2946:                                    fCurrentEntity.columnNumber--;
2947:                                    fStringBuffer.clear();
2948:                                }
2949:                                scanCharacters(fStringBuffer, -1);
2950:                                break;
2951:                            } // case STATE_CONTENT
2952:                            case STATE_MARKUP_BRACKET: {
2953:                                int delimiter = -1;
2954:                                int c = read();
2955:                                if (c == '!') {
2956:                                    if (skip("--", false)) {
2957:                                        fStringBuffer.clear();
2958:                                        boolean strip = (fScript && fScriptStripCommentDelims)
2959:                                                || (fStyle && fStyleStripCommentDelims);
2960:                                        if (strip) {
2961:                                            do {
2962:                                                c = read();
2963:                                                if (c == '\r' || c == '\n') {
2964:                                                    fCurrentEntity.columnNumber--;
2965:                                                    fCurrentEntity.offset--;
2966:                                                    break;
2967:                                                }
2968:                                            } while (c != -1);
2969:                                            skipNewlines(1);
2970:                                            delimiter = '-';
2971:                                        } else {
2972:                                            fStringBuffer.append("<!--");
2973:                                        }
2974:                                    } else if (skip("[CDATA[", false)) {
2975:                                        fStringBuffer.clear();
2976:                                        boolean strip = (fScript && fScriptStripCDATADelims)
2977:                                                || (fStyle && fStyleStripCDATADelims);
2978:                                        if (strip) {
2979:                                            do {
2980:                                                c = read();
2981:                                                if (c == '\r' || c == '\n') {
2982:                                                    fCurrentEntity.columnNumber--;
2983:                                                    fCurrentEntity.offset--;
2984:                                                    break;
2985:                                                }
2986:                                            } while (c != -1);
2987:                                            skipNewlines(1);
2988:                                            delimiter = ']';
2989:                                        } else {
2990:                                            fStringBuffer.append("<![CDATA[");
2991:                                        }
2992:                                    }
2993:                                } else if (c == '/') {
2994:                                    String ename = scanName();
2995:                                    if (ename != null) {
2996:                                        if (ename
2997:                                                .equalsIgnoreCase(fElementName)) {
2998:                                            if (read() == '>') {
2999:                                                ename = modifyName(ename,
3000:                                                        fNamesElems);
3001:                                                if (fDocumentHandler != null
3002:                                                        && fElementCount >= fElementDepth) {
3003:                                                    fQName.setValues(null,
3004:                                                            ename, ename, null);
3005:                                                    if (DEBUG_CALLBACKS) {
3006:                                                        System.out
3007:                                                                .println("endElement("
3008:                                                                        + fQName
3009:                                                                        + ")");
3010:                                                    }
3011:                                                    fEndLineNumber = fCurrentEntity.lineNumber;
3012:                                                    fEndColumnNumber = fCurrentEntity.columnNumber;
3013:                                                    fDocumentHandler
3014:                                                            .endElement(
3015:                                                                    fQName,
3016:                                                                    locationAugs());
3017:                                                }
3018:                                                setScanner(fContentScanner);
3019:                                                setScannerState(STATE_CONTENT);
3020:                                                return true;
3021:                                            } else {
3022:                                                fCurrentEntity.offset--;
3023:                                                fCurrentEntity.columnNumber--;
3024:                                            }
3025:                                        }
3026:                                        fStringBuffer.clear();
3027:                                        fStringBuffer.append("</");
3028:                                        fStringBuffer.append(ename);
3029:                                    } else {
3030:                                        fStringBuffer.clear();
3031:                                        fStringBuffer.append("</");
3032:                                    }
3033:                                } else {
3034:                                    fStringBuffer.clear();
3035:                                    fStringBuffer.append('<');
3036:                                    fStringBuffer.append((char) c);
3037:                                }
3038:                                scanCharacters(fStringBuffer, delimiter);
3039:                                setScannerState(STATE_CONTENT);
3040:                                break;
3041:                            } // case STATE_MARKUP_BRACKET
3042:                            } // switch
3043:                        } // try
3044:                        catch (EOFException e) {
3045:                            setScanner(fContentScanner);
3046:                            if (fCurrentEntityStack.empty()) {
3047:                                setScannerState(STATE_END_DOCUMENT);
3048:                            } else {
3049:                                fCurrentEntity = (CurrentEntity) fCurrentEntityStack
3050:                                        .pop();
3051:                                setScannerState(STATE_CONTENT);
3052:                            }
3053:                            return true;
3054:                        }
3055:                    } // do
3056:                    while (next || complete);
3057:                    return true;
3058:                } // scan(boolean):boolean
3059:
3060:                //
3061:                // Protected methods
3062:                //
3063:
3064:                /** Scan characters. */
3065:                protected void scanCharacters(XMLStringBuffer buffer,
3066:                        int delimiter) throws IOException {
3067:                    if (DEBUG_BUFFER) {
3068:                        System.out.print("(scanCharacters, delimiter="
3069:                                + delimiter + ": ");
3070:                        printBuffer();
3071:                        System.out.println();
3072:                    }
3073:                    boolean strip = (fScript && fScriptStripCommentDelims)
3074:                            || (fScript && fScriptStripCDATADelims)
3075:                            || (fStyle && fStyleStripCommentDelims)
3076:                            || (fStyle && fStyleStripCDATADelims);
3077:                    while (true) {
3078:                        int c = read();
3079:                        if (c == -1
3080:                                || (delimiter == -1 && (c == '<' || c == '&'))) {
3081:                            if (c != -1) {
3082:                                fCurrentEntity.offset--;
3083:                                fCurrentEntity.columnNumber--;
3084:                            }
3085:                            break;
3086:                        }
3087:                        // Patch supplied by Jonathan Baxter
3088:                        else if (c == '\r' || c == '\n') {
3089:                            fCurrentEntity.offset--;
3090:                            fCurrentEntity.columnNumber--;
3091:                            int newlines = skipNewlines();
3092:                            for (int i = 0; i < newlines; i++) {
3093:                                buffer.append('\n');
3094:                            }
3095:                        } else if (c == '\'' || c == '"') {
3096:                            buffer.append((char) c);
3097:                            final int stringChar = c;
3098:                            while (true) {
3099:                                c = read();
3100:                                if (c == '\\') {
3101:                                    buffer.append((char) c);
3102:                                    //always consume next character
3103:                                    buffer.append((char) read());
3104:                                } else if (c == stringChar) {
3105:                                    buffer.append((char) c);
3106:                                    break;
3107:                                } else if (c == '\r' || c == '\n') {
3108:                                    fCurrentEntity.offset--;
3109:                                    fCurrentEntity.columnNumber--;
3110:                                    int newlines = skipNewlines();
3111:                                    for (int i = 0; i < newlines; i++) {
3112:                                        buffer.append('\n');
3113:                                    }
3114:                                    break;
3115:                                } else {
3116:                                    buffer.append((char) c);
3117:                                }
3118:                            }
3119:                        } else if (delimiter != -1 && c == (char) delimiter) {
3120:                            int count = 0;
3121:                            do {
3122:                                count++;
3123:                                c = read();
3124:                            } while (c == (char) delimiter);
3125:                            for (int i = strip && c == '>' ? 2 : 0; i < count; i++) {
3126:                                buffer.append((char) delimiter);
3127:                            }
3128:                            if (c == -1 || (count >= 2 && c == '>')) {
3129:                                if (!strip) {
3130:                                    buffer.append((char) c);
3131:                                }
3132:                                break;
3133:                            }
3134:                            fCurrentEntity.offset--;
3135:                            fCurrentEntity.columnNumber--;
3136:                        } else {
3137:                            buffer.append((char) c);
3138:                            if (c == '\n') {
3139:                                fCurrentEntity.columnNumber = 1;
3140:                                fCurrentEntity.lineNumber++;
3141:                            }
3142:                        }
3143:                    }
3144:                    if (buffer.length > 0 && fDocumentHandler != null
3145:                            && fElementCount >= fElementDepth) {
3146:                        if (DEBUG_CALLBACKS) {
3147:                            System.out.println("characters(" + buffer + ")");
3148:                        }
3149:                        fEndLineNumber = fCurrentEntity.lineNumber;
3150:                        fEndColumnNumber = fCurrentEntity.columnNumber;
3151:                        fDocumentHandler.characters(buffer, locationAugs());
3152:                    }
3153:                    if (DEBUG_BUFFER) {
3154:                        System.out.print(")scanCharacters: ");
3155:                        printBuffer();
3156:                        System.out.println();
3157:                    }
3158:                } // scanCharacters(StringBuffer)
3159:
3160:            } // class SpecialScanner
3161:
3162:            /**
3163:             * A playback input stream. This class has the ability to save the bytes
3164:             * read from the underlying input stream and play the bytes back later.
3165:             * This class is used by the HTML scanner to switch encodings when a 
3166:             * &lt;meta&gt; tag is detected that specifies a different encoding. 
3167:             * <p>
3168:             * If the encoding is changed, then the scanner calls the 
3169:             * <code>playback</code> method and re-scans the beginning of the HTML
3170:             * document again. This should not be too much of a performance problem
3171:             * because the &lt;meta&gt; tag appears at the beginning of the document.
3172:             * <p>
3173:             * If the &lt;body&gt; tag is reached without playing back the bytes,
3174:             * then the buffer can be cleared by calling the <code>clear</code>
3175:             * method. This stops the buffering of bytes and allows the memory used
3176:             * by the buffer to be reclaimed. 
3177:             * <p>
3178:             * <strong>Note:</strong> 
3179:             * If the buffer is never played back or cleared, this input stream
3180:             * will continue to buffer the entire stream. Therefore, it is very
3181:             * important to use this stream correctly.
3182:             *
3183:             * @author Andy Clark
3184:             */
3185:            public static class PlaybackInputStream extends FilterInputStream {
3186:
3187:                //
3188:                // Constants
3189:                //
3190:
3191:                /** Set to true to debug playback. */
3192:                private static final boolean DEBUG_PLAYBACK = false;
3193:
3194:                //
3195:                // Data
3196:                //
3197:
3198:                // state
3199:
3200:                /** Playback mode. */
3201:                protected boolean fPlayback = false;
3202:
3203:                /** Buffer cleared. */
3204:                protected boolean fCleared = false;
3205:
3206:                /** Encoding detected. */
3207:                protected boolean fDetected = false;
3208:
3209:                // buffer info
3210:
3211:                /** Byte buffer. */
3212:                protected byte[] fByteBuffer = new byte[1024];
3213:
3214:                /** Offset into byte buffer during playback. */
3215:                protected int fByteOffset = 0;
3216:
3217:                /** Length of bytes read into byte buffer. */
3218:                protected int fByteLength = 0;
3219:
3220:                /** Pushback offset. */
3221:                public int fPushbackOffset = 0;
3222:
3223:                /** Pushback length. */
3224:                public int fPushbackLength = 0;
3225:
3226:                //
3227:                // Constructors
3228:                //
3229:
3230:                /** Constructor. */
3231:                public PlaybackInputStream(InputStream in) {
3232:                    super (in);
3233:                } // <init>(InputStream)
3234:
3235:                //
3236:                // Public methods
3237:                //
3238:
3239:                /** Detect encoding. */
3240:                public void detectEncoding(String[] encodings)
3241:                        throws IOException {
3242:                    if (fDetected) {
3243:                        throw new IOException(
3244:                                "Should not detect encoding twice.");
3245:                    }
3246:                    fDetected = true;
3247:                    int b1 = read();
3248:                    if (b1 == -1) {
3249:                        return;
3250:                    }
3251:                    int b2 = read();
3252:                    if (b2 == -1) {
3253:                        fPushbackLength = 1;
3254:                        return;
3255:                    }
3256:                    // UTF-8 BOM: 0xEFBBBF
3257:                    if (b1 == 0xEF && b2 == 0xBB) {
3258:                        int b3 = read();
3259:                        if (b3 == 0xBF) {
3260:                            fPushbackOffset = 3;
3261:                            encodings[0] = "UTF-8";
3262:                            encodings[1] = "UTF8";
3263:                            return;
3264:                        }
3265:                        fPushbackLength = 3;
3266:                    }
3267:                    // UTF-16 LE BOM: 0xFFFE
3268:                    if (b1 == 0xFF && b2 == 0xFE) {
3269:                        encodings[0] = "UTF-16";
3270:                        encodings[1] = "UnicodeLittleUnmarked";
3271:                        return;
3272:                    }
3273:                    // UTF-16 BE BOM: 0xFEFF
3274:                    else if (b1 == 0xFE && b2 == 0xFF) {
3275:                        encodings[0] = "UTF-16";
3276:                        encodings[1] = "UnicodeBigUnmarked";
3277:                        return;
3278:                    }
3279:                    // unknown
3280:                    fPushbackLength = 2;
3281:                } // detectEncoding()
3282:
3283:                /** Playback buffer contents. */
3284:                public void playback() {
3285:                    fPlayback = true;
3286:                } // playback()
3287:
3288:                /** 
3289:                 * Clears the buffer.
3290:                 * <p>
3291:                 * <strong>Note:</strong>
3292:                 * The buffer cannot be cleared during playback. Therefore, calling
3293:                 * this method during playback will not do anything. However, the
3294:                 * buffer will be cleared automatically at the end of playback.
3295:                 */
3296:                public void clear() {
3297:                    if (!fPlayback) {
3298:                        fCleared = true;
3299:                        fByteBuffer = null;
3300:                    }
3301:                } // clear()
3302:
3303:                //
3304:                // InputStream methods
3305:                //
3306:
3307:                /** Read a byte. */
3308:                public int read() throws IOException {
3309:                    if (DEBUG_PLAYBACK) {
3310:                        System.out.println("(read");
3311:                    }
3312:                    if (fPushbackOffset < fPushbackLength) {
3313:                        return fByteBuffer[fPushbackOffset++];
3314:                    }
3315:                    if (fCleared) {
3316:                        return in.read();
3317:                    }
3318:                    if (fPlayback) {
3319:                        int c = fByteBuffer[fByteOffset++];
3320:                        if (fByteOffset == fByteLength) {
3321:                            fCleared = true;
3322:                            fByteBuffer = null;
3323:                        }
3324:                        if (DEBUG_PLAYBACK) {
3325:                            System.out.println(")read -> " + (char) c);
3326:                        }
3327:                        return c;
3328:                    }
3329:                    int c = in.read();
3330:                    if (c != -1) {
3331:                        if (fByteLength == fByteBuffer.length) {
3332:                            byte[] newarray = new byte[fByteLength + 1024];
3333:                            System.arraycopy(fByteBuffer, 0, newarray, 0,
3334:                                    fByteLength);
3335:                            fByteBuffer = newarray;
3336:                        }
3337:                        fByteBuffer[fByteLength++] = (byte) c;
3338:                    }
3339:                    if (DEBUG_PLAYBACK) {
3340:                        System.out.println(")read -> " + (char) c);
3341:                    }
3342:                    return c;
3343:                } // read():int
3344:
3345:                /** Read an array of bytes. */
3346:                public int read(byte[] array) throws IOException {
3347:                    return read(array, 0, array.length);
3348:                } // read(byte[]):int
3349:
3350:                /** Read an array of bytes. */
3351:                public int read(byte[] array, int offset, int length)
3352:                        throws IOException {
3353:                    if (DEBUG_PLAYBACK) {
3354:                        System.out.println(")read(" + offset + ',' + length
3355:                                + ')');
3356:                    }
3357:                    if (fPushbackOffset < fPushbackLength) {
3358:                        int count = fPushbackLength - fPushbackOffset;
3359:                        if (count > length) {
3360:                            count = length;
3361:                        }
3362:                        System.arraycopy(fByteBuffer, fPushbackOffset, array,
3363:                                offset, count);
3364:                        fPushbackOffset += count;
3365:                        return count;
3366:                    }
3367:                    if (fCleared) {
3368:                        return in.read(array, offset, length);
3369:                    }
3370:                    if (fPlayback) {
3371:                        if (fByteOffset + length > fByteLength) {
3372:                            length = fByteLength - fByteOffset;
3373:                        }
3374:                        System.arraycopy(fByteBuffer, fByteOffset, array,
3375:                                offset, length);
3376:                        fByteOffset += length;
3377:                        if (fByteOffset == fByteLength) {
3378:                            fCleared = true;
3379:                            fByteBuffer = null;
3380:                        }
3381:                        return length;
3382:                    }
3383:                    int count = in.read(array, offset, length);
3384:                    if (count != -1) {
3385:                        if (fByteLength + count > fByteBuffer.length) {
3386:                            byte[] newarray = new byte[fByteLength + count
3387:                                    + 512];
3388:                            System.arraycopy(fByteBuffer, 0, newarray, 0,
3389:                                    fByteLength);
3390:                            fByteBuffer = newarray;
3391:                        }
3392:                        System.arraycopy(array, offset, fByteBuffer,
3393:                                fByteLength, count);
3394:                        fByteLength += count;
3395:                    }
3396:                    if (DEBUG_PLAYBACK) {
3397:                        System.out.println(")read(" + offset + ',' + length
3398:                                + ") -> " + count);
3399:                    }
3400:                    return count;
3401:                } // read(byte[]):int
3402:
3403:            } // class PlaybackInputStream
3404:
3405:            /**
3406:             * Location infoset item. 
3407:             *
3408:             * @author Andy Clark
3409:             */
3410:            protected static class LocationItem implements  HTMLEventInfo {
3411:
3412:                //
3413:                // Data
3414:                //
3415:
3416:                /** Beginning line number. */
3417:                protected int fBeginLineNumber;
3418:
3419:                /** Beginning column number. */
3420:                protected int fBeginColumnNumber;
3421:
3422:                /** Ending line number. */
3423:                protected int fEndLineNumber;
3424:
3425:                /** Ending column number. */
3426:                protected int fEndColumnNumber;
3427:
3428:                //
3429:                // Public methods
3430:                //
3431:
3432:                /** Sets the values of this item. */
3433:                public void setValues(int beginLine, int beginColumn,
3434:                        int endLine, int endColumn) {
3435:                    fBeginLineNumber = beginLine;
3436:                    fBeginColumnNumber = beginColumn;
3437:                    fEndLineNumber = endLine;
3438:                    fEndColumnNumber = endColumn;
3439:                } // setValues(int,int,int,int)
3440:
3441:                //
3442:                // HTMLEventInfo methods
3443:                //
3444:
3445:                // location information
3446:
3447:                /** Returns the line number of the beginning of this event.*/
3448:                public int getBeginLineNumber() {
3449:                    return fBeginLineNumber;
3450:                } // getBeginLineNumber():int
3451:
3452:                /** Returns the column number of the beginning of this event.*/
3453:                public int getBeginColumnNumber() {
3454:                    return fBeginColumnNumber;
3455:                } // getBeginColumnNumber():int
3456:
3457:                /** Returns the line number of the end of this event.*/
3458:                public int getEndLineNumber() {
3459:                    return fEndLineNumber;
3460:                } // getEndLineNumber():int
3461:
3462:                /** Returns the column number of the end of this event.*/
3463:                public int getEndColumnNumber() {
3464:                    return fEndColumnNumber;
3465:                } // getEndColumnNumber():int
3466:
3467:                // other information
3468:
3469:                /** Returns true if this corresponding event was synthesized. */
3470:                public boolean isSynthesized() {
3471:                    return false;
3472:                } // isSynthesize():boolean
3473:
3474:                //
3475:                // Object methods
3476:                //
3477:
3478:                /** Returns a string representation of this object. */
3479:                public String toString() {
3480:                    StringBuffer str = new StringBuffer();
3481:                    str.append(fBeginLineNumber);
3482:                    str.append(':');
3483:                    str.append(fBeginColumnNumber);
3484:                    str.append(':');
3485:                    str.append(fEndLineNumber);
3486:                    str.append(':');
3487:                    str.append(fEndColumnNumber);
3488:                    return str.toString();
3489:                } // toString():String
3490:
3491:            } // class LocationItem
3492:
3493:            /**
3494:             * To detect if 2 encoding are compatible, both must be able to read the meta tag specifying
3495:             * the new encoding. This means that the byte representation of some minimal html markup must
3496:             * be the same in both encodings
3497:             */
3498:            boolean isEncodingCompatible(final String encoding1,
3499:                    final String encoding2) {
3500:                final String reference = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=";
3501:                try {
3502:                    final byte[] bytesEncoding1 = reference.getBytes(encoding1);
3503:                    final byte[] bytesEncoding2 = reference.getBytes(encoding2);
3504:                    if (bytesEncoding1.length != bytesEncoding2.length) {
3505:                        return false;
3506:                    } else {
3507:                        for (int i = 0; i < bytesEncoding1.length; ++i) {
3508:                            if (bytesEncoding1[i] != bytesEncoding2[i]) {
3509:                                return false;
3510:                            }
3511:                        }
3512:                    }
3513:
3514:                    return true;
3515:                } catch (final UnsupportedEncodingException e) {
3516:                    return false;
3517:                }
3518:            }
3519:        } // class HTMLScanner
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.