0001: /*
0002: * Copyright 2002-2008 Andy Clark
0003: *
0004: * Licensed under the Apache License, Version 2.0 (the "License");
0005: * you may not use this file except in compliance with the License.
0006: * You may obtain a copy of the License at
0007: *
0008: * http://www.apache.org/licenses/LICENSE-2.0
0009: *
0010: * Unless required by applicable law or agreed to in writing, software
0011: * distributed under the License is distributed on an "AS IS" BASIS,
0012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013: * See the License for the specific language governing permissions and
0014: * limitations under the License.
0015: */
0016:
0017: package org.cyberneko.html;
0018:
0019: import java.io.EOFException;
0020: import java.io.FilterInputStream;
0021: import java.io.IOException;
0022: import java.io.InputStream;
0023: import java.io.InputStreamReader;
0024: import java.io.Reader;
0025: import java.io.UnsupportedEncodingException;
0026: import java.lang.reflect.InvocationTargetException;
0027: import java.lang.reflect.Method;
0028: import java.net.URL;
0029: import java.util.Stack;
0030:
0031: import org.apache.xerces.util.EncodingMap;
0032: import org.apache.xerces.util.NamespaceSupport;
0033: import org.apache.xerces.util.URI;
0034: import org.apache.xerces.util.XMLAttributesImpl;
0035: import org.apache.xerces.util.XMLResourceIdentifierImpl;
0036: import org.apache.xerces.util.XMLStringBuffer;
0037: import org.apache.xerces.xni.Augmentations;
0038: import org.apache.xerces.xni.NamespaceContext;
0039: import org.apache.xerces.xni.QName;
0040: import org.apache.xerces.xni.XMLAttributes;
0041: import org.apache.xerces.xni.XMLDocumentHandler;
0042: import org.apache.xerces.xni.XMLLocator;
0043: import org.apache.xerces.xni.XMLResourceIdentifier;
0044: import org.apache.xerces.xni.XMLString;
0045: import org.apache.xerces.xni.XNIException;
0046: import org.apache.xerces.xni.parser.XMLComponentManager;
0047: import org.apache.xerces.xni.parser.XMLConfigurationException;
0048: import org.apache.xerces.xni.parser.XMLDocumentScanner;
0049: import org.apache.xerces.xni.parser.XMLInputSource;
0050:
0051: /**
0052: * A simple HTML scanner. This scanner makes no attempt to balance tags
0053: * or fix other problems in the source document — it just scans what
0054: * it can and generates XNI document "events", ignoring errors of all
0055: * kinds.
0056: * <p>
0057: * This component recognizes the following features:
0058: * <ul>
0059: * <li>http://cyberneko.org/html/features/augmentations
0060: * <li>http://cyberneko.org/html/features/report-errors
0061: * <li>http://apache.org/xml/features/scanner/notify-char-refs
0062: * <li>http://apache.org/xml/features/scanner/notify-builtin-refs
0063: * <li>http://cyberneko.org/html/features/scanner/notify-builtin-refs
0064: * <li>http://cyberneko.org/html/features/scanner/fix-mswindows-refs
0065: * <li>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
0066: * <li>http://cyberneko.org/html/features/scanner/script/strip-comment-delims
0067: * <li>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
0068: * <li>http://cyberneko.org/html/features/scanner/style/strip-comment-delims
0069: * <li>http://cyberneko.org/html/features/scanner/ignore-specified-charset
0070: * <li>http://cyberneko.org/html/features/scanner/cdata-sections
0071: * <li>http://cyberneko.org/html/features/override-doctype
0072: * <li>http://cyberneko.org/html/features/insert-doctype
0073: * </ul>
0074: * <p>
0075: * This component recognizes the following properties:
0076: * <ul>
0077: * <li>http://cyberneko.org/html/properties/names/elems
0078: * <li>http://cyberneko.org/html/properties/names/attrs
0079: * <li>http://cyberneko.org/html/properties/default-encoding
0080: * <li>http://cyberneko.org/html/properties/error-reporter
0081: * <li>http://cyberneko.org/html/properties/doctype/pubid
0082: * <li>http://cyberneko.org/html/properties/doctype/sysid
0083: * </ul>
0084: *
0085: * @see HTMLElements
0086: * @see HTMLEntities
0087: *
0088: * @author Andy Clark
0089: * @author Ahmed Ashour
0090: *
0091: * @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
0092: */
0093: public class HTMLScanner implements XMLDocumentScanner, XMLLocator,
0094: HTMLComponent {
0095:
0096: //
0097: // Constants
0098: //
0099:
0100: // doctype info: HTML 4.01 strict
0101:
0102: /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
0103: public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";
0104:
0105: /** HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). */
0106: public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";
0107:
0108: // doctype info: HTML 4.01 loose
0109:
0110: /** HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). */
0111: public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";
0112:
0113: /** HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). */
0114: public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";
0115:
0116: // doctype info: HTML 4.01 frameset
0117:
0118: /** HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). */
0119: public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";
0120:
0121: /** HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). */
0122: public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";
0123:
0124: // features
0125:
0126: /** Include infoset augmentations. */
0127: protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
0128:
0129: /** Report errors. */
0130: protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
0131:
0132: /** Notify character entity references (e.g. &#32;, &#x20;, etc). */
0133: public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
0134:
0135: /**
0136: * Notify handler of built-in entity references (e.g. &amp;,
0137: * &lt;, etc).
0138: * <p>
0139: * <strong>Note:</strong>
0140: * This only applies to the five pre-defined XML general entities.
0141: * Specifically, "amp", "lt", "gt", "quot", and "apos". This is done
0142: * for compatibility with the Xerces feature.
0143: * <p>
0144: * To be notified of the built-in entity references in HTML, set the
0145: * <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code>
0146: * feature to <code>true</code>.
0147: */
0148: public static final String NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs";
0149:
0150: /**
0151: * Notify handler of built-in entity references (e.g. &nobr;,
0152: * &copy;, etc).
0153: * <p>
0154: * <strong>Note:</strong>
0155: * This <em>includes</em> the five pre-defined XML general entities.
0156: */
0157: public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
0158:
0159: /** Fix Microsoft Windows® character entity references. */
0160: public static final String FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs";
0161:
0162: /**
0163: * Strip HTML comment delimiters ("<!−−" and
0164: * "−−>") from SCRIPT tag contents.
0165: */
0166: public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";
0167:
0168: /**
0169: * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from
0170: * SCRIPT tag contents.
0171: */
0172: public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";
0173:
0174: /**
0175: * Strip HTML comment delimiters ("<!−−" and
0176: * "−−>") from STYLE tag contents.
0177: */
0178: public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";
0179:
0180: /**
0181: * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from
0182: * STYLE tag contents.
0183: */
0184: public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";
0185:
0186: /**
0187: * Ignore specified charset found in the <meta equiv='Content-Type'
0188: * content='text/html;charset=…'> tag.
0189: */
0190: public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";
0191:
0192: /** Scan CDATA sections. */
0193: public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";
0194:
0195: /** Override doctype declaration public and system identifiers. */
0196: public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";
0197:
0198: /** Insert document type declaration. */
0199: public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";
0200:
0201: /** Normalize attribute values. */
0202: protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs";
0203:
0204: /** Recognized features. */
0205: private static final String[] RECOGNIZED_FEATURES = {
0206: AUGMENTATIONS, REPORT_ERRORS, NOTIFY_CHAR_REFS,
0207: NOTIFY_XML_BUILTIN_REFS, NOTIFY_HTML_BUILTIN_REFS,
0208: FIX_MSWINDOWS_REFS, SCRIPT_STRIP_CDATA_DELIMS,
0209: SCRIPT_STRIP_COMMENT_DELIMS, STYLE_STRIP_CDATA_DELIMS,
0210: STYLE_STRIP_COMMENT_DELIMS, IGNORE_SPECIFIED_CHARSET,
0211: CDATA_SECTIONS, OVERRIDE_DOCTYPE, INSERT_DOCTYPE,
0212: NORMALIZE_ATTRIBUTES, };
0213:
0214: /** Recognized features defaults. */
0215: private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
0216: null, null, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0217: Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0218: Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
0219: Boolean.FALSE, Boolean.FALSE, };
0220:
0221: // properties
0222:
0223: /** Modify HTML element names: { "upper", "lower", "default" }. */
0224: protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
0225:
0226: /** Modify HTML attribute names: { "upper", "lower", "default" }. */
0227: protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
0228:
0229: /** Default encoding. */
0230: protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";
0231:
0232: /** Error reporter. */
0233: protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
0234:
0235: /** Doctype declaration public identifier. */
0236: protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";
0237:
0238: /** Doctype declaration system identifier. */
0239: protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";
0240:
0241: /** Recognized properties. */
0242: private static final String[] RECOGNIZED_PROPERTIES = {
0243: NAMES_ELEMS, NAMES_ATTRS, DEFAULT_ENCODING, ERROR_REPORTER,
0244: DOCTYPE_PUBID, DOCTYPE_SYSID, };
0245:
0246: /** Recognized properties defaults. */
0247: private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
0248: null, null, "Windows-1252", null,
0249: HTML_4_01_TRANSITIONAL_PUBID, HTML_4_01_TRANSITIONAL_SYSID, };
0250:
0251: // states
0252:
0253: /** State: content. */
0254: protected static final short STATE_CONTENT = 0;
0255:
0256: /** State: markup bracket. */
0257: protected static final short STATE_MARKUP_BRACKET = 1;
0258:
0259: /** State: start document. */
0260: protected static final short STATE_START_DOCUMENT = 10;
0261:
0262: /** State: end document. */
0263: protected static final short STATE_END_DOCUMENT = 11;
0264:
0265: // modify HTML names
0266:
0267: /** Don't modify HTML names. */
0268: protected static final short NAMES_NO_CHANGE = 0;
0269:
0270: /** Uppercase HTML names. */
0271: protected static final short NAMES_UPPERCASE = 1;
0272:
0273: /** Lowercase HTML names. */
0274: protected static final short NAMES_LOWERCASE = 2;
0275:
0276: // defaults
0277:
0278: /** Default buffer size. */
0279: protected static final int DEFAULT_BUFFER_SIZE = 2048;
0280:
0281: // debugging
0282:
0283: /** Set to true to debug changes in the scanner. */
0284: private static final boolean DEBUG_SCANNER = false;
0285:
0286: /** Set to true to debug changes in the scanner state. */
0287: private static final boolean DEBUG_SCANNER_STATE = false;
0288:
0289: /** Set to true to debug the buffer. */
0290: private static final boolean DEBUG_BUFFER = false;
0291:
0292: /** Set to true to debug character encoding handling. */
0293: private static final boolean DEBUG_CHARSET = false;
0294:
0295: /** Set to true to debug callbacks. */
0296: protected static final boolean DEBUG_CALLBACKS = false;
0297:
0298: // static vars
0299:
0300: /** Synthesized event info item. */
0301: protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
0302:
0303: //
0304: // Data
0305: //
0306:
0307: // features
0308:
0309: /** Augmentations. */
0310: protected boolean fAugmentations;
0311:
0312: /** Report errors. */
0313: protected boolean fReportErrors;
0314:
0315: /** Notify character entity references. */
0316: protected boolean fNotifyCharRefs;
0317:
0318: /** Notify XML built-in general entity references. */
0319: protected boolean fNotifyXmlBuiltinRefs;
0320:
0321: /** Notify HTML built-in general entity references. */
0322: protected boolean fNotifyHtmlBuiltinRefs;
0323:
0324: /** Fix Microsoft Windows® character entity references. */
0325: protected boolean fFixWindowsCharRefs;
0326:
0327: /** Strip CDATA delimiters from SCRIPT tags. */
0328: protected boolean fScriptStripCDATADelims;
0329:
0330: /** Strip comment delimiters from SCRIPT tags. */
0331: protected boolean fScriptStripCommentDelims;
0332:
0333: /** Strip CDATA delimiters from STYLE tags. */
0334: protected boolean fStyleStripCDATADelims;
0335:
0336: /** Strip comment delimiters from STYLE tags. */
0337: protected boolean fStyleStripCommentDelims;
0338:
0339: /** Ignore specified character set. */
0340: protected boolean fIgnoreSpecifiedCharset;
0341:
0342: /** CDATA sections. */
0343: protected boolean fCDATASections;
0344:
0345: /** Override doctype declaration public and system identifiers. */
0346: protected boolean fOverrideDoctype;
0347:
0348: /** Insert document type declaration. */
0349: protected boolean fInsertDoctype;
0350:
0351: /** Normalize attribute values. */
0352: protected boolean fNormalizeAttributes;
0353:
0354: // properties
0355:
0356: /** Modify HTML element names. */
0357: protected short fNamesElems;
0358:
0359: /** Modify HTML attribute names. */
0360: protected short fNamesAttrs;
0361:
0362: /** Default encoding. */
0363: protected String fDefaultIANAEncoding;
0364:
0365: /** Error reporter. */
0366: protected HTMLErrorReporter fErrorReporter;
0367:
0368: /** Doctype declaration public identifier. */
0369: protected String fDoctypePubid;
0370:
0371: /** Doctype declaration system identifier. */
0372: protected String fDoctypeSysid;
0373:
0374: // boundary locator information
0375:
0376: /** Beginning line number. */
0377: protected int fBeginLineNumber;
0378:
0379: /** Beginning column number. */
0380: protected int fBeginColumnNumber;
0381:
0382: /** Ending line number. */
0383: protected int fEndLineNumber;
0384:
0385: /** Ending column number. */
0386: protected int fEndColumnNumber;
0387:
0388: // state
0389:
0390: /** The playback byte stream. */
0391: protected PlaybackInputStream fByteStream;
0392:
0393: /** Current entity. */
0394: protected CurrentEntity fCurrentEntity;
0395:
0396: /** The current entity stack. */
0397: protected final Stack fCurrentEntityStack = new Stack();
0398:
0399: /** The current scanner. */
0400: protected Scanner fScanner;
0401:
0402: /** The current scanner state. */
0403: protected short fScannerState;
0404:
0405: /** The document handler. */
0406: protected XMLDocumentHandler fDocumentHandler;
0407:
0408: /** Auto-detected IANA encoding. */
0409: protected String fIANAEncoding;
0410:
0411: /** Auto-detected Java encoding. */
0412: protected String fJavaEncoding;
0413:
0414: /** True if the encoding matches "ISO-8859-*". */
0415: protected boolean fIso8859Encoding;
0416:
0417: /** Element count. */
0418: protected int fElementCount;
0419:
0420: /** Element depth. */
0421: protected int fElementDepth;
0422:
0423: // scanners
0424:
0425: /** Content scanner. */
0426: protected Scanner fContentScanner = new ContentScanner();
0427:
0428: /**
0429: * Special scanner used for elements whose content needs to be scanned
0430: * as plain text, ignoring markup such as elements and entity references.
0431: * For example: <SCRIPT> and <COMMENT>.
0432: */
0433: protected SpecialScanner fSpecialScanner = new SpecialScanner();
0434:
0435: // temp vars
0436:
0437: /** String. */
0438: protected final XMLString fString = new XMLString();
0439:
0440: /** String buffer. */
0441: protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(
0442: 1024);
0443:
0444: /** String buffer. */
0445: private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(
0446: 1024);
0447:
0448: /** Non-normalized attribute string buffer. */
0449: private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(
0450: 128);
0451:
0452: /** Augmentations. */
0453: private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
0454:
0455: /** Location infoset item. */
0456: private final LocationItem fLocationItem = new LocationItem();
0457:
0458: /** Single boolean array. */
0459: private final boolean[] fSingleBoolean = { false };
0460:
0461: /** Resource identifier. */
0462: private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl();
0463:
0464: //
0465: // Public methods
0466: //
0467:
0468: /**
0469: * Pushes an input source onto the current entity stack. This
0470: * enables the scanner to transparently scan new content (e.g.
0471: * the output written by an embedded script). At the end of the
0472: * current entity, the scanner returns where it left off at the
0473: * time this entity source was pushed.
0474: * <p>
0475: * <strong>Note:</strong>
0476: * This functionality is experimental at this time and is
0477: * subject to change in future releases of NekoHTML.
0478: *
0479: * @param inputSource The new input source to start scanning.
0480: * @see #evaluateInputSource(XMLInputSource)
0481: */
0482: public void pushInputSource(XMLInputSource inputSource) {
0483: final Reader reader = getReader(inputSource);
0484:
0485: fCurrentEntityStack.push(fCurrentEntity);
0486: String encoding = inputSource.getEncoding();
0487: String publicId = inputSource.getPublicId();
0488: String baseSystemId = inputSource.getBaseSystemId();
0489: String literalSystemId = inputSource.getSystemId();
0490: String expandedSystemId = expandSystemId(literalSystemId,
0491: baseSystemId);
0492: fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0493: baseSystemId, literalSystemId, expandedSystemId);
0494: } // pushInputSource(XMLInputSource)
0495:
0496: private Reader getReader(final XMLInputSource inputSource) {
0497: Reader reader = inputSource.getCharacterStream();
0498: if (reader == null) {
0499: try {
0500: return new InputStreamReader(inputSource
0501: .getByteStream(), fJavaEncoding);
0502: } catch (final UnsupportedEncodingException e) {
0503: // should not happen as this encoding is already used to parse the "main" source
0504: }
0505: }
0506: return reader;
0507: }
0508:
0509: /**
0510: * Immediately evaluates an input source and add the new content (e.g.
0511: * the output written by an embedded script).
0512: *
0513: * @param inputSource The new input source to start evaluating.
0514: * @see #pushInputSource(XMLInputSource)
0515: */
0516: public void evaluateInputSource(XMLInputSource inputSource) {
0517: final Reader reader = getReader(inputSource);
0518:
0519: String encoding = inputSource.getEncoding();
0520: String publicId = inputSource.getPublicId();
0521: String baseSystemId = inputSource.getBaseSystemId();
0522: String literalSystemId = inputSource.getSystemId();
0523: String expandedSystemId = expandSystemId(literalSystemId,
0524: baseSystemId);
0525: fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0526: baseSystemId, literalSystemId, expandedSystemId);
0527: setScanner(fContentScanner);
0528: setScannerState(STATE_CONTENT);
0529: try {
0530: fScanner.scan(false);
0531: } catch (final IOException e) {
0532: // ignore
0533: }
0534: } // evaluateInputSource(XMLInputSource)
0535:
0536: /**
0537: * Cleans up used resources. For example, if scanning is terminated
0538: * early, then this method ensures all remaining open streams are
0539: * closed.
0540: *
0541: * @param closeall Close all streams, including the original.
0542: * This is used in cases when the application has
0543: * opened the original document stream and should
0544: * be responsible for closing it.
0545: */
0546: public void cleanup(boolean closeall) {
0547: int size = fCurrentEntityStack.size();
0548: if (size > 0) {
0549: // current entity is not the original, so close it
0550: if (fCurrentEntity != null) {
0551: try {
0552: fCurrentEntity.stream.close();
0553: } catch (IOException e) {
0554: // ignore
0555: }
0556: }
0557: // close remaining streams
0558: for (int i = closeall ? 0 : 1; i < size; i++) {
0559: fCurrentEntity = (CurrentEntity) fCurrentEntityStack
0560: .pop();
0561: try {
0562: fCurrentEntity.stream.close();
0563: } catch (IOException e) {
0564: // ignore
0565: }
0566: }
0567: } else if (closeall && fCurrentEntity != null) {
0568: try {
0569: fCurrentEntity.stream.close();
0570: } catch (IOException e) {
0571: // ignore
0572: }
0573: }
0574: } // cleanup(boolean)
0575:
0576: //
0577: // XMLLocator methods
0578: //
0579:
0580: /** Returns the encoding. */
0581: public String getEncoding() {
0582: return fCurrentEntity != null ? fCurrentEntity.encoding : null;
0583: } // getEncoding():String
0584:
0585: /** Returns the public identifier. */
0586: public String getPublicId() {
0587: return fCurrentEntity != null ? fCurrentEntity.publicId : null;
0588: } // getPublicId():String
0589:
0590: /** Returns the base system identifier. */
0591: public String getBaseSystemId() {
0592: return fCurrentEntity != null ? fCurrentEntity.baseSystemId
0593: : null;
0594: } // getBaseSystemId():String
0595:
0596: /** Returns the literal system identifier. */
0597: public String getLiteralSystemId() {
0598: return fCurrentEntity != null ? fCurrentEntity.literalSystemId
0599: : null;
0600: } // getLiteralSystemId():String
0601:
0602: /** Returns the expanded system identifier. */
0603: public String getExpandedSystemId() {
0604: return fCurrentEntity != null ? fCurrentEntity.expandedSystemId
0605: : null;
0606: } // getExpandedSystemId():String
0607:
0608: /** Returns the current line number. */
0609: public int getLineNumber() {
0610: return fCurrentEntity != null ? fCurrentEntity.lineNumber : -1;
0611: } // getLineNumber():int
0612:
0613: /** Returns the current column number. */
0614: public int getColumnNumber() {
0615: return fCurrentEntity != null ? fCurrentEntity.columnNumber
0616: : -1;
0617: } // getColumnNumber():int
0618:
0619: /** Returns the XML version. */
0620: public String getXMLVersion() {
0621: return fCurrentEntity != null ? fCurrentEntity.version : null;
0622: } // getXMLVersion():String
0623:
0624: /** Returns the character offset. */
0625: public int getCharacterOffset() {
0626: return fCurrentEntity != null ? fCurrentEntity.charOffset : -1;
0627: } // getCharacterOffset():int
0628:
0629: //
0630: // HTMLComponent methods
0631: //
0632:
0633: /** Returns the default state for a feature. */
0634: public Boolean getFeatureDefault(String featureId) {
0635: int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length
0636: : 0;
0637: for (int i = 0; i < length; i++) {
0638: if (RECOGNIZED_FEATURES[i].equals(featureId)) {
0639: return RECOGNIZED_FEATURES_DEFAULTS[i];
0640: }
0641: }
0642: return null;
0643: } // getFeatureDefault(String):Boolean
0644:
0645: /** Returns the default state for a property. */
0646: public Object getPropertyDefault(String propertyId) {
0647: int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length
0648: : 0;
0649: for (int i = 0; i < length; i++) {
0650: if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
0651: return RECOGNIZED_PROPERTIES_DEFAULTS[i];
0652: }
0653: }
0654: return null;
0655: } // getPropertyDefault(String):Object
0656:
0657: //
0658: // XMLComponent methods
0659: //
0660:
0661: /** Returns recognized features. */
0662: public String[] getRecognizedFeatures() {
0663: return RECOGNIZED_FEATURES;
0664: } // getRecognizedFeatures():String[]
0665:
0666: /** Returns recognized properties. */
0667: public String[] getRecognizedProperties() {
0668: return RECOGNIZED_PROPERTIES;
0669: } // getRecognizedProperties():String[]
0670:
0671: /** Resets the component. */
0672: public void reset(XMLComponentManager manager)
0673: throws XMLConfigurationException {
0674:
0675: // get features
0676: fAugmentations = manager.getFeature(AUGMENTATIONS);
0677: fReportErrors = manager.getFeature(REPORT_ERRORS);
0678: fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS);
0679: fNotifyXmlBuiltinRefs = manager
0680: .getFeature(NOTIFY_XML_BUILTIN_REFS);
0681: fNotifyHtmlBuiltinRefs = manager
0682: .getFeature(NOTIFY_HTML_BUILTIN_REFS);
0683: fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS);
0684: fScriptStripCDATADelims = manager
0685: .getFeature(SCRIPT_STRIP_CDATA_DELIMS);
0686: fScriptStripCommentDelims = manager
0687: .getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
0688: fStyleStripCDATADelims = manager
0689: .getFeature(STYLE_STRIP_CDATA_DELIMS);
0690: fStyleStripCommentDelims = manager
0691: .getFeature(STYLE_STRIP_COMMENT_DELIMS);
0692: fIgnoreSpecifiedCharset = manager
0693: .getFeature(IGNORE_SPECIFIED_CHARSET);
0694: fCDATASections = manager.getFeature(CDATA_SECTIONS);
0695: fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
0696: fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
0697: fNormalizeAttributes = manager.getFeature(NORMALIZE_ATTRIBUTES);
0698:
0699: // get properties
0700: fNamesElems = getNamesValue(String.valueOf(manager
0701: .getProperty(NAMES_ELEMS)));
0702: fNamesAttrs = getNamesValue(String.valueOf(manager
0703: .getProperty(NAMES_ATTRS)));
0704: fDefaultIANAEncoding = String.valueOf(manager
0705: .getProperty(DEFAULT_ENCODING));
0706: fErrorReporter = (HTMLErrorReporter) manager
0707: .getProperty(ERROR_REPORTER);
0708: fDoctypePubid = String.valueOf(manager
0709: .getProperty(DOCTYPE_PUBID));
0710: fDoctypeSysid = String.valueOf(manager
0711: .getProperty(DOCTYPE_SYSID));
0712:
0713: } // reset(XMLComponentManager)
0714:
0715: /** Sets a feature. */
0716: public void setFeature(String featureId, boolean state)
0717: throws XMLConfigurationException {
0718:
0719: if (featureId.equals(AUGMENTATIONS)) {
0720: fAugmentations = state;
0721: } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
0722: fIgnoreSpecifiedCharset = state;
0723: } else if (featureId.equals(NOTIFY_CHAR_REFS)) {
0724: fNotifyCharRefs = state;
0725: } else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) {
0726: fNotifyXmlBuiltinRefs = state;
0727: } else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) {
0728: fNotifyHtmlBuiltinRefs = state;
0729: } else if (featureId.equals(FIX_MSWINDOWS_REFS)) {
0730: fFixWindowsCharRefs = state;
0731: } else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) {
0732: fScriptStripCDATADelims = state;
0733: } else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) {
0734: fScriptStripCommentDelims = state;
0735: } else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) {
0736: fStyleStripCDATADelims = state;
0737: } else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) {
0738: fStyleStripCommentDelims = state;
0739: } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
0740: fIgnoreSpecifiedCharset = state;
0741: }
0742:
0743: } // setFeature(String,boolean)
0744:
0745: /** Sets a property. */
0746: public void setProperty(String propertyId, Object value)
0747: throws XMLConfigurationException {
0748:
0749: if (propertyId.equals(NAMES_ELEMS)) {
0750: fNamesElems = getNamesValue(String.valueOf(value));
0751: return;
0752: }
0753:
0754: if (propertyId.equals(NAMES_ATTRS)) {
0755: fNamesAttrs = getNamesValue(String.valueOf(value));
0756: return;
0757: }
0758:
0759: if (propertyId.equals(DEFAULT_ENCODING)) {
0760: fDefaultIANAEncoding = String.valueOf(value);
0761: return;
0762: }
0763:
0764: } // setProperty(String,Object)
0765:
0766: //
0767: // XMLDocumentScanner methods
0768: //
0769:
0770: /** Sets the input source. */
0771: public void setInputSource(XMLInputSource source)
0772: throws IOException {
0773:
0774: // reset state
0775: fElementCount = 0;
0776: fElementDepth = -1;
0777: fByteStream = null;
0778: fCurrentEntityStack.removeAllElements();
0779:
0780: fBeginLineNumber = 1;
0781: fBeginColumnNumber = 1;
0782: fEndLineNumber = fBeginLineNumber;
0783: fEndColumnNumber = fBeginColumnNumber;
0784:
0785: // reset encoding information
0786: fIANAEncoding = fDefaultIANAEncoding;
0787: fJavaEncoding = fIANAEncoding;
0788:
0789: // get location information
0790: String encoding = source.getEncoding();
0791: String publicId = source.getPublicId();
0792: String baseSystemId = source.getBaseSystemId();
0793: String literalSystemId = source.getSystemId();
0794: String expandedSystemId = expandSystemId(literalSystemId,
0795: baseSystemId);
0796:
0797: // open stream
0798: Reader reader = source.getCharacterStream();
0799: if (reader == null) {
0800: InputStream inputStream = source.getByteStream();
0801: if (inputStream == null) {
0802: URL url = new URL(expandedSystemId);
0803: inputStream = url.openStream();
0804: }
0805: fByteStream = new PlaybackInputStream(inputStream);
0806: String[] encodings = new String[2];
0807: if (encoding == null) {
0808: fByteStream.detectEncoding(encodings);
0809: } else {
0810: encodings[0] = encoding;
0811: }
0812: if (encodings[0] == null) {
0813: encodings[0] = fDefaultIANAEncoding;
0814: if (fReportErrors) {
0815: fErrorReporter.reportWarning("HTML1000", null);
0816: }
0817: }
0818: if (encodings[1] == null) {
0819: encodings[1] = EncodingMap
0820: .getIANA2JavaMapping(encodings[0].toUpperCase());
0821: if (encodings[1] == null) {
0822: encodings[1] = encodings[0];
0823: if (fReportErrors) {
0824: fErrorReporter.reportWarning("HTML1001",
0825: new Object[] { encodings[0] });
0826: }
0827: }
0828: }
0829: fIANAEncoding = encodings[0];
0830: fJavaEncoding = encodings[1];
0831: /* PATCH: Asgeir Asgeirsson */
0832: fIso8859Encoding = fIANAEncoding == null
0833: || fIANAEncoding.toUpperCase().startsWith(
0834: "ISO-8859")
0835: || fIANAEncoding
0836: .equalsIgnoreCase(fDefaultIANAEncoding);
0837: encoding = fIANAEncoding;
0838: reader = new InputStreamReader(fByteStream, fJavaEncoding);
0839: }
0840: fCurrentEntity = new CurrentEntity(reader, encoding, publicId,
0841: baseSystemId, literalSystemId, expandedSystemId);
0842:
0843: // set scanner and state
0844: setScanner(fContentScanner);
0845: setScannerState(STATE_START_DOCUMENT);
0846:
0847: } // setInputSource(XMLInputSource)
0848:
0849: /** Scans the document. */
0850: public boolean scanDocument(boolean complete) throws XNIException,
0851: IOException {
0852: do {
0853: if (!fScanner.scan(complete)) {
0854: return false;
0855: }
0856: } while (complete);
0857: return true;
0858: } // scanDocument(boolean):boolean
0859:
0860: /** Sets the document handler. */
0861: public void setDocumentHandler(XMLDocumentHandler handler) {
0862: fDocumentHandler = handler;
0863: } // setDocumentHandler(XMLDocumentHandler)
0864:
0865: // @since Xerces 2.1.0
0866:
0867: /** Returns the document handler. */
0868: public XMLDocumentHandler getDocumentHandler() {
0869: return fDocumentHandler;
0870: } // getDocumentHandler():XMLDocumentHandler
0871:
0872: //
0873: // Protected static methods
0874: //
0875:
0876: /** Returns the value of the specified attribute, ignoring case. */
0877: protected static String getValue(XMLAttributes attrs, String aname) {
0878: int length = attrs != null ? attrs.getLength() : 0;
0879: for (int i = 0; i < length; i++) {
0880: if (attrs.getQName(i).equalsIgnoreCase(aname)) {
0881: return attrs.getValue(i);
0882: }
0883: }
0884: return null;
0885: } // getValue(XMLAttributes,String):String
0886:
0887: /**
0888: * Expands a system id and returns the system id as a URI, if
0889: * it can be expanded. A return value of null means that the
0890: * identifier is already expanded. An exception thrown
0891: * indicates a failure to expand the id.
0892: *
0893: * @param systemId The systemId to be expanded.
0894: *
0895: * @return Returns the URI string representing the expanded system
0896: * identifier. A null value indicates that the given
0897: * system identifier is already expanded.
0898: *
0899: */
0900: public static String expandSystemId(String systemId,
0901: String baseSystemId) {
0902:
0903: // check for bad parameters id
0904: if (systemId == null || systemId.length() == 0) {
0905: return systemId;
0906: }
0907: // if id already expanded, return
0908: try {
0909: URI uri = new URI(systemId);
0910: if (uri != null) {
0911: return systemId;
0912: }
0913: } catch (URI.MalformedURIException e) {
0914: // continue on...
0915: }
0916: // normalize id
0917: String id = fixURI(systemId);
0918:
0919: // normalize base
0920: URI base = null;
0921: URI uri = null;
0922: try {
0923: if (baseSystemId == null || baseSystemId.length() == 0
0924: || baseSystemId.equals(systemId)) {
0925: String dir;
0926: try {
0927: dir = fixURI(System.getProperty("user.dir"));
0928: } catch (SecurityException se) {
0929: dir = "";
0930: }
0931: if (!dir.endsWith("/")) {
0932: dir = dir + "/";
0933: }
0934: base = new URI("file", "", dir, null, null);
0935: } else {
0936: try {
0937: base = new URI(fixURI(baseSystemId));
0938: } catch (URI.MalformedURIException e) {
0939: String dir;
0940: try {
0941: dir = fixURI(System.getProperty("user.dir"));
0942: } catch (SecurityException se) {
0943: dir = "";
0944: }
0945: if (baseSystemId.indexOf(':') != -1) {
0946: // for xml schemas we might have baseURI with
0947: // a specified drive
0948: base = new URI("file", "",
0949: fixURI(baseSystemId), null, null);
0950: } else {
0951: if (!dir.endsWith("/")) {
0952: dir = dir + "/";
0953: }
0954: dir = dir + fixURI(baseSystemId);
0955: base = new URI("file", "", dir, null, null);
0956: }
0957: }
0958: }
0959: // expand id
0960: uri = new URI(base, id);
0961: } catch (URI.MalformedURIException e) {
0962: // let it go through
0963: }
0964:
0965: if (uri == null) {
0966: return systemId;
0967: }
0968: return uri.toString();
0969:
0970: } // expandSystemId(String,String):String
0971:
0972: /**
0973: * Fixes a platform dependent filename to standard URI form.
0974: *
0975: * @param str The string to fix.
0976: *
0977: * @return Returns the fixed URI string.
0978: */
0979: protected static String fixURI(String str) {
0980:
0981: // handle platform dependent strings
0982: str = str.replace(java.io.File.separatorChar, '/');
0983:
0984: // Windows fix
0985: if (str.length() >= 2) {
0986: char ch1 = str.charAt(1);
0987: // change "C:blah" to "/C:blah"
0988: if (ch1 == ':') {
0989: char ch0 = Character.toUpperCase(str.charAt(0));
0990: if (ch0 >= 'A' && ch0 <= 'Z') {
0991: str = "/" + str;
0992: }
0993: }
0994: // change "//blah" to "file://blah"
0995: else if (ch1 == '/' && str.charAt(0) == '/') {
0996: str = "file:" + str;
0997: }
0998: }
0999:
1000: // done
1001: return str;
1002:
1003: } // fixURI(String):String
1004:
1005: /** Modifies the given name based on the specified mode. */
1006: protected static final String modifyName(String name, short mode) {
1007: switch (mode) {
1008: case NAMES_UPPERCASE:
1009: return name.toUpperCase();
1010: case NAMES_LOWERCASE:
1011: return name.toLowerCase();
1012: }
1013: return name;
1014: } // modifyName(String,short):String
1015:
1016: /**
1017: * Converts HTML names string value to constant value.
1018: *
1019: * @see #NAMES_NO_CHANGE
1020: * @see #NAMES_LOWERCASE
1021: * @see #NAMES_UPPERCASE
1022: */
1023: protected static final short getNamesValue(String value) {
1024: if (value.equals("lower")) {
1025: return NAMES_LOWERCASE;
1026: }
1027: if (value.equals("upper")) {
1028: return NAMES_UPPERCASE;
1029: }
1030: return NAMES_NO_CHANGE;
1031: } // getNamesValue(String):short
1032:
1033: /**
1034: * Fixes Microsoft Windows® specific characters.
1035: * <p>
1036: * Details about this common problem can be found at
1037: * <a href='http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.html</a>
1038: */
1039: protected int fixWindowsCharacter(int origChar) {
1040: /* PATCH: Asgeir Asgeirsson */
1041: switch (origChar) {
1042: case 130:
1043: return 8218;
1044: case 131:
1045: return 402;
1046: case 132:
1047: return 8222;
1048: case 133:
1049: return 8230;
1050: case 134:
1051: return 8224;
1052: case 135:
1053: return 8225;
1054: case 136:
1055: return 710;
1056: case 137:
1057: return 8240;
1058: case 138:
1059: return 352;
1060: case 139:
1061: return 8249;
1062: case 140:
1063: return 338;
1064: case 145:
1065: return 8216;
1066: case 146:
1067: return 8217;
1068: case 147:
1069: return 8220;
1070: case 148:
1071: return 8221;
1072: case 149:
1073: return 8226;
1074: case 150:
1075: return 8211;
1076: case 151:
1077: return 8212;
1078: case 152:
1079: return 732;
1080: case 153:
1081: return 8482;
1082: case 154:
1083: return 353;
1084: case 155:
1085: return 8250;
1086: case 156:
1087: return 339;
1088: case 159:
1089: return 376;
1090: }
1091: return origChar;
1092: } // fixWindowsCharacter(int):int
1093:
1094: //
1095: // Protected methods
1096: //
1097:
1098: // i/o
1099:
1100: /** Reads a single character. */
1101: protected int read() throws IOException {
1102: if (DEBUG_BUFFER) {
1103: System.out.print("(read: ");
1104: printBuffer();
1105: System.out.println();
1106: }
1107: if (fCurrentEntity.offset == fCurrentEntity.length) {
1108: if (load(0) == -1) {
1109: if (DEBUG_BUFFER) {
1110: System.out.println(")read: -> -1");
1111: }
1112: return -1;
1113: }
1114: }
1115: int c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1116: fCurrentEntity.columnNumber++;
1117: if (DEBUG_BUFFER) {
1118: System.out.print(")read: ");
1119: printBuffer();
1120: System.out.print(" -> ");
1121: System.out.print(c);
1122: System.out.println();
1123: }
1124: return c;
1125: } // read():int
1126:
1127: /**
1128: * Loads a new chunk of data into the buffer and returns the number of
1129: * characters loaded or -1 if no additional characters were loaded.
1130: *
1131: * @param offset The offset at which new characters should be loaded.
1132: */
1133: protected int load(int offset) throws IOException {
1134: if (DEBUG_BUFFER) {
1135: System.out.print("(load: ");
1136: printBuffer();
1137: System.out.println();
1138: }
1139: // resize buffer, if needed
1140: if (offset == fCurrentEntity.buffer.length) {
1141: int adjust = fCurrentEntity.buffer.length / 4;
1142: char[] array = new char[fCurrentEntity.buffer.length
1143: + adjust];
1144: System.arraycopy(fCurrentEntity.buffer, 0, array, 0,
1145: fCurrentEntity.length);
1146: fCurrentEntity.buffer = array;
1147: }
1148: // read a block of characters
1149: int count = fCurrentEntity.stream.read(fCurrentEntity.buffer,
1150: offset, fCurrentEntity.buffer.length - offset);
1151: fCurrentEntity.length = count != -1 ? count + offset : offset;
1152: fCurrentEntity.offset = offset;
1153: if (DEBUG_BUFFER) {
1154: System.out.print(")load: ");
1155: printBuffer();
1156: System.out.print(" -> ");
1157: System.out.print(count);
1158: System.out.println();
1159: }
1160: return count;
1161: } // load():int
1162:
1163: // debugging
1164:
1165: /** Sets the scanner. */
1166: protected void setScanner(Scanner scanner) {
1167: fScanner = scanner;
1168: if (DEBUG_SCANNER) {
1169: System.out.print("$$$ setScanner(");
1170: System.out.print(scanner != null ? scanner.getClass()
1171: .getName() : "null");
1172: System.out.println(");");
1173: }
1174: } // setScanner(Scanner)
1175:
1176: /** Sets the scanner state. */
1177: protected void setScannerState(short state) {
1178: fScannerState = state;
1179: if (DEBUG_SCANNER_STATE) {
1180: System.out.print("$$$ setScannerState(");
1181: switch (fScannerState) {
1182: case STATE_CONTENT: {
1183: System.out.print("STATE_CONTENT");
1184: break;
1185: }
1186: case STATE_MARKUP_BRACKET: {
1187: System.out.print("STATE_MARKUP_BRACKET");
1188: break;
1189: }
1190: case STATE_START_DOCUMENT: {
1191: System.out.print("STATE_START_DOCUMENT");
1192: break;
1193: }
1194: case STATE_END_DOCUMENT: {
1195: System.out.print("STATE_END_DOCUMENT");
1196: break;
1197: }
1198: }
1199: System.out.println(");");
1200: }
1201: } // setScannerState(short)
1202:
1203: // scanning
1204:
1205: /** Scans a DOCTYPE line. */
1206: protected void scanDoctype() throws IOException {
1207: String root = null;
1208: String pubid = null;
1209: String sysid = null;
1210:
1211: if (skipSpaces()) {
1212: root = scanName();
1213: if (root == null) {
1214: if (fReportErrors) {
1215: fErrorReporter.reportError("HTML1014", null);
1216: }
1217: } else {
1218: root = modifyName(root, fNamesElems);
1219: }
1220: if (skipSpaces()) {
1221: if (skip("PUBLIC", false)) {
1222: skipSpaces();
1223: pubid = scanLiteral();
1224: if (skipSpaces()) {
1225: sysid = scanLiteral();
1226: }
1227: } else if (skip("SYSTEM", false)) {
1228: skipSpaces();
1229: sysid = scanLiteral();
1230: }
1231: }
1232: }
1233: int c;
1234: while ((c = read()) != -1) {
1235: if (c == '<') {
1236: fCurrentEntity.offset--;
1237: fCurrentEntity.columnNumber--;
1238: break;
1239: }
1240: if (c == '>') {
1241: break;
1242: }
1243: if (c == '[') {
1244: skipMarkup(true);
1245: break;
1246: }
1247: }
1248:
1249: if (fDocumentHandler != null) {
1250: if (fOverrideDoctype) {
1251: pubid = fDoctypePubid;
1252: sysid = fDoctypeSysid;
1253: }
1254: fEndLineNumber = fCurrentEntity.lineNumber;
1255: fEndColumnNumber = fCurrentEntity.columnNumber;
1256: fDocumentHandler.doctypeDecl(root, pubid, sysid,
1257: locationAugs());
1258: }
1259:
1260: } // scanDoctype()
1261:
1262: /** Scans a quoted literal. */
1263: protected String scanLiteral() throws IOException {
1264: int quote = read();
1265: if (quote == '\'' || quote == '"') {
1266: StringBuffer str = new StringBuffer();
1267: int c;
1268: while ((c = read()) != -1) {
1269: if (c == quote) {
1270: break;
1271: }
1272: if (c == '\r' || c == '\n') {
1273: fCurrentEntity.offset--;
1274: fCurrentEntity.columnNumber--;
1275: // NOTE: This collapses newlines to a single space.
1276: // [Q] Is this the right thing to do here? -Ac
1277: skipNewlines();
1278: str.append(' ');
1279: } else if (c == '<') {
1280: fCurrentEntity.offset--;
1281: fCurrentEntity.columnNumber--;
1282: break;
1283: } else {
1284: str.append((char) c);
1285: }
1286: }
1287: if (c == -1) {
1288: if (fReportErrors) {
1289: fErrorReporter.reportError("HTML1007", null);
1290: }
1291: throw new EOFException();
1292: }
1293: return str.toString();
1294: } else {
1295: fCurrentEntity.offset--;
1296: fCurrentEntity.columnNumber--;
1297: }
1298: return null;
1299: } // scanLiteral():String
1300:
1301: /** Scans a name. */
1302: protected String scanName() throws IOException {
1303: if (DEBUG_BUFFER) {
1304: System.out.print("(scanName: ");
1305: printBuffer();
1306: System.out.println();
1307: }
1308: if (fCurrentEntity.offset == fCurrentEntity.length) {
1309: if (load(0) == -1) {
1310: if (DEBUG_BUFFER) {
1311: System.out.print(")scanName: ");
1312: printBuffer();
1313: System.out.println(" -> null");
1314: }
1315: return null;
1316: }
1317: }
1318: int offset = fCurrentEntity.offset;
1319: while (true) {
1320: while (fCurrentEntity.offset < fCurrentEntity.length) {
1321: char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1322: if (!Character.isLetterOrDigit(c)
1323: && !(c == '-' || c == '.' || c == ':' || c == '_')) {
1324: break;
1325: }
1326: fCurrentEntity.offset++;
1327: fCurrentEntity.columnNumber++;
1328: }
1329: if (fCurrentEntity.offset == fCurrentEntity.length) {
1330: int length = fCurrentEntity.length - offset;
1331: System.arraycopy(fCurrentEntity.buffer, offset,
1332: fCurrentEntity.buffer, 0, length);
1333: int count = load(length);
1334: offset = 0;
1335: if (count == -1) {
1336: break;
1337: }
1338: } else {
1339: break;
1340: }
1341: }
1342: int length = fCurrentEntity.offset - offset;
1343: String name = length > 0 ? new String(fCurrentEntity.buffer,
1344: offset, length) : null;
1345: if (DEBUG_BUFFER) {
1346: System.out.print(")scanName: ");
1347: printBuffer();
1348: System.out.print(" -> \"");
1349: System.out.print(name);
1350: System.out.println('"');
1351: }
1352: return name;
1353: } // scanName():String
1354:
1355: /** Scans an entity reference. */
1356: protected int scanEntityRef(XMLStringBuffer str, boolean content)
1357: throws IOException {
1358: str.clear();
1359: str.append('&');
1360: while (true) {
1361: int c = read();
1362: if (c == ';') {
1363: str.append(';');
1364: break;
1365: }
1366: if (c == -1) {
1367: if (fReportErrors) {
1368: fErrorReporter.reportWarning("HTML1004", null);
1369: }
1370: if (content && fDocumentHandler != null
1371: && fElementCount >= fElementDepth) {
1372: fEndLineNumber = fCurrentEntity.lineNumber;
1373: fEndColumnNumber = fCurrentEntity.columnNumber;
1374: fDocumentHandler.characters(str, locationAugs());
1375: }
1376: return -1;
1377: }
1378: if (!Character.isLetterOrDigit((char) c) && c != '#') {
1379: if (fReportErrors) {
1380: fErrorReporter.reportWarning("HTML1004", null);
1381: }
1382: fCurrentEntity.offset--;
1383: fCurrentEntity.columnNumber--;
1384: if (content && fDocumentHandler != null
1385: && fElementCount >= fElementDepth) {
1386: fEndLineNumber = fCurrentEntity.lineNumber;
1387: fEndColumnNumber = fCurrentEntity.columnNumber;
1388: fDocumentHandler.characters(str, locationAugs());
1389: }
1390: return -1;
1391: }
1392: str.append((char) c);
1393: }
1394: if (str.length == 1) {
1395: if (content && fDocumentHandler != null
1396: && fElementCount >= fElementDepth) {
1397: fEndLineNumber = fCurrentEntity.lineNumber;
1398: fEndColumnNumber = fCurrentEntity.columnNumber;
1399: fDocumentHandler.characters(str, locationAugs());
1400: }
1401: return -1;
1402: }
1403:
1404: String name = str.toString().substring(1, str.length - 1);
1405: if (name.startsWith("#")) {
1406: int value = -1;
1407: try {
1408: if (name.startsWith("#x")) {
1409: value = Integer.parseInt(name.substring(2), 16);
1410: } else {
1411: value = Integer.parseInt(name.substring(1));
1412: }
1413: /* PATCH: Asgeir Asgeirsson */
1414: if (fFixWindowsCharRefs && fIso8859Encoding) {
1415: value = fixWindowsCharacter(value);
1416: }
1417: if (content && fDocumentHandler != null
1418: && fElementCount >= fElementDepth) {
1419: fEndLineNumber = fCurrentEntity.lineNumber;
1420: fEndColumnNumber = fCurrentEntity.columnNumber;
1421: if (fNotifyCharRefs) {
1422: XMLResourceIdentifier id = resourceId();
1423: String encoding = null;
1424: fDocumentHandler.startGeneralEntity(name, id,
1425: encoding, locationAugs());
1426: }
1427: str.clear();
1428: str.append((char) value);
1429: fDocumentHandler.characters(str, locationAugs());
1430: if (fNotifyCharRefs) {
1431: fDocumentHandler.endGeneralEntity(name,
1432: locationAugs());
1433: }
1434: }
1435: } catch (NumberFormatException e) {
1436: if (fReportErrors) {
1437: fErrorReporter.reportError("HTML1005",
1438: new Object[] { name });
1439: }
1440: if (content && fDocumentHandler != null
1441: && fElementCount >= fElementDepth) {
1442: fEndLineNumber = fCurrentEntity.lineNumber;
1443: fEndColumnNumber = fCurrentEntity.columnNumber;
1444: fDocumentHandler.characters(str, locationAugs());
1445: }
1446: }
1447: return value;
1448: }
1449:
1450: int c = HTMLEntities.get(name);
1451: if (c == -1) {
1452: if (fReportErrors) {
1453: fErrorReporter.reportWarning("HTML1006",
1454: new Object[] { name });
1455: }
1456: if (content && fDocumentHandler != null
1457: && fElementCount >= fElementDepth) {
1458: fEndLineNumber = fCurrentEntity.lineNumber;
1459: fEndColumnNumber = fCurrentEntity.columnNumber;
1460: fDocumentHandler.characters(str, locationAugs());
1461: }
1462: return -1;
1463: }
1464: if (content && fDocumentHandler != null
1465: && fElementCount >= fElementDepth) {
1466: fEndLineNumber = fCurrentEntity.lineNumber;
1467: fEndColumnNumber = fCurrentEntity.columnNumber;
1468: boolean notify = fNotifyHtmlBuiltinRefs
1469: || (fNotifyXmlBuiltinRefs && builtinXmlRef(name));
1470: if (notify) {
1471: XMLResourceIdentifier id = resourceId();
1472: String encoding = null;
1473: fDocumentHandler.startGeneralEntity(name, id, encoding,
1474: locationAugs());
1475: }
1476: str.clear();
1477: str.append((char) c);
1478: fDocumentHandler.characters(str, locationAugs());
1479: if (notify) {
1480: fDocumentHandler.endGeneralEntity(name, locationAugs());
1481: }
1482: }
1483: return c;
1484:
1485: } // scanEntityRef(XMLStringBuffer,boolean):int
1486:
1487: /** Returns true if the specified text is present and is skipped. */
1488: protected boolean skip(String s, boolean caseSensitive)
1489: throws IOException {
1490: int length = s != null ? s.length() : 0;
1491: for (int i = 0; i < length; i++) {
1492: if (fCurrentEntity.offset == fCurrentEntity.length) {
1493: System.arraycopy(fCurrentEntity.buffer,
1494: fCurrentEntity.offset - i,
1495: fCurrentEntity.buffer, 0, i);
1496: if (load(i) == -1) {
1497: fCurrentEntity.offset = 0;
1498: return false;
1499: }
1500: }
1501: char c0 = s.charAt(i);
1502: char c1 = fCurrentEntity.buffer[fCurrentEntity.offset++];
1503: fCurrentEntity.columnNumber++;
1504: if (!caseSensitive) {
1505: c0 = Character.toUpperCase(c0);
1506: c1 = Character.toUpperCase(c1);
1507: }
1508: if (c0 != c1) {
1509: fCurrentEntity.offset -= i + 1;
1510: return false;
1511: }
1512: }
1513: return true;
1514: } // skip(String):boolean
1515:
1516: /** Skips markup. */
1517: protected boolean skipMarkup(boolean balance) throws IOException {
1518: if (DEBUG_BUFFER) {
1519: System.out.print("(skipMarkup: ");
1520: printBuffer();
1521: System.out.println();
1522: }
1523: int depth = 1;
1524: boolean slashgt = false;
1525: OUTER: while (true) {
1526: if (fCurrentEntity.offset == fCurrentEntity.length) {
1527: if (load(0) == -1) {
1528: break OUTER;
1529: }
1530: }
1531: while (fCurrentEntity.offset < fCurrentEntity.length) {
1532: char c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1533: fCurrentEntity.columnNumber++;
1534: if (balance && c == '<') {
1535: depth++;
1536: } else if (c == '>') {
1537: depth--;
1538: if (depth == 0) {
1539: break OUTER;
1540: }
1541: } else if (c == '/') {
1542: if (fCurrentEntity.offset == fCurrentEntity.length) {
1543: if (load(0) == -1) {
1544: break OUTER;
1545: }
1546: }
1547: c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1548: fCurrentEntity.columnNumber++;
1549: if (c == '>') {
1550: slashgt = true;
1551: depth--;
1552: if (depth == 0) {
1553: break OUTER;
1554: }
1555: } else {
1556: fCurrentEntity.offset--;
1557: fCurrentEntity.columnNumber--;
1558: }
1559: } else if (c == '\r' || c == '\n') {
1560: skipNewlines();
1561: }
1562: }
1563: }
1564: if (DEBUG_BUFFER) {
1565: System.out.print(")skipMarkup: ");
1566: printBuffer();
1567: System.out.print(" -> " + slashgt);
1568: System.out.println();
1569: }
1570: return slashgt;
1571: } // skipMarkup():boolean
1572:
1573: /** Skips whitespace. */
1574: protected boolean skipSpaces() throws IOException {
1575: if (DEBUG_BUFFER) {
1576: System.out.print("(skipSpaces: ");
1577: printBuffer();
1578: System.out.println();
1579: }
1580: boolean spaces = false;
1581: while (true) {
1582: if (fCurrentEntity.offset == fCurrentEntity.length) {
1583: if (load(0) == -1) {
1584: break;
1585: }
1586: }
1587: char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1588: if (!Character.isSpace(c)) {
1589: break;
1590: }
1591: spaces = true;
1592: if (c == '\r' || c == '\n') {
1593: skipNewlines();
1594: continue;
1595: }
1596: fCurrentEntity.offset++;
1597: fCurrentEntity.columnNumber++;
1598: }
1599: if (DEBUG_BUFFER) {
1600: System.out.print(")skipSpaces: ");
1601: printBuffer();
1602: System.out.print(" -> ");
1603: System.out.print(spaces);
1604: System.out.println();
1605: }
1606: return spaces;
1607: } // skipSpaces()
1608:
1609: /** Skips newlines and returns the number of newlines skipped. */
1610: protected int skipNewlines() throws IOException {
1611: return skipNewlines(Integer.MAX_VALUE);
1612: } // skipNewlines():int
1613:
1614: /** Skips newlines and returns the number of newlines skipped. */
1615: protected int skipNewlines(int maxlines) throws IOException {
1616: if (DEBUG_BUFFER) {
1617: System.out.print("(skipNewlines: ");
1618: printBuffer();
1619: System.out.println();
1620: }
1621: if (fCurrentEntity.offset == fCurrentEntity.length) {
1622: if (load(0) == -1) {
1623: if (DEBUG_BUFFER) {
1624: System.out.print(")skipNewlines: ");
1625: printBuffer();
1626: System.out.println();
1627: }
1628: return 0;
1629: }
1630: }
1631: char c = fCurrentEntity.buffer[fCurrentEntity.offset];
1632: int newlines = 0;
1633: int offset = fCurrentEntity.offset;
1634: if (c == '\n' || c == '\r') {
1635: do {
1636: c = fCurrentEntity.buffer[fCurrentEntity.offset++];
1637: if (c == '\r') {
1638: newlines++;
1639: if (fCurrentEntity.offset == fCurrentEntity.length) {
1640: offset = 0;
1641: fCurrentEntity.offset = newlines;
1642: if (load(newlines) == -1) {
1643: break;
1644: }
1645: }
1646: if (fCurrentEntity.buffer[fCurrentEntity.offset] == '\n') {
1647: fCurrentEntity.offset++;
1648: offset++;
1649: }
1650: } else if (c == '\n') {
1651: newlines++;
1652: if (fCurrentEntity.offset == fCurrentEntity.length) {
1653: offset = 0;
1654: fCurrentEntity.offset = newlines;
1655: if (load(newlines) == -1) {
1656: break;
1657: }
1658: }
1659: } else {
1660: fCurrentEntity.offset--;
1661: break;
1662: }
1663: } while (newlines < maxlines
1664: && fCurrentEntity.offset < fCurrentEntity.length - 1);
1665: fCurrentEntity.lineNumber += newlines;
1666: fCurrentEntity.columnNumber = 1;
1667: }
1668: if (DEBUG_BUFFER) {
1669: System.out.print(")skipNewlines: ");
1670: printBuffer();
1671: System.out.print(" -> ");
1672: System.out.print(newlines);
1673: System.out.println();
1674: }
1675: return newlines;
1676: } // skipNewlines(int):int
1677:
1678: // infoset utility methods
1679:
1680: /** Returns an augmentations object with a location item added. */
1681: protected final Augmentations locationAugs() {
1682: HTMLAugmentations augs = null;
1683: if (fAugmentations) {
1684: fLocationItem.setValues(fBeginLineNumber,
1685: fBeginColumnNumber, fEndLineNumber,
1686: fEndColumnNumber);
1687: augs = fInfosetAugs;
1688: augs.removeAllItems();
1689: augs.putItem(AUGMENTATIONS, fLocationItem);
1690: }
1691: return augs;
1692: } // locationAugs():Augmentations
1693:
1694: /** Returns an augmentations object with a synthesized item added. */
1695: protected final Augmentations synthesizedAugs() {
1696: HTMLAugmentations augs = null;
1697: if (fAugmentations) {
1698: augs = fInfosetAugs;
1699: augs.removeAllItems();
1700: augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
1701: }
1702: return augs;
1703: } // synthesizedAugs():Augmentations
1704:
1705: /** Returns an empty resource identifier. */
1706: protected final XMLResourceIdentifier resourceId() {
1707: /***/
1708: fResourceId.clear();
1709: return fResourceId;
1710: /***
1711: // NOTE: Unfortunately, the Xerces DOM parser classes expect a
1712: // non-null resource identifier object to be passed to
1713: // startGeneralEntity. -Ac
1714: return null;
1715: /***/
1716: } // resourceId():XMLResourceIdentifier
1717:
1718: //
1719: // Protected static methods
1720: //
1721:
1722: /** Returns true if the name is a built-in XML general entity reference. */
1723: protected static boolean builtinXmlRef(String name) {
1724: return name.equals("amp") || name.equals("lt")
1725: || name.equals("gt") || name.equals("quot")
1726: || name.equals("apos");
1727: } // builtinXmlRef(String):boolean
1728:
1729: //
1730: // Private methods
1731: //
1732:
1733: /** Prints the contents of the character buffer to standard out. */
1734: private void printBuffer() {
1735: if (DEBUG_BUFFER) {
1736: System.out.print('[');
1737: System.out.print(fCurrentEntity.length);
1738: System.out.print(' ');
1739: System.out.print(fCurrentEntity.offset);
1740: if (fCurrentEntity.length > 0) {
1741: System.out.print(" \"");
1742: for (int i = 0; i < fCurrentEntity.length; i++) {
1743: if (i == fCurrentEntity.offset) {
1744: System.out.print('^');
1745: }
1746: char c = fCurrentEntity.buffer[i];
1747: switch (c) {
1748: case '\r': {
1749: System.out.print("\\r");
1750: break;
1751: }
1752: case '\n': {
1753: System.out.print("\\n");
1754: break;
1755: }
1756: case '\t': {
1757: System.out.print("\\t");
1758: break;
1759: }
1760: case '"': {
1761: System.out.print("\\\"");
1762: break;
1763: }
1764: default: {
1765: System.out.print(c);
1766: }
1767: }
1768: }
1769: if (fCurrentEntity.offset == fCurrentEntity.length) {
1770: System.out.print('^');
1771: }
1772: System.out.print('"');
1773: }
1774: System.out.print(']');
1775: }
1776: } // printBuffer()
1777:
1778: //
1779: // Interfaces
1780: //
1781:
1782: /**
1783: * Basic scanner interface.
1784: *
1785: * @author Andy Clark
1786: */
1787: public interface Scanner {
1788:
1789: //
1790: // Scanner methods
1791: //
1792:
1793: /**
1794: * Scans part of the document. This interface allows scanning to
1795: * be performed in a pulling manner.
1796: *
1797: * @param complete True if the scanner should not return until
1798: * scanning is complete.
1799: *
1800: * @return True if additional scanning is required.
1801: *
1802: * @throws IOException Thrown if I/O error occurs.
1803: */
1804: public boolean scan(boolean complete) throws IOException;
1805:
1806: } // interface Scanner
1807:
1808: //
1809: // Classes
1810: //
1811:
1812: /**
1813: * Current entity.
1814: *
1815: * @author Andy Clark
1816: */
1817: public static class CurrentEntity {
1818:
1819: //
1820: // Data
1821: //
1822:
1823: /** Character stream. */
1824: public Reader stream;
1825:
1826: /** Encoding. */
1827: public String encoding;
1828:
1829: /** Public identifier. */
1830: public String publicId;
1831:
1832: /** Base system identifier. */
1833: public String baseSystemId;
1834:
1835: /** Literal system identifier. */
1836: public String literalSystemId;
1837:
1838: /** Expanded system identifier. */
1839: public String expandedSystemId;
1840:
1841: /** XML version. */
1842: public String version = "1.0";
1843:
1844: /** Line number. */
1845: public int lineNumber = 1;
1846:
1847: /** Column number. */
1848: public int columnNumber = 1;
1849:
1850: /** Character offset. */
1851: public int charOffset = -1;
1852:
1853: // buffer
1854:
1855: /** Character buffer. */
1856: public char[] buffer = new char[DEFAULT_BUFFER_SIZE];
1857:
1858: /** Offset into character buffer. */
1859: public int offset = 0;
1860:
1861: /** Length of characters read into character buffer. */
1862: public int length = 0;
1863:
1864: //
1865: // Constructors
1866: //
1867:
1868: /** Constructs an entity from the specified stream. */
1869: public CurrentEntity(Reader stream, String encoding,
1870: String publicId, String baseSystemId,
1871: String literalSystemId, String expandedSystemId) {
1872: this .stream = stream;
1873: this .encoding = encoding;
1874: this .publicId = publicId;
1875: this .baseSystemId = baseSystemId;
1876: this .literalSystemId = literalSystemId;
1877: this .expandedSystemId = expandedSystemId;
1878: } // <init>(Reader,String,String,String,String)
1879:
1880: } // class CurrentEntity
1881:
1882: /**
1883: * The primary HTML document scanner.
1884: *
1885: * @author Andy Clark
1886: */
1887: public class ContentScanner implements Scanner {
1888:
1889: //
1890: // Data
1891: //
1892:
1893: // temp vars
1894:
1895: /** A qualified name. */
1896: private final QName fQName = new QName();
1897:
1898: /** Attributes. */
1899: private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();
1900:
1901: //
1902: // Scanner methods
1903: //
1904:
1905: /** Scan. */
1906: public boolean scan(boolean complete) throws IOException {
1907: boolean next;
1908: do {
1909: try {
1910: next = false;
1911: switch (fScannerState) {
1912: case STATE_CONTENT: {
1913: fBeginLineNumber = fCurrentEntity.lineNumber;
1914: fBeginColumnNumber = fCurrentEntity.columnNumber;
1915: int c = read();
1916: if (c == '<') {
1917: setScannerState(STATE_MARKUP_BRACKET);
1918: next = true;
1919: } else if (c == '&') {
1920: scanEntityRef(fStringBuffer, true);
1921: } else if (c == -1) {
1922: throw new EOFException();
1923: } else {
1924: fCurrentEntity.offset--;
1925: fCurrentEntity.columnNumber--;
1926: scanCharacters();
1927: }
1928: break;
1929: }
1930: case STATE_MARKUP_BRACKET: {
1931: int c = read();
1932: if (c == '!') {
1933: if (skip("--", false)) {
1934: scanComment();
1935: } else if (skip("[CDATA[", false)) {
1936: scanCDATA();
1937: } else if (skip("DOCTYPE", false)) {
1938: scanDoctype();
1939: } else {
1940: if (fReportErrors) {
1941: fErrorReporter.reportError(
1942: "HTML1002", null);
1943: }
1944: skipMarkup(true);
1945: }
1946: } else if (c == '?') {
1947: scanPI();
1948: } else if (c == '/') {
1949: scanEndElement();
1950: } else if (c == -1) {
1951: if (fReportErrors) {
1952: fErrorReporter.reportError("HTML1003",
1953: null);
1954: }
1955: if (fDocumentHandler != null
1956: && fElementCount >= fElementDepth) {
1957: fStringBuffer.clear();
1958: fStringBuffer.append('<');
1959: fDocumentHandler.characters(
1960: fStringBuffer, null);
1961: }
1962: throw new EOFException();
1963: } else {
1964: fCurrentEntity.offset--;
1965: fCurrentEntity.columnNumber--;
1966: fElementCount++;
1967: fSingleBoolean[0] = false;
1968: String ename = scanStartElement(fSingleBoolean);
1969: if (ename != null
1970: && !fSingleBoolean[0]
1971: && HTMLElements.getElement(ename)
1972: .isSpecial()) {
1973: setScanner(fSpecialScanner
1974: .setElementName(ename));
1975: setScannerState(STATE_CONTENT);
1976: return true;
1977: }
1978: }
1979: setScannerState(STATE_CONTENT);
1980: break;
1981: }
1982: case STATE_START_DOCUMENT: {
1983: if (fDocumentHandler != null
1984: && fElementCount >= fElementDepth) {
1985: if (DEBUG_CALLBACKS) {
1986: System.out.println("startDocument()");
1987: }
1988: XMLLocator locator = HTMLScanner.this ;
1989: String encoding = fIANAEncoding;
1990: Augmentations augs = locationAugs();
1991: try {
1992: // NOTE: Hack to allow the default filter to work with
1993: // old and new versions of the XNI document handler
1994: // interface. -Ac
1995: Class cls = fDocumentHandler.getClass();
1996: Class[] types = { XMLLocator.class,
1997: String.class,
1998: NamespaceContext.class,
1999: Augmentations.class };
2000: Method method = cls.getMethod(
2001: "startDocument", types);
2002: NamespaceContext nscontext = new NamespaceSupport();
2003: Object[] params = { locator, encoding,
2004: nscontext, augs };
2005: method.invoke(fDocumentHandler, params);
2006: } catch (IllegalAccessException e) {
2007: throw new XNIException(e);
2008: } catch (InvocationTargetException e) {
2009: throw new XNIException(e);
2010: } catch (NoSuchMethodException e) {
2011: try {
2012: // NOTE: Hack to allow the default filter to work with
2013: // old and new versions of the XNI document handler
2014: // interface. -Ac
2015: Class cls = fDocumentHandler
2016: .getClass();
2017: Class[] types = { XMLLocator.class,
2018: String.class,
2019: Augmentations.class };
2020: Method method = cls.getMethod(
2021: "startDocument", types);
2022: Object[] params = { locator,
2023: encoding, augs };
2024: method.invoke(fDocumentHandler,
2025: params);
2026: } catch (IllegalAccessException ex) {
2027: // NOTE: Should never reach here!
2028: throw new XNIException(ex);
2029: } catch (InvocationTargetException ex) {
2030: // NOTE: Should never reach here!
2031: throw new XNIException(ex);
2032: } catch (NoSuchMethodException ex) {
2033: // NOTE: Should never reach here!
2034: throw new XNIException(ex);
2035: }
2036: }
2037: }
2038: if (fInsertDoctype && fDocumentHandler != null) {
2039: String root = HTMLElements
2040: .getElement(HTMLElements.HTML).name;
2041: root = modifyName(root, fNamesElems);
2042: String pubid = fDoctypePubid;
2043: String sysid = fDoctypeSysid;
2044: fDocumentHandler.doctypeDecl(root, pubid,
2045: sysid, synthesizedAugs());
2046: }
2047: setScannerState(STATE_CONTENT);
2048: break;
2049: }
2050: case STATE_END_DOCUMENT: {
2051: if (fDocumentHandler != null
2052: && fElementCount >= fElementDepth) {
2053: if (DEBUG_CALLBACKS) {
2054: System.out.println("endDocument()");
2055: }
2056: fEndLineNumber = fCurrentEntity.lineNumber;
2057: fEndColumnNumber = fCurrentEntity.columnNumber;
2058: fDocumentHandler
2059: .endDocument(locationAugs());
2060: }
2061: return false;
2062: }
2063: default: {
2064: throw new RuntimeException(
2065: "unknown scanner state: "
2066: + fScannerState);
2067: }
2068: }
2069: } catch (EOFException e) {
2070: if (fCurrentEntityStack.empty()) {
2071: setScannerState(STATE_END_DOCUMENT);
2072: } else {
2073: fCurrentEntity = (CurrentEntity) fCurrentEntityStack
2074: .pop();
2075: }
2076: next = true;
2077: }
2078: } while (next || complete);
2079: return true;
2080: } // scan(boolean):boolean
2081:
2082: //
2083: // Protected methods
2084: //
2085:
2086: /** Scans characters. */
2087: protected void scanCharacters() throws IOException {
2088: if (DEBUG_BUFFER) {
2089: System.out.print("(scanCharacters: ");
2090: printBuffer();
2091: System.out.println();
2092: }
2093: int newlines = skipNewlines();
2094: if (newlines == 0
2095: && fCurrentEntity.offset == fCurrentEntity.length) {
2096: if (DEBUG_BUFFER) {
2097: System.out.print(")scanCharacters: ");
2098: printBuffer();
2099: System.out.println();
2100: }
2101: return;
2102: }
2103: char c;
2104: int offset = fCurrentEntity.offset - newlines;
2105: for (int i = offset; i < fCurrentEntity.offset; i++) {
2106: fCurrentEntity.buffer[i] = '\n';
2107: }
2108: while (fCurrentEntity.offset < fCurrentEntity.length) {
2109: c = fCurrentEntity.buffer[fCurrentEntity.offset];
2110: if (c == '<' || c == '&' || c == '\n' || c == '\r') {
2111: break;
2112: }
2113: fCurrentEntity.offset++;
2114: fCurrentEntity.columnNumber++;
2115: }
2116: if (fCurrentEntity.offset > offset
2117: && fDocumentHandler != null
2118: && fElementCount >= fElementDepth) {
2119: fString.setValues(fCurrentEntity.buffer, offset,
2120: fCurrentEntity.offset - offset);
2121: if (DEBUG_CALLBACKS) {
2122: System.out.println("characters(" + fString + ")");
2123: }
2124: fEndLineNumber = fCurrentEntity.lineNumber;
2125: fEndColumnNumber = fCurrentEntity.columnNumber;
2126: fDocumentHandler.characters(fString, locationAugs());
2127: }
2128: if (DEBUG_BUFFER) {
2129: System.out.print(")scanCharacters: ");
2130: printBuffer();
2131: System.out.println();
2132: }
2133: } // scanCharacters()
2134:
2135: /** Scans a CDATA section. */
2136: protected void scanCDATA() throws IOException {
2137: if (DEBUG_BUFFER) {
2138: System.out.print("(scanCDATA: ");
2139: printBuffer();
2140: System.out.println();
2141: }
2142: fStringBuffer.clear();
2143: if (fCDATASections) {
2144: if (fDocumentHandler != null
2145: && fElementCount >= fElementDepth) {
2146: fEndLineNumber = fCurrentEntity.lineNumber;
2147: fEndColumnNumber = fCurrentEntity.columnNumber;
2148: if (DEBUG_CALLBACKS) {
2149: System.out.println("startCDATA()");
2150: }
2151: fDocumentHandler.startCDATA(locationAugs());
2152: }
2153: } else {
2154: fStringBuffer.append("[CDATA[");
2155: }
2156: boolean eof = scanMarkupContent(fStringBuffer, ']');
2157: if (!fCDATASections) {
2158: fStringBuffer.append("]]");
2159: }
2160: if (fDocumentHandler != null
2161: && fElementCount >= fElementDepth) {
2162: fEndLineNumber = fCurrentEntity.lineNumber;
2163: fEndColumnNumber = fCurrentEntity.columnNumber;
2164: if (fCDATASections) {
2165: if (DEBUG_CALLBACKS) {
2166: System.out.println("characters("
2167: + fStringBuffer + ")");
2168: }
2169: fDocumentHandler.characters(fStringBuffer,
2170: locationAugs());
2171: if (DEBUG_CALLBACKS) {
2172: System.out.println("endCDATA()");
2173: }
2174: fDocumentHandler.endCDATA(locationAugs());
2175: } else {
2176: if (DEBUG_CALLBACKS) {
2177: System.out.println("comment(" + fStringBuffer
2178: + ")");
2179: }
2180: fDocumentHandler.comment(fStringBuffer,
2181: locationAugs());
2182: }
2183: }
2184: if (DEBUG_BUFFER) {
2185: System.out.print(")scanCDATA: ");
2186: printBuffer();
2187: System.out.println();
2188: }
2189: if (eof) {
2190: throw new EOFException();
2191: }
2192: } // scanCDATA()
2193:
2194: /** Scans a comment. */
2195: protected void scanComment() throws IOException {
2196: if (DEBUG_BUFFER) {
2197: System.out.print("(scanComment: ");
2198: printBuffer();
2199: System.out.println();
2200: }
2201: fStringBuffer.clear();
2202: boolean eof = scanMarkupContent(fStringBuffer, '-');
2203: if (fDocumentHandler != null
2204: && fElementCount >= fElementDepth) {
2205: if (DEBUG_CALLBACKS) {
2206: System.out
2207: .println("comment(" + fStringBuffer + ")");
2208: }
2209: fEndLineNumber = fCurrentEntity.lineNumber;
2210: fEndColumnNumber = fCurrentEntity.columnNumber;
2211: fDocumentHandler.comment(fStringBuffer, locationAugs());
2212: }
2213: if (DEBUG_BUFFER) {
2214: System.out.print(")scanComment: ");
2215: printBuffer();
2216: System.out.println();
2217: }
2218: if (eof) {
2219: throw new EOFException();
2220: }
2221: } // scanComment()
2222:
2223: /** Scans markup content. */
2224: protected boolean scanMarkupContent(XMLStringBuffer buffer,
2225: char cend) throws IOException {
2226: int c = -1;
2227: OUTER: while (true) {
2228: c = read();
2229: if (c == cend) {
2230: int count = 1;
2231: while (true) {
2232: c = read();
2233: if (c == cend) {
2234: count++;
2235: continue;
2236: }
2237: break;
2238: }
2239: if (c == -1) {
2240: if (fReportErrors) {
2241: fErrorReporter
2242: .reportError("HTML1007", null);
2243: }
2244: break OUTER;
2245: }
2246: if (count < 2) {
2247: buffer.append(cend);
2248: //if (c != -1) {
2249: fCurrentEntity.offset--;
2250: fCurrentEntity.columnNumber--;
2251: //}
2252: continue;
2253: }
2254: if (c != '>') {
2255: for (int i = 0; i < count; i++) {
2256: buffer.append(cend);
2257: }
2258: fCurrentEntity.offset--;
2259: fCurrentEntity.columnNumber--;
2260: continue;
2261: }
2262: for (int i = 0; i < count - 2; i++) {
2263: buffer.append(cend);
2264: }
2265: break;
2266: } else if (c == '\n' || c == '\r') {
2267: fCurrentEntity.offset--;
2268: fCurrentEntity.columnNumber--;
2269: int newlines = skipNewlines();
2270: for (int i = 0; i < newlines; i++) {
2271: buffer.append('\n');
2272: }
2273: continue;
2274: } else if (c == -1) {
2275: if (fReportErrors) {
2276: fErrorReporter.reportError("HTML1007", null);
2277: }
2278: break;
2279: }
2280: buffer.append((char) c);
2281: }
2282: return c == -1;
2283: } // scanMarkupContent(XMLStringBuffer,char):boolean
2284:
2285: /** Scans a processing instruction. */
2286: protected void scanPI() throws IOException {
2287: if (DEBUG_BUFFER) {
2288: System.out.print("(scanPI: ");
2289: printBuffer();
2290: System.out.println();
2291: }
2292: if (fReportErrors) {
2293: fErrorReporter.reportWarning("HTML1008", null);
2294: }
2295:
2296: // scan processing instruction
2297: String target = scanName();
2298: if (target != null && !target.equalsIgnoreCase("xml")) {
2299: while (true) {
2300: int c = read();
2301: if (c == '\r' || c == '\n') {
2302: fCurrentEntity.lineNumber++;
2303: fCurrentEntity.columnNumber = 1;
2304: if (c == '\r') {
2305: c = read();
2306: if (c != '\n') {
2307: fCurrentEntity.offset--;
2308: }
2309: }
2310: continue;
2311: }
2312: if (c == -1) {
2313: break;
2314: }
2315: if (c != ' ' && c != '\t') {
2316: fCurrentEntity.offset--;
2317: fCurrentEntity.columnNumber--;
2318: break;
2319: }
2320: }
2321: fStringBuffer.clear();
2322: while (true) {
2323: int c = read();
2324: if (c == '?' || c == '/') {
2325: char c0 = (char) c;
2326: c = read();
2327: if (c == '>') {
2328: break;
2329: } else {
2330: fStringBuffer.append(c0);
2331: fCurrentEntity.offset--;
2332: fCurrentEntity.columnNumber--;
2333: continue;
2334: }
2335: } else if (c == '\r' || c == '\n') {
2336: fStringBuffer.append('\n');
2337: fCurrentEntity.lineNumber++;
2338: fCurrentEntity.columnNumber = 1;
2339: if (c == '\r') {
2340: c = read();
2341: if (c != '\n') {
2342: fCurrentEntity.offset--;
2343: }
2344: }
2345: continue;
2346: } else if (c == -1) {
2347: break;
2348: } else {
2349: fStringBuffer.append((char) c);
2350: }
2351: }
2352: XMLString data = fStringBuffer;
2353: if (fDocumentHandler != null) {
2354: fEndLineNumber = fCurrentEntity.lineNumber;
2355: fEndColumnNumber = fCurrentEntity.columnNumber;
2356: fDocumentHandler.processingInstruction(target,
2357: data, locationAugs());
2358: }
2359: }
2360:
2361: // scan xml/text declaration
2362: else {
2363: int beginLineNumber = fBeginLineNumber;
2364: int beginColumnNumber = fBeginColumnNumber;
2365: fAttributes.removeAllAttributes();
2366: int aindex = 0;
2367: while (scanPseudoAttribute(fAttributes)) {
2368: fAttributes.getName(aindex, fQName);
2369: fQName.rawname = fQName.rawname.toLowerCase();
2370: fAttributes.setName(aindex, fQName);
2371: aindex++;
2372: }
2373: if (fDocumentHandler != null) {
2374: String version = fAttributes.getValue("version");
2375: String encoding = fAttributes.getValue("encoding");
2376: String standalone = fAttributes
2377: .getValue("standalone");
2378:
2379: fBeginLineNumber = beginLineNumber;
2380: fBeginColumnNumber = beginColumnNumber;
2381: fEndLineNumber = fCurrentEntity.lineNumber;
2382: fEndColumnNumber = fCurrentEntity.columnNumber;
2383: fDocumentHandler.xmlDecl(version, encoding,
2384: standalone, locationAugs());
2385: }
2386: }
2387:
2388: if (DEBUG_BUFFER) {
2389: System.out.print(")scanPI: ");
2390: printBuffer();
2391: System.out.println();
2392: }
2393: } // scanPI()
2394:
2395: /**
2396: * Scans a start element.
2397: *
2398: * @param empty Is used for a second return value to indicate whether
2399: * the start element tag is empty (e.g. "/>").
2400: */
2401: protected String scanStartElement(boolean[] empty)
2402: throws IOException {
2403: String ename = scanName();
2404: int length = ename != null ? ename.length() : 0;
2405: int c = length > 0 ? ename.charAt(0) : -1;
2406: if (length == 0
2407: || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
2408: if (fReportErrors) {
2409: fErrorReporter.reportError("HTML1009", null);
2410: }
2411: if (fDocumentHandler != null
2412: && fElementCount >= fElementDepth) {
2413: fStringBuffer.clear();
2414: fStringBuffer.append('<');
2415: if (length > 0) {
2416: fStringBuffer.append(ename);
2417: }
2418: fDocumentHandler.characters(fStringBuffer, null);
2419: }
2420: return null;
2421: }
2422: ename = modifyName(ename, fNamesElems);
2423: fAttributes.removeAllAttributes();
2424: int beginLineNumber = fBeginLineNumber;
2425: int beginColumnNumber = fBeginColumnNumber;
2426: while (scanAttribute(fAttributes, empty)) {
2427: // do nothing
2428: }
2429: fBeginLineNumber = beginLineNumber;
2430: fBeginColumnNumber = beginColumnNumber;
2431: if (fByteStream != null && fElementDepth == -1) {
2432: if (ename.equalsIgnoreCase("META")) {
2433: if (DEBUG_CHARSET) {
2434: System.out.println("+++ <META>");
2435: }
2436: String httpEquiv = getValue(fAttributes,
2437: "http-equiv");
2438: if (httpEquiv != null
2439: && httpEquiv
2440: .equalsIgnoreCase("content-type")) {
2441: if (DEBUG_CHARSET) {
2442: System.out.println("+++ @content-type: \""
2443: + httpEquiv + '"');
2444: }
2445: String content = getValue(fAttributes,
2446: "content");
2447: int index1 = content != null ? content
2448: .toLowerCase().indexOf("charset=") : -1;
2449: if (index1 != -1 && !fIgnoreSpecifiedCharset) {
2450: int index2 = content.indexOf(';', index1);
2451: String charset = index2 != -1 ? content
2452: .substring(index1 + 8, index2)
2453: : content.substring(index1 + 8);
2454: try {
2455: String ianaEncoding = charset;
2456: String javaEncoding = EncodingMap
2457: .getIANA2JavaMapping(ianaEncoding
2458: .toUpperCase());
2459: if (DEBUG_CHARSET) {
2460: System.out
2461: .println("+++ ianaEncoding: "
2462: + ianaEncoding);
2463: System.out
2464: .println("+++ javaEncoding: "
2465: + javaEncoding);
2466: }
2467: if (javaEncoding == null) {
2468: javaEncoding = ianaEncoding;
2469: if (fReportErrors) {
2470: fErrorReporter
2471: .reportError(
2472: "HTML1001",
2473: new Object[] { ianaEncoding });
2474: }
2475: }
2476: // patch: Marc Guillemot
2477: if (!javaEncoding.equals(fJavaEncoding)) {
2478: if (!isEncodingCompatible(
2479: javaEncoding, fJavaEncoding)) {
2480: if (fReportErrors) {
2481: fErrorReporter
2482: .reportError(
2483: "HTML1015",
2484: new Object[] {
2485: javaEncoding,
2486: fJavaEncoding });
2487: }
2488: }
2489: // change the charset
2490: else {
2491: fIso8859Encoding = ianaEncoding == null
2492: || ianaEncoding
2493: .toUpperCase()
2494: .startsWith(
2495: "ISO-8859")
2496: || ianaEncoding
2497: .equalsIgnoreCase(fDefaultIANAEncoding);
2498: fCurrentEntity.stream = new InputStreamReader(
2499: fByteStream,
2500: javaEncoding);
2501: fByteStream.playback();
2502: fElementDepth = fElementCount;
2503: fElementCount = 0;
2504: fCurrentEntity.offset = fCurrentEntity.length = 0;
2505: fCurrentEntity.lineNumber = 1;
2506: fCurrentEntity.columnNumber = 1;
2507: }
2508: }
2509: } catch (UnsupportedEncodingException e) {
2510: if (fReportErrors) {
2511: fErrorReporter.reportError(
2512: "HTML1010",
2513: new Object[] { charset });
2514: }
2515: // NOTE: If the encoding change doesn't work,
2516: // then there's no point in continuing to
2517: // buffer the input stream.
2518: fByteStream.clear();
2519: fByteStream = null;
2520: }
2521: }
2522: }
2523: } else if (ename.equalsIgnoreCase("BODY")) {
2524: fByteStream.clear();
2525: fByteStream = null;
2526: } else {
2527: HTMLElements.Element element = HTMLElements
2528: .getElement(ename);
2529: if (element.parent != null
2530: && element.parent.length > 0) {
2531: if (element.parent[0].code == HTMLElements.BODY) {
2532: fByteStream.clear();
2533: fByteStream = null;
2534: }
2535: }
2536: }
2537: }
2538: if (fDocumentHandler != null
2539: && fElementCount >= fElementDepth) {
2540: fQName.setValues(null, ename, ename, null);
2541: if (DEBUG_CALLBACKS) {
2542: System.out.println("startElement(" + fQName + ','
2543: + fAttributes + ")");
2544: }
2545: fEndLineNumber = fCurrentEntity.lineNumber;
2546: fEndColumnNumber = fCurrentEntity.columnNumber;
2547: if (empty[0]) {
2548: fDocumentHandler.emptyElement(fQName, fAttributes,
2549: locationAugs());
2550: } else {
2551: fDocumentHandler.startElement(fQName, fAttributes,
2552: locationAugs());
2553: }
2554: }
2555: return ename;
2556: } // scanStartElement():ename
2557:
2558: /**
2559: * Scans a real attribute.
2560: *
2561: * @param attributes The list of attributes.
2562: * @param empty Is used for a second return value to indicate
2563: * whether the start element tag is empty
2564: * (e.g. "/>").
2565: */
2566: protected boolean scanAttribute(XMLAttributesImpl attributes,
2567: boolean[] empty) throws IOException {
2568: return scanAttribute(attributes, empty, '/');
2569: } // scanAttribute(XMLAttributesImpl,boolean[]):boolean
2570:
2571: /**
2572: * Scans a pseudo attribute.
2573: *
2574: * @param attributes The list of attributes.
2575: */
2576: protected boolean scanPseudoAttribute(
2577: XMLAttributesImpl attributes) throws IOException {
2578: return scanAttribute(attributes, fSingleBoolean, '?');
2579: } // scanPseudoAttribute(XMLAttributesImpl):boolean
2580:
2581: /**
2582: * Scans an attribute, pseudo or real.
2583: *
2584: * @param attributes The list of attributes.
2585: * @param empty Is used for a second return value to indicate
2586: * whether the start element tag is empty
2587: * (e.g. "/>").
2588: * @param endc The end character that appears before the
2589: * closing angle bracket ('>').
2590: */
2591: protected boolean scanAttribute(XMLAttributesImpl attributes,
2592: boolean[] empty, char endc) throws IOException {
2593: boolean skippedSpaces = skipSpaces();
2594: fBeginLineNumber = fCurrentEntity.lineNumber;
2595: fBeginColumnNumber = fCurrentEntity.columnNumber;
2596: int c = read();
2597: if (c == -1) {
2598: if (fReportErrors) {
2599: fErrorReporter.reportError("HTML1007", null);
2600: }
2601: throw new EOFException();
2602: }
2603: if (c == '>') {
2604: return false;
2605: }
2606: fCurrentEntity.offset--;
2607: fCurrentEntity.columnNumber--;
2608: String aname = scanName();
2609: if (aname == null) {
2610: if (fReportErrors) {
2611: fErrorReporter.reportError("HTML1011", null);
2612: }
2613: empty[0] = skipMarkup(false);
2614: return false;
2615: }
2616: if (!skippedSpaces && fReportErrors) {
2617: fErrorReporter.reportError("HTML1013",
2618: new Object[] { aname });
2619: }
2620: aname = modifyName(aname, fNamesAttrs);
2621: skipSpaces();
2622: c = read();
2623: if (c == -1) {
2624: if (fReportErrors) {
2625: fErrorReporter.reportError("HTML1007", null);
2626: }
2627: throw new EOFException();
2628: }
2629: if (c == '/' || c == '>') {
2630: fQName.setValues(null, aname, aname, null);
2631: attributes.addAttribute(fQName, "CDATA", "");
2632: attributes.setSpecified(attributes.getLength() - 1,
2633: true);
2634: if (fAugmentations) {
2635: addLocationItem(attributes,
2636: attributes.getLength() - 1);
2637: }
2638: if (c == '/') {
2639: fCurrentEntity.offset--;
2640: fCurrentEntity.columnNumber--;
2641: empty[0] = skipMarkup(false);
2642: }
2643: return false;
2644: }
2645: /***
2646: // REVISIT: [Q] Why is this still here? -Ac
2647: if (c == '/' || c == '>') {
2648: if (c == '/') {
2649: fCurrentEntity.offset--;
2650: fCurrentEntity.columnNumber--;
2651: empty[0] = skipMarkup(false);
2652: }
2653: fQName.setValues(null, aname, aname, null);
2654: attributes.addAttribute(fQName, "CDATA", "");
2655: attributes.setSpecified(attributes.getLength()-1, true);
2656: if (fAugmentations) {
2657: addLocationItem(attributes, attributes.getLength() - 1);
2658: }
2659: return false;
2660: }
2661: /***/
2662: if (c == '=') {
2663: skipSpaces();
2664: c = read();
2665: if (c == -1) {
2666: if (fReportErrors) {
2667: fErrorReporter.reportError("HTML1007", null);
2668: }
2669: throw new EOFException();
2670: }
2671: // Xiaowei/Ac: Fix for <a href=/cgi-bin/myscript>...</a>
2672: if (c == '>') {
2673: fQName.setValues(null, aname, aname, null);
2674: attributes.addAttribute(fQName, "CDATA", "");
2675: attributes.setSpecified(attributes.getLength() - 1,
2676: true);
2677: if (fAugmentations) {
2678: addLocationItem(attributes, attributes
2679: .getLength() - 1);
2680: }
2681: return false;
2682: }
2683: fStringBuffer.clear();
2684: fNonNormAttr.clear();
2685: if (c != '\'' && c != '"') {
2686: fCurrentEntity.offset--;
2687: fCurrentEntity.columnNumber--;
2688: while (true) {
2689: c = read();
2690: // Xiaowei/Ac: Fix for <a href=/broken/>...</a>
2691: if (Character.isSpace((char) c) || c == '>') {
2692: //fCharOffset--;
2693: fCurrentEntity.offset--;
2694: fCurrentEntity.columnNumber--;
2695: break;
2696: }
2697: if (c == -1) {
2698: if (fReportErrors) {
2699: fErrorReporter.reportError("HTML1007",
2700: null);
2701: }
2702: throw new EOFException();
2703: }
2704: if (c == '&') {
2705: int ce = scanEntityRef(fStringBuffer2,
2706: false);
2707: if (ce != -1) {
2708: fStringBuffer.append((char) ce);
2709: } else {
2710: fStringBuffer.append(fStringBuffer2);
2711: }
2712: fNonNormAttr.append(fStringBuffer2);
2713: } else {
2714: fStringBuffer.append((char) c);
2715: fNonNormAttr.append((char) c);
2716: }
2717: }
2718: fQName.setValues(null, aname, aname, null);
2719: String avalue = fStringBuffer.toString();
2720: attributes.addAttribute(fQName, "CDATA", avalue);
2721:
2722: int lastattr = attributes.getLength() - 1;
2723: attributes.setSpecified(lastattr, true);
2724: attributes.setNonNormalizedValue(lastattr,
2725: fNonNormAttr.toString());
2726: if (fAugmentations) {
2727: addLocationItem(attributes, attributes
2728: .getLength() - 1);
2729: }
2730: return true;
2731: }
2732: char quote = (char) c;
2733: boolean isStart = true;
2734: boolean prevSpace = false;
2735: do {
2736: boolean acceptSpace = !fNormalizeAttributes
2737: || (!isStart && !prevSpace);
2738: c = read();
2739: if (c == -1) {
2740: if (fReportErrors) {
2741: fErrorReporter
2742: .reportError("HTML1007", null);
2743: }
2744: throw new EOFException();
2745: }
2746: if (c == '&') {
2747: isStart = false;
2748: int ce = scanEntityRef(fStringBuffer2, false);
2749: if (ce != -1) {
2750: fStringBuffer.append((char) ce);
2751: } else {
2752: fStringBuffer.append(fStringBuffer2);
2753: }
2754: fNonNormAttr.append(fStringBuffer2);
2755: } else if (c == ' ' || c == '\t') {
2756: if (acceptSpace) {
2757: fStringBuffer
2758: .append(fNormalizeAttributes ? ' '
2759: : (char) c);
2760: }
2761: fNonNormAttr.append((char) c);
2762: } else if (c == '\r' || c == '\n') {
2763: fCurrentEntity.lineNumber++;
2764: fCurrentEntity.columnNumber = 0;
2765: if (c == '\r') {
2766: int c2 = read();
2767: if (c2 != '\n') {
2768: fCurrentEntity.offset--;
2769: fCurrentEntity.columnNumber--;
2770: } else {
2771: fNonNormAttr.append('\r');
2772: c = c2;
2773: }
2774: }
2775: if (acceptSpace) {
2776: fStringBuffer
2777: .append(fNormalizeAttributes ? ' '
2778: : '\n');
2779: }
2780: fNonNormAttr.append((char) c);
2781: } else if (c != quote) {
2782: isStart = false;
2783: fStringBuffer.append((char) c);
2784: fNonNormAttr.append((char) c);
2785: }
2786: prevSpace = c == ' ' || c == '\t' || c == '\r'
2787: || c == '\n';
2788: isStart = isStart && prevSpace;
2789: } while (c != quote);
2790:
2791: if (fNormalizeAttributes) {
2792: // trailing whitespace already normalized to single space
2793: if (fStringBuffer.ch[fStringBuffer.length - 1] == ' ') {
2794: fStringBuffer.length--;
2795: }
2796: }
2797:
2798: fQName.setValues(null, aname, aname, null);
2799: String avalue = fStringBuffer.toString();
2800: attributes.addAttribute(fQName, "CDATA", avalue);
2801:
2802: int lastattr = attributes.getLength() - 1;
2803: attributes.setSpecified(lastattr, true);
2804: attributes.setNonNormalizedValue(lastattr, fNonNormAttr
2805: .toString());
2806: if (fAugmentations) {
2807: addLocationItem(attributes,
2808: attributes.getLength() - 1);
2809: }
2810: } else {
2811: fQName.setValues(null, aname, aname, null);
2812: attributes.addAttribute(fQName, "CDATA", "");
2813: attributes.setSpecified(attributes.getLength() - 1,
2814: true);
2815: fCurrentEntity.offset--;
2816: fCurrentEntity.columnNumber--;
2817: if (fAugmentations) {
2818: addLocationItem(attributes,
2819: attributes.getLength() - 1);
2820: }
2821: }
2822: return true;
2823: } // scanAttribute(XMLAttributesImpl):boolean
2824:
2825: /** Adds location augmentations to the specified attribute. */
2826: protected void addLocationItem(XMLAttributes attributes,
2827: int index) {
2828: fEndLineNumber = fCurrentEntity.lineNumber;
2829: fEndColumnNumber = fCurrentEntity.columnNumber;
2830: LocationItem locationItem = new LocationItem();
2831: locationItem.setValues(fBeginLineNumber,
2832: fBeginColumnNumber, fEndLineNumber,
2833: fEndColumnNumber);
2834: Augmentations augs = attributes.getAugmentations(index);
2835: augs.putItem(AUGMENTATIONS, locationItem);
2836: } // addLocationItem(XMLAttributes,int)
2837:
2838: /** Scans an end element. */
2839: protected void scanEndElement() throws IOException {
2840: String ename = scanName();
2841: if (fReportErrors && ename == null) {
2842: fErrorReporter.reportError("HTML1012", null);
2843: }
2844: skipMarkup(false);
2845: if (ename != null) {
2846: ename = modifyName(ename, fNamesElems);
2847: if (fDocumentHandler != null
2848: && fElementCount >= fElementDepth) {
2849: fQName.setValues(null, ename, ename, null);
2850: if (DEBUG_CALLBACKS) {
2851: System.out
2852: .println("endElement(" + fQName + ")");
2853: }
2854: fEndLineNumber = fCurrentEntity.lineNumber;
2855: fEndColumnNumber = fCurrentEntity.columnNumber;
2856: fDocumentHandler.endElement(fQName, locationAugs());
2857: }
2858: }
2859: } // scanEndElement()
2860: } // class ContentScanner
2861:
2862: /**
2863: * Special scanner used for elements whose content needs to be scanned
2864: * as plain text, ignoring markup such as elements and entity references.
2865: * For example: <SCRIPT> and <COMMENT>.
2866: *
2867: * @author Andy Clark
2868: */
2869: public class SpecialScanner implements Scanner {
2870:
2871: //
2872: // Data
2873: //
2874:
2875: /** Name of element whose content needs to be scanned as text. */
2876: protected String fElementName;
2877:
2878: /** True if <script> element. */
2879: protected boolean fScript;
2880:
2881: /** True if <style> element. */
2882: protected boolean fStyle;
2883:
2884: /** True if <textarea> element. */
2885: protected boolean fTextarea;
2886:
2887: /** True if <title> element. */
2888: protected boolean fTitle;
2889:
2890: // temp vars
2891:
2892: /** A qualified name. */
2893: private final QName fQName = new QName();
2894:
2895: /** A string buffer. */
2896: private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
2897:
2898: //
2899: // Public methods
2900: //
2901:
2902: /** Sets the element name. */
2903: public Scanner setElementName(String ename) {
2904: fElementName = ename;
2905: fScript = fElementName.equalsIgnoreCase("SCRIPT");
2906: fStyle = fElementName.equalsIgnoreCase("STYLE");
2907: fTextarea = fElementName.equalsIgnoreCase("TEXTAREA");
2908: fTitle = fElementName.equalsIgnoreCase("TITLE");
2909: return this ;
2910: } // setElementName(String):Scanner
2911:
2912: //
2913: // Scanner methods
2914: //
2915:
2916: /** Scan. */
2917: public boolean scan(boolean complete) throws IOException {
2918: boolean next;
2919: do {
2920: try {
2921: next = false;
2922: switch (fScannerState) {
2923: case STATE_CONTENT: {
2924: fBeginLineNumber = fCurrentEntity.lineNumber;
2925: fBeginColumnNumber = fCurrentEntity.columnNumber;
2926: int c = read();
2927: if (c == '<') {
2928: setScannerState(STATE_MARKUP_BRACKET);
2929: continue;
2930: }
2931: if (c == '&') {
2932: if (fTextarea || fTitle) {
2933: scanEntityRef(fStringBuffer, true);
2934: continue;
2935: }
2936: fStringBuffer.clear();
2937: fStringBuffer.append('&');
2938: } else if (c == -1) {
2939: if (fReportErrors) {
2940: fErrorReporter.reportError("HTML1007",
2941: null);
2942: }
2943: throw new EOFException();
2944: } else {
2945: fCurrentEntity.offset--;
2946: fCurrentEntity.columnNumber--;
2947: fStringBuffer.clear();
2948: }
2949: scanCharacters(fStringBuffer, -1);
2950: break;
2951: } // case STATE_CONTENT
2952: case STATE_MARKUP_BRACKET: {
2953: int delimiter = -1;
2954: int c = read();
2955: if (c == '!') {
2956: if (skip("--", false)) {
2957: fStringBuffer.clear();
2958: boolean strip = (fScript && fScriptStripCommentDelims)
2959: || (fStyle && fStyleStripCommentDelims);
2960: if (strip) {
2961: do {
2962: c = read();
2963: if (c == '\r' || c == '\n') {
2964: fCurrentEntity.columnNumber--;
2965: fCurrentEntity.offset--;
2966: break;
2967: }
2968: } while (c != -1);
2969: skipNewlines(1);
2970: delimiter = '-';
2971: } else {
2972: fStringBuffer.append("<!--");
2973: }
2974: } else if (skip("[CDATA[", false)) {
2975: fStringBuffer.clear();
2976: boolean strip = (fScript && fScriptStripCDATADelims)
2977: || (fStyle && fStyleStripCDATADelims);
2978: if (strip) {
2979: do {
2980: c = read();
2981: if (c == '\r' || c == '\n') {
2982: fCurrentEntity.columnNumber--;
2983: fCurrentEntity.offset--;
2984: break;
2985: }
2986: } while (c != -1);
2987: skipNewlines(1);
2988: delimiter = ']';
2989: } else {
2990: fStringBuffer.append("<![CDATA[");
2991: }
2992: }
2993: } else if (c == '/') {
2994: String ename = scanName();
2995: if (ename != null) {
2996: if (ename
2997: .equalsIgnoreCase(fElementName)) {
2998: if (read() == '>') {
2999: ename = modifyName(ename,
3000: fNamesElems);
3001: if (fDocumentHandler != null
3002: && fElementCount >= fElementDepth) {
3003: fQName.setValues(null,
3004: ename, ename, null);
3005: if (DEBUG_CALLBACKS) {
3006: System.out
3007: .println("endElement("
3008: + fQName
3009: + ")");
3010: }
3011: fEndLineNumber = fCurrentEntity.lineNumber;
3012: fEndColumnNumber = fCurrentEntity.columnNumber;
3013: fDocumentHandler
3014: .endElement(
3015: fQName,
3016: locationAugs());
3017: }
3018: setScanner(fContentScanner);
3019: setScannerState(STATE_CONTENT);
3020: return true;
3021: } else {
3022: fCurrentEntity.offset--;
3023: fCurrentEntity.columnNumber--;
3024: }
3025: }
3026: fStringBuffer.clear();
3027: fStringBuffer.append("</");
3028: fStringBuffer.append(ename);
3029: } else {
3030: fStringBuffer.clear();
3031: fStringBuffer.append("</");
3032: }
3033: } else {
3034: fStringBuffer.clear();
3035: fStringBuffer.append('<');
3036: fStringBuffer.append((char) c);
3037: }
3038: scanCharacters(fStringBuffer, delimiter);
3039: setScannerState(STATE_CONTENT);
3040: break;
3041: } // case STATE_MARKUP_BRACKET
3042: } // switch
3043: } // try
3044: catch (EOFException e) {
3045: setScanner(fContentScanner);
3046: if (fCurrentEntityStack.empty()) {
3047: setScannerState(STATE_END_DOCUMENT);
3048: } else {
3049: fCurrentEntity = (CurrentEntity) fCurrentEntityStack
3050: .pop();
3051: setScannerState(STATE_CONTENT);
3052: }
3053: return true;
3054: }
3055: } // do
3056: while (next || complete);
3057: return true;
3058: } // scan(boolean):boolean
3059:
3060: //
3061: // Protected methods
3062: //
3063:
3064: /** Scan characters. */
3065: protected void scanCharacters(XMLStringBuffer buffer,
3066: int delimiter) throws IOException {
3067: if (DEBUG_BUFFER) {
3068: System.out.print("(scanCharacters, delimiter="
3069: + delimiter + ": ");
3070: printBuffer();
3071: System.out.println();
3072: }
3073: boolean strip = (fScript && fScriptStripCommentDelims)
3074: || (fScript && fScriptStripCDATADelims)
3075: || (fStyle && fStyleStripCommentDelims)
3076: || (fStyle && fStyleStripCDATADelims);
3077: while (true) {
3078: int c = read();
3079: if (c == -1
3080: || (delimiter == -1 && (c == '<' || c == '&'))) {
3081: if (c != -1) {
3082: fCurrentEntity.offset--;
3083: fCurrentEntity.columnNumber--;
3084: }
3085: break;
3086: }
3087: // Patch supplied by Jonathan Baxter
3088: else if (c == '\r' || c == '\n') {
3089: fCurrentEntity.offset--;
3090: fCurrentEntity.columnNumber--;
3091: int newlines = skipNewlines();
3092: for (int i = 0; i < newlines; i++) {
3093: buffer.append('\n');
3094: }
3095: } else if (c == '\'' || c == '"') {
3096: buffer.append((char) c);
3097: final int stringChar = c;
3098: while (true) {
3099: c = read();
3100: if (c == '\\') {
3101: buffer.append((char) c);
3102: //always consume next character
3103: buffer.append((char) read());
3104: } else if (c == stringChar) {
3105: buffer.append((char) c);
3106: break;
3107: } else if (c == '\r' || c == '\n') {
3108: fCurrentEntity.offset--;
3109: fCurrentEntity.columnNumber--;
3110: int newlines = skipNewlines();
3111: for (int i = 0; i < newlines; i++) {
3112: buffer.append('\n');
3113: }
3114: break;
3115: } else {
3116: buffer.append((char) c);
3117: }
3118: }
3119: } else if (delimiter != -1 && c == (char) delimiter) {
3120: int count = 0;
3121: do {
3122: count++;
3123: c = read();
3124: } while (c == (char) delimiter);
3125: for (int i = strip && c == '>' ? 2 : 0; i < count; i++) {
3126: buffer.append((char) delimiter);
3127: }
3128: if (c == -1 || (count >= 2 && c == '>')) {
3129: if (!strip) {
3130: buffer.append((char) c);
3131: }
3132: break;
3133: }
3134: fCurrentEntity.offset--;
3135: fCurrentEntity.columnNumber--;
3136: } else {
3137: buffer.append((char) c);
3138: if (c == '\n') {
3139: fCurrentEntity.columnNumber = 1;
3140: fCurrentEntity.lineNumber++;
3141: }
3142: }
3143: }
3144: if (buffer.length > 0 && fDocumentHandler != null
3145: && fElementCount >= fElementDepth) {
3146: if (DEBUG_CALLBACKS) {
3147: System.out.println("characters(" + buffer + ")");
3148: }
3149: fEndLineNumber = fCurrentEntity.lineNumber;
3150: fEndColumnNumber = fCurrentEntity.columnNumber;
3151: fDocumentHandler.characters(buffer, locationAugs());
3152: }
3153: if (DEBUG_BUFFER) {
3154: System.out.print(")scanCharacters: ");
3155: printBuffer();
3156: System.out.println();
3157: }
3158: } // scanCharacters(StringBuffer)
3159:
3160: } // class SpecialScanner
3161:
3162: /**
3163: * A playback input stream. This class has the ability to save the bytes
3164: * read from the underlying input stream and play the bytes back later.
3165: * This class is used by the HTML scanner to switch encodings when a
3166: * <meta> tag is detected that specifies a different encoding.
3167: * <p>
3168: * If the encoding is changed, then the scanner calls the
3169: * <code>playback</code> method and re-scans the beginning of the HTML
3170: * document again. This should not be too much of a performance problem
3171: * because the <meta> tag appears at the beginning of the document.
3172: * <p>
3173: * If the <body> tag is reached without playing back the bytes,
3174: * then the buffer can be cleared by calling the <code>clear</code>
3175: * method. This stops the buffering of bytes and allows the memory used
3176: * by the buffer to be reclaimed.
3177: * <p>
3178: * <strong>Note:</strong>
3179: * If the buffer is never played back or cleared, this input stream
3180: * will continue to buffer the entire stream. Therefore, it is very
3181: * important to use this stream correctly.
3182: *
3183: * @author Andy Clark
3184: */
3185: public static class PlaybackInputStream extends FilterInputStream {
3186:
3187: //
3188: // Constants
3189: //
3190:
3191: /** Set to true to debug playback. */
3192: private static final boolean DEBUG_PLAYBACK = false;
3193:
3194: //
3195: // Data
3196: //
3197:
3198: // state
3199:
3200: /** Playback mode. */
3201: protected boolean fPlayback = false;
3202:
3203: /** Buffer cleared. */
3204: protected boolean fCleared = false;
3205:
3206: /** Encoding detected. */
3207: protected boolean fDetected = false;
3208:
3209: // buffer info
3210:
3211: /** Byte buffer. */
3212: protected byte[] fByteBuffer = new byte[1024];
3213:
3214: /** Offset into byte buffer during playback. */
3215: protected int fByteOffset = 0;
3216:
3217: /** Length of bytes read into byte buffer. */
3218: protected int fByteLength = 0;
3219:
3220: /** Pushback offset. */
3221: public int fPushbackOffset = 0;
3222:
3223: /** Pushback length. */
3224: public int fPushbackLength = 0;
3225:
3226: //
3227: // Constructors
3228: //
3229:
3230: /** Constructor. */
3231: public PlaybackInputStream(InputStream in) {
3232: super (in);
3233: } // <init>(InputStream)
3234:
3235: //
3236: // Public methods
3237: //
3238:
3239: /** Detect encoding. */
3240: public void detectEncoding(String[] encodings)
3241: throws IOException {
3242: if (fDetected) {
3243: throw new IOException(
3244: "Should not detect encoding twice.");
3245: }
3246: fDetected = true;
3247: int b1 = read();
3248: if (b1 == -1) {
3249: return;
3250: }
3251: int b2 = read();
3252: if (b2 == -1) {
3253: fPushbackLength = 1;
3254: return;
3255: }
3256: // UTF-8 BOM: 0xEFBBBF
3257: if (b1 == 0xEF && b2 == 0xBB) {
3258: int b3 = read();
3259: if (b3 == 0xBF) {
3260: fPushbackOffset = 3;
3261: encodings[0] = "UTF-8";
3262: encodings[1] = "UTF8";
3263: return;
3264: }
3265: fPushbackLength = 3;
3266: }
3267: // UTF-16 LE BOM: 0xFFFE
3268: if (b1 == 0xFF && b2 == 0xFE) {
3269: encodings[0] = "UTF-16";
3270: encodings[1] = "UnicodeLittleUnmarked";
3271: return;
3272: }
3273: // UTF-16 BE BOM: 0xFEFF
3274: else if (b1 == 0xFE && b2 == 0xFF) {
3275: encodings[0] = "UTF-16";
3276: encodings[1] = "UnicodeBigUnmarked";
3277: return;
3278: }
3279: // unknown
3280: fPushbackLength = 2;
3281: } // detectEncoding()
3282:
3283: /** Playback buffer contents. */
3284: public void playback() {
3285: fPlayback = true;
3286: } // playback()
3287:
3288: /**
3289: * Clears the buffer.
3290: * <p>
3291: * <strong>Note:</strong>
3292: * The buffer cannot be cleared during playback. Therefore, calling
3293: * this method during playback will not do anything. However, the
3294: * buffer will be cleared automatically at the end of playback.
3295: */
3296: public void clear() {
3297: if (!fPlayback) {
3298: fCleared = true;
3299: fByteBuffer = null;
3300: }
3301: } // clear()
3302:
3303: //
3304: // InputStream methods
3305: //
3306:
3307: /** Read a byte. */
3308: public int read() throws IOException {
3309: if (DEBUG_PLAYBACK) {
3310: System.out.println("(read");
3311: }
3312: if (fPushbackOffset < fPushbackLength) {
3313: return fByteBuffer[fPushbackOffset++];
3314: }
3315: if (fCleared) {
3316: return in.read();
3317: }
3318: if (fPlayback) {
3319: int c = fByteBuffer[fByteOffset++];
3320: if (fByteOffset == fByteLength) {
3321: fCleared = true;
3322: fByteBuffer = null;
3323: }
3324: if (DEBUG_PLAYBACK) {
3325: System.out.println(")read -> " + (char) c);
3326: }
3327: return c;
3328: }
3329: int c = in.read();
3330: if (c != -1) {
3331: if (fByteLength == fByteBuffer.length) {
3332: byte[] newarray = new byte[fByteLength + 1024];
3333: System.arraycopy(fByteBuffer, 0, newarray, 0,
3334: fByteLength);
3335: fByteBuffer = newarray;
3336: }
3337: fByteBuffer[fByteLength++] = (byte) c;
3338: }
3339: if (DEBUG_PLAYBACK) {
3340: System.out.println(")read -> " + (char) c);
3341: }
3342: return c;
3343: } // read():int
3344:
3345: /** Read an array of bytes. */
3346: public int read(byte[] array) throws IOException {
3347: return read(array, 0, array.length);
3348: } // read(byte[]):int
3349:
3350: /** Read an array of bytes. */
3351: public int read(byte[] array, int offset, int length)
3352: throws IOException {
3353: if (DEBUG_PLAYBACK) {
3354: System.out.println(")read(" + offset + ',' + length
3355: + ')');
3356: }
3357: if (fPushbackOffset < fPushbackLength) {
3358: int count = fPushbackLength - fPushbackOffset;
3359: if (count > length) {
3360: count = length;
3361: }
3362: System.arraycopy(fByteBuffer, fPushbackOffset, array,
3363: offset, count);
3364: fPushbackOffset += count;
3365: return count;
3366: }
3367: if (fCleared) {
3368: return in.read(array, offset, length);
3369: }
3370: if (fPlayback) {
3371: if (fByteOffset + length > fByteLength) {
3372: length = fByteLength - fByteOffset;
3373: }
3374: System.arraycopy(fByteBuffer, fByteOffset, array,
3375: offset, length);
3376: fByteOffset += length;
3377: if (fByteOffset == fByteLength) {
3378: fCleared = true;
3379: fByteBuffer = null;
3380: }
3381: return length;
3382: }
3383: int count = in.read(array, offset, length);
3384: if (count != -1) {
3385: if (fByteLength + count > fByteBuffer.length) {
3386: byte[] newarray = new byte[fByteLength + count
3387: + 512];
3388: System.arraycopy(fByteBuffer, 0, newarray, 0,
3389: fByteLength);
3390: fByteBuffer = newarray;
3391: }
3392: System.arraycopy(array, offset, fByteBuffer,
3393: fByteLength, count);
3394: fByteLength += count;
3395: }
3396: if (DEBUG_PLAYBACK) {
3397: System.out.println(")read(" + offset + ',' + length
3398: + ") -> " + count);
3399: }
3400: return count;
3401: } // read(byte[]):int
3402:
3403: } // class PlaybackInputStream
3404:
3405: /**
3406: * Location infoset item.
3407: *
3408: * @author Andy Clark
3409: */
3410: protected static class LocationItem implements HTMLEventInfo {
3411:
3412: //
3413: // Data
3414: //
3415:
3416: /** Beginning line number. */
3417: protected int fBeginLineNumber;
3418:
3419: /** Beginning column number. */
3420: protected int fBeginColumnNumber;
3421:
3422: /** Ending line number. */
3423: protected int fEndLineNumber;
3424:
3425: /** Ending column number. */
3426: protected int fEndColumnNumber;
3427:
3428: //
3429: // Public methods
3430: //
3431:
3432: /** Sets the values of this item. */
3433: public void setValues(int beginLine, int beginColumn,
3434: int endLine, int endColumn) {
3435: fBeginLineNumber = beginLine;
3436: fBeginColumnNumber = beginColumn;
3437: fEndLineNumber = endLine;
3438: fEndColumnNumber = endColumn;
3439: } // setValues(int,int,int,int)
3440:
3441: //
3442: // HTMLEventInfo methods
3443: //
3444:
3445: // location information
3446:
3447: /** Returns the line number of the beginning of this event.*/
3448: public int getBeginLineNumber() {
3449: return fBeginLineNumber;
3450: } // getBeginLineNumber():int
3451:
3452: /** Returns the column number of the beginning of this event.*/
3453: public int getBeginColumnNumber() {
3454: return fBeginColumnNumber;
3455: } // getBeginColumnNumber():int
3456:
3457: /** Returns the line number of the end of this event.*/
3458: public int getEndLineNumber() {
3459: return fEndLineNumber;
3460: } // getEndLineNumber():int
3461:
3462: /** Returns the column number of the end of this event.*/
3463: public int getEndColumnNumber() {
3464: return fEndColumnNumber;
3465: } // getEndColumnNumber():int
3466:
3467: // other information
3468:
3469: /** Returns true if this corresponding event was synthesized. */
3470: public boolean isSynthesized() {
3471: return false;
3472: } // isSynthesize():boolean
3473:
3474: //
3475: // Object methods
3476: //
3477:
3478: /** Returns a string representation of this object. */
3479: public String toString() {
3480: StringBuffer str = new StringBuffer();
3481: str.append(fBeginLineNumber);
3482: str.append(':');
3483: str.append(fBeginColumnNumber);
3484: str.append(':');
3485: str.append(fEndLineNumber);
3486: str.append(':');
3487: str.append(fEndColumnNumber);
3488: return str.toString();
3489: } // toString():String
3490:
3491: } // class LocationItem
3492:
3493: /**
3494: * To detect if 2 encoding are compatible, both must be able to read the meta tag specifying
3495: * the new encoding. This means that the byte representation of some minimal html markup must
3496: * be the same in both encodings
3497: */
3498: boolean isEncodingCompatible(final String encoding1,
3499: final String encoding2) {
3500: final String reference = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=";
3501: try {
3502: final byte[] bytesEncoding1 = reference.getBytes(encoding1);
3503: final byte[] bytesEncoding2 = reference.getBytes(encoding2);
3504: if (bytesEncoding1.length != bytesEncoding2.length) {
3505: return false;
3506: } else {
3507: for (int i = 0; i < bytesEncoding1.length; ++i) {
3508: if (bytesEncoding1[i] != bytesEncoding2[i]) {
3509: return false;
3510: }
3511: }
3512: }
3513:
3514: return true;
3515: } catch (final UnsupportedEncodingException e) {
3516: return false;
3517: }
3518: }
3519: } // class HTMLScanner
|