0001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
0002: // Version 2.5
0003: // Copyright (C) 2007 Martin Jericho
0004: // http://jerichohtml.sourceforge.net/
0005: //
0006: // This library is free software; you can redistribute it and/or
0007: // modify it under the terms of either one of the following licences:
0008: //
0009: // 1. The Eclipse Public License (EPL) version 1.0,
0010: // included in this distribution in the file licence-epl-1.0.html
0011: // or available at http://www.eclipse.org/legal/epl-v10.html
0012: //
0013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
0014: // included in this distribution in the file licence-lgpl-2.1.txt
0015: // or available at http://www.gnu.org/licenses/lgpl.txt
0016: //
0017: // This library is distributed on an "AS IS" basis,
0018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
0019: // See the individual licence texts for more details.
0020:
0021: package au.id.jericho.lib.html;
0022:
0023: import java.util.*;
0024: import java.io.*;
0025: import java.net.*;
0026:
0027: /**
0028: * Represents a source HTML document.
0029: * <p>
0030: * The first step in parsing an HTML document is always to construct a <code>Source</code> object from the source data, which can be a
0031: * <code>String</code>, <code>Reader</code>, <code>InputStream</code> or <code>URL</code>.
0032: * Each constructor uses all the evidence available to determine the original {@linkplain #getEncoding() character encoding} of the data.
0033: * <p>
0034: * Once the <code>Source</code> object has been created, you can immediately start searching for {@linkplain Tag tags} or {@linkplain Element elements} within the document
0035: * using the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
0036: * <p>
0037: * In certain circumstances you may be able to improve performance by calling the {@link #fullSequentialParse()} method before calling any
0038: * <a href="Tag.html#TagSearchMethods">tag search methods</a>. See the documentation of the {@link #fullSequentialParse()} method for details.
0039: * <p>
0040: * Any issues encountered while parsing are logged to a {@link Logger} object.
0041: * The {@link #setLogger(Logger)} method can be used to explicitly set a <code>Logger</code> implementation for a particular <code>Source</code> instance,
0042: * otherwise the static {@link Config#LoggerProvider} property determines how the logger is set by default for all <code>Source</code> instances.
0043: * See the documentation of the {@link Config#LoggerProvider} property for information about how the default logging provider is determined.
0044: * <p>
0045: * Note that many of the useful functions which can be performed on the source document are
0046: * defined in its superclass, {@link Segment}.
0047: * The source object is itself a segment which spans the entire document.
0048: * <p>
0049: * Most of the methods defined in this class are useful for determining the elements and tags
0050: * surrounding or neighbouring a particular character position in the document.
0051: * <p>
0052: * For information on how to create a modified version of this source document, see the {@link OutputDocument} class.
0053: *
0054: * @see Segment
0055: */
0056: public class Source extends Segment {
0057: final String string;
0058: private String documentSpecifiedEncoding = UNINITIALISED;
0059: private String encoding = UNINITIALISED; // null value means no encoding specified.
0060: private String encodingSpecificationInfo;
0061: private String preliminaryEncodingInfo = null;
0062: private String newLine = UNINITIALISED;
0063: private ParseText parseText = null;
0064: private OutputDocument parseTextOutputDocument = null;
0065: Logger logger; // never null
0066: private RowColumnVector[] rowColumnVectorCacheArray = null;
0067: final Cache cache = new Cache(this );
0068: boolean useAllTypesCache = true;
0069: boolean useSpecialTypesCache = true;
0070: // cached result lists:
0071: Tag[] allTagsArray = null; // non-null iff fullSequentialParse was called
0072: List allTags = null; // non-null iff fullSequentialParse was called
0073: List allStartTags = null;
0074: private List allElements = null;
0075:
0076: private static String lastNewLine = null;
0077:
0078: private static final String UNINITIALISED = "";
0079: private static final String CR = "\r";
0080: private static final String LF = "\n";
0081: private static final String CRLF = "\r\n";
0082:
0083: static final String PACKAGE_NAME = "net.htmlparser.jericho"; //Source.class.getPackage().getName();
0084:
0085: /**
0086: * Constructs a new <code>Source</code> object from the specified text.
0087: * @param text the source text.
0088: */
0089: public Source(final CharSequence text) {
0090: super (text.length());
0091: string = text.toString();
0092: setLogger(newLogger());
0093: }
0094:
0095: private Source(final EncodingDetector encodingDetector)
0096: throws IOException {
0097: this (getString(encodingDetector));
0098: encoding = encodingDetector.getEncoding();
0099: encodingSpecificationInfo = encodingDetector
0100: .getEncodingSpecificationInfo();
0101: preliminaryEncodingInfo = encodingDetector
0102: .getPreliminaryEncoding()
0103: + ": "
0104: + encodingDetector
0105: .getPreliminaryEncodingSpecificationInfo();
0106: }
0107:
0108: Source(final Reader reader, final String encoding)
0109: throws IOException {
0110: this (Util.getString(reader));
0111: if (encoding != null) {
0112: this .encoding = encoding;
0113: encodingSpecificationInfo = "InputStreamReader.getEncoding() of constructor argument";
0114: }
0115: }
0116:
0117: /**
0118: * Constructs a new <code>Source</code> object by loading the content from the specified <code>Reader</code>.
0119: * <p>
0120: * If the specified reader is an instance of <code>InputStreamReader</code>, the {@link #getEncoding()} method of the
0121: * created source object returns the encoding from <code>InputStreamReader.getEncoding()</code>.
0122: *
0123: * @param reader the <code>java.io.Reader</code> from which to load the source text.
0124: * @throws java.io.IOException if an I/O error occurs.
0125: */
0126: public Source(final Reader reader) throws IOException {
0127: this (
0128: reader,
0129: (reader instanceof InputStreamReader) ? ((InputStreamReader) reader)
0130: .getEncoding()
0131: : null);
0132: }
0133:
0134: /**
0135: * Constructs a new <code>Source</code> object by loading the content from the specified <code>InputStream</code>.
0136: * <p>
0137: * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document from the raw bytes
0138: * of the specified input stream is the same as that for the {@link #Source(URL)} constructor,
0139: * except that the first step is not possible as there is no
0140: * <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header to check.
0141: *
0142: * @param inputStream the <code>java.io.InputStream</code> from which to load the source text.
0143: * @throws java.io.IOException if an I/O error occurs.
0144: * @see #getEncoding()
0145: */
0146: public Source(final InputStream inputStream) throws IOException {
0147: this (new EncodingDetector(inputStream));
0148: }
0149:
0150: /**
0151: * Constructs a new <code>Source</code> object by loading the content from the specified URL.
0152: * <p>
0153: * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document is as follows:
0154: * <br />(process termination is marked by ♦)
0155: * <ol class="HalfSeparated">
0156: * <li>If the HTTP headers received when opening a connection to the URL include a
0157: * <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header
0158: * specifying a <code>charset</code> parameter, then use the encoding specified in the value of the <code>charset</code> parameter. ♦
0159: * <li>Read the first four bytes of the input stream.
0160: * <li>If the input stream is empty, the created source document has zero length and its {@link #getEncoding()} method
0161: * returns <code>null</code>. ♦
0162: * <li>If the input stream starts with a unicode <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a> (BOM),
0163: * then use the encoding signified by the BOM. ♦
0164: * <table class="bordered" cellspacing="0" style="margin: 15px">
0165: * <tr><th>BOM Bytes</th><th>Encoding</th></tr>
0166: * <tr><td><code>EF BB FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a></tr>
0167: * <tr><td><code>FF FE 00 00</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (little-endian)</tr>
0168: * <tr><td><code>00 00 FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (big-endian)</tr>
0169: * <tr><td><code>FF FE</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (little-endian)</tr>
0170: * <tr><td><code>FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (big-endian)</tr>
0171: * <tr><td><code>0E FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode">SCSU</a></tr>
0172: * <tr><td><code>2B 2F 76</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-7">UTF-7</a></tr>
0173: * <tr><td><code>DD 73 66 73</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-EBCDIC">UTF-EBCDIC</a></tr>
0174: * <tr><td><code>FB EE 28</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/BOCU-1">BOCU-1</a></tr>
0175: * </table>
0176: * <li>If the stream contains less than four bytes, then:
0177: * <ol class="Unseparated">
0178: * <li>If the stream contains either one or three bytes, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. ♦
0179: * <li>If the stream starts with a zero byte, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>. ♦
0180: * <li>If the second byte of the stream is zero, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>. ♦
0181: * <li>Otherwise use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. ♦
0182: * </ol>
0183: * <li>Determine a {@linkplain #getPreliminaryEncodingInfo() preliminary encoding} by examining the first four bytes of the input stream.
0184: * See the {@link #getPreliminaryEncodingInfo()} method for details.
0185: * <li>Read the first 2048 bytes of the input stream and decode it using the preliminary encoding to create a "preview segment".
0186: * If the detected preliminary encoding is not supported on this platform, create the preview segment using
0187: * <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> instead (this incident is logged at {@linkplain Logger#warn(String) warn} level).
0188: * <li>Search the preview segment for an <a href="#EncodingSpecification">encoding specification</a>, which should always appear at or near the top of the document.
0189: * <li>If an encoding specification is found:
0190: * <ol class="Unseparated">
0191: * <li>If the specified encoding is supported on this platform, use it. ♦
0192: * <li>If the specified encoding is not supported on this platform, use the encoding that was used to create the preview segment,
0193: * which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. ♦
0194: * </ol>
0195: * <li>If the document {@linkplain #isXML() looks like XML}, then use <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. ♦
0196: * <br/>Section <a target="_blank" href="http://www.w3.org/TR/REC-xml/#charencoding">4.3.3</a> of the XML 1.0 specification states that
0197: * an XML file that is not encoded in UTF-8 must contain either a UTF-16 <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">BOM</a>
0198: * or an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> in its {@linkplain StartTagType#XML_DECLARATION XML declaration}.
0199: * Since neither of these was detected, we can assume the encoding is <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
0200: * <li>Use the encoding that was used to create the preview segment, which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. ♦
0201: * <br />This is the best guess, in the absence of any explicit information about the encoding, based on the first four bytes of the stream.
0202: * The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1">HTTP protocol section 3.7.1</a>
0203: * states that an encoding of <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> can be assumed
0204: * if no <code>charset</code> parameter was included in the HTTP
0205: * <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header.
0206: * This is consistent with the preliminary encoding detected in this scenario.
0207: * </ol>
0208: *
0209: * @param url the URL from which to load the source text.
0210: * @throws java.io.IOException if an I/O error occurs.
0211: * @see #getEncoding()
0212: */
0213: public Source(final URL url) throws IOException {
0214: this (new EncodingDetector(url));
0215: }
0216:
0217: private String setEncoding(final String encoding,
0218: final String encodingSpecificationInfo) {
0219: if (this .encoding == UNINITIALISED) {
0220: this .encoding = encoding;
0221: this .encodingSpecificationInfo = encodingSpecificationInfo;
0222: }
0223: return encoding;
0224: }
0225:
0226: /**
0227: * Returns the document {@linkplain #getEncoding() encoding} specified within the text of the document.
0228: * <p>
0229: * The document encoding can be specified within the document text in two ways.
0230: * They are referred to generically in this library as an <i><a name="EncodingSpecification">encoding specification</a></i>,
0231: * and are listed below in order of precedence:
0232: * <ol class="HalfSeparated">
0233: * <li>
0234: * An <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> within the
0235: * {@linkplain StartTagType#XML_DECLARATION XML declaration} of an XML document,
0236: * which must be present if it has an encoding other than <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>
0237: * or <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>.
0238: * <pre><?xml version="1.0" encoding="ISO-8859-1" ?></pre>
0239: * <li>
0240: * A <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#spec-char-encoding">META declaration</a>,
0241: * which is in the form of a {@link HTMLElementName#META META} tag with attribute <code>http-equiv="Content-Type"</code>.
0242: * The encoding is specified in the <code>charset</code> parameter of a
0243: * <code><a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a></code>
0244: * HTTP header value, which is placed in the value of the meta tag's <code>content</code> attribute.
0245: * This META declaration should appear as early as possible in the {@link HTMLElementName#HEAD HEAD} element.
0246: * <pre><META http-equiv=Content-Type content="text/html; charset=iso-8859-1"></pre>
0247: * </ol>
0248: * <p>
0249: * Both of these tags must only use characters in the range U+0000 to U+007F, and in the case of the META declaration
0250: * must use ASCII encoding. This, along with the fact that they must occur at or near the beginning of the document,
0251: * assists in their detection and decoding without the need to know the exact encoding of the full text.
0252: *
0253: * @return the document {@linkplain #getEncoding() encoding} specified within the text of the document, or <code>null</code> if no encoding is specified.
0254: * @see #getEncoding()
0255: */
0256: public String getDocumentSpecifiedEncoding() {
0257: if (documentSpecifiedEncoding != UNINITIALISED)
0258: return documentSpecifiedEncoding;
0259: final Tag xmlDeclarationTag = getTagAt(0);
0260: if (xmlDeclarationTag != null
0261: && xmlDeclarationTag.getTagType() == StartTagType.XML_DECLARATION) {
0262: documentSpecifiedEncoding = ((StartTag) xmlDeclarationTag)
0263: .getAttributeValue("encoding");
0264: if (documentSpecifiedEncoding != null)
0265: return setEncoding(documentSpecifiedEncoding,
0266: xmlDeclarationTag.toString());
0267: }
0268: // Check for Content-Type http-equiv meta tag:
0269: final StartTag contentTypeMetaTag = findNextStartTag(0,
0270: "http-equiv", "Content-Type", false);
0271: if (contentTypeMetaTag != null) {
0272: final String contentValue = contentTypeMetaTag
0273: .getAttributeValue("content");
0274: if (contentValue != null) {
0275: documentSpecifiedEncoding = getCharsetParameterFromHttpHeaderValue(contentValue);
0276: if (documentSpecifiedEncoding != null)
0277: return setEncoding(documentSpecifiedEncoding,
0278: contentTypeMetaTag.toString());
0279: }
0280: }
0281: return setEncoding(null, "No encoding specified in document");
0282: }
0283:
0284: /**
0285: * Returns the character encoding scheme of the source byte stream used to create this object.
0286: * <p>
0287: * The encoding of a document defines how the original byte stream was encoded into characters.
0288: * The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.4">HTTP specification section 3.4</a>
0289: * uses the term "character set" to refer to the encoding, and the term "charset" is similarly used in Java
0290: * (see the class <code>java.nio.charset.Charset</code>).
0291: * This often causes confusion, as a modern "coded character set" such as <a target="_blank" href="http://www.unicode.org/">Unicode</a>
0292: * can have several encodings, such as <a target="_blank" href="http://www.unicode.org/faq/utf_bom.html">UTF-8, UTF-16, and UTF-32</a>.
0293: * See the Wikipedia <a target="_blank" href="http://en.wikipedia.org/wiki/Character_encoding">character encoding</a> article
0294: * for an explanation of the terminology.
0295: * <p>
0296: * This method makes the best possible effort to return the name of the encoding used to decode the original source byte stream
0297: * into character data. This decoding takes place in the constructor when a parameter based on a byte stream such as an
0298: * <code>InputStream</code> or <code>URL</code> is used to specify the source text.
0299: * The documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors describe how the return value of this
0300: * method is determined in these cases.
0301: * It is also possible in some circumstances for the encoding to be determined in the {@link #Source(Reader)} constructor.
0302: * <p>
0303: * If a constructor was used that specifies the source text directly in character form (not requiring the decoding of a byte sequence)
0304: * then the document itself is searched for an <a href="#EncodingSpecification">encoding specification</a>. In this case, this
0305: * method returns the same value as the {@link #getDocumentSpecifiedEncoding()} method.
0306: * <p>
0307: * The {@link #getEncodingSpecificationInfo()} method returns a simple description of how the value of this method was determined.
0308: *
0309: * @return the character encoding scheme of the source byte stream used to create this object, or <code>null</code> if the encoding is not known.
0310: * @see #getEncodingSpecificationInfo()
0311: */
0312: public String getEncoding() {
0313: if (encoding == UNINITIALISED)
0314: getDocumentSpecifiedEncoding();
0315: return encoding;
0316: }
0317:
0318: /**
0319: * Returns a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
0320: * <p>
0321: * The description is intended for informational purposes only.
0322: * It is not guaranteed to have any particular format and can not be reliably parsed.
0323: *
0324: * @return a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
0325: * @see #getEncoding()
0326: */
0327: public String getEncodingSpecificationInfo() {
0328: if (encoding == UNINITIALISED)
0329: getDocumentSpecifiedEncoding();
0330: return encodingSpecificationInfo;
0331: }
0332:
0333: /**
0334: * Returns the preliminary encoding of the source document together with a concise description of how it was determined.
0335: * <p>
0336: * It is sometimes necessary for the {@link #Source(InputStream)} and {@link #Source(URL)} constructors to search the document for an
0337: * <a href="#EncodingSpecification">encoding specification</a> in order to determine the exact {@linkplain #getEncoding() encoding}
0338: * of the source byte stream.
0339: * <p>
0340: * In order to search for the {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} before the exact encoding is known,
0341: * a <i>preliminary encoding</i> is determined using the first four bytes of the input stream.
0342: * <p>
0343: * Because the encoding specification must only use characters in the range U+0000 to U+007F, the preliminary encoding need only have the following
0344: * basic properties determined:
0345: * <ul>
0346: * <li>Code unit size (8-bit, 16-bit or 32-bit)
0347: * <li>Byte order (big-endian or little-endian) if the code unit size is 16-bit or 32-bit
0348: * <li>Basic encoding of characters in the range U+0000 to U+007F (current implementation only distinguishes between ASCII and EBCDIC)
0349: * </ul>
0350: * <p>
0351: * The encodings used to represent the most commonly encountered combinations of these basic properties are:
0352: * <ul>
0353: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
0354: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
0355: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>: 16-bit big-endian encoding
0356: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>: 16-bit little-endian encoding
0357: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a>: 32-bit big-endian encoding (not supported on most java platforms)
0358: * <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a>: 32-bit little-endian encoding (not supported on most java platforms)
0359: * </ul>
0360: * Note: all encodings with a code unit size greater than 8 bits are assumed to use an
0361: * <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible low-order byte.
0362: * <p>
0363: * In some descriptions returned by this method, and the documentation below, a pattern is used to help demonstrate the contents of the first four bytes of the stream.
0364: * The patterns use the characters "<code>00</code>" to signify a zero byte, "<code>XX</code>" to signify a non-zero byte, and "<code>??</code>" to signify
0365: * a byte than can be either zero or non-zero.
0366: * <p>
0367: * The algorithm for determining the preliminary encoding is as follows:
0368: * <ol class="HalfSeparated">
0369: * <li>Byte pattern "<code>00 00</code>..." : If the stream starts with two zero bytes, the default 32-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a> is used.
0370: * <li>Byte pattern "<code>00 XX</code>..." : If the stream starts with a single zero byte, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used.
0371: * <li>Byte pattern "<code>XX ?? 00 00</code>..." : If the third and fourth bytes of the stream are zero, the default 32-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a> is used.
0372: * <li>Byte pattern "<code>XX 00</code>..." or "<code>XX ?? XX 00</code>..." : If the second or fourth byte of the stream is zero, the default 16-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a> is used.
0373: * <li>Byte pattern "<code>XX XX 00 XX</code>..." : If the third byte of the stream is zero, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used (assumes the first character is > U+00FF).
0374: * <li>Byte pattern "<code>4C XX XX XX</code>..." : If the first four bytes are consistent with the <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a> encoding of
0375: * an {@linkplain StartTagType#XML_DECLARATION XML declaration} ("<code><?xm</code>") or
0376: * a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} ("<code><!DO</code>"),
0377: * or any other string starting with the EBCDIC character '<' followed by three non-ASCII characters (8th bit set),
0378: * which is consistent with EBCDIC alphanumeric characters,
0379: * the default <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
0380: * <a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a> is used.
0381: * <li>Byte pattern "<code>XX XX XX XX</code>..." : Otherwise, if all of the first four bytes of the stream are non-zero,
0382: * the default 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
0383: * <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> is used.
0384: * </ol>
0385: * <p>
0386: * If it was not necessary to search for a {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} when determining the
0387: * {@linkplain #getEncoding() encoding} of this source document from a byte stream, this method returns <code>null</code>.
0388: * <p>
0389: * See the documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors for more detailed information about when the detection of a
0390: * preliminary encoding is required.
0391: * <p>
0392: * The description returned by this method is intended for informational purposes only.
0393: * It is not guaranteed to have any particular format and can not be reliably parsed.
0394: *
0395: * @return the preliminary encoding of the source document together with a concise description of how it was determined, or <code>null</code> if no preliminary encoding was required.
0396: * @see #getEncoding()
0397: */
0398: public String getPreliminaryEncodingInfo() {
0399: return preliminaryEncodingInfo;
0400: }
0401:
0402: /**
0403: * Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>.
0404: * <p>
0405: * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
0406: * most normal situations.
0407: * An exact determination of whether the source document is XML would require a much more complex analysis of the text.
0408: * <p>
0409: * The algorithm is as follows:
0410: * <ol class="HalfSeparated">
0411: * <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
0412: * <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
0413: * "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
0414: * also an XML document.
0415: * <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
0416: * </ol>
0417: * <p>
0418: * As of version 2.5, this method no longer returns <code>true</code> if the document doesn't contain an {@link HTMLElementName#HTML HTML} element.
0419: * The library is often used to parse partial HTML documents, so the lack of an {@link HTMLElementName#HTML HTML} element is not a reliable test for an XML document.
0420: *
0421: * @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
0422: */
0423: public boolean isXML() {
0424: final Tag xmlDeclarationTag = getTagAt(0);
0425: if (xmlDeclarationTag != null
0426: && xmlDeclarationTag.getTagType() == StartTagType.XML_DECLARATION)
0427: return true;
0428: final Tag doctypeTag = findNextTag(0,
0429: StartTagType.DOCTYPE_DECLARATION);
0430: // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
0431: if (doctypeTag != null
0432: && getParseText().indexOf("xhtml", doctypeTag.begin,
0433: doctypeTag.end) != -1)
0434: return true;
0435: return false;
0436: }
0437:
0438: /**
0439: * Returns the <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> character sequence used in the source document.
0440: * <p>
0441: * If the document does not contain any newline characters, this method returns <code>null</code>.
0442: * <p>
0443: * The three possible return values (aside from <code>null</code>) are <code>"\n"</code>, <code>"\r\n"</code> and <code>"\r"</code>.
0444: *
0445: * @return the <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> character sequence used in the source document, or <code>null</code> if none is present.
0446: */
0447: public String getNewLine() {
0448: if (newLine != UNINITIALISED)
0449: return newLine;
0450: for (int i = 0; i < end; i++) {
0451: char ch = string.charAt(i);
0452: if (ch == '\n')
0453: return newLine = lastNewLine = LF;
0454: if (ch == '\r')
0455: return newLine = lastNewLine = (++i < end && string
0456: .charAt(i) == '\n') ? CRLF : CR;
0457: }
0458: return newLine = null;
0459: }
0460:
0461: String getBestGuessNewLine() {
0462: final String newLine = getNewLine();
0463: if (newLine != null)
0464: return newLine;
0465: if (lastNewLine != null)
0466: return lastNewLine;
0467: return Config.NewLine;
0468: }
0469:
0470: /**
0471: * Returns the row number of the specified character position in the source document.
0472: * @param pos the position in the source document.
0473: * @return the row number of the specified character position in the source document.
0474: * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
0475: * @see #getColumn(int pos)
0476: * @see #getRowColumnVector(int pos)
0477: */
0478: public int getRow(final int pos) {
0479: return getRowColumnVector(pos).getRow();
0480: }
0481:
0482: /**
0483: * Returns the column number of the specified character position in the source document.
0484: * @param pos the position in the source document.
0485: * @return the column number of the specified character position in the source document.
0486: * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
0487: * @see #getRow(int pos)
0488: * @see #getRowColumnVector(int pos)
0489: */
0490: public int getColumn(final int pos) {
0491: return getRowColumnVector(pos).getColumn();
0492: }
0493:
0494: /**
0495: * Returns a {@link RowColumnVector} object representing the row and column number of the specified character position in the source document.
0496: * @param pos the position in the source document.
0497: * @return a {@link RowColumnVector} object representing the row and column number of the specified character position in the source document.
0498: * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
0499: * @see #getRow(int pos)
0500: * @see #getColumn(int pos)
0501: */
0502: public RowColumnVector getRowColumnVector(final int pos) {
0503: if (pos > end)
0504: throw new IndexOutOfBoundsException();
0505: if (rowColumnVectorCacheArray == null)
0506: rowColumnVectorCacheArray = RowColumnVector
0507: .getCacheArray(this );
0508: return RowColumnVector.get(rowColumnVectorCacheArray, pos);
0509: }
0510:
0511: /**
0512: * Returns the source text as a <code>String</code>.
0513: * @return the source text as a <code>String</code>.
0514: */
0515: public String toString() {
0516: return string;
0517: }
0518:
0519: /**
0520: * Parses all of the {@linkplain Tag tags} in this source document sequentially from beginning to end.
0521: * <p>
0522: * Calling this method can greatly improve performance if most or all of the tags in the document need to be parsed.
0523: * <p>
0524: * Calling the {@link #findAllTags()}, {@link #findAllStartTags()}, {@link #findAllElements()} or {@link #getChildElements()} method on the <code>Source</code> object
0525: * performs a full sequential parse automatically.
0526: * There are however still circumstances where it should be called manually, such as when it is known that most or all of the tags in the document will need to be parsed,
0527: * but none of the abovementioned methods are used, or are called only after calling one or more other <a href="Tag.html#TagSearchMethods">tag search methods</a>.
0528: * <p>
0529: * If this method is called manually, is should be called soon after the <code>Source</code> object is created,
0530: * before any <a href="Tag.html#TagSearchMethods">tag search methods</a> are called.
0531: * <p>
0532: * By default, tags are parsed only as needed, which is referred to as <i><a name="ParseOnDemand">parse on demand</a></i> mode.
0533: * In this mode, every call to a tag search method that is not returning previously cached tags must perform a relatively complex check to determine whether a
0534: * potential tag is in a {@linkplain TagType#isValidPosition(Source,int,int[]) valid position}.
0535: * <p>
0536: * Generally speaking, a tag is in a valid position if it does not appear inside any another tag.
0537: * {@linkplain TagType#isServerTag() Server tags} can appear anywhere in a document, including inside other tags, so this relates only to non-server tags.
0538: * Theoretically, checking whether a specified position in the document is enclosed in another tag is only possible if every preceding tag has been parsed,
0539: * otherwise it is impossible to tell whether one of the delimiters of the enclosing tag was in fact enclosed by some other tag before it, thereby invalidating it.
0540: * <p>
0541: * When this method is called, each tag is parsed in sequence starting from the beginning of the document, making it easy to check whether each potential
0542: * tag is in a valid position.
0543: * In <i>parse on demand</i> mode a compromise technique must be used for this check, since the theoretical requirement of having parsed all preceding tags
0544: * is no longer practical.
0545: * This compromise involves only checking whether the position is enclosed by other tags with {@linkplain TagType#getTagTypesIgnoringEnclosedMarkup() certain tag types}.
0546: * The added complexity of this technique makes parsing each tag slower compared to when a full sequential parse is performed, but when only a few tags need
0547: * parsing this is an extremely beneficial trade-off.
0548: * <p>
0549: * The documentation of the {@link TagType#isValidPosition(Source, int pos, int[] fullSequentialParseData)} method,
0550: * which is called internally by the parser to perform the valid position check,
0551: * includes a more detailed explanation of the differences between the two modes of operation.
0552: * <p>
0553: * Calling this method a second or subsequent time has no effect.
0554: * <p>
0555: * This method returns the same list of tags as the {@link Source#findAllTags() Source.findAllTags()} method, but as an array instead of a list.
0556: * <p>
0557: * If this method is called after any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are called,
0558: * the {@linkplain #getCacheDebugInfo() cache} is cleared of any previously found tags before being restocked via the full sequential parse.
0559: * This is significant if the {@link Segment#ignoreWhenParsing()} method has been called since the tags were first found, as any tags inside the
0560: * ignored segments will no longer be returned by any of the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
0561: * <p>
0562: * See also the {@link Tag} class documentation for more general details about how tags are parsed.
0563: *
0564: * @return an array of all {@linkplain Tag tags} in this source document.
0565: */
0566: public Tag[] fullSequentialParse() {
0567: // The assumeNoNestedTags flag tells the parser not to bother checking for tags inside other tags
0568: // if the user knows that the document doesn't contain any server tags.
0569: // This results in a more efficient search, but the difference during benchmark tests was only minimal -
0570: // about 12% speed improvement in a 1MB document containing 70,000 tags, 75% of which were inside a comment tag.
0571: // With such a small improvement in a document specifically designed to show an an exaggerated improvement,
0572: // it is not worth documenting this feature.
0573: // The flag has been retained internally however as it does not have a measurable performance impact to check for it.
0574: if (allTagsArray != null)
0575: return allTagsArray;
0576: final boolean assumeNoNestedTags = false;
0577: if (cache.getTagCount() != 0)
0578: cache.clear();
0579: final boolean useAllTypesCacheSave = useAllTypesCache;
0580: try {
0581: useAllTypesCache = false;
0582: useSpecialTypesCache = false;
0583: return Tag.parseAll(this , assumeNoNestedTags);
0584: } finally {
0585: useAllTypesCache = useAllTypesCacheSave;
0586: useSpecialTypesCache = true;
0587: }
0588: }
0589:
0590: /**
0591: * Returns a list of the top-level {@linkplain Element elements} in the document element hierarchy.
0592: * <p>
0593: * The objects in the list are all of type {@link Element}.
0594: * <p>
0595: * The term <i><a name="TopLevelElement">top-level element</a></i> refers to an element that is not nested within any other element in the document.
0596: * <p>
0597: * The term <i><a name="DocumentElementHierarchy">document element hierarchy</a></i> refers to the hierarchy of elements that make up this source document.
0598: * The source document itself is not considered to be part of the hierarchy, meaning there is typically more than one top-level element.
0599: * Even when the source represents an entire HTML document, the {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} and/or an
0600: * {@linkplain StartTagType#XML_DECLARATION XML declaration} often exist as top-level elements along with the {@link HTMLElementName#HTML HTML} element itself.
0601: * <p>
0602: * The {@link Element#getChildElements()} method can be used to get the children of the top-level elements, with recursive use providing a means to
0603: * visit every element in the document hierarchy.
0604: * <p>
0605: * The document element hierarchy differs from that of the <a target="_blank" href="http://en.wikipedia.org/wiki/Document_Object_Model">Document Object Model</a>
0606: * in that it is only a representation of the elements that are physically present in the source text. Unlike the DOM, it does not include any "implied" HTML elements
0607: * such as {@link HTMLElementName#TBODY TBODY} if they are not present in the source text.
0608: * <p>
0609: * Elements formed from {@linkplain TagType#isServerTag() server tags} are not included in the hierarchy at all.
0610: * <p>
0611: * Structural errors in this source document such as overlapping elements are reported in the {@linkplain #getLogger() log}.
0612: * When elements are found to overlap, the position of the start tag determines the location of the element in the hierarchy.
0613: * <p>
0614: * Calling this method on the <code>Source</code> object performs a {@linkplain #fullSequentialParse() full sequential parse} automatically.
0615: * <p>
0616: * A visual representation of the document element hierarchy can be obtained by calling:<br />
0617: * {@link #getSourceFormatter()}<code>.</code>{@link SourceFormatter#setIndentAllElements(boolean) setIndentAllElements(true)}<code>.</code>{@link SourceFormatter#setCollapseWhiteSpace(boolean) setCollapseWhiteSpace(true)}<code>.</code>{@link SourceFormatter#setTidyTags(boolean) setTidyTags(true)}<code>.</code>{@link SourceFormatter#toString() toString()}
0618: *
0619: * @return a list of the top-level {@linkplain Element elements} in the document element hierarchy, guaranteed not <code>null</code>.
0620: * @see Element#getParentElement()
0621: * @see Element#getChildElements()
0622: * @see Element#getDepth()
0623: */
0624: public List getChildElements() {
0625: if (childElements == null) {
0626: if (length() == 0) {
0627: childElements = Collections.EMPTY_LIST;
0628: } else {
0629: if (allTags == null)
0630: fullSequentialParse();
0631: childElements = new ArrayList();
0632: int pos = 0;
0633: while (true) {
0634: final StartTag childStartTag = source
0635: .findNextStartTag(pos);
0636: if (childStartTag == null)
0637: break;
0638: if (!Config.IncludeServerTagsInElementHierarchy
0639: && childStartTag.getTagType().isServerTag()) {
0640: pos = childStartTag.end;
0641: continue;
0642: }
0643: final Element childElement = childStartTag
0644: .getElement();
0645: childElement.getChildElements(0);
0646: if (childElement.parentElement == Element.NOT_CACHED) { // make sure element was not added as a child of a descendent element (can happen with overlapping elements)
0647: childElement.parentElement = null;
0648: childElements.add(childElement);
0649: }
0650: pos = childElement.end;
0651: }
0652: }
0653: }
0654: return childElements;
0655: }
0656:
0657: /**
0658: * Formats the HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
0659: * <p>
0660: * The output format can be configured by setting any number of properties on the returned {@link SourceFormatter} instance before
0661: * {@linkplain SourceFormatter#writeTo(Writer) obtaining its output}.
0662: * <p>
0663: * To create a <code>SourceFormatter</code> instance based on a {@link Segment} rather than an entire <code>Source</code> document,
0664: * use {@linkplain SourceFormatter#SourceFormatter(Segment) new SourceFormatter(segment)} instead.
0665: *
0666: * @return an instance of {@link SourceFormatter} based on this source document.
0667: */
0668: public SourceFormatter getSourceFormatter() {
0669: return new SourceFormatter(this );
0670: }
0671:
0672: /**
0673: * Returns a list of all {@linkplain Tag tags} in this source document.
0674: * <p>
0675: * Calling this method on the <code>Source</code> object performs a {@linkplain #fullSequentialParse() full sequential parse} automatically.
0676: * <p>
0677: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0678: *
0679: * @return a list of all {@linkplain Tag tags} in this source document.
0680: */
0681: public List findAllTags() {
0682: if (allTags == null)
0683: fullSequentialParse();
0684: return allTags;
0685: }
0686:
0687: /**
0688: * Returns a list of all {@linkplain StartTag start tags} in this source document.
0689: * <p>
0690: * Calling this method on the <code>Source</code> object performs a {@linkplain #fullSequentialParse() full sequential parse} automatically.
0691: * <p>
0692: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0693: *
0694: * @return a list of all {@linkplain StartTag start tags} in this source document.
0695: */
0696: public List findAllStartTags() {
0697: if (allStartTags == null) {
0698: final List allTags = findAllTags();
0699: allStartTags = new ArrayList(allTags.size());
0700: for (final Iterator i = allTags.iterator(); i.hasNext();) {
0701: final Object next = i.next();
0702: if (next instanceof StartTag)
0703: allStartTags.add(next);
0704: }
0705: }
0706: return allStartTags;
0707: }
0708:
0709: /**
0710: * Returns a list of all {@linkplain Element elements} in this source document.
0711: * <p>
0712: * Calling this method on the <code>Source</code> object performs a {@linkplain #fullSequentialParse() full sequential parse} automatically.
0713: * <p>
0714: * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
0715: *
0716: * @return a list of all {@linkplain Element elements} in this source document.
0717: */
0718: public List findAllElements() {
0719: if (allElements == null) {
0720: final List allStartTags = findAllStartTags();
0721: if (allStartTags.isEmpty())
0722: return Collections.EMPTY_LIST;
0723: allElements = new ArrayList(allStartTags.size());
0724: for (final Iterator i = allStartTags.iterator(); i
0725: .hasNext();) {
0726: final StartTag startTag = (StartTag) i.next();
0727: allElements.add(startTag.getElement());
0728: }
0729: }
0730: return allElements;
0731: }
0732:
0733: /**
0734: * Returns the {@link Element} with the specified <code>id</code> attribute value.
0735: * <p>
0736: * This simulates the script method
0737: * <code><a target="_blank" href="http://www.w3.org/TR/1998/REC-DOM-Level-1-19981001/level-one-html.html#ID-36113835">getElementById</a></code>
0738: * defined in DOM HTML level 1.
0739: * <p>
0740: * This is equivalent to {@link #findNextStartTag(int,String,String,boolean) findNextStartTag}<code>(0,"id",id,true).</code>{@link StartTag#getElement() getElement()}, assuming that the element exists.
0741: * <p>
0742: * A well formed HTML document should have no more than one element with any given <code>id</code> attribute value.
0743: *
0744: * @param id the <code>id</code> attribute value (case sensitive) to search for, must not be <code>null</code>.
0745: * @return the {@link Element} with the specified <code>id</code> attribute value, or <code>null</code> if no such element exists.
0746: */
0747: public Element getElementById(final String id) {
0748: final StartTag startTag = findNextStartTag(0, Attribute.ID, id,
0749: true);
0750: return startTag == null ? null : startTag.getElement();
0751: }
0752:
0753: /**
0754: * Returns the {@link Tag} at the specified position in the source document.
0755: * <p>
0756: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0757: * <p>
0758: * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags.
0759: *
0760: * @param pos the position in the source document, may be out of bounds.
0761: * @return the {@link Tag} at the specified position in the source document, or <code>null</code> if no tag exists at the specified position or it is out of bounds.
0762: */
0763: public final Tag getTagAt(final int pos) {
0764: return Tag.getTagAt(this , pos);
0765: }
0766:
0767: /**
0768: * Returns the {@link Tag} beginning at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
0769: * <p>
0770: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0771: *
0772: * @param pos the position in the source document from which to start the search, may be out of bounds.
0773: * @return the {@link Tag} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0774: */
0775: public Tag findPreviousTag(final int pos) {
0776: return Tag.findPreviousOrNextTag(this , pos, true);
0777: }
0778:
0779: /**
0780: * Returns the {@link Tag} of the specified {@linkplain TagType type} beginning at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
0781: * <p>
0782: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0783: *
0784: * @param pos the position in the source document from which to start the search, may be out of bounds.
0785: * @param tagType the <code>TagType</code> to search for.
0786: * @return the {@link Tag} with the specified {@linkplain TagType type} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0787: */
0788: public Tag findPreviousTag(final int pos, final TagType tagType) {
0789: return Tag.findPreviousOrNextTag(this , pos, tagType, true);
0790: }
0791:
0792: /**
0793: * Returns the {@link Tag} beginning at or immediately following the specified position in the source document.
0794: * <p>
0795: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0796: * <p>
0797: * Use {@link Tag#findNextTag()} to find the tag immediately following another tag.
0798: *
0799: * @param pos the position in the source document from which to start the search, may be out of bounds.
0800: * @return the {@link Tag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0801: */
0802: public Tag findNextTag(final int pos) {
0803: return Tag.findPreviousOrNextTag(this , pos, false);
0804: }
0805:
0806: /**
0807: * Returns the {@link Tag} of the specified {@linkplain TagType type} beginning at or immediately following the specified position in the source document.
0808: * <p>
0809: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0810: *
0811: * @param pos the position in the source document from which to start the search, may be out of bounds.
0812: * @param tagType the <code>TagType</code> to search for.
0813: * @return the {@link Tag} with the specified {@linkplain TagType type} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0814: */
0815: public Tag findNextTag(final int pos, final TagType tagType) {
0816: return Tag.findPreviousOrNextTag(this , pos, tagType, false);
0817: }
0818:
0819: /**
0820: * Returns the {@link Tag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
0821: * <p>
0822: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0823: *
0824: * @param pos the position in the source document, may be out of bounds.
0825: * @return the {@link Tag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a tag or is out of bounds.
0826: */
0827: public Tag findEnclosingTag(final int pos) {
0828: return findEnclosingTag(pos, null);
0829: }
0830:
0831: /**
0832: * Returns the {@link Tag} of the specified {@linkplain TagType type} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
0833: * <p>
0834: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0835: *
0836: * @param pos the position in the source document, may be out of bounds.
0837: * @param tagType the <code>TagType</code> to search for.
0838: * @return the {@link Tag} of the specified {@linkplain TagType type} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a tag of the specified type or is out of bounds.
0839: */
0840: public Tag findEnclosingTag(final int pos, final TagType tagType) {
0841: final Tag tag = findPreviousTag(pos, tagType);
0842: if (tag == null || tag.end <= pos)
0843: return null;
0844: return tag;
0845: }
0846:
0847: /**
0848: * Returns the {@link Element} beginning at or immediately following the specified position in the source document.
0849: * <p>
0850: * This is equivalent to {@link #findNextStartTag(int) findNextStartTag(pos)}<code>.</code>{@link StartTag#getElement() getElement()},
0851: * assuming the result is not <code>null</code>.
0852: *
0853: * @param pos the position in the source document from which to start the search, may be out of bounds.
0854: * @return the {@link Element} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0855: */
0856: public Element findNextElement(final int pos) {
0857: final StartTag startTag = findNextStartTag(pos);
0858: return startTag == null ? null : startTag.getElement();
0859: }
0860:
0861: /**
0862: * Returns the {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document.
0863: * <p>
0864: * This is equivalent to {@link #findNextStartTag(int,String) findNextStartTag(pos,name)}<code>.</code>{@link StartTag#getElement() getElement()},
0865: * assuming the result is not <code>null</code>.
0866: * <p>
0867: * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
0868: * {@link #findNextStartTag(int) findNextElement(pos)}.
0869: * <p>
0870: * Specifying an argument to the <code>name</code> parameter that ends in a colon (<code>:</code>) searches for all elements
0871: * in the specified XML namespace.
0872: * <p>
0873: * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
0874: *
0875: * @param pos the position in the source document from which to start the search, may be out of bounds.
0876: * @param name the {@linkplain Element#getName() name} of the element to search for.
0877: * @return the {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0878: */
0879: public Element findNextElement(final int pos, String name) {
0880: final StartTag startTag = findNextStartTag(pos, name);
0881: return startTag == null ? null : startTag.getElement();
0882: }
0883:
0884: /**
0885: * Returns the {@link Element} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
0886: * <p>
0887: * This is equivalent to {@link #findNextStartTag(int,String,String,boolean) findNextStartTag(pos,attributeName,value,valueCaseSensitive)}<code>.</code>{@link StartTag#getElement() getElement()},
0888: * assuming the result is not <code>null</code>.
0889: *
0890: * @param pos the position in the source document from which to start the search, may be out of bounds.
0891: * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
0892: * @param value the value of the specified attribute to search for, must not be <code>null</code>.
0893: * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
0894: * @return the {@link Element} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0895: */
0896: public Element findNextElement(final int pos,
0897: final String attributeName, final String value,
0898: final boolean valueCaseSensitive) {
0899: final StartTag startTag = findNextStartTag(pos, attributeName,
0900: value, valueCaseSensitive);
0901: return startTag == null ? null : startTag.getElement();
0902: }
0903:
0904: /**
0905: * Returns the {@link StartTag} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
0906: * <p>
0907: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0908: *
0909: * @param pos the position in the source document from which to start the search, may be out of bounds.
0910: * @return the {@link StartTag} at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0911: */
0912: public StartTag findPreviousStartTag(final int pos) {
0913: return StartTag.findPreviousOrNext(this , pos, true);
0914: }
0915:
0916: /**
0917: * Returns the {@link StartTag} with the specified {@linkplain StartTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
0918: * <p>
0919: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0920: * <p>
0921: * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
0922: * {@link #findPreviousStartTag(int) findPreviousStartTag(pos)}.
0923: * <p>
0924: * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
0925: *
0926: * @param pos the position in the source document from which to start the search, may be out of bounds.
0927: * @param name the {@linkplain StartTag#getName() name} of the start tag to search for.
0928: * @return the {@link StartTag} with the specified {@linkplain StartTag#getName() name} at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0929: */
0930: public StartTag findPreviousStartTag(final int pos, String name) {
0931: if (name != null)
0932: name = name.toLowerCase();
0933: final boolean isXMLTagName = Tag.isXMLName(name);
0934: return StartTag.findPreviousOrNext(this , pos, name,
0935: isXMLTagName, true);
0936: }
0937:
0938: /**
0939: * Returns the {@link StartTag} beginning at or immediately following the specified position in the source document.
0940: * <p>
0941: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0942: *
0943: * @param pos the position in the source document from which to start the search, may be out of bounds.
0944: * @return the {@link StartTag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0945: */
0946: public StartTag findNextStartTag(final int pos) {
0947: return StartTag.findPreviousOrNext(this , pos, false);
0948: }
0949:
0950: /**
0951: * Returns the {@link StartTag} with the specified {@linkplain StartTag#getName() name} beginning at or immediately following the specified position in the source document.
0952: * <p>
0953: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0954: * <p>
0955: * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
0956: * {@link #findNextStartTag(int) findNextStartTag(pos)}.
0957: * <p>
0958: * Specifying an argument to the <code>name</code> parameter that ends in a colon (<code>:</code>) searches for all start tags
0959: * in the specified XML namespace.
0960: * <p>
0961: * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
0962: *
0963: * @param pos the position in the source document from which to start the search, may be out of bounds.
0964: * @param name the {@linkplain StartTag#getName() name} of the start tag to search for.
0965: * @return the {@link StartTag} with the specified {@linkplain StartTag#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0966: */
0967: public StartTag findNextStartTag(final int pos, String name) {
0968: if (name != null)
0969: name = name.toLowerCase();
0970: final boolean isXMLTagName = Tag.isXMLName(name);
0971: return StartTag.findPreviousOrNext(this , pos, name,
0972: isXMLTagName, false);
0973: }
0974:
0975: /**
0976: * Returns the {@link StartTag} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
0977: * <p>
0978: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0979: *
0980: * @param pos the position in the source document from which to start the search, may be out of bounds.
0981: * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
0982: * @param value the value of the specified attribute to search for, must not be <code>null</code>.
0983: * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
0984: * @return the {@link StartTag} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
0985: */
0986: public StartTag findNextStartTag(final int pos,
0987: final String attributeName, final String value,
0988: final boolean valueCaseSensitive) {
0989: return StartTag.findNext(this , pos, attributeName, value,
0990: valueCaseSensitive);
0991: }
0992:
0993: /**
0994: * Returns the {@link EndTag} beginning at or immediately preceding the specified position in the source document.
0995: * <p>
0996: * See the {@link Tag} class documentation for more details about the behaviour of this method.
0997: *
0998: * @param pos the position in the source document from which to start the search, may be out of bounds.
0999: * @return the {@link EndTag} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1000: */
1001: public EndTag findPreviousEndTag(final int pos) {
1002: return EndTag.findPreviousOrNext(this , pos, true);
1003: }
1004:
1005: /**
1006: * Returns the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
1007: * <p>
1008: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1009: *
1010: * @param pos the position in the source document from which to start the search, may be out of bounds.
1011: * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
1012: * @return the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1013: */
1014: public EndTag findPreviousEndTag(final int pos, final String name) {
1015: if (name == null)
1016: throw new IllegalArgumentException(
1017: "name argument must not be null");
1018: return EndTag.findPreviousOrNext(this , pos, name.toLowerCase(),
1019: EndTagType.NORMAL, true);
1020: }
1021:
1022: /**
1023: * Returns the {@link EndTag} beginning at or immediately following the specified position in the source document.
1024: * <p>
1025: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1026: *
1027: * @param pos the position in the source document from which to start the search, may be out of bounds.
1028: * @return the {@link EndTag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1029: */
1030: public EndTag findNextEndTag(final int pos) {
1031: return EndTag.findPreviousOrNext(this , pos, false);
1032: }
1033:
1034: /**
1035: * Returns the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} beginning at or immediately following the specified position in the source document.
1036: * <p>
1037: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1038: *
1039: * @param pos the position in the source document from which to start the search, may be out of bounds.
1040: * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
1041: * @return the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1042: */
1043: public EndTag findNextEndTag(final int pos, final String name) {
1044: return findNextEndTag(pos, name, EndTagType.NORMAL);
1045: }
1046:
1047: /**
1048: * Returns the {@link EndTag} with the specified {@linkplain EndTag#getName() name} and {@linkplain EndTagType type} beginning at or immediately following the specified position in the source document.
1049: * <p>
1050: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1051: *
1052: * @param pos the position in the source document from which to start the search, may be out of bounds.
1053: * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
1054: * @param endTagType the {@linkplain EndTagType type} of the end tag to search for, must not be <code>null</code>.
1055: * @return the {@link EndTag} with the specified {@linkplain EndTag#getName() name} and {@linkplain EndTagType type} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1056: */
1057: public EndTag findNextEndTag(final int pos, final String name,
1058: final EndTagType endTagType) {
1059: if (name == null)
1060: throw new IllegalArgumentException(
1061: "name argument must not be null");
1062: return EndTag.findPreviousOrNext(this , pos, name.toLowerCase(),
1063: endTagType, false);
1064: }
1065:
1066: /**
1067: * Returns the most nested {@link Element} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
1068: * <p>
1069: * The specified position can be anywhere inside the {@linkplain Element#getStartTag() start tag}, {@linkplain Element#getEndTag() end tag},
1070: * or {@linkplain Element#getContent() content} of the element. There is no requirement that the returned element has an end tag, and it
1071: * may be a {@linkplain TagType#isServerTag() server tag} or HTML {@linkplain StartTagType#COMMENT comment}.
1072: * <p>
1073: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1074: *
1075: * @param pos the position in the source document, may be out of bounds.
1076: * @return the most nested {@link Element} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within an element or is out of bounds.
1077: */
1078: public Element findEnclosingElement(final int pos) {
1079: return findEnclosingElement(pos, null);
1080: }
1081:
1082: /**
1083: * Returns the most nested {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
1084: * <p>
1085: * The specified position can be anywhere inside the {@linkplain Element#getStartTag() start tag}, {@linkplain Element#getEndTag() end tag},
1086: * or {@linkplain Element#getContent() content} of the element. There is no requirement that the returned element has an end tag, and it
1087: * may be a {@linkplain TagType#isServerTag() server tag} or HTML {@linkplain StartTagType#COMMENT comment}.
1088: * <p>
1089: * See the {@link Tag} class documentation for more details about the behaviour of this method.
1090: * <p>
1091: * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
1092: *
1093: * @param pos the position in the source document, may be out of bounds.
1094: * @param name the {@linkplain Element#getName() name} of the element to search for.
1095: * @return the most nested {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1096: */
1097: public Element findEnclosingElement(final int pos, String name) {
1098: int startBefore = pos;
1099: if (name != null)
1100: name = name.toLowerCase();
1101: final boolean isXMLTagName = Tag.isXMLName(name);
1102: while (true) {
1103: StartTag startTag = StartTag.findPreviousOrNext(this ,
1104: startBefore, name, isXMLTagName, true);
1105: if (startTag == null)
1106: return null;
1107: Element element = startTag.getElement();
1108: if (pos < element.end)
1109: return element;
1110: startBefore = startTag.begin - 1;
1111: }
1112: }
1113:
1114: /**
1115: * Returns the {@link CharacterReference} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
1116: * <p>
1117: * Character references positioned within an HTML {@linkplain StartTagType#COMMENT comment} are <b>NOT</b> ignored.
1118: *
1119: * @param pos the position in the source document from which to start the search, may be out of bounds.
1120: * @return the {@link CharacterReference} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1121: */
1122: public CharacterReference findPreviousCharacterReference(
1123: final int pos) {
1124: return CharacterReference.findPreviousOrNext(this , pos, true);
1125: }
1126:
1127: /**
1128: * Returns the {@link CharacterReference} beginning at or immediately following the specified position in the source document.
1129: * <p>
1130: * Character references positioned within an HTML {@linkplain StartTagType#COMMENT comment} are <b>NOT</b> ignored.
1131: *
1132: * @param pos the position in the source document from which to start the search, may be out of bounds.
1133: * @return the {@link CharacterReference} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
1134: */
1135: public CharacterReference findNextCharacterReference(final int pos) {
1136: return CharacterReference.findPreviousOrNext(this , pos, false);
1137: }
1138:
1139: /**
1140: * Returns the end position of the <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a> that starts at the
1141: * specified position.
1142: * <p>
1143: * This implementation first checks that the character at the specified position is a valid XML Name start character as defined by the
1144: * {@link Tag#isXMLNameStartChar(char)} method. If this is not the case, the value <code>-1</code> is returned.
1145: * <p>
1146: * Once the first character has been checked, subsequent characters are checked using the {@link Tag#isXMLNameChar(char)} method until
1147: * one is found that is not a valid XML Name character or the end of the document is reached. This position is then returned.
1148: *
1149: * @param pos the position in the source document of the first character of the XML Name.
1150: * @return the end position of the <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a> that starts at the specified position.
1151: * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
1152: */
1153: public int findNameEnd(int pos) {
1154: if (!Tag.isXMLNameStartChar(string.charAt(pos++)))
1155: return -1;
1156: while (pos < string.length()
1157: && Tag.isXMLNameChar(string.charAt(pos)))
1158: pos++;
1159: return pos;
1160: }
1161:
1162: /**
1163: * Parses any {@link Attributes} starting at the specified position.
1164: * This method is only used in the unusual situation where attributes exist outside of a start tag.
1165: * The {@link StartTag#getAttributes()} method should be used in normal situations.
1166: * <p>
1167: * The returned Attributes segment always begins at <code>pos</code>,
1168: * and ends at the end of the last attribute before either <code>maxEnd</code> or
1169: * the first occurrence of "/>" or ">" outside of a quoted attribute value, whichever comes first.
1170: * <p>
1171: * Only returns <code>null</code> if the segment contains a major syntactical error
1172: * or more than the {@linkplain Attributes#getDefaultMaxErrorCount() default maximum} number of
1173: * minor syntactical errors.
1174: * <p>
1175: * This is equivalent to
1176: * {@link #parseAttributes(int,int,int) parseAttributes}<code>(pos,maxEnd,</code>{@link Attributes#getDefaultMaxErrorCount()}<code>)}</code>.
1177: *
1178: * @param pos the position in the source document at the beginning of the attribute list, may be out of bounds.
1179: * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum.
1180: * @return the {@link Attributes} starting at the specified position, or <code>null</code> if too many errors occur while parsing or the specified position is out of bounds.
1181: * @see StartTag#getAttributes()
1182: * @see Segment#parseAttributes()
1183: */
1184: public Attributes parseAttributes(final int pos, final int maxEnd) {
1185: return parseAttributes(pos, maxEnd, Attributes
1186: .getDefaultMaxErrorCount());
1187: }
1188:
1189: /**
1190: * Parses any {@link Attributes} starting at the specified position.
1191: * This method is only used in the unusual situation where attributes exist outside of a start tag.
1192: * The {@link StartTag#getAttributes()} method should be used in normal situations.
1193: * <p>
1194: * Only returns <code>null</code> if the segment contains a major syntactical error
1195: * or more than the specified number of minor syntactical errors.
1196: * <p>
1197: * The <code>maxErrorCount</code> argument overrides the {@linkplain Attributes#getDefaultMaxErrorCount() default maximum error count}.
1198: * <p>
1199: * See {@link #parseAttributes(int pos, int maxEnd)} for more information.
1200: *
1201: * @param pos the position in the source document at the beginning of the attribute list, may be out of bounds.
1202: * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum.
1203: * @param maxErrorCount the maximum number of minor errors allowed while parsing.
1204: * @return the {@link Attributes} starting at the specified position, or <code>null</code> if too many errors occur while parsing or the specified position is out of bounds.
1205: * @see StartTag#getAttributes()
1206: * @see #parseAttributes(int pos, int MaxEnd)
1207: */
1208: public Attributes parseAttributes(final int pos, final int maxEnd,
1209: final int maxErrorCount) {
1210: return Attributes.construct(this , pos, maxEnd, maxErrorCount);
1211: }
1212:
1213: /**
1214: * Causes the specified range of the source text to be ignored when parsing.
1215: * <p>
1216: * See the documentation of the {@link Segment#ignoreWhenParsing()} method for more information.
1217: *
1218: * @param begin the beginning character position in the source text.
1219: * @param end the end character position in the source text.
1220: */
1221: public void ignoreWhenParsing(final int begin, final int end) {
1222: if (wasFullSequentialParseCalled())
1223: throw new IllegalStateException(
1224: "ignoreWhenParsing can not be used after fullSequentialParse() has been called");
1225: if (parseTextOutputDocument == null) {
1226: parseTextOutputDocument = new OutputDocument(getParseText());
1227: parseText = null;
1228: }
1229: parseTextOutputDocument.replaceWithSpaces(begin, end);
1230: }
1231:
1232: /**
1233: * Causes all of the segments in the specified collection to be ignored when parsing.
1234: * <p>
1235: * This is equivalent to calling {@link Segment#ignoreWhenParsing()} on each segment in the collection.
1236: */
1237: public void ignoreWhenParsing(final Collection segments) {
1238: for (final Iterator i = segments.iterator(); i.hasNext();) {
1239: ((Segment) i.next()).ignoreWhenParsing();
1240: }
1241: }
1242:
1243: /**
1244: * Sets the {@link Logger} that handles log messages.
1245: * <p>
1246: * Specifying a <code>null</code> argument disables logging completely for operations performed on this <code>Source</code> object.
1247: * <p>
1248: * A logger instance is created automatically for each <code>Source</code> object using the {@link LoggerProvider}
1249: * specified by the static {@link Config#LoggerProvider} property.
1250: * The name used for all automatically created logger instances is "<code>net.htmlparser.jericho</code>".
1251: * <p>
1252: * Use of this method with a non-null argument is therefore not usually necessary,
1253: * unless specifying an instance of {@link WriterLogger} or a user-defined {@link Logger} implementation.
1254: *
1255: * @param logger the logger that will handle log messages, or <code>null</code> to disable logging.
1256: * @see Config#LoggerProvider
1257: */
1258: public void setLogger(final Logger logger) {
1259: this .logger = (logger != null ? logger
1260: : LoggerDisabled.INSTANCE);
1261: }
1262:
1263: /**
1264: * Returns the {@link Logger} that handles log messages.
1265: * <p>
1266: * A logger instance is created automatically for each <code>Source</code> object using the {@link LoggerProvider}
1267: * specified by the static {@link Config#LoggerProvider} property.
1268: * This can be overridden by calling the {@link #setLogger(Logger)} method.
1269: * The name used for all automatically created logger instances is "<code>net.htmlparser.jericho</code>".
1270: *
1271: * @return the {@link Logger} that handles log messages, or <code>null</code> if logging is disabled.
1272: */
1273: public Logger getLogger() {
1274: return logger != LoggerDisabled.INSTANCE ? logger : null;
1275: }
1276:
1277: /**
1278: * Clears the {@linkplain #getCacheDebugInfo() tag cache} of all tags.
1279: * <p>
1280: * This method may be useful after calling the {@link Segment#ignoreWhenParsing()} method so that any tags previously found within the ignored segments
1281: * will no longer be returned by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
1282: */
1283: public void clearCache() {
1284: cache.clear();
1285: allTagsArray = null;
1286: allTags = null;
1287: allStartTags = null;
1288: allElements = null;
1289: }
1290:
1291: /**
1292: * Returns a string representation of the tag cache, useful for debugging purposes.
1293: * @return a string representation of the tag cache, useful for debugging purposes.
1294: */
1295: public String getCacheDebugInfo() {
1296: return cache.toString();
1297: }
1298:
1299: /**
1300: * Gets a list of all the tags that have been parsed so far.
1301: * <p>
1302: * This information may be useful for debugging purposes.
1303: * Execution of this method collects information from the internal cache and is relatively expensive.
1304: *
1305: * @return a list of all the tags that have been parsed so far.
1306: * @see #getCacheDebugInfo()
1307: */
1308: List getParsedTags() {
1309: final ArrayList list = new ArrayList();
1310: for (final Iterator i = cache.getTagIterator(); i.hasNext();)
1311: list.add(i.next());
1312: return list;
1313: }
1314:
1315: /**
1316: * Returns the {@linkplain ParseText parse text} of this source document.
1317: * <p>
1318: * This method is normally only of interest to users who wish to create <a href="TagType.html#Custom">custom tag types</a>.
1319: * <p>
1320: * The parse text is defined as the entire text of the source document in lower case, with all
1321: * {@linkplain Segment#ignoreWhenParsing() ignored} segments replaced by space characters.
1322: *
1323: * @return the {@linkplain ParseText parse text} of this source document.
1324: */
1325: public final ParseText getParseText() {
1326: if (parseText == null) {
1327: if (parseTextOutputDocument != null) {
1328: parseText = new ParseText(parseTextOutputDocument);
1329: parseTextOutputDocument = null;
1330: } else {
1331: parseText = new ParseText(this );
1332: }
1333: }
1334: return parseText;
1335: }
1336:
1337: /**
1338: * Formats the HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
1339: * <p>
1340: * This method has been deprecated as of version 2.4 and replaced with the {@link #getSourceFormatter()} method.
1341: *
1342: * @param indentString the string to use for indentation.
1343: * @param tidyTags specifies whether to replace the original text of each tag with the output from its {@link Tag#tidy()} method.
1344: * @param collapseWhiteSpace specifies whether to collapse the white space in the text between the tags.
1345: * @param indentAllElements specifies whether to indent all elements, including {@linkplain HTMLElements#getBlockLevelElementNames() block-level elements} and those with preformatted contents.
1346: * @return a {@link CharStreamSource} that produces the output.
1347: * @deprecated Use {@link #getSourceFormatter()}<code>.</code>{@link SourceFormatter#setIndentString(String) setIndentString(indentString)}<code>.</code>{@link SourceFormatter#setTidyTags(boolean) setTidyTags(tidyTags)}<code>.</code>{@link SourceFormatter#setCollapseWhiteSpace(boolean) setCollapseWhiteSpace(collapseWhiteSpace)}<code>.</code>{@link SourceFormatter#setIndentAllElements(boolean) setIndentAllElements(indentAllElements)} instead.
1348: */
1349: public CharStreamSource indent(final String indentString,
1350: final boolean tidyTags, final boolean collapseWhiteSpace,
1351: final boolean indentAllElements) {
1352: return getSourceFormatter().setIndentString(indentString)
1353: .setTidyTags(tidyTags).setCollapseWhiteSpace(
1354: collapseWhiteSpace).setIndentAllElements(
1355: indentAllElements);
1356: }
1357:
1358: /**
1359: * {@linkplain #setLogger(Logger) Sets the logger} to an implementation that that sends all output to a specified <code>Writer</code>.
1360: * <p>
1361: * This method has been deprecated as of version 2.4 in favour of the more generic {@link #setLogger(Logger)} method.
1362: *
1363: * @param writer the destination <code>java.io.Writer</code> for log messages.
1364: * @deprecated Use {@link #setLogger(Logger) setLogger}<code>(new </code>{@link WriterLogger#WriterLogger(Writer) WriterLogger}<code>(writer))</code> instead.
1365: */
1366: public void setLogWriter(final Writer writer) {
1367: setLogger(new WriterLogger(writer));
1368: }
1369:
1370: /**
1371: * Returns the destination <code>Writer</code> for log messages.
1372: * <p>
1373: * This method has been deprecated as of version 2.4 in favour of the more generic {@link #getLogger()} method.
1374: * <p>
1375: * Returns <code>null</code> if the {@linkplain #getLogger() current logger} is not an instance of {@link WriterLogger}.
1376: *
1377: * @return the destination <code>Writer</code> for log messages, or <code>null</code> if the {@linkplain #getLogger() current logger} is not an instance of {@link WriterLogger}.
1378: * @deprecated Use <code>((</code>{@link WriterLogger}<code>)</code>{@link #getLogger()}<code>).</code>{@link WriterLogger#getWriter() getWriter()} instead.
1379: */
1380: public Writer getLogWriter() {
1381: if (!(logger instanceof WriterLogger))
1382: return null;
1383: return ((WriterLogger) getLogger()).getWriter();
1384: }
1385:
1386: /**
1387: * Writes the specified message to the log.
1388: * <p>
1389: * This method has been deprecated as of version 2.4 as logging is now perfomed via the {@link Logger} interface
1390: * obtained via the {@link #getLogger()} method.
1391: *
1392: * @param message the message to log
1393: * @deprecated Use {@link #getLogger()}<code>.info(message)</code> instead.
1394: */
1395: public void log(final String message) {
1396: logger.info(message);
1397: }
1398:
1399: /**
1400: * Indicates whether logging is currently enabled.
1401: * <p>
1402: * This method has been deprecated as of version 2.4 as its purpose was to allow efficient use of the {@link #log(String)} method, which has been deprecated.
1403: *
1404: * @return <code>true</code> if logging is currently enabled, otherwise <code>false</code>.
1405: * @deprecated Use {@link #getLogger()}<code>.isInfoEnabled()</code> instead.
1406: */
1407: public boolean isLoggingEnabled() {
1408: return logger.isInfoEnabled();
1409: }
1410:
1411: static String getCharsetParameterFromHttpHeaderValue(
1412: final String httpHeaderValue) {
1413: final int charsetParameterPos = httpHeaderValue.toLowerCase()
1414: .indexOf("charset=");
1415: if (charsetParameterPos == -1)
1416: return null;
1417: final int charsetBegin = charsetParameterPos + 8;
1418: int charsetEnd = httpHeaderValue.indexOf(';', charsetBegin);
1419: final String charset = (charsetEnd == -1) ? httpHeaderValue
1420: .substring(charsetBegin) : httpHeaderValue.substring(
1421: charsetBegin, charsetEnd);
1422: return charset.trim();
1423: }
1424:
1425: static Logger newLogger() {
1426: return LoggerFactory.getLogger(PACKAGE_NAME);
1427: }
1428:
1429: private static String getString(
1430: final EncodingDetector encodingDetector) throws IOException {
1431: try {
1432: return Util.getString(encodingDetector.openReader());
1433: } catch (IOException ex) {
1434: try {
1435: Logger logger = newLogger();
1436: if (logger.isInfoEnabled())
1437: logger
1438: .info("IOException constructing encoded source. Encoding: "
1439: + encodingDetector.getEncoding()
1440: + " - "
1441: + encodingDetector
1442: .getEncodingSpecificationInfo()
1443: + ". PreliminaryEncoding: "
1444: + encodingDetector
1445: .getPreliminaryEncoding()
1446: + " - "
1447: + encodingDetector
1448: .getPreliminaryEncodingSpecificationInfo());
1449: } catch (Exception ex2) {
1450: } // make sure attempting to log does not cause a new exception
1451: throw ex;
1452: }
1453: }
1454:
1455: private boolean wasFullSequentialParseCalled() {
1456: return allTagsArray != null;
1457: }
1458: }
|