001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024:
025: /**
026: * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1">Numeric Character Reference</a>.
027: * <p>
028: * A numeric character reference can be one of two types:
029: * <dl>
030: * <dt><a name="DecimalCharacterReference">Decimal Character Reference</a>
031: * <dd>A numeric character reference specifying the unicode code point in decimal notation.<br />
032: * This is signified by the absence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&#62;</code>").
033: * <dt><a name="HexadecimalCharacterReference">Hexadecimal Character Reference</a>
034: * <dd>A numeric character reference specifying the unicode code point in hexadecimal notation.<br />
035: * This is signified by the presence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&#x3e;</code>").
036: * </dl>
037: * <p>
038: * Static methods to {@linkplain #encode(CharSequence) encode} and {@linkplain #decode(CharSequence) decode} strings
039: * and single characters can be found in the {@link CharacterReference} superclass.
040: * <p>
041: * <code>NumericCharacterReference</code> instances are obtained using one of the following methods:
042: * <ul>
043: * <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
044: * <li>{@link Source#findNextCharacterReference(int pos)}
045: * <li>{@link Source#findPreviousCharacterReference(int pos)}
046: * <li>{@link Segment#findAllCharacterReferences()}
047: * </ul>
048: *
049: * @see CharacterReference
050: * @see CharacterEntityReference
051: */
052: public class NumericCharacterReference extends CharacterReference {
053: private boolean hex;
054:
055: private NumericCharacterReference(final Source source,
056: final int begin, final int end, final int codePoint,
057: final boolean hex) {
058: super (source, begin, end, codePoint);
059: this .hex = hex;
060: }
061:
062: /**
063: * Indicates whether this numeric character reference specifies the unicode code point in decimal format.
064: * <p>
065: * A numeric character reference in decimal format is referred to in this library as a
066: * <a href="#DecimalCharacterReference">decimal character reference</a>.
067: *
068: * @return <code>true</code> if this numeric character reference specifies the unicode code point in decimal format, otherwise <code>false</code>.
069: * @see #isHexadecimal()
070: */
071: public boolean isDecimal() {
072: return !hex;
073: }
074:
075: /**
076: * Indicates whether this numeric character reference specifies the unicode code point in hexadecimal format.
077: * <p>
078: * A numeric character reference in hexadecimal format is referred to in this library as a
079: * <a href="#HexadecimalCharacterReference">hexadecimal character reference</a>.
080: *
081: * @return <code>true</code> if this numeric character reference specifies the unicode code point in hexadecimal format, otherwise <code>false</code>.
082: * @see #isDecimal()
083: */
084: public boolean isHexadecimal() {
085: return hex;
086: }
087:
088: /**
089: * Encodes the specified text, escaping special characters into numeric character references.
090: * <p>
091: * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
092: * <p>
093: * This method encodes all character references in <a href="#DecimalCharacterReference">decimal format</a>, and is exactly the same as calling
094: * {@link #encodeDecimal(CharSequence)}.
095: * <p>
096: * To encode text using both character entity references and numeric character references, use the<br />
097: * {@link CharacterReference#encode(CharSequence)} method instead.
098: * <p>
099: * To encode text using <a href="#HexadecimalCharacterReference">hexadecimal character references</a> only,
100: * use the {@link #encodeHexadecimal(CharSequence)} method instead.
101: *
102: * @param unencodedText the text to encode.
103: * @return the encoded string.
104: * @see #decode(CharSequence)
105: */
106: public static String encode(final CharSequence unencodedText) {
107: if (unencodedText == null)
108: return null;
109: final StringBuffer sb = new StringBuffer(
110: unencodedText.length() * 2);
111: for (int i = 0; i < unencodedText.length(); i++) {
112: final char ch = unencodedText.charAt(i);
113: if (requiresEncoding(ch)) {
114: appendDecimalCharacterReferenceString(sb, ch);
115: } else {
116: sb.append(ch);
117: }
118: }
119: return sb.toString();
120: }
121:
122: /**
123: * Encodes the specified text, escaping special characters into <a href="#DecimalCharacterReference">decimal character references</a>.
124: * <p>
125: * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
126: * <p>
127: * To encode text using both character entity references and numeric character references, use the<br />
128: * {@link CharacterReference#encode(CharSequence)} method instead.
129: * <p>
130: * To encode text using <a href="#HexadecimalCharacterReference">hexadecimal character references</a> only,
131: * use the {@link #encodeHexadecimal(CharSequence)} method instead.
132: *
133: * @param unencodedText the text to encode.
134: * @return the encoded string.
135: * @see #decode(CharSequence)
136: */
137: public static String encodeDecimal(final CharSequence unencodedText) {
138: return encode(unencodedText);
139: }
140:
141: /**
142: * Encodes the specified text, escaping special characters into <a href="#HexadecimalCharacterReference">hexadecimal character references</a>.
143: * <p>
144: * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
145: * <p>
146: * To encode text using both character entity references and numeric character references, use the<br />
147: * {@link CharacterReference#encode(CharSequence)} method instead.
148: * <p>
149: * To encode text using <a href="#DecimalCharacterReference">decimal character references</a> only,
150: * use the {@link #encodeDecimal(CharSequence)} method instead.
151: *
152: * @param unencodedText the text to encode.
153: * @return the encoded string.
154: * @see #decode(CharSequence)
155: */
156: public static String encodeHexadecimal(
157: final CharSequence unencodedText) {
158: if (unencodedText == null)
159: return null;
160: final StringBuffer sb = new StringBuffer(
161: unencodedText.length() * 2);
162: for (int i = 0; i < unencodedText.length(); i++) {
163: final char ch = unencodedText.charAt(i);
164: if (requiresEncoding(ch)) {
165: appendHexadecimalCharacterReferenceString(sb, ch);
166: } else {
167: sb.append(ch);
168: }
169: }
170: return sb.toString();
171: }
172:
173: /**
174: * Returns the correct encoded form of this numeric character reference.
175: * <p>
176: * The returned string uses the same radix as the original character reference in the source document,
177: * i.e. decimal format if {@link #isDecimal()} is <code>true</code>, and hexadecimal format if {@link #isHexadecimal()} is <code>true</code>.
178: * <p>
179: * Note that the returned string is not necessarily the same as the original source text used to create this object.
180: * This library recognises certain invalid forms of character references,
181: * as detailed in the {@link #decode(CharSequence) decode(CharSequence)} method.
182: * <p>
183: * To retrieve the original source text, use the {@link #toString() toString()} method instead.
184: * <p>
185: * <dl>
186: * <dt>Example:</dt>
187: * <dd><code>CharacterReference.parse("&#62").getCharacterReferenceString()</code> returns "<code>&#62;</code>"</dd>
188: * </dl>
189: *
190: * @return the correct encoded form of this numeric character reference.
191: * @see CharacterReference#getCharacterReferenceString(int codePoint)
192: */
193: public String getCharacterReferenceString() {
194: return hex ? getHexadecimalCharacterReferenceString(codePoint)
195: : getDecimalCharacterReferenceString(codePoint);
196: }
197:
198: /**
199: * Returns the numeric character reference encoded form of the specified unicode code point.
200: * <p>
201: * This method returns the character reference in decimal format, and is exactly the same as calling
202: * {@link #getDecimalCharacterReferenceString(int codePoint)}.
203: * <p>
204: * To get either the character entity reference or numeric character reference, use the<br />
205: * {@link CharacterReference#getCharacterReferenceString(int codePoint)} method instead.
206: * <p>
207: * To get the character reference in hexadecimal format, use the {@link #getHexadecimalCharacterReferenceString(int codePoint)} method instead.
208: * <p>
209: * <dl>
210: * <dt>Examples:</dt>
211: * <dd><code>NumericCharacterReference.getCharacterReferenceString(62)</code> returns "<code>&#62;</code>"</dd>
212: * <dd><code>NumericCharacterReference.getCharacterReferenceString('>')</code> returns "<code>&#62;</code>"</dd>
213: * </dl>
214: *
215: * @return the numeric character reference encoded form of the specified unicode code point.
216: * @see CharacterReference#getCharacterReferenceString(int codePoint)
217: */
218: public static String getCharacterReferenceString(final int codePoint) {
219: return getDecimalCharacterReferenceString(codePoint);
220: }
221:
222: static CharacterReference construct(
223: final Source source,
224: final int begin,
225: final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
226: // only called from CharacterReference.construct(), so we can assume that first characters are "&#"
227: final ParseText parseText = source.getParseText();
228: int codePointStringBegin = begin + 2;
229: boolean hex;
230: if (hex = (parseText.charAt(codePointStringBegin) == 'x'))
231: codePointStringBegin++;
232: final int unterminatedMaxCodePoint = hex ? unterminatedCharacterReferenceSettings.hexadecimalCharacterReferenceMaxCodePoint
233: : unterminatedCharacterReferenceSettings.decimalCharacterReferenceMaxCodePoint;
234: final int maxSourcePos = parseText.length() - 1;
235: String codePointString;
236: int end;
237: int x = codePointStringBegin;
238: boolean unterminated = false;
239: while (true) {
240: final char ch = parseText.charAt(x);
241: if (ch == ';') {
242: end = x + 1;
243: codePointString = parseText.substring(
244: codePointStringBegin, x);
245: break;
246: }
247: if ((ch < '0' || ch > '9')
248: && (!hex || ch < 'a' || ch > 'f')) {
249: // At this point we were either expecting a decimal digit (if hex is false), or a hexadecimal digit (if hex is true),
250: // but have found something else, meaning the character reference is unterminated.
251: unterminated = true;
252: } else if (x == maxSourcePos) {
253: // At this point, we have a valid digit but are at the last position in the source text without the terminating semicolon.
254: unterminated = true;
255: x++; // include this digit
256: }
257: if (unterminated) {
258: // Different browsers react differently to unterminated numeric character references.
259: // The behaviour of this method is determined by the settings in the unterminatedCharacterReferenceSettings parameter.
260: if (unterminatedMaxCodePoint == INVALID_CODE_POINT) {
261: // reject:
262: return null;
263: } else {
264: // accept:
265: end = x;
266: codePointString = parseText.substring(
267: codePointStringBegin, x);
268: break;
269: }
270: }
271: x++;
272: }
273: if (codePointString.length() == 0)
274: return null;
275: int codePoint = INVALID_CODE_POINT;
276: try {
277: codePoint = Integer
278: .parseInt(codePointString, hex ? 16 : 10);
279: if (unterminated && codePoint > unterminatedMaxCodePoint)
280: return null;
281: if (codePoint > MAX_CODE_POINT)
282: codePoint = INVALID_CODE_POINT;
283: } catch (NumberFormatException ex) {
284: // This should only happen if number is larger than Integer.MAX_VALUE.
285: if (unterminated)
286: return null;
287: // If it is a terminated reference just ignore the exception as codePoint will remain with its value of INVALID_CODE_POINT.
288: }
289: return new NumericCharacterReference(source, begin, end,
290: codePoint, hex);
291: }
292:
293: public String getDebugInfo() {
294: final StringBuffer sb = new StringBuffer();
295: sb.append('"');
296: if (hex)
297: appendHexadecimalCharacterReferenceString(sb, codePoint);
298: else
299: appendDecimalCharacterReferenceString(sb, codePoint);
300: sb.append("\" ");
301: appendUnicodeText(sb, codePoint);
302: sb.append(' ').append(super.getDebugInfo());
303: return sb.toString();
304: }
305: }
|