001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025:
026: /**
027: * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
028: * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
029: * <p>
030: * This class, together with its subclasses, contains static methods to perform most required operations
031: * without having to instantiate an object.
032: * <p>
033: * Instances of this class are useful when the positions of character references in a source document are required,
034: * or to replace the found character references with customised text.
035: * <p>
036: * <code>CharacterReference</code> instances are obtained using one of the following methods:
037: * <ul>
038: * <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
039: * <li>{@link Source#findNextCharacterReference(int pos)}
040: * <li>{@link Source#findPreviousCharacterReference(int pos)}
041: * <li>{@link Segment#findAllCharacterReferences()}
042: * </ul>
043: */
044: public abstract class CharacterReference extends Segment {
045: int codePoint;
046:
047: /**
048: * Represents an invalid unicode code point.
049: * <p>
050: * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
051: */
052: public static final int INVALID_CODE_POINT = -1;
053:
054: /**
055: * The maximum codepoint allowed by unicode, 0x10FFFF (decimal 1114111).
056: * This can be replaced by Character.MAX_CODE_POINT in java 1.5
057: */
058: static final int MAX_CODE_POINT = 0x10FFFF;
059:
060: static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation
061:
062: /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
063: private static final int TAB_LENGTH = 4;
064:
065: CharacterReference(final Source source, final int begin,
066: final int end, final int codePoint) {
067: super (source, begin, end);
068: this .codePoint = codePoint;
069: }
070:
071: /**
072: * Returns the <a target="_blank" href="http://www.unicode.org">unicode</a> code point represented by this character reference.
073: * @return the unicode code point represented by this character reference.
074: */
075: public int getCodePoint() {
076: return codePoint;
077: }
078:
079: /**
080: * Returns the character represented by this character reference.
081: * <p>
082: * If this character reference represents a unicode
083: * <a target="_blank" href="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
084: * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
085: *
086: * @return the character represented by this character reference.
087: */
088: public char getChar() {
089: return (char) codePoint;
090: }
091:
092: /**
093: * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
094: * <p>
095: * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
096: * not end with a semicolon.
097: * <p>
098: * The SGML specification allows unterminated character references in some circumstances, and because the
099: * HTML 4.01 specification states simply that
100: * "<a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
101: * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
102: * <p>
103: * Unterminated character references are not allowed in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
104: *
105: * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
106: * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
107: */
108: public boolean isTerminated() {
109: return source.charAt(end - 1) == ';';
110: }
111:
112: /**
113: * Encodes the specified text, escaping special characters into character references.
114: * <p>
115: * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
116: * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
117: * code point is greater than U+007F.
118: * <p>
119: * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
120: * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
121: * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&#39;</code>".
122: * <p>
123: * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &apos;}
124: * as this entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information.
125: * <p>
126: * To encode text using only numeric character references, use the<br />
127: * {@link NumericCharacterReference#encode(CharSequence)} method instead.
128: *
129: * @param unencodedText the text to encode.
130: * @return the encoded string.
131: * @see #decode(CharSequence)
132: */
133: public static String encode(final CharSequence unencodedText) {
134: if (unencodedText == null)
135: return null;
136: return appendEncode(
137: new StringBuffer(unencodedText.length() * 2),
138: unencodedText, false).toString();
139: }
140:
141: /**
142: * Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}.
143: * <p>
144: * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method.
145: *
146: * @param ch the character to encode.
147: * @return a character reference if appropriate, otherwise a string containing the original character.
148: */
149: public static String encode(final char ch) {
150: return appendEncode(
151: new StringBuffer(MAX_ENTITY_REFERENCE_LENGTH), ch)
152: .toString();
153: }
154:
155: /**
156: * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
157: * <p>
158: * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
159: * <ul>
160: * <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
161: * are converted to "<code><br /></code>". CR/LF pairs are treated as a single line break.
162: * <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&nbsp;</code>"
163: * while ensuring the last is always a normal space.
164: * <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
165: * </ul>
166: * <p>
167: * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
168: * spaces to be rendered, but also allows the line to wrap in the middle of it.
169: * <p>
170: * Note that zero-width spaces (U+200B) are converted to the numeric character reference
171: * "<code>&#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
172: * either encoded or unencoded.
173: * <p>
174: * There is no method provided to reverse this encoding.
175: *
176: * @param unencodedText the text to encode.
177: * @return the encoded string with whitespace formatting converted to markup.
178: * @see #encode(CharSequence)
179: */
180: public static String encodeWithWhiteSpaceFormatting(
181: final CharSequence unencodedText) {
182: if (unencodedText == null)
183: return null;
184: return appendEncode(
185: new StringBuffer(unencodedText.length() * 2),
186: unencodedText, true).toString();
187: }
188:
189: /**
190: * Decodes the specified HTML encoded text into normal text.
191: * <p>
192: * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
193: * are converted to their respective characters.
194: * <p>
195: * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
196: * <p>
197: * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
198: * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
199: * <p>
200: * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
201: * some browsers also recognise them in a case-insensitive way.
202: * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
203: *
204: * @param encodedText the text to decode.
205: * @return the decoded string.
206: * @see #encode(CharSequence)
207: */
208: public static String decode(final CharSequence encodedText) {
209: return decode(encodedText, false, false);
210: }
211:
212: /**
213: * Decodes the specified HTML encoded text into normal text.
214: * <p>
215: * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
216: * are converted to their respective characters.
217: * <p>
218: * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the
219: * value of the <code>insideAttributeValue</code> parameter and the
220: * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
221: * <p>
222: * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
223: * some browsers also recognise them in a case-insensitive way.
224: * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
225: *
226: * @param encodedText the text to decode.
227: * @param insideAttributeValue specifies whether the encoded text is inside an attribute value.
228: * @return the decoded string.
229: * @see #decode(CharSequence)
230: * @see #encode(CharSequence)
231: */
232: public static String decode(final CharSequence encodedText,
233: final boolean insideAttributeValue) {
234: return decode(encodedText, insideAttributeValue, false);
235: }
236:
237: private static String decode(final CharSequence encodedText,
238: final boolean insideAttributeValue,
239: final boolean convertNonBreakingSpaces) {
240: if (encodedText == null)
241: return null;
242: for (int i = 0; i < encodedText.length(); i++) {
243: if (encodedText.charAt(i) == '&')
244: return appendDecode(
245: new StringBuffer(encodedText.length()),
246: encodedText, i, insideAttributeValue,
247: convertNonBreakingSpaces).toString();
248: }
249: return encodedText.toString();
250: }
251:
252: /**
253: * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
254: * <p>
255: * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
256: * <p>
257: * The result is how the text would normally be rendered by a
258: * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
259: * assuming it does not contain any tags.
260: * <p>
261: * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
262: * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
263: * See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
264: * method for a more detailed explanation of this topic.
265: *
266: * @param text the source text
267: * @return the decoded text with collapsed white space.
268: * @see FormControl#getPredefinedValues()
269: */
270: public static String decodeCollapseWhiteSpace(
271: final CharSequence text) {
272: return decodeCollapseWhiteSpace(text, false);
273: }
274:
275: static String decodeCollapseWhiteSpace(final CharSequence text,
276: final boolean convertNonBreakingSpaces) {
277: return decode(appendCollapseWhiteSpace(new StringBuffer(text
278: .length()), text), false, convertNonBreakingSpaces);
279: }
280:
281: /**
282: * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
283: * <p>
284: * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
285: * <p>
286: * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
287: * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
288: * may be used in future.
289: *
290: * @param encodedText the text to re-encode.
291: * @return the re-encoded string.
292: */
293: public static String reencode(final CharSequence encodedText) {
294: return encode(decode(encodedText, true));
295: }
296:
297: /**
298: * Returns the encoded form of this character reference.
299: * <p>
300: * The exact behaviour of this method depends on the class of this object.
301: * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
302: * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
303: * <p>
304: * <dl>
305: * <dt>Examples:</dt>
306: * <dd><code>CharacterReference.parse("&GT;").getCharacterReferenceString()</code> returns "<code>&gt;</code>"</dd>
307: * <dd><code>CharacterReference.parse("&#x3E;").getCharacterReferenceString()</code> returns "<code>&#3e;</code>"</dd>
308: * </dl>
309: *
310: * @return the encoded form of this character reference.
311: * @see #getCharacterReferenceString(int codePoint)
312: * @see #getDecimalCharacterReferenceString()
313: */
314: public abstract String getCharacterReferenceString();
315:
316: /**
317: * Returns the encoded form of the specified unicode code point.
318: * <p>
319: * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
320: * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
321: * <p>
322: * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
323: * which is encoded as the numeric character reference "<code>&#39;</code>" instead of its character entity reference
324: * "<code>&apos;</code>".
325: * <p>
326: * <dl>
327: * <dt>Examples:</dt>
328: * <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&gt;</code>"</dd>
329: * <dd><code>CharacterReference.getCharacterReferenceString('>')</code> returns "<code>&gt;</code>"</dd>
330: * <dd><code>CharacterReference.getCharacterReferenceString('☺')</code> returns "<code>&#9786;</code>"</dd>
331: * </dl>
332: *
333: * @param codePoint the unicode code point to encode.
334: * @return the encoded form of the specified unicode code point.
335: * @see #getHexadecimalCharacterReferenceString(int codePoint)
336: */
337: public static String getCharacterReferenceString(final int codePoint) {
338: String characterReferenceString = null;
339: if (codePoint != CharacterEntityReference._apos)
340: characterReferenceString = CharacterEntityReference
341: .getCharacterReferenceString(codePoint);
342: if (characterReferenceString == null)
343: characterReferenceString = NumericCharacterReference
344: .getCharacterReferenceString(codePoint);
345: return characterReferenceString;
346: }
347:
348: /**
349: * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
350: * <p>
351: * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
352: * <p>
353: * <dl>
354: * <dt>Example:</dt>
355: * <dd><code>CharacterReference.parse("&gt;").getDecimalCharacterReferenceString()</code> returns "<code>&#62;</code>"</dd>
356: * </dl>
357: *
358: * @return the decimal encoded form of this character reference.
359: * @see #getCharacterReferenceString()
360: * @see #getHexadecimalCharacterReferenceString()
361: */
362: public String getDecimalCharacterReferenceString() {
363: return getDecimalCharacterReferenceString(codePoint);
364: }
365:
366: /**
367: * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
368: * <p>
369: * <dl>
370: * <dt>Example:</dt>
371: * <dd><code>CharacterReference.getDecimalCharacterReferenceString('>')</code> returns "<code>&#62;</code>"</dd>
372: * </dl>
373: *
374: * @param codePoint the unicode code point to encode.
375: * @return the decimal encoded form of the specified unicode code point.
376: * @see #getCharacterReferenceString(int codePoint)
377: * @see #getHexadecimalCharacterReferenceString(int codePoint)
378: */
379: public static String getDecimalCharacterReferenceString(
380: final int codePoint) {
381: return appendDecimalCharacterReferenceString(
382: new StringBuffer(), codePoint).toString();
383: }
384:
385: /**
386: * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
387: * <p>
388: * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
389: * <p>
390: * <dl>
391: * <dt>Example:</dt>
392: * <dd><code>CharacterReference.parse("&gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&#x3e;</code>"</dd>
393: * </dl>
394: *
395: * @return the hexadecimal encoded form of this character reference.
396: * @see #getCharacterReferenceString()
397: * @see #getDecimalCharacterReferenceString()
398: */
399: public String getHexadecimalCharacterReferenceString() {
400: return getHexadecimalCharacterReferenceString(codePoint);
401: }
402:
403: /**
404: * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
405: * <p>
406: * <dl>
407: * <dt>Example:</dt>
408: * <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('>')</code> returns "<code>&#x3e;</code>"</dd>
409: * </dl>
410: *
411: * @param codePoint the unicode code point to encode.
412: * @return the hexadecimal encoded form of the specified unicode code point.
413: * @see #getCharacterReferenceString(int codePoint)
414: * @see #getDecimalCharacterReferenceString(int codePoint)
415: */
416: public static String getHexadecimalCharacterReferenceString(
417: final int codePoint) {
418: return appendHexadecimalCharacterReferenceString(
419: new StringBuffer(), codePoint).toString();
420: }
421:
422: /**
423: * Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
424: * <p>
425: * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
426: * <p>
427: * <dl>
428: * <dt>Example:</dt>
429: * <dd><code>CharacterReference.parse("&gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
430: * </dl>
431: *
432: * @return the unicode code point of this character reference in U+ notation.
433: * @see #getUnicodeText(int codePoint)
434: */
435: public String getUnicodeText() {
436: return getUnicodeText(codePoint);
437: }
438:
439: /**
440: * Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
441: * <p>
442: * <dl>
443: * <dt>Example:</dt>
444: * <dd><code>CharacterReference.getUnicodeText('>')</code> returns "<code>U+003E</code>"</dd>
445: * </dl>
446: *
447: * @param codePoint the unicode code point.
448: * @return the specified unicode code point in U+ notation.
449: */
450: public static String getUnicodeText(final int codePoint) {
451: return appendUnicodeText(new StringBuffer(), codePoint)
452: .toString();
453: }
454:
455: static final StringBuffer appendUnicodeText(final StringBuffer sb,
456: final int codePoint) {
457: sb.append("U+");
458: final String hex = Integer.toString(codePoint, 16)
459: .toUpperCase();
460: for (int i = 4 - hex.length(); i > 0; i--)
461: sb.append('0');
462: sb.append(hex);
463: return sb;
464: }
465:
466: /**
467: * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
468: * <p>
469: * The character reference must be at the start of the given text, but may contain other characters at the end.
470: * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
471: * <p>
472: * If the text does not represent a valid character reference, this method returns <code>null</code>.
473: * <p>
474: * <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
475: * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
476: * <p>
477: * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
478: * <p>
479: * <dl>
480: * <dt>Example:</dt>
481: * <dd><code>CharacterReference.parse("&gt;").getChar()</code> returns '<code>></code>'</dd>
482: * </dl>
483: *
484: * @param characterReferenceText the text containing a single encoded character reference.
485: * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
486: * @see #decode(CharSequence)
487: */
488: public static CharacterReference parse(
489: final CharSequence characterReferenceText) {
490: return construct(
491: new Source(characterReferenceText.toString()),
492: 0,
493: Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
494: }
495:
496: /**
497: * Parses a single encoded character reference text into a unicode code point.
498: * <p>
499: * The character reference must be at the start of the given text, but may contain other characters at the end.
500: * <p>
501: * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
502: * <p>
503: * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
504: * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
505: * <code>NullPointerException</code>.
506: * <p>
507: * <dl>
508: * <dt>Example:</dt>
509: * <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&gt;")</code> returns <code>38</code></dd>
510: * </dl>
511: *
512: * @param characterReferenceText the text containing a single encoded character reference.
513: * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
514: */
515: public static int getCodePointFromCharacterReferenceString(
516: final CharSequence characterReferenceText) {
517: final CharacterReference characterReference = parse(characterReferenceText);
518: return (characterReference != null) ? characterReference
519: .getCodePoint() : INVALID_CODE_POINT;
520: }
521:
522: /**
523: * Indicates whether the specified character would need to be encoded in HTML text.
524: * <p>
525: * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
526: * <p>
527: * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
528: * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
529: * is currently set to <code>true</code>.
530: *
531: * @param ch the character to test.
532: * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
533: */
534: public static final boolean requiresEncoding(final char ch) {
535: return ch > 127
536: || (CharacterEntityReference.getName(ch) != null && (ch != '\'' || Config.IsApostropheEncoded));
537: }
538:
539: /**
540: * Returns a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
541: *
542: * @param writer the destination for the encoded text
543: * @return a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
544: * @see #encode(CharSequence unencodedText)
545: */
546: public static Writer getEncodingFilterWriter(final Writer writer) {
547: return new EncodingFilterWriter(writer);
548: }
549:
550: private static final class EncodingFilterWriter extends
551: FilterWriter {
552: StringBuffer sb = new StringBuffer(MAX_ENTITY_REFERENCE_LENGTH);
553:
554: public EncodingFilterWriter(final Writer writer) {
555: super (writer);
556: }
557:
558: public void write(final char ch) throws IOException {
559: sb.setLength(0);
560: appendEncode(sb, ch);
561: if (sb.length() == 1)
562: out.write(sb.charAt(0));
563: else
564: Util.appendTo(out, sb);
565: }
566:
567: public void write(final int chInt) throws IOException {
568: write((char) chInt);
569: }
570:
571: public void write(final char[] cbuf, final int off,
572: final int len) throws IOException {
573: final int end = off + len;
574: for (int i = off; i < end; i++)
575: write(cbuf[i]);
576: }
577:
578: public void write(final String str, final int off, final int len)
579: throws IOException {
580: final int end = off + len;
581: for (int i = off; i < end; i++)
582: write(str.charAt(i));
583: }
584: }
585:
586: private static StringBuffer appendEncode(final StringBuffer sb,
587: char ch) {
588: if (appendEncodeCheckForWhiteSpaceFormatting(sb, ch, false))
589: return sb;
590: return sb.append(ch);
591: }
592:
593: static StringBuffer appendEncode(final StringBuffer sb,
594: CharSequence unencodedText,
595: final boolean whiteSpaceFormatting) {
596: if (unencodedText == null)
597: return sb;
598: int beginPos = 0;
599: int endPos = unencodedText.length();
600: if (unencodedText instanceof Segment) {
601: // this might improve performance slightly
602: final Segment segment = (Segment) unencodedText;
603: final int segmentOffset = segment.getBegin();
604: beginPos = segmentOffset;
605: endPos += segmentOffset;
606: unencodedText = segment.source.string;
607: }
608: final boolean isApostropheEncoded = Config.IsApostropheEncoded;
609: for (int i = beginPos; i < endPos; i++) {
610: char ch = unencodedText.charAt(i);
611: if (appendEncodeCheckForWhiteSpaceFormatting(sb, ch,
612: whiteSpaceFormatting))
613: continue;
614: // need to process white space
615: // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
616: int spaceCount;
617: int nexti = i + 1;
618: if (ch != ' ') {
619: if (ch != '\t') {
620: // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
621: if (ch == '\r' && nexti < endPos
622: && unencodedText.charAt(nexti) == '\n')
623: i++; // process cr/lf pair as one line break
624: sb.append("<br />"); // add line break
625: continue;
626: } else {
627: spaceCount = TAB_LENGTH;
628: }
629: } else {
630: spaceCount = 1;
631: }
632: while (nexti < endPos) {
633: ch = unencodedText.charAt(nexti);
634: if (ch == ' ')
635: spaceCount += 1;
636: else if (ch == '\t')
637: spaceCount += TAB_LENGTH;
638: else
639: break;
640: nexti++;
641: }
642: if (spaceCount == 1) {
643: // handle the very common case of a single character to improve efficiency slightly
644: sb.append(' ');
645: continue;
646: }
647: if (spaceCount % 2 == 1)
648: sb.append(' '); // fist character is a space if we have an odd number of spaces
649: while (spaceCount >= 2) {
650: sb.append(" "); // use alternating and spaces to keep original number of spaces
651: spaceCount -= 2;
652: }
653: // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
654: i = nexti - 1; // minus 1 because top level for loop will add it again
655: }
656: return sb;
657: }
658:
659: private static final boolean appendEncodeCheckForWhiteSpaceFormatting(
660: final StringBuffer sb, char ch,
661: final boolean whiteSpaceFormatting) {
662: final String characterEntityReferenceName = CharacterEntityReference
663: .getName(ch);
664: if (characterEntityReferenceName != null) {
665: if (ch == '\'') {
666: if (Config.IsApostropheEncoded)
667: sb.append("'");
668: else
669: sb.append(ch);
670: } else {
671: CharacterEntityReference
672: .appendCharacterReferenceString(sb,
673: characterEntityReferenceName);
674: }
675: } else if (ch > 127) {
676: appendDecimalCharacterReferenceString(sb, ch);
677: } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
678: sb.append(ch);
679: } else {
680: return false;
681: }
682: return true;
683: }
684:
685: static CharacterReference findPreviousOrNext(final Source source,
686: final int pos, final boolean previous) {
687: return findPreviousOrNext(
688: source,
689: pos,
690: Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL,
691: previous);
692: }
693:
694: private static CharacterReference findPreviousOrNext(
695: final Source source,
696: int pos,
697: final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings,
698: final boolean previous) {
699: final ParseText parseText = source.getParseText();
700: pos = previous ? parseText.lastIndexOf('&', pos) : parseText
701: .indexOf('&', pos);
702: while (pos != -1) {
703: final CharacterReference characterReference = construct(
704: source, pos, unterminatedCharacterReferenceSettings);
705: if (characterReference != null)
706: return characterReference;
707: pos = previous ? parseText.lastIndexOf('&', pos - 1)
708: : parseText.indexOf('&', pos + 1);
709: }
710: return null;
711: }
712:
713: static final StringBuffer appendHexadecimalCharacterReferenceString(
714: final StringBuffer sb, final int codePoint) {
715: return sb.append("&#x").append(Integer.toString(codePoint, 16))
716: .append(';');
717: }
718:
719: static final StringBuffer appendDecimalCharacterReferenceString(
720: final StringBuffer sb, final int codePoint) {
721: return sb.append("&#").append(codePoint).append(';');
722: }
723:
724: private static CharacterReference construct(
725: final Source source,
726: final int begin,
727: final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
728: try {
729: if (source.getParseText().charAt(begin) != '&')
730: return null;
731: return (source.getParseText().charAt(begin + 1) == '#') ? NumericCharacterReference
732: .construct(source, begin,
733: unterminatedCharacterReferenceSettings)
734: : CharacterEntityReference
735: .construct(
736: source,
737: begin,
738: unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
739: } catch (IndexOutOfBoundsException ex) {
740: return null;
741: }
742: }
743:
744: private static StringBuffer appendDecode(final StringBuffer sb,
745: final CharSequence encodedText, int pos,
746: final boolean insideAttributeValue,
747: final boolean convertNonBreakingSpaces) {
748: final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings = Config.CurrentCompatibilityMode
749: .getUnterminatedCharacterReferenceSettings(insideAttributeValue);
750: int lastEnd = 0;
751: final Source source = new Source(encodedText);
752: while (true) {
753: final CharacterReference characterReference = findPreviousOrNext(
754: source, pos,
755: unterminatedCharacterReferenceSettings, false);
756: if (characterReference == null)
757: break;
758: if (lastEnd != characterReference.getBegin())
759: Util.appendTo(sb, encodedText, lastEnd,
760: characterReference.getBegin());
761: final char ch = characterReference.getChar();
762: sb
763: .append((ch == CharacterEntityReference._nbsp && convertNonBreakingSpaces) ? ' '
764: : ch);
765: pos = lastEnd = characterReference.getEnd();
766: }
767: if (lastEnd != encodedText.length())
768: Util.appendTo(sb, encodedText, lastEnd, encodedText
769: .length());
770: return sb;
771: }
772: }
|