Source Code Cross Referenced for CharacterReference.java in » HTML-Parser » jericho-html » au » id » jericho » lib » html » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » HTML Parser » jericho html » au.id.jericho.lib.html
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002:        // Version 2.5
003:        // Copyright (C) 2007 Martin Jericho
004:        // http://jerichohtml.sourceforge.net/
005:        //
006:        // This library is free software; you can redistribute it and/or
007:        // modify it under the terms of either one of the following licences:
008:        //
009:        // 1. The Eclipse Public License (EPL) version 1.0,
010:        // included in this distribution in the file licence-epl-1.0.html
011:        // or available at http://www.eclipse.org/legal/epl-v10.html
012:        //
013:        // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014:        // included in this distribution in the file licence-lgpl-2.1.txt
015:        // or available at http://www.gnu.org/licenses/lgpl.txt
016:        //
017:        // This library is distributed on an "AS IS" basis,
018:        // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019:        // See the individual licence texts for more details.
020:
021:        package au.id.jericho.lib.html;
022:
023:        import java.util.*;
024:        import java.io.*;
025:
026:        /**
027:         * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
028:         * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
029:         * <p>
030:         * This class, together with its subclasses, contains static methods to perform most required operations
031:         * without having to instantiate an object.
032:         * <p>
033:         * Instances of this class are useful when the positions of character references in a source document are required,
034:         * or to replace the found character references with customised text.
035:         * <p>
036:         * <code>CharacterReference</code> instances are obtained using one of the following methods:
037:         * <ul>
038:         *  <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
039:         *  <li>{@link Source#findNextCharacterReference(int pos)}
040:         *  <li>{@link Source#findPreviousCharacterReference(int pos)}
041:         *  <li>{@link Segment#findAllCharacterReferences()}
042:         * </ul>
043:         */
044:        public abstract class CharacterReference extends Segment {
045:            int codePoint;
046:
047:            /**
048:             * Represents an invalid unicode code point.
049:             * <p>
050:             * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
051:             */
052:            public static final int INVALID_CODE_POINT = -1;
053:
054:            /**
055:             * The maximum codepoint allowed by unicode, 0x10FFFF (decimal 1114111).
056:             * This can be replaced by Character.MAX_CODE_POINT in java 1.5
057:             */
058:            static final int MAX_CODE_POINT = 0x10FFFF;
059:
060:            static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation
061:
062:            /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
063:            private static final int TAB_LENGTH = 4;
064:
065:            CharacterReference(final Source source, final int begin,
066:                    final int end, final int codePoint) {
067:                super (source, begin, end);
068:                this .codePoint = codePoint;
069:            }
070:
071:            /**
072:             * Returns the <a target="_blank" href="http://www.unicode.org">unicode</a> code point represented by this character reference.
073:             * @return the unicode code point represented by this character reference.
074:             */
075:            public int getCodePoint() {
076:                return codePoint;
077:            }
078:
079:            /**
080:             * Returns the character represented by this character reference.
081:             * <p>
082:             * If this character reference represents a unicode
083:             * <a target="_blank" href="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
084:             * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
085:             *
086:             * @return the character represented by this character reference.
087:             */
088:            public char getChar() {
089:                return (char) codePoint;
090:            }
091:
092:            /**
093:             * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
094:             * <p>
095:             * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
096:             * not end with a semicolon.
097:             * <p>
098:             * The SGML specification allows unterminated character references in some circumstances, and because the
099:             * HTML 4.01 specification states simply that
100:             * "<a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
101:             * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
102:             * <p>
103:             * Unterminated character references are not allowed in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
104:             *
105:             * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
106:             * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
107:             */
108:            public boolean isTerminated() {
109:                return source.charAt(end - 1) == ';';
110:            }
111:
112:            /**
113:             * Encodes the specified text, escaping special characters into character references.
114:             * <p>
115:             * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
116:             * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
117:             * code point is greater than U+007F.
118:             * <p>
119:             * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
120:             * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
121:             * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&amp;#39;</code>".
122:             * <p>
123:             * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &amp;apos;}
124:             * as this entity is not defined for use in HTML.  See the comments in the {@link CharacterEntityReference} class for more information.
125:             * <p>
126:             * To encode text using only numeric character references, use the<br />
127:             * {@link NumericCharacterReference#encode(CharSequence)} method instead.
128:             *
129:             * @param unencodedText  the text to encode.
130:             * @return the encoded string.
131:             * @see #decode(CharSequence)
132:             */
133:            public static String encode(final CharSequence unencodedText) {
134:                if (unencodedText == null)
135:                    return null;
136:                return appendEncode(
137:                        new StringBuffer(unencodedText.length() * 2),
138:                        unencodedText, false).toString();
139:            }
140:
141:            /**
142:             * Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}.
143:             * <p>
144:             * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method.
145:             *
146:             * @param ch  the character to encode.
147:             * @return a character reference if appropriate, otherwise a string containing the original character.
148:             */
149:            public static String encode(final char ch) {
150:                return appendEncode(
151:                        new StringBuffer(MAX_ENTITY_REFERENCE_LENGTH), ch)
152:                        .toString();
153:            }
154:
155:            /**
156:             * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
157:             * <p>
158:             * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
159:             * <ul>
160:             *  <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
161:             *   are converted to "<code>&lt;br /&gt;</code>".  CR/LF pairs are treated as a single line break.
162:             *  <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
163:             *   while ensuring the last is always a normal space.
164:             *  <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
165:             * </ul>
166:             * <p>
167:             * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
168:             * spaces to be rendered, but also allows the line to wrap in the middle of it.
169:             * <p>
170:             * Note that zero-width spaces (U+200B) are converted to the numeric character reference
171:             * "<code>&amp;#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
172:             * either encoded or unencoded.
173:             * <p>
174:             * There is no method provided to reverse this encoding.
175:             *
176:             * @param unencodedText  the text to encode.
177:             * @return the encoded string with whitespace formatting converted to markup.
178:             * @see #encode(CharSequence)
179:             */
180:            public static String encodeWithWhiteSpaceFormatting(
181:                    final CharSequence unencodedText) {
182:                if (unencodedText == null)
183:                    return null;
184:                return appendEncode(
185:                        new StringBuffer(unencodedText.length() * 2),
186:                        unencodedText, true).toString();
187:            }
188:
189:            /**
190:             * Decodes the specified HTML encoded text into normal text.
191:             * <p>
192:             * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
193:             * are converted to their respective characters.
194:             * <p>
195:             * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
196:             * <p>
197:             * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
198:             * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
199:             * <p>
200:             * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
201:             * some browsers also recognise them in a case-insensitive way.
202:             * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
203:             *
204:             * @param encodedText  the text to decode.
205:             * @return the decoded string.
206:             * @see #encode(CharSequence)
207:             */
208:            public static String decode(final CharSequence encodedText) {
209:                return decode(encodedText, false, false);
210:            }
211:
212:            /**
213:             * Decodes the specified HTML encoded text into normal text.
214:             * <p>
215:             * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
216:             * are converted to their respective characters.
217:             * <p>
218:             * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the
219:             * value of the <code>insideAttributeValue</code> parameter and the
220:             * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
221:             * <p>
222:             * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
223:             * some browsers also recognise them in a case-insensitive way.
224:             * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
225:             *
226:             * @param encodedText  the text to decode.
227:             * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
228:             * @return the decoded string.
229:             * @see #decode(CharSequence)
230:             * @see #encode(CharSequence)
231:             */
232:            public static String decode(final CharSequence encodedText,
233:                    final boolean insideAttributeValue) {
234:                return decode(encodedText, insideAttributeValue, false);
235:            }
236:
237:            private static String decode(final CharSequence encodedText,
238:                    final boolean insideAttributeValue,
239:                    final boolean convertNonBreakingSpaces) {
240:                if (encodedText == null)
241:                    return null;
242:                for (int i = 0; i < encodedText.length(); i++) {
243:                    if (encodedText.charAt(i) == '&')
244:                        return appendDecode(
245:                                new StringBuffer(encodedText.length()),
246:                                encodedText, i, insideAttributeValue,
247:                                convertNonBreakingSpaces).toString();
248:                }
249:                return encodedText.toString();
250:            }
251:
252:            /**
253:             * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
254:             * <p>
255:             * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
256:             * <p>
257:             * The result is how the text would normally be rendered by a
258:             * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
259:             * assuming it does not contain any tags.
260:             * <p>
261:             * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
262:             * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
263:             * See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
264:             * method for a more detailed explanation of this topic.
265:             *
266:             * @param text  the source text
267:             * @return the decoded text with collapsed white space.
268:             * @see FormControl#getPredefinedValues()
269:             */
270:            public static String decodeCollapseWhiteSpace(
271:                    final CharSequence text) {
272:                return decodeCollapseWhiteSpace(text, false);
273:            }
274:
275:            static String decodeCollapseWhiteSpace(final CharSequence text,
276:                    final boolean convertNonBreakingSpaces) {
277:                return decode(appendCollapseWhiteSpace(new StringBuffer(text
278:                        .length()), text), false, convertNonBreakingSpaces);
279:            }
280:
281:            /**
282:             * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
283:             * <p>
284:             * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
285:             * <p>
286:             * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
287:             * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
288:             * may be used in future.
289:             *
290:             * @param encodedText  the text to re-encode.
291:             * @return the re-encoded string.
292:             */
293:            public static String reencode(final CharSequence encodedText) {
294:                return encode(decode(encodedText, true));
295:            }
296:
297:            /**
298:             * Returns the encoded form of this character reference.
299:             * <p>
300:             * The exact behaviour of this method depends on the class of this object.
301:             * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
302:             * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
303:             * <p>
304:             * <dl>
305:             *  <dt>Examples:</dt>
306:             *   <dd><code>CharacterReference.parse("&amp;GT;").getCharacterReferenceString()</code> returns "<code>&amp;gt;</code>"</dd>
307:             *   <dd><code>CharacterReference.parse("&amp;#x3E;").getCharacterReferenceString()</code> returns "<code>&amp;#3e;</code>"</dd>
308:             * </dl>
309:             *
310:             * @return the encoded form of this character reference.
311:             * @see #getCharacterReferenceString(int codePoint)
312:             * @see #getDecimalCharacterReferenceString()
313:             */
314:            public abstract String getCharacterReferenceString();
315:
316:            /**
317:             * Returns the encoded form of the specified unicode code point.
318:             * <p>
319:             * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
320:             * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
321:             * <p>
322:             * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
323:             * which is encoded as the numeric character reference "<code>&amp;#39;</code>" instead of its character entity reference
324:             * "<code>&amp;apos;</code>".
325:             * <p>
326:             * <dl>
327:             *  <dt>Examples:</dt>
328:             *   <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;gt;</code>"</dd>
329:             *   <dd><code>CharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;gt;</code>"</dd>
330:             *   <dd><code>CharacterReference.getCharacterReferenceString('&#9786;')</code> returns "<code>&amp;#9786;</code>"</dd>
331:             * </dl>
332:             *
333:             * @param codePoint  the unicode code point to encode.
334:             * @return the encoded form of the specified unicode code point.
335:             * @see #getHexadecimalCharacterReferenceString(int codePoint)
336:             */
337:            public static String getCharacterReferenceString(final int codePoint) {
338:                String characterReferenceString = null;
339:                if (codePoint != CharacterEntityReference._apos)
340:                    characterReferenceString = CharacterEntityReference
341:                            .getCharacterReferenceString(codePoint);
342:                if (characterReferenceString == null)
343:                    characterReferenceString = NumericCharacterReference
344:                            .getCharacterReferenceString(codePoint);
345:                return characterReferenceString;
346:            }
347:
348:            /**
349:             * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
350:             * <p>
351:             * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
352:             * <p>
353:             * <dl>
354:             *  <dt>Example:</dt>
355:             *  <dd><code>CharacterReference.parse("&amp;gt;").getDecimalCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
356:             * </dl>
357:             *
358:             * @return the decimal encoded form of this character reference.
359:             * @see #getCharacterReferenceString()
360:             * @see #getHexadecimalCharacterReferenceString()
361:             */
362:            public String getDecimalCharacterReferenceString() {
363:                return getDecimalCharacterReferenceString(codePoint);
364:            }
365:
366:            /**
367:             * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
368:             * <p>
369:             * <dl>
370:             *  <dt>Example:</dt>
371:             *  <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
372:             * </dl>
373:             *
374:             * @param codePoint  the unicode code point to encode.
375:             * @return the decimal encoded form of the specified unicode code point.
376:             * @see #getCharacterReferenceString(int codePoint)
377:             * @see #getHexadecimalCharacterReferenceString(int codePoint)
378:             */
379:            public static String getDecimalCharacterReferenceString(
380:                    final int codePoint) {
381:                return appendDecimalCharacterReferenceString(
382:                        new StringBuffer(), codePoint).toString();
383:            }
384:
385:            /**
386:             * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
387:             * <p>
388:             * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
389:             * <p>
390:             * <dl>
391:             *  <dt>Example:</dt>
392:             *  <dd><code>CharacterReference.parse("&amp;gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&amp;#x3e;</code>"</dd>
393:             * </dl>
394:             *
395:             * @return the hexadecimal encoded form of this character reference.
396:             * @see #getCharacterReferenceString()
397:             * @see #getDecimalCharacterReferenceString()
398:             */
399:            public String getHexadecimalCharacterReferenceString() {
400:                return getHexadecimalCharacterReferenceString(codePoint);
401:            }
402:
403:            /**
404:             * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
405:             * <p>
406:             * <dl>
407:             *  <dt>Example:</dt>
408:             *  <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#x3e;</code>"</dd>
409:             * </dl>
410:             *
411:             * @param codePoint  the unicode code point to encode.
412:             * @return the hexadecimal encoded form of the specified unicode code point.
413:             * @see #getCharacterReferenceString(int codePoint)
414:             * @see #getDecimalCharacterReferenceString(int codePoint)
415:             */
416:            public static String getHexadecimalCharacterReferenceString(
417:                    final int codePoint) {
418:                return appendHexadecimalCharacterReferenceString(
419:                        new StringBuffer(), codePoint).toString();
420:            }
421:
422:            /**
423:             * Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
424:             * <p>
425:             * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
426:             * <p>
427:             * <dl>
428:             *  <dt>Example:</dt>
429:             *  <dd><code>CharacterReference.parse("&amp;gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
430:             * </dl>
431:             *
432:             * @return the unicode code point of this character reference in U+ notation.
433:             * @see #getUnicodeText(int codePoint)
434:             */
435:            public String getUnicodeText() {
436:                return getUnicodeText(codePoint);
437:            }
438:
439:            /**
440:             * Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
441:             * <p>
442:             * <dl>
443:             *  <dt>Example:</dt>
444:             *  <dd><code>CharacterReference.getUnicodeText('&gt;')</code> returns "<code>U+003E</code>"</dd>
445:             * </dl>
446:             *
447:             * @param codePoint  the unicode code point.
448:             * @return the specified unicode code point in U+ notation.
449:             */
450:            public static String getUnicodeText(final int codePoint) {
451:                return appendUnicodeText(new StringBuffer(), codePoint)
452:                        .toString();
453:            }
454:
455:            static final StringBuffer appendUnicodeText(final StringBuffer sb,
456:                    final int codePoint) {
457:                sb.append("U+");
458:                final String hex = Integer.toString(codePoint, 16)
459:                        .toUpperCase();
460:                for (int i = 4 - hex.length(); i > 0; i--)
461:                    sb.append('0');
462:                sb.append(hex);
463:                return sb;
464:            }
465:
466:            /**
467:             * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
468:             * <p>
469:             * The character reference must be at the start of the given text, but may contain other characters at the end.
470:             * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
471:             * <p>
472:             * If the text does not represent a valid character reference, this method returns <code>null</code>.
473:             * <p>
474:             * <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
475:             * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
476:             * <p>
477:             * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
478:             * <p>
479:             * <dl>
480:             *  <dt>Example:</dt>
481:             *  <dd><code>CharacterReference.parse("&amp;gt;").getChar()</code> returns '<code>&gt;</code>'</dd>
482:             * </dl>
483:             *
484:             * @param characterReferenceText  the text containing a single encoded character reference.
485:             * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
486:             * @see #decode(CharSequence)
487:             */
488:            public static CharacterReference parse(
489:                    final CharSequence characterReferenceText) {
490:                return construct(
491:                        new Source(characterReferenceText.toString()),
492:                        0,
493:                        Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
494:            }
495:
496:            /**
497:             * Parses a single encoded character reference text into a unicode code point.
498:             * <p>
499:             * The character reference must be at the start of the given text, but may contain other characters at the end.
500:             * <p>
501:             * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
502:             * <p>
503:             * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
504:             * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
505:             * <code>NullPointerException</code>.
506:             * <p>
507:             * <dl>
508:             *  <dt>Example:</dt>
509:             *  <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&amp;gt;")</code> returns <code>38</code></dd>
510:             * </dl>
511:             *
512:             * @param characterReferenceText  the text containing a single encoded character reference.
513:             * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
514:             */
515:            public static int getCodePointFromCharacterReferenceString(
516:                    final CharSequence characterReferenceText) {
517:                final CharacterReference characterReference = parse(characterReferenceText);
518:                return (characterReference != null) ? characterReference
519:                        .getCodePoint() : INVALID_CODE_POINT;
520:            }
521:
522:            /**
523:             * Indicates whether the specified character would need to be encoded in HTML text.
524:             * <p>
525:             * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
526:             * <p>
527:             * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
528:             * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
529:             * is currently set to <code>true</code>.
530:             *
531:             * @param ch  the character to test.
532:             * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
533:             */
534:            public static final boolean requiresEncoding(final char ch) {
535:                return ch > 127
536:                        || (CharacterEntityReference.getName(ch) != null && (ch != '\'' || Config.IsApostropheEncoded));
537:            }
538:
539:            /**
540:             * Returns a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
541:             *
542:             * @param writer  the destination for the encoded text
543:             * @return a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
544:             * @see #encode(CharSequence unencodedText)
545:             */
546:            public static Writer getEncodingFilterWriter(final Writer writer) {
547:                return new EncodingFilterWriter(writer);
548:            }
549:
550:            private static final class EncodingFilterWriter extends
551:                    FilterWriter {
552:                StringBuffer sb = new StringBuffer(MAX_ENTITY_REFERENCE_LENGTH);
553:
554:                public EncodingFilterWriter(final Writer writer) {
555:                    super (writer);
556:                }
557:
558:                public void write(final char ch) throws IOException {
559:                    sb.setLength(0);
560:                    appendEncode(sb, ch);
561:                    if (sb.length() == 1)
562:                        out.write(sb.charAt(0));
563:                    else
564:                        Util.appendTo(out, sb);
565:                }
566:
567:                public void write(final int chInt) throws IOException {
568:                    write((char) chInt);
569:                }
570:
571:                public void write(final char[] cbuf, final int off,
572:                        final int len) throws IOException {
573:                    final int end = off + len;
574:                    for (int i = off; i < end; i++)
575:                        write(cbuf[i]);
576:                }
577:
578:                public void write(final String str, final int off, final int len)
579:                        throws IOException {
580:                    final int end = off + len;
581:                    for (int i = off; i < end; i++)
582:                        write(str.charAt(i));
583:                }
584:            }
585:
586:            private static StringBuffer appendEncode(final StringBuffer sb,
587:                    char ch) {
588:                if (appendEncodeCheckForWhiteSpaceFormatting(sb, ch, false))
589:                    return sb;
590:                return sb.append(ch);
591:            }
592:
593:            static StringBuffer appendEncode(final StringBuffer sb,
594:                    CharSequence unencodedText,
595:                    final boolean whiteSpaceFormatting) {
596:                if (unencodedText == null)
597:                    return sb;
598:                int beginPos = 0;
599:                int endPos = unencodedText.length();
600:                if (unencodedText instanceof  Segment) {
601:                    // this might improve performance slightly
602:                    final Segment segment = (Segment) unencodedText;
603:                    final int segmentOffset = segment.getBegin();
604:                    beginPos = segmentOffset;
605:                    endPos += segmentOffset;
606:                    unencodedText = segment.source.string;
607:                }
608:                final boolean isApostropheEncoded = Config.IsApostropheEncoded;
609:                for (int i = beginPos; i < endPos; i++) {
610:                    char ch = unencodedText.charAt(i);
611:                    if (appendEncodeCheckForWhiteSpaceFormatting(sb, ch,
612:                            whiteSpaceFormatting))
613:                        continue;
614:                    // need to process white space
615:                    // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
616:                    int spaceCount;
617:                    int nexti = i + 1;
618:                    if (ch != ' ') {
619:                        if (ch != '\t') {
620:                            // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
621:                            if (ch == '\r' && nexti < endPos
622:                                    && unencodedText.charAt(nexti) == '\n')
623:                                i++; // process cr/lf pair as one line break
624:                            sb.append("<br />"); // add line break
625:                            continue;
626:                        } else {
627:                            spaceCount = TAB_LENGTH;
628:                        }
629:                    } else {
630:                        spaceCount = 1;
631:                    }
632:                    while (nexti < endPos) {
633:                        ch = unencodedText.charAt(nexti);
634:                        if (ch == ' ')
635:                            spaceCount += 1;
636:                        else if (ch == '\t')
637:                            spaceCount += TAB_LENGTH;
638:                        else
639:                            break;
640:                        nexti++;
641:                    }
642:                    if (spaceCount == 1) {
643:                        // handle the very common case of a single character to improve efficiency slightly
644:                        sb.append(' ');
645:                        continue;
646:                    }
647:                    if (spaceCount % 2 == 1)
648:                        sb.append(' '); // fist character is a space if we have an odd number of spaces
649:                    while (spaceCount >= 2) {
650:                        sb.append("&nbsp; "); // use alternating &nbsp; and spaces to keep original number of spaces
651:                        spaceCount -= 2;
652:                    }
653:                    // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
654:                    i = nexti - 1; // minus 1 because top level for loop will add it again
655:                }
656:                return sb;
657:            }
658:
659:            private static final boolean appendEncodeCheckForWhiteSpaceFormatting(
660:                    final StringBuffer sb, char ch,
661:                    final boolean whiteSpaceFormatting) {
662:                final String characterEntityReferenceName = CharacterEntityReference
663:                        .getName(ch);
664:                if (characterEntityReferenceName != null) {
665:                    if (ch == '\'') {
666:                        if (Config.IsApostropheEncoded)
667:                            sb.append("&#39;");
668:                        else
669:                            sb.append(ch);
670:                    } else {
671:                        CharacterEntityReference
672:                                .appendCharacterReferenceString(sb,
673:                                        characterEntityReferenceName);
674:                    }
675:                } else if (ch > 127) {
676:                    appendDecimalCharacterReferenceString(sb, ch);
677:                } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
678:                    sb.append(ch);
679:                } else {
680:                    return false;
681:                }
682:                return true;
683:            }
684:
685:            static CharacterReference findPreviousOrNext(final Source source,
686:                    final int pos, final boolean previous) {
687:                return findPreviousOrNext(
688:                        source,
689:                        pos,
690:                        Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL,
691:                        previous);
692:            }
693:
694:            private static CharacterReference findPreviousOrNext(
695:                    final Source source,
696:                    int pos,
697:                    final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings,
698:                    final boolean previous) {
699:                final ParseText parseText = source.getParseText();
700:                pos = previous ? parseText.lastIndexOf('&', pos) : parseText
701:                        .indexOf('&', pos);
702:                while (pos != -1) {
703:                    final CharacterReference characterReference = construct(
704:                            source, pos, unterminatedCharacterReferenceSettings);
705:                    if (characterReference != null)
706:                        return characterReference;
707:                    pos = previous ? parseText.lastIndexOf('&', pos - 1)
708:                            : parseText.indexOf('&', pos + 1);
709:                }
710:                return null;
711:            }
712:
713:            static final StringBuffer appendHexadecimalCharacterReferenceString(
714:                    final StringBuffer sb, final int codePoint) {
715:                return sb.append("&#x").append(Integer.toString(codePoint, 16))
716:                        .append(';');
717:            }
718:
719:            static final StringBuffer appendDecimalCharacterReferenceString(
720:                    final StringBuffer sb, final int codePoint) {
721:                return sb.append("&#").append(codePoint).append(';');
722:            }
723:
724:            private static CharacterReference construct(
725:                    final Source source,
726:                    final int begin,
727:                    final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
728:                try {
729:                    if (source.getParseText().charAt(begin) != '&')
730:                        return null;
731:                    return (source.getParseText().charAt(begin + 1) == '#') ? NumericCharacterReference
732:                            .construct(source, begin,
733:                                    unterminatedCharacterReferenceSettings)
734:                            : CharacterEntityReference
735:                                    .construct(
736:                                            source,
737:                                            begin,
738:                                            unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
739:                } catch (IndexOutOfBoundsException ex) {
740:                    return null;
741:                }
742:            }
743:
744:            private static StringBuffer appendDecode(final StringBuffer sb,
745:                    final CharSequence encodedText, int pos,
746:                    final boolean insideAttributeValue,
747:                    final boolean convertNonBreakingSpaces) {
748:                final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings = Config.CurrentCompatibilityMode
749:                        .getUnterminatedCharacterReferenceSettings(insideAttributeValue);
750:                int lastEnd = 0;
751:                final Source source = new Source(encodedText);
752:                while (true) {
753:                    final CharacterReference characterReference = findPreviousOrNext(
754:                            source, pos,
755:                            unterminatedCharacterReferenceSettings, false);
756:                    if (characterReference == null)
757:                        break;
758:                    if (lastEnd != characterReference.getBegin())
759:                        Util.appendTo(sb, encodedText, lastEnd,
760:                                characterReference.getBegin());
761:                    final char ch = characterReference.getChar();
762:                    sb
763:                            .append((ch == CharacterEntityReference._nbsp && convertNonBreakingSpaces) ? ' '
764:                                    : ch);
765:                    pos = lastEnd = characterReference.getEnd();
766:                }
767:                if (lastEnd != encodedText.length())
768:                    Util.appendTo(sb, encodedText, lastEnd, encodedText
769:                            .length());
770:                return sb;
771:            }
772:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.