001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024:
025: /**
026: * Encapsulates global configuration properties which determine the behaviour of various functions.
027: * <p>
028: * All of the properties in this class are static, affecting all objects and threads.
029: * Multiple concurrent configurations are not possible.
030: * <p>
031: * Properties that relate to <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>
032: * compatibility issues are stored in instances of the {@link Config.CompatibilityMode} class.
033: * This allows all of the properties in the compatibility mode to be set as a block by setting the static
034: * {@link #CurrentCompatibilityMode} property to a different instance.
035: *
036: * @see Config.CompatibilityMode
037: */
038: public final class Config {
039: private Config() {
040: }
041:
042: /**
043: * Determines the string used to separate a single column's multiple values in the output of the {@link FormFields#getColumnValues(Map)} method.
044: * <p>
045: * The situation where a single column has multiple values only arises if {@link FormField#getUserValueCount()}<code>>1</code>
046: * on the relevant form field, which usually indicates a poorly designed form.
047: * <p>
048: * The default value is "<code>,</code>" (a comma, not including the quotes).
049: * <p>
050: * Must not be <code>null</code>.
051: */
052: public static String ColumnMultipleValueSeparator = ",";
053:
054: /**
055: * Determines the string that represents the value <code>true</code> in the output of the {@link FormFields#getColumnValues(Map)} method.
056: * <p>
057: * The default value is "<code>true</code>" (without the quotes).
058: * <p>
059: * Must not be <code>null</code>.
060: */
061: public static String ColumnValueTrue = Boolean.toString(true);
062:
063: /**
064: * Determines the string that represents the value <code>false</code> in the output of the {@link FormFields#getColumnValues(Map)} method.
065: * <p>
066: * The default value is <code>null</code>, which represents no output at all.
067: */
068: public static String ColumnValueFalse = null;
069:
070: /**
071: * Determines the currently active {@linkplain Config.CompatibilityMode compatibility mode}.
072: * <p>
073: * The default setting is {@link Config.CompatibilityMode#IE} (MS Internet Explorer 6.0).
074: * <p>
075: * Must not be <code>null</code>.
076: */
077: public static CompatibilityMode CurrentCompatibilityMode = CompatibilityMode.IE;
078:
079: /**
080: * Determines whether apostrophes are encoded when calling the {@link CharacterReference#encode(CharSequence)} method.
081: * <p>
082: * A value of <code>false</code> means {@linkplain CharacterEntityReference#_apos apostrophe}
083: * (U+0027) characters are not encoded.
084: * The only time apostrophes need to be encoded is within an attribute value delimited by
085: * single quotes (apostrophes), so in most cases ignoring apostrophes is perfectly safe and
086: * enhances the readability of the source document.
087: * <p>
088: * Note that apostrophes are always encoded as a {@linkplain NumericCharacterReference numeric character reference}, never as the
089: * character entity reference {@link CharacterEntityReference#_apos &apos;}.
090: * <p>
091: * The default value is <code>false</code>.
092: */
093: public static boolean IsApostropheEncoded = false;
094:
095: /**
096: * Determines the string used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in text output throughout the library.
097: * <p>
098: * The default value is the standard new line character sequence of the host platform, determined by <code>System.getProperty("line.separator")</code>.
099: */
100: public static String NewLine = System.getProperty("line.separator");
101:
102: /**
103: * Determines the {@link LoggerProvider} that is used to create the default {@link Logger} object for each new {@link Source} object.
104: * <p>
105: * The {@link LoggerProvider} interface contains several predefined <code>LoggerProvider</code> instances which this property can be set to,
106: * mostly representing wrappers to common logging frameworks.
107: * <p>
108: * The default value is <code>null</code>, which results in the auto-detection of the most appropriate logging mechanism according to the following algorithm:
109: * <p>
110: * <ol>
111: * <li>If the class <code>org.slf4j.impl.StaticLoggerBinder</code> is detected:
112: * <ul>
113: * <li>If the class <code>org.slf4j.impl.JDK14LoggerFactory</code> is detected, use {@link LoggerProvider#JAVA}.
114: * <li>If the class <code>org.slf4j.impl.Log4jLoggerFactory</code> is detected, use {@link LoggerProvider#LOG4J}.
115: * <li>If the class <code>org.slf4j.impl.JCLLoggerFactory</code> is NOT detected, use {@link LoggerProvider#SLF4J}.
116: * </ul>
117: * <li>If the class <code>org.apache.commons.logging.Log</code> is detected:
118: * <blockquote>
119: * Create an instance of it using the commons-logging <code>LogFactory</code> class.
120: * <ul>
121: * <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Jdk14Logger</code>, use {@link LoggerProvider#JAVA}.
122: * <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Log4JLogger</code>, use {@link LoggerProvider#LOG4J}.
123: * <li>otherwise, use {@link LoggerProvider#JCL}.
124: * </ul>
125: * </blockquote>
126: * <li>If the class <code>org.apache.log4j.Logger</code> is detected, use {@link LoggerProvider#LOG4J}.
127: * <li>otherwise, use {@link LoggerProvider#JAVA}.
128: * </ol>
129: *
130: * @see Source#setLogger(Logger)
131: */
132: public static LoggerProvider LoggerProvider = null;
133:
134: /**
135: * Used in Element.getChildElements.
136: * Will only make this public if someone makes a convincing argument why you would ever need to include server tags in an element hierarchy.
137: */
138: static final boolean IncludeServerTagsInElementHierarchy = false;
139:
140: /**
141: * Represents a set of maximum unicode code points to be recognised for the three types of
142: * <a href="CharacterReference.html#Unterminated">unterminated</a> character reference in a given context.
143: * <p>
144: * The three types of character reference are:
145: * <ul>
146: * <li>{@linkplain CharacterEntityReference Character entity reference}
147: * <li><a href="NumericCharacterReference.html#DecimalCharacterReference">Decimal character reference</a>
148: * <li><a href="NumericCharacterReference.html#HexadecimalCharacterReference">Hexadecimal character reference</a>
149: * </ul>
150: * <p>
151: * The two types of contexts used in this library are:
152: * <ul>
153: * <li>Inside an attribute value
154: * <li>Outside an attribute value
155: * </ul>
156: */
157: static class UnterminatedCharacterReferenceSettings {
158: // use volatile fields to make them thread safe
159: public volatile int characterEntityReferenceMaxCodePoint;
160: public volatile int decimalCharacterReferenceMaxCodePoint;
161: public volatile int hexadecimalCharacterReferenceMaxCodePoint;
162:
163: public static UnterminatedCharacterReferenceSettings ACCEPT_ALL = new UnterminatedCharacterReferenceSettings(
164: CompatibilityMode.CODE_POINTS_ALL,
165: CompatibilityMode.CODE_POINTS_ALL,
166: CompatibilityMode.CODE_POINTS_ALL);
167:
168: public UnterminatedCharacterReferenceSettings() {
169: this (CompatibilityMode.CODE_POINTS_NONE,
170: CompatibilityMode.CODE_POINTS_NONE,
171: CompatibilityMode.CODE_POINTS_NONE);
172: }
173:
174: public UnterminatedCharacterReferenceSettings(
175: final int characterEntityReferenceMaxCodePoint,
176: final int decimalCharacterReferenceMaxCodePoint,
177: final int hexadecimalCharacterReferenceMaxCodePoint) {
178: this .characterEntityReferenceMaxCodePoint = characterEntityReferenceMaxCodePoint;
179: this .decimalCharacterReferenceMaxCodePoint = decimalCharacterReferenceMaxCodePoint;
180: this .hexadecimalCharacterReferenceMaxCodePoint = hexadecimalCharacterReferenceMaxCodePoint;
181: }
182:
183: public String toString() {
184: return Config.NewLine
185: + " Character entity reference: "
186: + getDescription(characterEntityReferenceMaxCodePoint)
187: + Config.NewLine
188: + " Decimal character reference: "
189: + getDescription(decimalCharacterReferenceMaxCodePoint)
190: + Config.NewLine
191: + " Haxadecimal character reference: "
192: + getDescription(hexadecimalCharacterReferenceMaxCodePoint);
193: }
194:
195: private String getDescription(final int codePoint) {
196: if (codePoint == CompatibilityMode.CODE_POINTS_NONE)
197: return "None";
198: if (codePoint == CompatibilityMode.CODE_POINTS_ALL)
199: return "All";
200: return "0x" + Integer.toString(codePoint, 16);
201: }
202: }
203:
204: /**
205: * Represents a set of configuration parameters that relate to
206: * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a> compatibility issues.
207: * <p>
208: * The predefined compatibility modes {@link #IE}, {@link #MOZILLA}, {@link #OPERA} and {@link #XHTML} provide an easy means of
209: * ensuring the library interprets the markup in a way consistent with some of the most commonly used browsers,
210: * at least in relation to the behaviour described by the properties in this class.
211: * <p>
212: * The properties of any <code>CompatibilityMode</code> object can be modified individually, including those in
213: * the predefined instances as well as newly constructed instances.
214: * Take note however that modifying the properties of the predefined instances has a global affect.
215: * <p>
216: * The currently active compatibility mode is stored in the static
217: * {@link Config#CurrentCompatibilityMode} property.
218: * <p>
219: */
220: public static final class CompatibilityMode {
221: private String name;
222: private volatile boolean formFieldNameCaseInsensitive;
223: volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue;
224: volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue;
225:
226: /**
227: * Indicates the recognition of all unicode code points.
228: * <p>
229: * This value is used in properties which specify a maximum unicode code point to be recognised by the parser.
230: *
231: * @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)
232: * @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
233: * @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
234: */
235: public static final int CODE_POINTS_ALL = CharacterReference.MAX_CODE_POINT;
236:
237: /**
238: * Indicates the recognition of no unicode code points.
239: * <p>
240: * This value is used in properties which specify a maximum unicode code point to be recognised by the parser.
241: *
242: * @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)
243: * @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
244: * @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
245: */
246: public static final int CODE_POINTS_NONE = CharacterReference.INVALID_CODE_POINT;
247:
248: /**
249: * <a target="_blank" href="http://www.microsoft.com/windows/ie/">Microsoft Internet Explorer</a> compatibility mode.
250: * <p>
251: * <code>{@link #getName() Name} = IE</code><br />
252: * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br />
253: * <table cellspacing="0" cellpadding="0">
254: * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
255: * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">U+00FF
256: * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
257: * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_NONE None}
258: * </table>
259: */
260: public static final CompatibilityMode IE = new CompatibilityMode(
261: "IE", true, new UnterminatedCharacterReferenceSettings(
262: 0xFF, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
263: new UnterminatedCharacterReferenceSettings(0xFF,
264: CODE_POINTS_ALL, CODE_POINTS_NONE) // outside attributes
265: );
266:
267: /**
268: * <a target="_blank" href="http://www.mozilla.org/products/mozilla1.x/">Mozilla</a> /
269: * <a target="_blank" href="http://www.mozilla.org/products/firefox/">Firefox</a> /
270: * <a target="_blank" href="http://browser.netscape.com/">Netscape</a> compatibility mode.
271: * <p>
272: * <code>{@link #getName() Name} = Mozilla</code><br />
273: * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br />
274: * <table cellspacing="0" cellpadding="0">
275: * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
276: * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">{@linkplain #CODE_POINTS_ALL All}
277: * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
278: * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
279: * </table>
280: */
281: public static final CompatibilityMode MOZILLA = new CompatibilityMode(
282: "Mozilla", false,
283: new UnterminatedCharacterReferenceSettings(0xFF,
284: CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
285: new UnterminatedCharacterReferenceSettings(
286: CODE_POINTS_ALL, CODE_POINTS_ALL,
287: CODE_POINTS_ALL) // outside attributes
288: );
289:
290: /**
291: * Opera compatibility mode.
292: * <p>
293: * <code>{@link #getName() Name} = Opera</code><br />
294: * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br />
295: * <table cellspacing="0" cellpadding="0">
296: * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
297: * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+003E<td align="center">{@linkplain #CODE_POINTS_ALL All}
298: * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
299: * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
300: * </table>
301: */
302: public static final CompatibilityMode OPERA = new CompatibilityMode(
303: "Opera", true,
304: new UnterminatedCharacterReferenceSettings(0x3E,
305: CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
306: new UnterminatedCharacterReferenceSettings(
307: CODE_POINTS_ALL, CODE_POINTS_ALL,
308: CODE_POINTS_ALL) // outside attributes
309: );
310:
311: /**
312: * <a target="_blank" href="http://www.w3.org/TR/xhtml1/#xhtml">XHTML</a> compatibility mode.
313: * <p>
314: * <code>{@link #getName() Name} = XHTML</code><br />
315: * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br />
316: * <table cellspacing="0" cellpadding="0">
317: * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
318: * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
319: * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
320: * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
321: * </table>
322: */
323: public static final CompatibilityMode XHTML = new CompatibilityMode(
324: "XHTML");
325:
326: /**
327: * Constructs a new <code>CompatibilityMode</code> with the given {@linkplain #getName() name}.
328: * <p>
329: * All properties in the new instance are initially assigned their default values, which are the same as the strict
330: * rules of the {@link #XHTML} compatibility mode.
331: *
332: * @param name the {@linkplain #getName() name} of the new compatibility mode
333: */
334: public CompatibilityMode(final String name) {
335: this (name, false,
336: new UnterminatedCharacterReferenceSettings(),
337: new UnterminatedCharacterReferenceSettings());
338: }
339:
340: private CompatibilityMode(
341: final String name,
342: final boolean formFieldNameCaseInsensitive,
343: final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue,
344: final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue) {
345: this .name = name;
346: this .formFieldNameCaseInsensitive = formFieldNameCaseInsensitive;
347: this .unterminatedCharacterReferenceSettingsInsideAttributeValue = unterminatedCharacterReferenceSettingsInsideAttributeValue;
348: this .unterminatedCharacterReferenceSettingsOutsideAttributeValue = unterminatedCharacterReferenceSettingsOutsideAttributeValue;
349: }
350:
351: /**
352: * Returns the name of this compatibility mode.
353: * @return the name of this compatibility mode.
354: */
355: public String getName() {
356: return name;
357: }
358:
359: /**
360: * Indicates whether {@linkplain FormField#getName() form field names} are treated as case insensitive.
361: * <p>
362: * Microsoft Internet Explorer treats field names as case insensitive,
363: * while Mozilla treats them as case sensitive.
364: * <p>
365: * The value of this property in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}
366: * affects all instances of the {@link FormFields} class.
367: * It should be set to the desired configuration before any instances of <code>FormFields</code> are created.
368: *
369: * @return <code>true</code> if {@linkplain FormField#getName() form field names} are treated as case insensitive, otherwise <code>false</code>.
370: * @see #setFormFieldNameCaseInsensitive(boolean)
371: */
372: public boolean isFormFieldNameCaseInsensitive() {
373: return formFieldNameCaseInsensitive;
374: }
375:
376: /**
377: * Sets whether {@linkplain FormField#getName() form field names} are treated as case insensitive.
378: * <p>
379: * See {@link #isFormFieldNameCaseInsensitive()} for the documentation of this property.
380: *
381: * @param value the new value of the property
382: */
383: public void setFormFieldNameCaseInsensitive(final boolean value) {
384: formFieldNameCaseInsensitive = value;
385: }
386:
387: /**
388: * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
389: * {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
390: * <p>
391: * For example, if <code>getUnterminatedCharacterEntityReferenceMaxCodePoint(true)</code> has the value <code>0xFF</code> (U+00FF)
392: * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
393: * <ul>
394: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&gt",true)}
395: * returns "<code>></code>".<br />
396: * The string is recognised as the character entity reference {@link CharacterEntityReference#_gt &gt;}
397: * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
398: * because its unicode code point U+003E is below the maximum of U+00FF set by this property.
399: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&euro",true)}
400: * returns "<code>&euro</code>".<br />
401: * The string is not recognised as the character entity reference {@link CharacterEntityReference#_euro &euro;}
402: * because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
403: * and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
404: * </ul>
405: * <p>
406: * See the documentation of the {@link Attribute#getValue()} method for further discussion.
407: *
408: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
409: * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
410: * @see #setUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
411: */
412: public int getUnterminatedCharacterEntityReferenceMaxCodePoint(
413: final boolean insideAttributeValue) {
414: return getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint;
415: }
416:
417: /**
418: * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
419: * {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
420: * <p>
421: * See {@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
422: *
423: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
424: * @param maxCodePoint the maximum unicode code point.
425: */
426: public void setUnterminatedCharacterEntityReferenceMaxCodePoint(
427: final boolean insideAttributeValue,
428: final int maxCodePoint) {
429: getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint = maxCodePoint;
430: }
431:
432: /**
433: * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
434: * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
435: * <p>
436: * For example, if <code>getUnterminatedDecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF)
437: * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
438: * <ul>
439: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#62",true)}
440: * returns "<code>></code>".<br />
441: * The string is recognised as the numeric character reference <code>&#62;</code>
442: * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
443: * because its unicode code point U+003E is below the maximum of U+00FF set by this property.
444: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#8364",true)}
445: * returns "<code>&#8364</code>".<br />
446: * The string is not recognised as the numeric character reference <code>&#8364;</code>
447: * because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
448: * and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
449: * </ul>
450: *
451: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
452: * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
453: * @see #setUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
454: */
455: public int getUnterminatedDecimalCharacterReferenceMaxCodePoint(
456: final boolean insideAttributeValue) {
457: return getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint;
458: }
459:
460: /**
461: * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
462: * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
463: * <p>
464: * See {@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
465: *
466: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
467: * @param maxCodePoint the maximum unicode code point.
468: */
469: public void setUnterminatedDecimalCharacterReferenceMaxCodePoint(
470: final boolean insideAttributeValue,
471: final int maxCodePoint) {
472: getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint = maxCodePoint;
473: }
474:
475: /**
476: * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
477: * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context.
478: * <p>
479: * For example, if <code>getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF)
480: * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
481: * <ul>
482: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x3e",true)}
483: * returns "<code>></code>".<br />
484: * The string is recognised as the numeric character reference <code>&#x3e;</code>
485: * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
486: * because its unicode code point U+003E is below the maximum of U+00FF set by this property.
487: * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x20ac",true)}
488: * returns "<code>&#x20ac</code>".<br />
489: * The string is not recognised as the numeric character reference <code>&#20ac;</code>
490: * because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
491: * and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
492: * </ul>
493: *
494: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
495: * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context.
496: * @see #setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
497: */
498: public int getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(
499: final boolean insideAttributeValue) {
500: return getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint;
501: }
502:
503: /**
504: * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
505: * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">headecimal character reference</a> which is to be recognised in the specified context.
506: * <p>
507: * See {@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
508: *
509: * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
510: * @param maxCodePoint the maximum unicode code point.
511: */
512: public void setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(
513: final boolean insideAttributeValue,
514: final int maxCodePoint) {
515: getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint = maxCodePoint;
516: }
517:
518: /**
519: * Returns a string representation of this object useful for debugging purposes.
520: * @return a string representation of this object useful for debugging purposes.
521: */
522: public String getDebugInfo() {
523: return "Form field name case insensitive: "
524: + formFieldNameCaseInsensitive
525: + Config.NewLine
526: + "Maximum codepoints in unterminated character references:"
527: + Config.NewLine
528: + " Inside attribute values:"
529: + unterminatedCharacterReferenceSettingsInsideAttributeValue
530: + Config.NewLine
531: + " Outside attribute values:"
532: + unterminatedCharacterReferenceSettingsOutsideAttributeValue;
533: }
534:
535: /**
536: * Returns the {@linkplain #getName() name} of this compatibility mode.
537: * @return the {@linkplain #getName() name} of this compatibility mode.
538: */
539: public String toString() {
540: return getName();
541: }
542:
543: UnterminatedCharacterReferenceSettings getUnterminatedCharacterReferenceSettings(
544: final boolean insideAttributeValue) {
545: return insideAttributeValue ? unterminatedCharacterReferenceSettingsInsideAttributeValue
546: : unterminatedCharacterReferenceSettingsOutsideAttributeValue;
547: }
548: }
549: }
|