001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.io.*;
024:
025: /**
026: * Represents a single <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2">attribute</a>
027: * name/value segment within a {@link StartTag}.
028: * <p>
029: * An instance of this class is a representation of a single attribute in the source document and is not modifiable.
030: * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods
031: * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}.
032: * <p>
033: * Obtained using the {@link Attributes#get(String key)} method.
034: * <p>
035: * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>.
036: *
037: * @see Attributes
038: */
039: public final class Attribute extends Segment {
040: private final String key;
041: private final Segment nameSegment;
042: private final Segment valueSegment;
043: private final Segment valueSegmentIncludingQuotes;
044:
045: static final String CHECKED = "checked";
046: static final String CLASS = "class";
047: static final String DISABLED = "disabled";
048: static final String ID = "id";
049: static final String MULTIPLE = "multiple";
050: static final String NAME = "name";
051: static final String SELECTED = "selected";
052: static final String STYLE = "style";
053: static final String TYPE = "type";
054: static final String VALUE = "value";
055:
056: /**
057: * Constructs a new Attribute with no value part, called from Attributes class.
058: * <p>
059: * Note that the resulting Attribute segment has the same span as the supplied nameSegment.
060: *
061: * @param source the {@link Source} document.
062: * @param key the name of this attribute in lower case.
063: * @param nameSegment the segment representing the name.
064: */
065: Attribute(final Source source, final String key,
066: final Segment nameSegment) {
067: this (source, key, nameSegment, null, null);
068: }
069:
070: /**
071: * Constructs a new Attribute, called from Attributes class.
072: * <p>
073: * The resulting Attribute segment begins at the start of the nameSegment
074: * and finishes at the end of the valueSegmentIncludingQuotes. If this attribute
075: * has no value, it finishes at the end of the nameSegment.
076: * <p>
077: * If this attribute has no value, the <code>valueSegment</code> and <code>valueSegmentIncludingQuotes</code> must be null.
078: * The <valueSegmentIncludingQuotes</code> parameter must not be null if the <code>valueSegment</code> is not null, and vice versa
079: *
080: * @param source the {@link Source} document.
081: * @param key the name of this attribute in lower case.
082: * @param nameSegment the segment spanning the name.
083: * @param valueSegment the segment spanning the value.
084: * @param valueSegmentIncludingQuotes the segment spanning the value, including quotation marks if any.
085: */
086: Attribute(final Source source, final String key,
087: final Segment nameSegment, final Segment valueSegment,
088: final Segment valueSegmentIncludingQuotes) {
089: super (source, nameSegment.getBegin(),
090: (valueSegmentIncludingQuotes == null ? nameSegment
091: .getEnd() : valueSegmentIncludingQuotes
092: .getEnd()));
093: this .key = key;
094: this .nameSegment = nameSegment;
095: this .valueSegment = valueSegment;
096: this .valueSegmentIncludingQuotes = valueSegmentIncludingQuotes;
097: }
098:
099: /**
100: * Returns the name of this attribute in lower case.
101: * <p>
102: * This package treats all attribute names as case insensitive, consistent with
103: * <a target="_blank" href="http://www.w3.org/TR/html401/">HTML</a> but not consistent with
104: * <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a>.
105: *
106: * @return the name of this attribute in lower case.
107: * @see #getName()
108: */
109: public String getKey() {
110: return key;
111: }
112:
113: /**
114: * Returns the name of this attribute in original case.
115: * <p>
116: * This is exactly equivalent to {@link #getNameSegment()}<code>.toString()</code>.
117: *
118: * @return the name of this attribute in original case.
119: * @see #getKey()
120: */
121: public String getName() {
122: return nameSegment.toString();
123: }
124:
125: /**
126: * Returns the segment spanning the {@linkplain #getName() name} of this attribute.
127: * @return the segment spanning the {@linkplain #getName() name} of this attribute.
128: * @see #getName()
129: */
130: public Segment getNameSegment() {
131: return nameSegment;
132: }
133:
134: /**
135: * Indicates whether this attribute has a value.
136: * <p>
137: * This method also returns <code>true</code> if this attribute has been assigned a zero-length value.
138: * <p>
139: * It only returns <code>false</code> if this attribute appears in
140: * <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-boolean_attribute-1">minimized form</a>.
141: *
142: * @return <code>true</code> if this attribute has a value, otherwise <code>false</code>.
143: */
144: public boolean hasValue() {
145: return valueSegment != null;
146: }
147:
148: /**
149: * Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute,
150: * or <code>null</code> if it {@linkplain #hasValue() has no value}.
151: * <p>
152: * This is equivalent to {@link CharacterReference}<code>.</code>{@link CharacterReference#decode(CharSequence,boolean) decode}<code>(</code>{@link #getValueSegment()}<code>,true)</code>.
153: * <p>
154: * Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document,
155: * without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}.
156: * <p>
157: * To obtain the raw value without decoding, use {@link #getValueSegment()}<code>.toString()</code>.
158: * <p>
159: * Special attention should be given to attributes that contain URLs, such as the
160: * <code><a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a></code> attribute.
161: * When such an attribute contains a URL with parameters (as described in the
162: * <a target="_blank" href="http://www.w3.org/MarkUp/html-spec/html-spec_8.html#SEC8.2.1">form-urlencoded media type</a>),
163: * the ampersand (<code>&</code>) characters used to separate the parameters should be
164: * {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being
165: * unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}.
166: * This requirement is explicitly stated in the
167: * <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>.
168: * <p>
169: * For example, take the following element in the source document:
170: * <div style="margin: 0.5em"><code><a href="Report.jsp?chapt=2&sect=3">next</a></code></div>
171: * By default, calling
172: * {@link Element#getAttributes() getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue}<code>("href")</code>
173: * on this element returns the string
174: * "<code>Report.jsp?chapt=2§=3</code>", since the text "<code>&sect</code>" is interpreted as the rarely used
175: * character entity reference {@link CharacterEntityReference#_sect &sect;} (U+00A7), despite the fact that it is
176: * missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (<code>;</code>).
177: * <p>
178: * Most browsers recognise <a href="CharacterReference.html#Unterminated">unterminated</a> character entity references
179: * in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value.
180: * One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would
181: * have interpreted the URL in the above example differently to most other browsers.
182: * Most browsers also use different rules depending on whether the unterminated character reference is inside or outside
183: * of an attribute value, with both of these possibilities further split into different rules for
184: * {@linkplain CharacterEntityReference character entity references},
185: * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character references</a>, and
186: * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character references</a>.
187: * <p>
188: * The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting,
189: * which is determined by the {@link Config#CurrentCompatibilityMode} property.
190: *
191: * @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
192: */
193: public String getValue() {
194: return CharacterReference.decode(valueSegment, true);
195: }
196:
197: /**
198: * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
199: * @return the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
200: * @see #getValue()
201: */
202: public Segment getValueSegment() {
203: return valueSegment;
204: }
205:
206: /**
207: * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any,
208: * or <code>null</code> if it {@linkplain #hasValue() has no value}.
209: * <p>
210: * If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment}
211: *
212: * @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or <code>null</code> if it {@linkplain #hasValue() has no value}.
213: */
214: public Segment getValueSegmentIncludingQuotes() {
215: return valueSegmentIncludingQuotes;
216: }
217:
218: /**
219: * Returns the character used to quote the value.
220: * <p>
221: * The return value is either a double-quote (<code>"</code>), a single-quote (<code>'</code>), or a space.
222: *
223: * @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value.
224: */
225: public char getQuoteChar() {
226: if (valueSegment == valueSegmentIncludingQuotes)
227: return ' '; // no quotes
228: return source.charAt(valueSegmentIncludingQuotes.getBegin());
229: }
230:
231: /**
232: * Returns a string representation of this object useful for debugging purposes.
233: * @return a string representation of this object useful for debugging purposes.
234: */
235: public String getDebugInfo() {
236: final StringBuffer sb = new StringBuffer().append(key).append(
237: super .getDebugInfo()).append(",name=").append(
238: nameSegment.getDebugInfo());
239: if (hasValue())
240: sb.append(",value=").append(valueSegment.getDebugInfo())
241: .append('"').append(valueSegment).append('"')
242: .append(Config.NewLine);
243: else
244: sb.append(",NO VALUE").append(Config.NewLine);
245: return sb.toString();
246: }
247:
248: Tag appendTidy(final StringBuffer sb, Tag nextTag) {
249: sb.append(' ');
250: Util.appendTo(sb, nameSegment);
251: if (valueSegment != null) {
252: sb.append("=\"");
253: while (nextTag != null
254: && nextTag.begin < valueSegment.begin)
255: nextTag = nextTag.findNextTag();
256: if (nextTag == null || nextTag.begin >= valueSegment.end) {
257: appendTidyValue(sb, valueSegment);
258: } else {
259: int i = valueSegment.begin;
260: while (nextTag != null
261: && nextTag.begin < valueSegment.end) {
262: appendTidyValue(sb, new Segment(source, i,
263: nextTag.begin));
264: if (nextTag.end > valueSegment.end) {
265: sb.append(new Segment(source, nextTag.begin,
266: i = valueSegment.end));
267: break;
268: }
269: sb.append(nextTag);
270: i = nextTag.end;
271: nextTag = nextTag.findNextTag();
272: }
273: if (i < valueSegment.end)
274: appendTidyValue(sb, new Segment(source, i,
275: valueSegment.end));
276: }
277: sb.append('"');
278: }
279: return nextTag;
280: }
281:
282: private static void appendTidyValue(final StringBuffer sb,
283: final CharSequence unencodedValue) {
284: CharacterReference.appendEncode(sb, CharacterReference.decode(
285: unencodedValue, true), false);
286: }
287:
288: static StringBuffer appendHTML(final StringBuffer sb,
289: final CharSequence name, final CharSequence value) {
290: sb.append(' ');
291: Util.appendTo(sb, name);
292: if (value != null) {
293: sb.append("=\"");
294: CharacterReference.appendEncode(sb, value, false);
295: sb.append('"');
296: }
297: return sb;
298: }
299:
300: static Writer appendHTML(final Writer writer,
301: final CharSequence name, final CharSequence value)
302: throws IOException {
303: writer.write(' ');
304: Util.appendTo(writer, name);
305: if (value != null) {
306: writer.write("=\"");
307: writer.write(CharacterReference.encode(value));
308: writer.write('"');
309: }
310: return writer;
311: }
312: }
|