001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: /**
024: * Represents the text from the {@linkplain Source source} document that is to be parsed.
025: * <p>
026: * This class is normally only of interest to users who wish to create <a href="TagType.html#Custom">custom tag types</a>.
027: * <p>
028: * The parse text is defined as the entire text of the source document in lower case, with all
029: * {@linkplain Segment#ignoreWhenParsing() ignored} segments replaced by space characters.
030: * <p>
031: * The text is stored in lower case to make case insensitive parsing as efficient as possible.
032: * <p>
033: * This class provides many methods which are also provided by the <code>java.lang.String</code> class,
034: * but adds an extra parameter called <code>breakAtIndex</code> to the various <code>indexOf</code> methods.
035: * This parameter allows a search on only a specified segment of the text, which is not possible using the normal <code>String</code> class.
036: * <p>
037: * <code>ParseText</code> instances are obtained using the {@link Source#getParseText()} method.
038: */
039: public final class ParseText implements CharSequence {
040: private final char[] text;
041:
042: /** A value to use as the <code>breakAtIndex</code> argument in certain methods to indicate that the search should continue to the start or end of the parse text. */
043: public static final int NO_BREAK = -1;
044:
045: /**
046: * Constructs a new <code>ParseText</code> object based on the specified <code>CharSequence</code>.
047: * @param charSequence the character sequence upon which the parse text is based.
048: */
049: ParseText(final CharSequence charSequence) {
050: text = new char[charSequence.length()];
051: for (int i = 0; i < text.length; i++)
052: text[i] = Character.toLowerCase(charSequence.charAt(i));
053: }
054:
055: /**
056: * Constructs a new <code>ParseText</code> object based on the specified {@link OutputDocument}.
057: * @param outputDocument the {@link OutputDocument} upon which the parse text is based.
058: */
059: ParseText(final OutputDocument outputDocument) {
060: this (outputDocument.toString());
061: }
062:
063: /**
064: * Indicates whether this parse text contains the specified string at the specified position.
065: * <p>
066: * This method is analogous to the <code>java.lang.String.startsWith(String prefix, int toffset)</code> method.
067: *
068: * @param str a string.
069: * @param pos the position (index) in this parse text at which to check for the specified string.
070: * @return <code>true</code> if this parse text contains the specified string at the specified position, otherwise <code>false</code>.
071: */
072: public boolean containsAt(final String str, final int pos) {
073: for (int i = 0; i < str.length(); i++)
074: if (str.charAt(i) != text[pos + i])
075: return false;
076: return true;
077: }
078:
079: /**
080: * Returns the character at the specified index.
081: * @param index the index of the character.
082: * @return the character at the specified index, which is always in lower case.
083: */
084: public char charAt(final int index) {
085: return text[index];
086: }
087:
088: /**
089: * Returns the index within this parse text of the first occurrence of the specified character,
090: * starting the search at the position specified by <code>fromIndex</code>.
091: * <p>
092: * If the specified character is not found then -1 is returned.
093: *
094: * @param searchChar a character.
095: * @param fromIndex the index to start the search from.
096: * @return the index within this parse text of the first occurrence of the specified character within the specified range, or -1 if the character is not found.
097: */
098: public int indexOf(final char searchChar, final int fromIndex) {
099: return indexOf(searchChar, fromIndex, NO_BREAK);
100: }
101:
102: /**
103: * Returns the index within this parse text of the first occurrence of the specified character,
104: * starting the search at the position specified by <code>fromIndex</code>,
105: * and breaking the search at the index specified by <code>breakAtIndex</code>.
106: * <p>
107: * The position specified by <code>breakAtIndex</code> is not included in the search.
108: * <p>
109: * If the search is to continue to the end of the text,
110: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
111: * <p>
112: * If the specified character is not found then -1 is returned.
113: *
114: * @param searchChar a character.
115: * @param fromIndex the index to start the search from.
116: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
117: * @return the index within this parse text of the first occurrence of the specified character within the specified range, or -1 if the character is not found.
118: */
119: public int indexOf(final char searchChar, final int fromIndex,
120: final int breakAtIndex) {
121: final int actualBreakAtIndex = (breakAtIndex == NO_BREAK
122: || breakAtIndex > text.length ? text.length
123: : breakAtIndex);
124: for (int i = (fromIndex < 0 ? 0 : fromIndex); i < actualBreakAtIndex; i++)
125: if (text[i] == searchChar)
126: return i;
127: return -1;
128: }
129:
130: /**
131: * Returns the index within this parse text of the last occurrence of the specified character,
132: * searching backwards starting at the position specified by <code>fromIndex</code>.
133: * <p>
134: * If the specified character is not found then -1 is returned.
135: *
136: * @param searchChar a character.
137: * @param fromIndex the index to start the search from.
138: * @return the index within this parse text of the last occurrence of the specified character within the specified range, or -1 if the character is not found.
139: */
140: public int lastIndexOf(final char searchChar, final int fromIndex) {
141: return lastIndexOf(searchChar, fromIndex, NO_BREAK);
142: }
143:
144: /**
145: * Returns the index within this parse text of the last occurrence of the specified character,
146: * searching backwards starting at the position specified by <code>fromIndex</code>,
147: * and breaking the search at the index specified by <code>breakAtIndex</code>.
148: * <p>
149: * The position specified by <code>breakAtIndex</code> is not included in the search.
150: * <p>
151: * If the search is to continue to the start of the text,
152: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
153: * <p>
154: * If the specified character is not found then -1 is returned.
155: *
156: * @param searchChar a character.
157: * @param fromIndex the index to start the search from.
158: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
159: * @return the index within this parse text of the last occurrence of the specified character within the specified range, or -1 if the character is not found.
160: */
161: public int lastIndexOf(final char searchChar, final int fromIndex,
162: final int breakAtIndex) {
163: for (int i = (fromIndex > text.length ? text.length : fromIndex); i > breakAtIndex; i--)
164: if (text[i] == searchChar)
165: return i;
166: return -1;
167: }
168:
169: /**
170: * Returns the index within this parse text of the first occurrence of the specified string,
171: * starting the search at the position specified by <code>fromIndex</code>.
172: * <p>
173: * If the specified string is not found then -1 is returned.
174: *
175: * @param searchString a string.
176: * @param fromIndex the index to start the search from.
177: * @return the index within this parse text of the first occurrence of the specified string within the specified range, or -1 if the string is not found.
178: */
179: public int indexOf(final String searchString, final int fromIndex) {
180: return (searchString.length() == 1) ? indexOf(searchString
181: .charAt(0), fromIndex, NO_BREAK) : indexOf(searchString
182: .toCharArray(), fromIndex, NO_BREAK);
183: }
184:
185: /**
186: * Returns the index within this parse text of the first occurrence of the specified character array,
187: * starting the search at the position specified by <code>fromIndex</code>.
188: * <p>
189: * If the specified character array is not found then -1 is returned.
190: *
191: * @param searchCharArray a character array.
192: * @param fromIndex the index to start the search from.
193: * @return the index within this parse text of the first occurrence of the specified character array within the specified range, or -1 if the character array is not found.
194: */
195: public int indexOf(final char[] searchCharArray, final int fromIndex) {
196: return indexOf(searchCharArray, fromIndex, NO_BREAK);
197: }
198:
199: /**
200: * Returns the index within this parse text of the first occurrence of the specified string,
201: * starting the search at the position specified by <code>fromIndex</code>,
202: * and breaking the search at the index specified by <code>breakAtIndex</code>.
203: * <p>
204: * The position specified by <code>breakAtIndex</code> is not included in the search.
205: * <p>
206: * If the search is to continue to the end of the text,
207: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
208: * <p>
209: * If the specified string is not found then -1 is returned.
210: *
211: * @param searchString a string.
212: * @param fromIndex the index to start the search from.
213: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
214: * @return the index within this parse text of the first occurrence of the specified string within the specified range, or -1 if the string is not found.
215: */
216: public int indexOf(final String searchString, final int fromIndex,
217: final int breakAtIndex) {
218: return (searchString.length() == 1) ? indexOf(searchString
219: .charAt(0), fromIndex, breakAtIndex) : indexOf(
220: searchString.toCharArray(), fromIndex, breakAtIndex);
221: }
222:
223: /**
224: * Returns the index within this parse text of the first occurrence of the specified character array,
225: * starting the search at the position specified by <code>fromIndex</code>,
226: * and breaking the search at the index specified by <code>breakAtIndex</code>.
227: * <p>
228: * The position specified by <code>breakAtIndex</code> is not included in the search.
229: * <p>
230: * If the search is to continue to the end of the text,
231: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
232: * <p>
233: * If the specified character array is not found then -1 is returned.
234: *
235: * @param searchCharArray a character array.
236: * @param fromIndex the index to start the search from.
237: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
238: * @return the index within this parse text of the first occurrence of the specified character array within the specified range, or -1 if the character array is not found.
239: */
240: public int indexOf(final char[] searchCharArray,
241: final int fromIndex, final int breakAtIndex) {
242: if (searchCharArray.length == 0)
243: return fromIndex;
244: final char firstChar = searchCharArray[0];
245: final int lastPossibleBreakAtIndex = text.length
246: - searchCharArray.length + 1;
247: final int actualBreakAtIndex = (breakAtIndex == NO_BREAK || breakAtIndex > lastPossibleBreakAtIndex) ? lastPossibleBreakAtIndex
248: : breakAtIndex;
249: outerLoop: for (int i = (fromIndex < 0 ? 0 : fromIndex); i < actualBreakAtIndex; i++) {
250: if (text[i] == firstChar) {
251: for (int j = 1; j < searchCharArray.length; j++)
252: if (searchCharArray[j] != text[j + i])
253: continue outerLoop;
254: return i;
255: }
256: }
257: return -1;
258: }
259:
260: /**
261: * Returns the index within this parse text of the last occurrence of the specified string,
262: * searching backwards starting at the position specified by <code>fromIndex</code>.
263: * <p>
264: * If the specified string is not found then -1 is returned.
265: *
266: * @param searchString a string.
267: * @param fromIndex the index to start the search from.
268: * @return the index within this parse text of the last occurrence of the specified string within the specified range, or -1 if the string is not found.
269: */
270: public int lastIndexOf(final String searchString,
271: final int fromIndex) {
272: return (searchString.length() == 1) ? lastIndexOf(searchString
273: .charAt(0), fromIndex, NO_BREAK) : lastIndexOf(
274: searchString.toCharArray(), fromIndex, NO_BREAK);
275: }
276:
277: /**
278: * Returns the index within this parse text of the last occurrence of the specified character array,
279: * searching backwards starting at the position specified by <code>fromIndex</code>.
280: * <p>
281: * If the specified character array is not found then -1 is returned.
282: *
283: * @param searchCharArray a character array.
284: * @param fromIndex the index to start the search from.
285: * @return the index within this parse text of the last occurrence of the specified character array within the specified range, or -1 if the character array is not found.
286: */
287: public int lastIndexOf(final char[] searchCharArray,
288: final int fromIndex) {
289: return lastIndexOf(searchCharArray, fromIndex, NO_BREAK);
290: }
291:
292: /**
293: * Returns the index within this parse text of the last occurrence of the specified string,
294: * searching backwards starting at the position specified by <code>fromIndex</code>,
295: * and breaking the search at the index specified by <code>breakAtIndex</code>.
296: * <p>
297: * The position specified by <code>breakAtIndex</code> is not included in the search.
298: * <p>
299: * If the search is to continue to the start of the text,
300: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
301: * <p>
302: * If the specified string is not found then -1 is returned.
303: *
304: * @param searchString a string.
305: * @param fromIndex the index to start the search from.
306: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
307: * @return the index within this parse text of the last occurrence of the specified string within the specified range, or -1 if the string is not found.
308: */
309: public int lastIndexOf(final String searchString,
310: final int fromIndex, final int breakAtIndex) {
311: return (searchString.length() == 1) ? lastIndexOf(searchString
312: .charAt(0), fromIndex, breakAtIndex) : lastIndexOf(
313: searchString.toCharArray(), fromIndex, breakAtIndex);
314: }
315:
316: /**
317: * Returns the index within this parse text of the last occurrence of the specified character array,
318: * searching backwards starting at the position specified by <code>fromIndex</code>,
319: * and breaking the search at the index specified by <code>breakAtIndex</code>.
320: * <p>
321: * The position specified by <code>breakAtIndex</code> is not included in the search.
322: * <p>
323: * If the search is to continue to the start of the text,
324: * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
325: * <p>
326: * If the specified character array is not found then -1 is returned.
327: *
328: * @param searchCharArray a character array.
329: * @param fromIndex the index to start the search from.
330: * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
331: * @return the index within this parse text of the last occurrence of the specified character array within the specified range, or -1 if the character array is not found.
332: */
333: public int lastIndexOf(final char[] searchCharArray, int fromIndex,
334: final int breakAtIndex) {
335: if (searchCharArray.length == 0)
336: return fromIndex;
337: final int rightIndex = text.length - searchCharArray.length;
338: if (breakAtIndex > rightIndex)
339: return -1;
340: if (fromIndex > rightIndex)
341: fromIndex = rightIndex;
342: final int lastCharIndex = searchCharArray.length - 1;
343: final char lastChar = searchCharArray[lastCharIndex];
344: final int actualBreakAtPos = breakAtIndex + lastCharIndex;
345: outerLoop: for (int i = fromIndex + lastCharIndex; i > actualBreakAtPos; i--) {
346: if (text[i] == lastChar) {
347: final int startIndex = i - lastCharIndex;
348: for (int j = lastCharIndex - 1; j >= 0; j--)
349: if (searchCharArray[j] != text[j + startIndex])
350: continue outerLoop;
351: return startIndex;
352: }
353: }
354: return -1;
355: }
356:
357: /**
358: * Returns the length of the parse text.
359: * @return the length of the parse text.
360: */
361: public int length() {
362: return text.length;
363: }
364:
365: /**
366: * Returns a new string that is a substring of this parse text.
367: * <p>
368: * The substring begins at the specified <code>beginIndex</code> and extends to the character at index <code>endIndex</code> - 1.
369: * Thus the length of the substring is <code>endIndex-beginIndex</code>.
370: *
371: * @param beginIndex the begin index, inclusive.
372: * @param endIndex the end index, exclusive.
373: * @return a new string that is a substring of this parse text.
374: */
375: public String substring(final int beginIndex, final int endIndex) {
376: return new String(text, beginIndex, endIndex - beginIndex);
377: }
378:
379: /**
380: * Returns a new character sequence that is a subsequence of this sequence.
381: * <p>
382: * This is equivalent to {@link #substring(int,int) substring(beginIndex,endIndex)}.
383: *
384: * @param beginIndex the begin index, inclusive.
385: * @param endIndex the end index, exclusive.
386: * @return a new character sequence that is a subsequence of this sequence.
387: */
388: public CharSequence subSequence(final int beginIndex,
389: final int endIndex) {
390: return substring(beginIndex, endIndex);
391: }
392:
393: /**
394: * Returns the content of the parse text as a <code>String</code>.
395: * @return the content of the parse text as a <code>String</code>.
396: */
397: public String toString() {
398: return new String(text);
399: }
400: }
|