001:/*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033:package websphinx;
034:
035:import java.util.Hashtable;
036:import java.util.StringTokenizer;
037:import java.util.Enumeration;
038:import rcm.enum.ArrayEnumeration;
039:import rcm.util.Str; // for Str.parseNumber
040:
041:/**
042: * Region of an HTML page.
043: *
044: */
045:public class Region {
046:
047: protected Page source;
048: protected int start;
049: protected int end;
050:
051: /**
052: * Makes a Region.
053: * @param page Page containing region
054: * @param start Starting offset of region in page content
055: * @param end Ending offset of region in page
056: */
057: public Region (Page page, int start, int end) {
058: this .source = page;
059: this .start = start;
060: this .end = end;
061: }
062:
063: /**
064: * Makes a Region by copying another region's parameters.
065: * @param region Region to copy
066: */
067: public Region (Region region) {
068: this .source = region.source;
069: this .start = region.start;
070: this .end = region.end;
071: if (region.names != null)
072: this .names = (Hashtable)region.names.clone ();
073: }
074:
075: /**
076: * Gets page containing the region.
077: * @return page containing the region
078: */
079: public Page getSource () {
080: return source;
081: }
082:
083: /**
084: * Gets starting offset of region in page content.
085: * @return zero-based offset where region begins in page content
086: */
087: public int getStart () {
088: return start;
089: }
090:
091: /**
092: * Gets offset after end of region.
093: * @return zero-based offset just after the end of the region.
094: */
095: public int getEnd () {
096: return end;
097: }
098:
099: /**
100: * Gets length of the region. Equivalent to getEnd() - getStart().
101: * @return length of the HTML region in bytes.
102: */
103: public int getLength () {
104: return end - start;
105: }
106:
107: /**
108: * Converts the region to HTML, e.g. "<tag><tag><tag>text text</tag>"
109: * If the region does not contain HTML, then this function quotes all the <, >, &
110: * characters found in the page content, and wraps the result
111: * in <PRE> and </PRE>.
112: * @return a string consisting of the HTML content contained by this region.
113: */
114: public String toHTML () {
115: return source.substringHTML (start, end);
116: }
117:
118: /**
119: * Converts the region to tagless text, e.g. "text text".
120: * @return a string consisting of the text in the page contained by this region
121: */
122: public String toText () {
123: return source.substringText (start, end);
124: }
125:
126: /**
127: * Converts the region to HTML tags with no text, e.g. "<tag><tag></tag>".
128: * @return a string consisting of the tags in the page contained by this region
129: */
130: public String toTags () {
131: return source.substringText (start, end);
132: }
133:
134: /**
135: * Gets region as raw content.
136: * @return string representation of the region
137: */
138: public String toString () {
139: return source.substringContent (start, end);
140: }
141:
142: /**
143: * Get the root HTML element of the region.
144: * @return first HTML element whose start tag is
145: * completely in the region.
146: */
147: public Element getRootElement () {
148: Element[] elements = source.getElements ();
149: if (elements == null)
150: return null;
151:
152: int k = Region.findStart (elements, start);
153: if (k == elements.length)
154: return null;
155:
156: Element root = elements[k];
157: Tag startTag = root.getStartTag ();
158: if (startTag.getEnd() > end)
159: return null;
160:
161: return root;
162: }
163:
164: /**
165: * Finds a region that starts at or after a given position.
166: * @param regions array of regions sorted by starting offset
167: * @param p Desired starting offset
168: * @return index <i>k</i> into regions such that:
169: * <OL><LI>forall j<k: regions[j].start < p
170: * <LI>regions[k].start >= p
171: * </OL>
172: */
173: public static int findStart (Region[] regions, int p) {
174: // returns k such that forall j<k : regions[j].start < p
175: // && regions[k].start >= p
176: int lo = 0;
177: int hi = regions.length;
178: // invariant: forall j<lo : regions[j].start < p
179: // && forall j>=hi : regions[j].start >= p
180: while (lo != hi) {
181: int mid = (hi + lo) / 2;
182: if (regions[mid].start < p)
183: lo = mid+1;
184: else
185: hi = mid;
186: }
187: return hi;
188: }
189:
190: /**
191: * Finds a region that ends at or after a given position.
192: * @param regions array of regions sorted by ending offset
193: * @param p Desired ending offset
194: * @return index <i>k</i> into regions such that:
195: * <OL><LI>forall j<k: regions[j].end < p
196: * <LI>regions[k].end >= p
197: * </OL>
198: */
199: public static int findEnd (Region[] regions, int p) {
200: // returns k such that forall j<k : regions[j].end < p
201: // && regions[k].end >= p
202: int lo = 0;
203: int hi = regions.length;
204: // invariant: forall j<lo : regions[j].end < p
205: // && forall j>=hi : regions[j].end >= p
206: while (lo != hi) {
207: int mid = (hi + lo) / 2;
208: if (regions[mid].end < p)
209: lo = mid+1;
210: else
211: hi = mid;
212: }
213: return hi;
214: }
215:
216: /**
217: * Makes a new Region containing two regions.
218: * @param r end of spanning region
219: * @return region from the beginning of this region to the end of r. Both regions must have
220: * the same source, and r must end after this region starts.
221: */
222: public Region span (Region r) {
223: return new Region (source, start, r.end);
224: }
225:
226: protected Hashtable names = null;
227:
228: static final int INITIAL_SIZE = 4;
229: // typically only a handful of names are set
230:
231: /**
232: * Default value for labels set with setLabel (name). Value of TRUE is
233: * "true".
234: */
235: public static final String TRUE = "true".intern ();
236:
237: /**
238: * Set an object-valued label.
239: * @param name name of label (case-sensitive, whitespace permitted)
240: * @param value value set for label. If null, the label is removed.
241: */
242: public void setObjectLabel (String name, Object value) {
243: if (value == null)
244: removeLabel (name);
245: else {
246: if (names == null)
247: names = new Hashtable (INITIAL_SIZE);
248: names.put (name, value);
249: }
250: }
251:
252: /**
253: * Get an object-valued label.
254: * @param name name of label (case-sensitive, whitespace permitted)
255: * @return Object value set for label, or null if label not set
256: */
257: public Object getObjectLabel (String name) {
258: return names != null ? names.get (name) : null;
259: }
260:
261: /**
262: * Enumerate the labels of the region.
263: * @return enumeration producing label names
264: */
265: public Enumeration enumerateObjectLabels () {
266: return names != null ? names.keys () : new ArrayEnumeration (null);
267: }
268:
269: /**
270: * Get a String containing the labels of the region.
271: * @return string containing the label names, separated by spaces
272: */
273: public String getObjectLabels () {
274: Enumeration enum = enumerateObjectLabels ();
275: StringBuffer buf = new StringBuffer ();
276: while (enum.hasMoreElements ()) {
277: if (buf.length() > 0)
278: buf.append (' ');
279: buf.append ((String)enum.nextElement());
280: }
281: return buf.toString();
282: }
283:
284: /**
285: * Set a string-valued label.
286: * @param name name of label (case-sensitive, whitespace permitted)
287: * @param value value set for label. If null, the label is removed.
288: */
289: public void setLabel (String name, String value) {
290: setObjectLabel (name, value);
291: }
292:
293: /**
294: * Set a label on the region. The value of the label defaults to TRUE.
295: * @param name name of label (case-sensitive, whitespace permitted)
296: */
297: public void setLabel (String name) {
298: setObjectLabel (name, TRUE);
299: }
300:
301: /**
302: * Get a label's value.
303: * @param name name of label (case-sensitive, whitespace permitted)
304: * @return value of label, or null if label not set
305: */
306: public String getLabel (String name) {
307: Object obj = getObjectLabel (name);
308: if (obj == null)
309: return null;
310: else if (obj instanceof Region[])
311: return null; // NIY
312: else if (obj instanceof Region)
313: return ((Region)obj).toText ();
314: else
315: return obj.toString();
316: }
317:
318: /**
319: * Get a label's value. If the label is not set, return defaultValue.
320: * @param name name of label (case-sensitive, whitespace permitted)
321: * @param defaultValue default value that should be returned if label is not set
322: * @return value of label, or defaultValue if not set
323:
324: */
325: public String getLabel (String name, String defaultValue) {
326: String val = getLabel (name);
327: return (val != null) ? val : defaultValue;
328: }
329:
330: /**
331: * Get a label's value as a number. Returns the first number (integral or floating point) that can be
332: * parsed from the label's value, skipping an arbitrary amount of junk.
333: * @param name name of label (case-sensitive, whitespace permitted)
334: * @param defaultValue default value that should be returned if label is not set
335: * @return numeric value of label, or defaultValue if not set or no number is found
336:
337: */
338: public Number getNumericLabel (String name, Number defaultValue) {
339: String val = getLabel (name);
340: if (val == null)
341: return defaultValue;
342: try {
343: return Str.parseNumber (val);
344: } catch (NumberFormatException e) {
345: return defaultValue;
346: }
347: }
348:
349: /**
350: * Test if a label is set.
351: * @param name name of label (case-sensitive, whitespace permitted)
352: * @return true if label is set, otherwise false
353: */
354: public boolean hasLabel (String name) {
355: return names != null && names.containsKey (name);
356: }
357:
358: /**
359: * Test if one or more of several labels are set.
360: * @param expr a list of label names separated by spaces
361: * @return true if region has at least one of the labels in expr
362: */
363: public boolean hasAnyLabels (String expr) {
364: StringTokenizer tok = new StringTokenizer (expr);
365: while (tok.hasMoreElements ())
366: if (hasLabel (tok.nextToken()))
367: return true;
368: return false;
369: }
370:
371: /**
372: * Test if one or more of several labels are set.
373: * @param labels an array of label names
374: * @return true if region has at least one of the labels
375: */
376: public boolean hasAnyLabels (String[] labels) {
377: for (int i=0; i<labels.length; ++i)
378: if (hasLabel (labels[i]))
379: return true;
380: return false;
381: }
382:
383: /**
384: * Test if all of several labels are set.
385: * @param expr a list of label names separated by spaces
386: * @return true if region has at least one of the labels in expr
387: */
388: public boolean hasAllLabels (String expr) {
389: StringTokenizer tok = new StringTokenizer (expr);
390: while (tok.hasMoreElements ())
391: if (!hasLabel (tok.nextToken()))
392: return false;
393: return true;
394: }
395:
396: /**
397: * Test if all of several labels are set.
398: * @param labels an array of label names
399: * @return true if region has all of the labels
400: */
401: public boolean hasAllLabels (String[] labels) {
402: for (int i=0; i<labels.length; ++i)
403: if (!hasLabel (labels[i]))
404: return false;
405: return true;
406: }
407:
408:
409: /**
410: * Remove a label.
411: * @param name name of label (case-sensitive, whitespace permitted)
412: */
413: public void removeLabel (String name) {
414: if (names != null)
415: names.remove (name);
416: }
417:
418: /**
419: * Name a subregion (by setting a label to point to it).
420: * @param name label name (case-sensitive, whitespace permitted)
421: * @param region subregion to name
422: */
423: public void setField (String name, Region region) {
424: setObjectLabel (name, region);
425: }
426:
427: /**
428: * Get a named subregion.
429: * @param name label name (case-sensitive, whitespace permitted)
430: * @return the named region, or null if label not set to a region
431: */
432: public Region getField (String name) {
433: try {
434: return (Region)getObjectLabel (name);
435: } catch (ClassCastException e) {
436: return null;
437: }
438: }
439:
440: /**
441: * Name a set of subregions (by pointing a label to them).
442: * @param name label name (case-sensitive, whitespace permitted)
443: * @param regions list of subregions
444: */
445: public void setFields (String name, Region[] regions) {
446: setObjectLabel (name, regions);
447: }
448:
449: /**
450: * Get a set of named subregions. Note that subregions named with
451: * setField() cannot be retrieved with getFields(); use getField() instead.
452: * @param name label name (case-sensitive, whitespace permitted)
453: * @return the named subregions, or null if label not set to a set
454: * of subregions
455: */
456: public Region[] getFields (String name) {
457: try {
458: return (Region[])getObjectLabel (name);
459: } catch (ClassCastException e) {
460: return null;
461: }
462: }
463:
464:}
|