001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.net.URL;
036: import java.net.URLConnection; //#ifdef JDK1.1
037: import java.net.HttpURLConnection; //#endif JDK1.1
038: import java.io.IOException;
039: import java.io.InputStream;
040: import rcm.util.Str;
041:
042: /**
043: * A Web page. Although a Page can represent any MIME type, it mainly
044: * supports HTML pages, which are automatically parsed. The parsing produces
045: * a list of tags, a list of words, an HTML parse tree, and a list of links.
046: */
047: public class Page extends Region {
048:
049: // typical page length, to optimize downloads
050: static final int TYPICAL_LENGTH = 20240;
051:
052: // Permanent content
053: Link origin;
054: long lastModified = 0;
055: long expiration = 0;
056: String contentType;
057: String contentEncoding;
058: int responseCode = -1;
059: String responseMessage = null;
060: URL base;
061: String title;
062: Link[] links;
063:
064: int contentLock;
065: // If page was downloaded from Net, represents number of
066: // callers who want to keep the content.
067: // If page was created from a string, set to -1.
068:
069: // Discardable content (thrown away when contentLock falls to 0)
070: byte[] contentBytes;
071: String content;
072: Region[] tokens;
073: Text[] words;
074: Tag[] tags;
075: Element[] elements;
076: Element root;
077: String canonicalTags;
078:
079: /**
080: * Make a Page by downloading and parsing a Link.
081: * @param link Link to download
082: */
083: public Page(Link link) throws IOException {
084: this (link, DownloadParameters.NO_LIMITS, new HTMLParser());
085: }
086:
087: /**
088: * Make a Page by downloading a Link.
089: * @param link Link to download
090: * @param dp Download parameters to use
091: */
092: public Page(Link link, DownloadParameters dp) throws IOException {
093: this (link, dp, new HTMLParser());
094: }
095:
096: /**
097: * Make a Page by downloading a Link.
098: * @param link Link to download
099: * @param parser HTML parser to use
100: */
101: public Page(Link link, DownloadParameters dp, HTMLParser parser)
102: throws IOException {
103: super (null, 0, 0);
104: source = this ;
105: origin = link;
106: base = getURL();
107: download(dp, parser);
108: link.setPage(this );
109: }
110:
111: /**
112: * Make a Page from a URL and a string of HTML.
113: * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
114: * @param url URL to use as a base for relative links on the page
115: * @param html the HTML content of the page
116: */
117: public Page(URL url, String html) {
118: this (url, html, new HTMLParser());
119: }
120:
121: /**
122: * Make a Page from a URL and a string of HTML.
123: * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
124: * @param url URL to use as a base for relative links on the page
125: * @param html the HTML content of the page
126: * @param parser HTML parser to use
127: */
128: public Page(URL url, String html, HTMLParser parser) {
129: super (null, 0, html.length());
130: source = this ;
131: base = url;
132: this .content = html;
133: this .contentBytes = html.getBytes();
134: contentLock = -1;
135: parse(parser);
136: }
137:
138: /**
139: * Make a Page from a string of content. The content is not parsed.
140: * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
141: * @param content HTML content of the page */
142: public Page(String content) {
143: super (null, 0, content.length());
144: // FIX: don't think base==null will work
145: source = this ;
146: this .content = content;
147: this .contentBytes = content.getBytes();
148: contentLock = -1;
149: }
150:
151: /**
152: * Make a Page from a byte array of content. The content is not parsed.
153: * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
154: * @param content byte content of the page */
155: public Page(byte[] content) {
156: super (null, 0, content.length);
157: // FIX: don't think base==null will work
158: source = this ;
159: this .contentBytes = new byte[content.length];
160: System.arraycopy(content, 0, this .contentBytes, 0,
161: content.length);
162: this .content = new String(content);
163: contentLock = -1;
164: }
165:
166: //
167: // Downloading
168: //
169:
170: // This code generates SecurityExceptions in Netscape 4.0,
171: // and it doesn't seem to be necessary anyway: redirects are followed
172: // by Netscape and JDK by default, despite the fact that the JDK
173: // docs claim that setFollowRedirects() defaults to false
174:
175: //static {
176: //try {
177: // HttpURLConnection.setFollowRedirects (true);
178: //} catch (Throwable t) { }
179: //}
180:
181: /*
182: * Download the page. The downloaded page is parsed
183: * if its MIME type is HTML or unspecified.
184: * @param parser HTML parser to use
185: * @exception IOException if an error occurs in downloading the page
186: */
187: public void download(DownloadParameters dp, HTMLParser parser)
188: throws IOException {
189: URLConnection conn = Access.getAccess().openConnection(origin);
190:
191: // fetch and store final redirected URL and response headers
192: InputStream in = conn.getInputStream();
193: base = conn.getURL();
194: lastModified = conn.getLastModified();
195: expiration = conn.getExpiration();
196: contentType = conn.getContentType();
197: contentEncoding = conn.getContentEncoding();
198:
199: //#ifdef JDK1.1
200: // get HTTP response codes
201: if (conn instanceof HttpURLConnection) {
202: HttpURLConnection httpconn = (HttpURLConnection) conn;
203:
204: responseCode = httpconn.getResponseCode();
205: responseMessage = httpconn.getResponseMessage();
206: if (responseMessage == null)
207: responseMessage = "unknown error";
208:
209: if (responseCode >= 300)
210: // HTTP failure
211: throw new IOException(responseCode + " "
212: + responseMessage);
213: }
214: //#endif JDK1.1
215:
216: // System.err.println ("Original URL: " + origin.getURL());
217: // System.err.println ("Final URL: " + conn.getURL());
218:
219: // download content
220: int maxKB = dp.getMaxPageSize();
221: int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE;
222: int expectedLength = conn.getContentLength();
223: if (expectedLength > maxBytes)
224: throw new IOException("Page greater than " + maxBytes
225: + " bytes");
226: if (expectedLength == -1)
227: expectedLength = TYPICAL_LENGTH;
228: byte[] buf = new byte[expectedLength];
229: int n;
230: int total = 0;
231:
232: while ((n = in.read(buf, total, buf.length - total)) != -1) {
233: total += n;
234: if (total > maxBytes)
235: throw new IOException("Page greater than " + maxBytes
236: + " bytes");
237: if (total == buf.length) {
238: // try to read one more character
239: int c = in.read();
240: if (c == -1)
241: break; // EOF, we're done
242: else {
243: // need more space in array. Double the array, but don't make
244: // it bigger than maxBytes.
245: byte[] newbuf = new byte[Math.min(buf.length * 2,
246: maxBytes)];
247: System.arraycopy(buf, 0, newbuf, 0, buf.length);
248: buf = newbuf;
249: buf[total++] = (byte) c;
250: }
251: }
252: }
253: in.close();
254:
255: if (total != buf.length) {
256: // resize the array to be precisely total bytes long
257: byte[] newbuf = new byte[total];
258: System.arraycopy(buf, 0, newbuf, 0, total);
259: buf = newbuf;
260: }
261:
262: contentBytes = buf;
263: content = new String(buf);
264: start = 0;
265: end = total;
266: contentLock = 1;
267:
268: // parse the response
269: if (contentType == null || contentType.startsWith("text/html")
270: || contentType.startsWith("content/unknown"))
271: parse(parser);
272: }
273:
274: void downloadSafely() {
275: try {
276: download(new DownloadParameters(), new HTMLParser());
277: } catch (Throwable e) {
278: }
279: }
280:
281: //
282: // Parsing
283: //
284:
285: /**
286: * Parse the page. Assumes the page has already been downloaded.
287: * @param parser HTML parser to use
288: * @exception RuntimeException if an error occurs in downloading the page
289: */
290: public void parse(HTMLParser parser) {
291: if (!hasContent())
292: downloadSafely();
293: try {
294: parser.parse(this );
295: } catch (IOException e) {
296: throw new RuntimeException(e.toString());
297: }
298: }
299:
300: /**
301: * Test whether page has been parsed. Pages are parsed during
302: * download only if its MIME type is HTML or unspecified.
303: * @return true if page was parsed, false if not
304: */
305: public boolean isParsed() {
306: return tokens != null;
307: }
308:
309: /**
310: * Test whether page is HTML.
311: * @return true if page is HTML.
312: */
313: public boolean isHTML() {
314: return root != null;
315: }
316:
317: /**
318: * Test whether page is a GIF or JPEG image.
319: * @return true if page is a GIF or JPEG image, false if not
320: */
321: public boolean isImage() {
322: byte[] bytes = getContentBytes();
323: return startsWith(bytes, GIF_MAGIC)
324: || startsWith(bytes, JPG_MAGIC);
325: }
326:
327: private static final byte[] GIF_MAGIC = { (byte) 'G', (byte) 'I',
328: (byte) 'F', (byte) '8' };
329: private static final byte[] JPG_MAGIC = { (byte) 0377, (byte) 0330,
330: (byte) 0377, (byte) 0340, (byte) 0, (byte) 020, (byte) 'J',
331: (byte) 'F', (byte) 'I', (byte) 'F' };
332:
333: private boolean startsWith(byte[] bytes, byte[] prefix) {
334: if (prefix.length > bytes.length)
335: return false;
336: for (int i = 0, n = prefix.length; i < n; ++i)
337: if (bytes[i] != prefix[i])
338: return false;
339: return true;
340: }
341:
342: //
343: // Content management
344: //
345:
346: /**
347: * Lock the page's content (to prevent it from being discarded).
348: * This method increments a lock counter, representing all the
349: * callers interested in preserving the content. The lock
350: * counter is set to 1 when the page is initially downloaded.
351: */
352: public void keepContent() {
353: if (contentLock > 0)
354: ++contentLock;
355: }
356:
357: /**
358: * Unlock the page's content (allowing it to be garbage-collected, to
359: * save space during a Web crawl). This method decrements a lock counter.
360: * If the counter falls to
361: * 0 (meaning no callers are interested in the content),
362: * the content is released. At least the following
363: * fields are discarded: content, tokens, tags, words, elements, and
364: * root. After the content has been discarded, calling getContent()
365: * (or getTokens(), getTags(), etc.) will force the page to be downloaded
366: * again. Hopefully the download will come from the cache, however.
367: * <P> Links are not considered part of the content, and are not subject to
368: * discarding by this method. Also, if the page was created from a string
369: * (rather than by downloading), its content is not subject to discarding
370: * (since there would be no way to recover it).
371: */
372: public void discardContent() {
373: if (contentLock == 0) // already discarded
374: return;
375:
376: if (--contentLock > 0) // somebody else still has a lock on the content
377: return;
378:
379: if (origin == null)
380: return; // without an origin, we'd have no way to recover this page
381:
382: //System.err.println ("discarding content of " + toDescription());
383: contentBytes = null;
384: content = null;
385: tokens = null;
386: tags = null;
387: words = null;
388: elements = null;
389: root = null;
390: canonicalTags = null;
391:
392: // keep links, but isolate them from the element tree
393: if (links != null) {
394: for (int i = 0; i < links.length; ++i)
395: if (links[i] instanceof Link)
396: ((Link) links[i]).discardContent();
397: }
398:
399: // FIX: debugging only: disconnect this page from its parent
400: //origin.page = null;
401: //origin = null;
402:
403: contentLock = 0;
404: }
405:
406: /**
407: * Test if page content is available.
408: * @return true if content is downloaded and available, false if content has not been downloaded
409: * or has been discarded.
410: */
411: public final boolean hasContent() {
412: return contentLock != 0;
413: }
414:
415: //
416: // Page accessors
417: //
418:
419: /**
420: * Get depth of page in crawl.
421: * @return depth of page from root (depth of page is same as depth of its originating link)
422: */
423: public int getDepth() {
424: return origin != null ? origin.getDepth() : 0;
425: }
426:
427: /**
428: * Get the Link that points to this page.
429: * @return the Link object that was used to download this page.
430: */
431: public Link getOrigin() {
432: return origin;
433: }
434:
435: /**
436: * Get the base URL, relative to which the page's links were interpreted.
437: * The base URL defaults to the URL of the
438: * Link that was used to download the page. If any redirects occur
439: * while downloading the page, the final location becomes the new base
440: * URL. Lastly, if a <BASE> element is found in the page, that
441: * becomes the new base URL.
442: * @return the page's base URL.
443: */
444: public URL getBase() {
445: return base;
446: }
447:
448: /**
449: * Get the URL.
450: * @return the URL of the link that was used to download this page
451: */
452: public URL getURL() {
453: return origin != null ? origin.getURL() : null;
454: }
455:
456: /**
457: * Get the title of the page.
458: * @return the page's title, or null if the page hasn't been parsed.
459: */
460: public String getTitle() {
461: return title;
462: }
463:
464: /**
465: * Get the content of the page as a String. May not work properly for
466: * binary data like images; use getContentBytes instead.
467: * @return the String content of the page.
468: */
469: public String getContent() {
470: if (!hasContent())
471: downloadSafely();
472: return content;
473: }
474:
475: /**
476: * Get the content of the page as an array of bytes.
477: * @return the content of the page in binary form.
478: */
479: public byte[] getContentBytes() {
480: if (!hasContent())
481: downloadSafely();
482: return contentBytes;
483: }
484:
485: /**
486: * Get the token sequence of the page. Tokens are tags and whitespace-delimited text.
487: * @return token regions in the page, or null if the page hasn't been downloaded or parsed.
488: */
489: public Region[] getTokens() {
490: if (!hasContent())
491: downloadSafely();
492: return tokens;
493: }
494:
495: /**
496: * Get the tag sequence of the page.
497: * @return tags in the page, or null if the page hasn't been downloaded or parsed.
498: */
499: public Tag[] getTags() {
500: if (!hasContent())
501: downloadSafely();
502: return tags;
503: }
504:
505: /**
506: * Get the words in the page. Words are whitespace- and tag-delimited text.
507: * @return words in the page, or null if the page hasn't been downloaded or parsed.
508: */
509: public Text[] getWords() {
510: if (!hasContent())
511: downloadSafely();
512: return words;
513: }
514:
515: /**
516: * Get the HTML elements in the page. All elements in the page
517: * are included in the list, in the order they would appear in
518: * an inorder traversal of the HTML parse tree.
519: * @return HTML elements in the page ordered by inorder, or null if the page
520: * hasn't been downloaded or parsed.
521: */
522: public Element[] getElements() {
523: if (!hasContent())
524: downloadSafely();
525: return elements;
526: }
527:
528: /**
529: * Get the root HTML element of the page.
530: * @return first top-level HTML element in the page, or null
531: * if the page hasn't been downloaded or parsed.
532: */
533: public Element getRootElement() {
534: if (!hasContent())
535: downloadSafely();
536: return root;
537: }
538:
539: /**
540: * Get the links found in the page.
541: * @return links in the page, or null
542: * if the page hasn't been downloaded or parsed.
543: */
544: public Link[] getLinks() {
545: return links;
546: }
547:
548: /**
549: * Convert the link's URL to a String
550: * @return the URL represented as a string
551: */
552: public String toURL() {
553: return origin != null ? origin.toURL() : null;
554: }
555:
556: /**
557: * Generate a human-readable description of the page.
558: * @return a description of the link, in the form "title [url]".
559: */
560: public String toDescription() {
561: return (title != null && title.length() > 0 ? title + " " : "")
562: + "[" + getURL() + "]";
563: }
564:
565: /**
566: * Get page containing the region.
567: * @return page containing the region
568: */
569: public String toString() {
570: return getContent();
571: }
572:
573: /**
574: * Get last-modified date of page.
575: * @return the date when the page was last modified, or 0 if not known.
576: * The value is number of seconds since January 1, 1970 GMT
577: */
578: public long getLastModified() {
579: return lastModified;
580: }
581:
582: /**
583: * Set last-modified date of page.
584: * @param last the date when the page was last modified, or 0 if not known.
585: * The value is number of seconds since January 1, 1970 GMT
586: */
587: public void setLastModified(long last) {
588: lastModified = last;
589: }
590:
591: /**
592: * Get expiration date of page.
593: * @return the expiration date of the page, or 0 if not known.
594: * The value is number of seconds since January 1, 1970 GMT.
595: */
596: public long getExpiration() {
597: return expiration;
598: }
599:
600: /**
601: * Set expiration date of page.
602: * @param expire the expiration date of the page, or 0 if not known.
603: * The value is number of seconds since January 1, 1970 GMT.
604: */
605: public void setExpiration(long expire) {
606: expiration = expire;
607: }
608:
609: /**
610: * Get MIME type of page.
611: * @return the MIME type of page, such as "text/html", or null if not known.
612: */
613: public String getContentType() {
614: return contentType;
615: }
616:
617: /**
618: * Set MIME type of page.
619: * @param type the MIME type of page, such as "text/html", or null if not known.
620: */
621: public void setContentType(String type) {
622: contentType = type;
623: }
624:
625: /**
626: * Get content encoding of page.
627: * @return the encoding type of page, such as "base-64", or null if not known.
628: */
629: public String getContentEncoding() {
630: return contentEncoding;
631: }
632:
633: /**
634: * Set content encoding of page.
635: * @param encoding the encoding type of page, such as "base-64", or null if not known.
636: */
637: public void setContentEncoding(String encoding) {
638: contentEncoding = encoding;
639: }
640:
641: /**
642: * Get response code returned by the Web server. For list of
643: * possible values, see java.net.HttpURLConnection.
644: * @return response code, such as 200 (for OK) or 404 (not found).
645: * Code is -1 if unknown.
646: * @see java.net.HttpURLConnection
647: */
648: public int getResponseCode() {
649: return responseCode;
650: }
651:
652: /**
653: * Get response message returned by the Web server.
654: * @return response message, such as "OK" or "Not Found". The response message is null if the page failed to be fetched or not known.
655: */
656: public String getResponseMessage() {
657: return responseMessage;
658: }
659:
660: /**
661: * Get raw content found in a region.
662: * @param start starting offset of region
663: * @param end ending offset of region
664: * @return raw HTML contained in the region
665: */
666: public String substringContent(int start, int end) {
667: return getContent().substring(start, end);
668: }
669:
670: /**
671: * Get HTML found in a region.
672: * @param start starting offset of region
673: * @param end ending offset of region
674: * @return representation of region as HTML
675: */
676: public String substringHTML(int start, int end) {
677: String s = getContent().substring(start, end);
678: if (!isHTML()) {
679: s = Str.replace(s, "&", "&");
680: s = Str.replace(s, "<", "<");
681: s = Str.replace(s, ">", ">");
682: s = "<PRE>" + s + "</PRE>";
683: }
684: return s;
685: }
686:
687: /**
688: * Get tagless text found in a region.
689: * Runs of whitespace and tags are reduced to a single space character.
690: * @param start starting offset of region
691: * @param end ending offset of region
692: * @return tagless text contained in the region
693: */
694: public String substringText(int start, int end) {
695: if (words == null)
696: return ""; // page is not parsed
697:
698: // FIX: find some other mapping
699: StringBuffer buf = new StringBuffer();
700: for (int j = findStart(words, start); j < words.length; ++j) {
701: if (words[j].end > end)
702: break;
703: else {
704: if (buf.length() > 0)
705: buf.append(' ');
706: buf.append(words[j].text);
707: }
708: }
709: return buf.toString();
710: }
711:
712: /**
713: * Get HTML tags found in a region. Whitespace and text among the
714: * tags are deleted.
715: * @param start starting offset of region
716: * @param end ending offset of region
717: * @return tags contained in the region
718: */
719: public String substringTags(int start, int end) {
720: if (tags == null)
721: return ""; // page is not parsed
722:
723: // FIX: find some other mapping
724: StringBuffer buf = new StringBuffer();
725: for (int j = findStart(tags, start); j < tags.length; ++j) {
726: if (tags[j].end > end)
727: break;
728: else {
729: if (buf.length() > 0)
730: buf.append(' ');
731: buf.append(getContent().substring(tags[j].start,
732: tags[j].end));
733: }
734: }
735: return buf.toString();
736: }
737:
738: /**
739: * Get canonicalized HTML tags found in a region.
740: * A canonicalized tag looks like the following:
741: * <PRE>
742: * <tagname#index attr=value attr=value attr=value ...>
743: * <PRE>
744: * where tagname and attr are all lowercase, index is the tag's
745: * index in the page's tokens array. Attributes are sorted in
746: * increasing order by attribute name. Attributes without values
747: * omit the entire "=value" portion. Values are delimited by a
748: * space. All occurences of <, >, space, and % characters
749: * in a value are URL-encoded (e.g., space is converted to %20).
750: * Thus the only occurences of these characters in the canonical
751: * tag are the tag delimiters.
752: *
753: * <P>For example, raw HTML that looks like:
754: * <PRE>
755: * <IMG SRC="http://foo.com/map<>.gif" ISMAP>Image</IMG>
756: * </PRE>
757: * would be canonicalized to:
758: * <PRE>
759: * <img ismap src=http://foo.com/map%3C%3E.gif></img>
760: * </PRE>
761: * <P>
762: * Comment and declaration tags (whose tag name is !) are omitted
763: * from the canonicalization.
764: *
765: * @param start starting offset of region
766: * @param end ending offset of region
767: * @return canonicalized tags contained in the region
768: */
769: public String substringCanonicalTags(int start, int end) {
770: if (tokens == null)
771: return ""; // page is not parsed
772:
773: boolean all = (start == this .start && end == this .end);
774:
775: if (all && canonicalTags != null)
776: return canonicalTags;
777:
778: // FIX: find some other mapping
779: StringBuffer buf = new StringBuffer();
780: for (int j = findStart(tokens, start); j < tokens.length; ++j) {
781: if (tokens[j].end > end)
782: break;
783: else if (tokens[j] instanceof Tag)
784: Tagexp.canonicalizeTag(buf, (Tag) tokens[j], j);
785: }
786:
787: String result = buf.toString();
788: if (all)
789: canonicalTags = result;
790: return result;
791: }
792:
793: public static void main(String[] args) throws Exception {
794: int method = Link.GET;
795:
796: for (int i = 0; i < args.length; ++i) {
797: if (args[i].equals("-post"))
798: method = Link.POST;
799: else if (args[i].equals("-get"))
800: method = Link.GET;
801: else {
802: Link link = method == Link.GET ? new Link(args[i])
803: : new Link(args[i]); // FIX: POST?
804: try {
805: Page p = new Page(link);
806: System.out.write(p.getContentBytes());
807: } catch (IOException e) {
808: System.out.println(e);
809: }
810: }
811: }
812: }
813:
814: }
|