001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.util.Enumeration;
036: import java.io.File;
037: import java.net.URL;
038: import java.net.MalformedURLException;
039: import rcm.util.Prioritized;
040:
041: /**
042: * Link to a Web page.
043: *
044: * @author Rob Miller
045: * @see Page
046: */
047: public class Link extends Element implements Prioritized {
048:
049: protected URL url;
050:
051: private String directory;
052: private String filename;
053: private String query;
054: private String ref;
055: private Page page;
056: private int depth;
057: private String text = ""; // stored text of link anchor
058: private int status = LinkEvent.NONE;
059: private float priority;
060: private DownloadParameters dp;
061:
062: // timeouts, etc. to use when downloading this link
063:
064: /**
065: * Make a Link from a start tag and end tag and a base URL (for relative references).
066: * The tags must be on the same page.
067: * @param startTag Start tag of element
068: * @param endTag End tag of element
069: * @param base Base URL used for relative references
070: */
071: public Link(Tag startTag, Tag endTag, URL base)
072: throws MalformedURLException {
073: super (startTag, endTag);
074: url = urlFromHref(startTag, base);
075: depth = startTag.getSource().getDepth() + 1;
076: }
077:
078: /**
079: * Make a Link from a URL.
080: */
081: public Link(URL url) {
082: super (new Tag(new Page(""), 0, 0, "", true), null);
083: this .url = url;
084: depth = 0;
085: }
086:
087: /**
088: * Make a Link from a File.
089: */
090: public Link(File file) throws MalformedURLException {
091: this (FileToURL(file));
092: }
093:
094: /**
095: * Make a Link from a string URL.
096: * @exception java.net.MalformedURLException if the URL is invalid
097: */
098: public Link(String href) throws MalformedURLException {
099: this (new URL(href));
100: depth = 0;
101: }
102:
103: /**
104: * Eliminate all references to page content.
105: */
106: public void discardContent() {
107: parent = null;
108: child = null;
109: sibling = null;
110: }
111:
112: /**
113: * Disconnect this link from its downloaded page (throwing away the page).
114: */
115: public void disconnect() {
116: page = null;
117: status = LinkEvent.NONE;
118: }
119:
120: /**
121: * Get depth of link in crawl.
122: * @return depth of link from root (depth of roots is 0)
123: */
124: public int getDepth() {
125: return depth;
126: }
127:
128: /**
129: * Get the URL.
130: * @return the URL of the link
131: */
132: public URL getURL() {
133: return url;
134: }
135:
136: /**
137: * Get the network protocol of the link, like "ftp" or "http".
138: * @return the protocol portion of the link's URL
139: */
140: public String getProtocol() {
141: return getURL().getProtocol();
142: }
143:
144: /**
145: * Get the hostname of the link, like "www.cs.cmu.edu".
146: * @return the hostname portion of the link's URL
147: */
148: public String getHost() {
149: return getURL().getHost();
150: }
151:
152: /**
153: * Get the port number of the link.
154: * @return the port number of the link's URL, or -1 if no port number
155: * is explicitly specified in the URL
156: */
157: public int getPort() {
158: return getURL().getPort();
159: }
160:
161: /**
162: * Get the filename part of the link, which includes the pathname
163: * and query but not the anchor reference.
164: * Equivalent to getURL().getFile().
165: * @return the filename portion of the link's URL
166: */
167: public String getFile() {
168: return getURL().getFile();
169: }
170:
171: /**
172: * Get the directory part of the link, like "/home/dir/".
173: * Always starts and ends with '/'.
174: * @return the directory portion of the link's URL
175: */
176: public String getDirectory() {
177: if (directory == null)
178: parseURL();
179: return directory;
180: }
181:
182: /**
183: * Get the filename part of the link, like "index.html".
184: * Never contains '/'; may be the empty string.
185: * @return the filename portion of the link's URL
186: */
187: public String getFilename() {
188: if (filename == null)
189: parseURL();
190: return filename;
191: }
192:
193: /**
194: * Get the query part of the link.
195: * Either starts with a '?', or is empty.
196: * @return the query portion of the link's URL
197: */
198: public String getQuery() {
199: if (query == null)
200: parseURL();
201: return query;
202: }
203:
204: /**
205: * Get the anchor reference of the link, like "#ref".
206: * Either starts with '#', or is empty.
207: * @return the anchor reference portion of the link's URL
208: */
209: public String getRef() {
210: if (ref == null)
211: parseURL();
212: return ref;
213: }
214:
215: /**
216: * Get the URL of a page, omitting any anchor reference (like #ref).
217: * @return the URL sans anchor reference
218: */
219: public URL getPageURL() {
220: return getPageURL(getURL());
221: }
222:
223: /**
224: * Get the URL of a page, omitting any anchor reference (like #ref).
225: * @return the URL sans anchor reference
226: */
227: public static URL getPageURL(URL url) {
228: String href = url.toExternalForm();
229: int i = href.indexOf('#');
230: try {
231: return (i != -1) ? new URL(href.substring(0, i)) : url;
232: } catch (MalformedURLException e) {
233: return url;
234: }
235: }
236:
237: /**
238: * Get the URL of a Web service, omitting any query or anchor reference.
239: * @return the URL sans query and anchor reference
240: */
241: public URL getServiceURL() {
242: return getServiceURL(getURL());
243: }
244:
245: /**
246: * Get the URL of a Web service, omitting any query or anchor reference.
247: * @return the URL sans query and anchor reference
248: */
249: public static URL getServiceURL(URL url) {
250: String href = url.toExternalForm();
251: int i = href.indexOf('?');
252: try {
253: return (i != -1 && url.getProtocol().equals("http")) ? new URL(
254: href.substring(0, i))
255: : getPageURL(url);
256: } catch (MalformedURLException e) {
257: return url;
258: }
259: }
260:
261: /**
262: * Get the URL of a page's directory.
263: * @return the URL sans filename, query and anchor reference
264: */
265: public URL getDirectoryURL() {
266: return getDirectoryURL(getURL());
267: }
268:
269: /**
270: * Get the URL of a page's directory.
271: * @return the URL sans filename, query and anchor reference
272: */
273: public static URL getDirectoryURL(URL url) {
274: String file = url.getFile();
275: int qmark = file.indexOf('?');
276: if (qmark == -1 || !url.getProtocol().equals("http"))
277: qmark = file.length();
278: // find pivotal separator (between directory and filename)
279: int pivot = file.lastIndexOf('/', Math.max(qmark - 1, 0));
280: try {
281: if (pivot == -1)
282: return new URL(url, "/");
283: else if (pivot == file.length() - 1)
284: return url;
285: else
286: return new URL(url, file.substring(0, pivot + 1));
287: } catch (MalformedURLException e) {
288: return url;
289: }
290: }
291:
292: /**
293: * Get the URL of a page's parent directory.
294: * @return the URL sans filename, query and anchor reference
295: */
296: public URL getParentURL() {
297: return getParentURL(getURL());
298: }
299:
300: /**
301: * Get the URL of a page's parent directory.
302: * @return the URL sans filename, query and anchor reference
303: */
304: public static URL getParentURL(URL url) {
305: URL dirURL = getDirectoryURL(url);
306: if (!dirURL.equals(url))
307: return dirURL;
308:
309: String dir = dirURL.getFile();
310: int lastSlash = dir.length() - 1;
311: if (lastSlash == 0)
312: return dirURL;
313:
314: int penultSlash = dir.lastIndexOf('/', lastSlash - 1);
315:
316: if (penultSlash == -1)
317: return dirURL;
318:
319: try {
320: return new URL(url, dir.substring(0, penultSlash + 1));
321: } catch (MalformedURLException e) {
322: return dirURL;
323: }
324: }
325:
326: // computes relative HREF for URL <there> when the current location
327: // is URL <here>
328: public static String relativeTo(URL here, URL there) {
329: if (here == null)
330: return there.toString();
331: //System.err.println ("From: " + here);
332: //System.err.println ("To: " + there);
333: if (here.getProtocol().equals(there.getProtocol())
334: && here.getHost().equals(there.getHost())
335: && here.getPort() == there.getPort()) {
336: String fn = relativeTo(here.getFile(), there.getFile());
337: String ref = there.getRef();
338: return (ref != null) ? fn + ref : fn;
339: } else {
340: //System.err.println ("Use: " + there);
341: return there.toString();
342: }
343: }
344:
345: // computes relative HREF for URL <there> when the current location
346: // is URL <here>
347: public static String relativeTo(URL here, String there) {
348: if (here == null)
349: return there;
350: try {
351: return relativeTo(here, new URL(here, there));
352: } catch (MalformedURLException e) {
353: return there;
354: }
355: }
356:
357: // computes relative HREF for filename <there> when the current location
358: // is filename <here>
359: private static String relativeTo(String here, String there) {
360: StringBuffer result = new StringBuffer();
361:
362: int lcp = 0;
363:
364: while (true) {
365: int i = here.indexOf('/', lcp);
366: int j = there.indexOf('/', lcp);
367:
368: if (i == -1 || i != j
369: || !here.regionMatches(lcp, there, lcp, i - lcp))
370: break;
371: lcp = i + 1;
372: }
373:
374: // assert: first lcp characters of here and there are identical
375: // and (lcp==0 or here[lcp-1] == '/')
376:
377: // here[0..lcp-1] is the common ancestor directory of here and there
378:
379: // count hops up from here to the common ancestor
380: for (int i = here.indexOf('/', lcp); i != -1; i = here.indexOf(
381: '/', i + 1)) {
382: result.append("..");
383: result.append('/');
384: }
385:
386: // append path down from common ancestor to there
387: result.append(there.substring(lcp));
388:
389: //System.out.println ("Use: " + result);
390: //System.out.println ();
391:
392: return result.toString();
393: }
394:
395: /**
396: * Convert a local filename to a URL.
397: * For example, if the filename is "C:\FOO\BAR\BAZ",
398: * the resulting URL is "file:/C:/FOO/BAR/BAZ".
399: * @param file File to convert
400: * @return URL corresponding to file
401: */
402: public static URL FileToURL(File file) throws MalformedURLException {
403: return new URL("file:"
404: + toURLDelimiters(file.getAbsolutePath()));
405: }
406:
407: /**
408: * Convert a file: URL to a filename appropriate to the
409: * current system platform. For example, on MS Windows,
410: * if the URL is "file:/FOO/BAR/BAZ", the resulting
411: * filename is "\FOO\BAR\BAZ".
412: * @param url URL to convert
413: * @return File corresponding to url
414: * @exception MalformedURLException if url is not a
415: * file: URL.
416: */
417: public static File URLToFile(URL url) throws MalformedURLException {
418: if (!url.getProtocol().equals("file"))
419: throw new MalformedURLException();
420:
421: String path = url.getFile();
422: path = path.replace('/', File.separatorChar);
423: // for MSWindows: change pathnames of the
424: // form /X:/ to X:/
425: if (path.length() > 3 && path.charAt(0) == File.separatorChar
426: && path.charAt(2) == ':'
427: && path.charAt(3) == File.separatorChar)
428: path = path.substring(1);
429:
430: return new File(path);
431: }
432:
433: public static String toURLDelimiters(String path) {
434: path = path.replace('\\', '/');
435: if (!path.startsWith("/"))
436: path = "/" + path;
437: return path;
438: }
439:
440: /**
441: * Get the downloaded page to which the link points.
442: * @return the Page object, or null if the page hasn't been downloaded.
443: */
444: public Page getPage() {
445: return page;
446: }
447:
448: /**
449: * Set the page corresponding to this link.
450: * @param page Page to which this link points
451: */
452: public void setPage(Page page) {
453: this .page = page;
454: }
455:
456: /**
457: * Use the HTTP GET method to download this link.
458: */
459: public static final int GET = 0;
460: /**
461: * Use the HTTP POST method to access this link.
462: */
463: public static final int POST = 1;
464:
465: /**
466: * Get the method used to access this link.
467: * @return GET or POST.
468: */
469: public int getMethod() {
470: return GET;
471: }
472:
473: /**
474: * Convert the link's URL to a String
475: * @return the URL represented as a string
476: */
477: public String toURL() {
478: return getURL().toExternalForm();
479: }
480:
481: /**
482: * Generate a human-readable description of the link.
483: * @return a description of the link, in the form "[url]".
484: */
485: public String toDescription() {
486: return (text.length() > 0 ? text + " " : "") + "[" + getURL()
487: + "]";
488: }
489:
490: /**
491: * Convert the region to tagless text.
492: * @return a string consisting of the text in the page contained by this region
493: */
494: public String toText() {
495: return text;
496: }
497:
498: /**
499: * Set the tagless-text representation of this region.
500: * @param text a string consisting of the text in the page contained by this region
501: */
502: public void setText(String text) {
503: this .text = text;
504: }
505:
506: private void parseURL() {
507: String protocol = getProtocol();
508: String file = getFile();
509:
510: int qmark = file.indexOf('?');
511: if (qmark == -1 || !protocol.equals("http")) {
512: query = "";
513: qmark = file.length();
514: } else {
515: query = file.substring(qmark + 1);
516: file = file.substring(0, qmark);
517: }
518:
519: int slash = file.lastIndexOf('/', Math.max(qmark - 1, 0));
520: if (slash == -1) {
521: directory = "";
522: filename = file;
523: } else {
524: directory = file.substring(0, slash + 1);
525: filename = file.substring(slash + 1);
526: }
527:
528: ref = getURL().getRef();
529: if (ref == null)
530: ref = "";
531: }
532:
533: /**
534: * Construct the URL for a link element, from its start tag and a base URL (for relative references).
535: * @param tag Start tag of link, such as <A HREF="/foo/index.html">.
536: * @param base Base URL used for relative references
537: * @return URL to which the link points
538: */
539: protected URL urlFromHref(Tag tag, URL base)
540: throws MalformedURLException {
541: // element is a link -- make an instance of Link.
542: String hrefAttr = getHrefAttributeName(tag);
543: String href = tag.getHTMLAttribute(hrefAttr);
544: if (tag.tagName == Tag.APPLET) {
545: String codebase = tag.getHTMLAttribute("codebase");
546: if (codebase != null)
547: base = new URL(base, codebase);
548: }
549: return new URL(base, href);
550: }
551:
552: /**
553: * Copy the link's start tag, replacing the URL. Note that the name of the attribute containing the URL
554: * varies from tag to tag: sometimes it is called HREF, sometimes SRC, sometimes CODE, etc.
555: * This method changes the appropriate attribute for this tag.
556: * @param newHref New URL or relative reference; e.g. "http://www.cs.cmu.edu/" or "/foo/index.html".
557: * @return copy of this link's start tag with its URL attribute replaced. The copy is
558: * a region of a fresh page containing only the tag.
559: */
560: public Tag replaceHref(String newHref) {
561: Tag tag = startTag;
562:
563: if (tag.getTagName() == Tag.APPLET) {
564: int i = newHref.lastIndexOf('/');
565: if (i != -1) {
566: tag = startTag.replaceHTMLAttribute("codebase", newHref
567: .substring(0, i + 1));
568: newHref = newHref.substring(i + 1);
569: }
570: }
571: String hrefAttrName = getHrefAttributeName(tag);
572: if (hrefAttrName == null)
573: return tag;
574: return tag.replaceHTMLAttribute(hrefAttrName, newHref);
575: }
576:
577: private static String getHrefAttributeName(Tag tag) {
578: return (String) HTMLParser.linktag.get(tag.getTagName());
579: }
580:
581: /**
582: * Get the status of the link. Possible values are defined in LinkEvent.
583: * @return last event that happened to this link
584: */
585: public int getStatus() {
586: return status;
587: }
588:
589: /**
590: * Set the status of the link. Possible values are defined in LinkEvent.
591: * @param event the event that just happened to this link
592: */
593: public void setStatus(int event) {
594: status = event;
595: }
596:
597: /**
598: * Get the priority of the link in the crawl.
599: */
600: public float getPriority() {
601: return priority;
602: }
603:
604: /**
605: * Set the priority of the link in the crawl.
606: */
607: public void setPriority(float priority) {
608: this .priority = priority;
609: }
610:
611: /**
612: * Get the download parameters used for this link. Default is null.
613: */
614: public DownloadParameters getDownloadParameters() {
615: return dp;
616: }
617:
618: /**
619: * Set the download parameters used for this link.
620: */
621: public void setDownloadParameters(DownloadParameters dp) {
622: this .dp = dp;
623: }
624:
625: /*
626: * Testing
627: *
628:
629: public static void main (String[] args) throws Exception {
630: if (args[0].equals ("file"))
631: System.out.println (Link.FileToURL (new File (args[1])));
632: else if (args[0].equals ("url"))
633: System.out.println (Link.URLToFile (new URL (args[1])));
634: }
635: *
636: *
637: */
638: }
|