001: /* Link
002: *
003: * $Id: Link.java 4667 2006-09-26 20:38:48Z paul_jack $
004: *
005: * Created on Mar 7, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.extractor;
026:
027: import java.io.Serializable;
028:
029: /**
030: * Link represents one discovered "edge" of the web graph: the source
031: * URI, the destination URI, and the type of reference (represented by the
032: * context in which it was found).
033: *
034: * As such, it is a suitably generic item to returned from generic
035: * link-extraction utility code.
036: *
037: * @author gojomo
038: */
039: public class Link implements Serializable {
040:
041: private static final long serialVersionUID = 7660959085498739376L;
042:
043: /* contexts for when another syntax (XPath-like or header-based)
044: * in unavailable */
045: /** stand-in value for embeds without other context */
046: public static final String EMBED_MISC = "=EMBED_MISC".intern();
047: /** stand-in value for js-discovered urls without other context */
048: public static final String JS_MISC = "=JS_MISC".intern();
049: /** stand-in value for navlink urls without other context */
050: public static final String NAVLINK_MISC = "=NAVLINK_MISC".intern();
051: /** stand-in value for speculative/aggressively extracted urls without other context */
052: public static final String SPECULATIVE_MISC = "=SPECULATIVE_MISC"
053: .intern();
054: /** stanf-in value for prerequisite without other context */
055: public static final String PREREQ_MISC = "=PREREQ_MISC".intern();
056:
057: /* hop types */
058: /** navigation links, like A/@HREF */
059: public static final char NAVLINK_HOP = 'L'; // TODO: change to 'N' to avoid 'L'ink confusion?
060: /** implied prerequisite links, like dns or robots */
061: public static final char PREREQ_HOP = 'P';
062: /** embedded links necessary to render the page, like IMG/@SRC */
063: public static final char EMBED_HOP = 'E';
064: /** speculative/aggressively extracted links, perhaps embed or nav, as in javascript */
065: public static final char SPECULATIVE_HOP = 'X';
066: /** referral/redirect links, like header 'Location:' on a 301/302 response */
067: public static final char REFER_HOP = 'R';
068:
069: /** URI where this Link was discovered */
070: private CharSequence source;
071: /** URI (absolute) where this Link points */
072: private CharSequence destination;
073: /** context of discovery -- will be an XPath-like element[/@attribute]
074: * fragment for HTML URIs, a header name with trailing ':' for header
075: * values, or one of the stand-in constants when other context is
076: * unavailable */
077: private CharSequence context;
078: /** hop-type, as character abbrieviation */
079: private char hopType;
080:
081: /**
082: * Create a Link with the given fields.
083: * @param source
084: * @param destination
085: * @param context
086: * @param hopType
087: */
088: public Link(CharSequence source, CharSequence destination,
089: CharSequence context, char hopType) {
090: super ();
091: this .source = source;
092: this .destination = destination;
093: this .context = context;
094: this .hopType = hopType;
095: }
096:
097: /**
098: * @return Returns the context.
099: */
100: public CharSequence getContext() {
101: return context;
102: }
103:
104: /**
105: * @return Returns the destination.
106: */
107: public CharSequence getDestination() {
108: return destination;
109: }
110:
111: /**
112: * @return Returns the source.
113: */
114: public CharSequence getSource() {
115: return source;
116: }
117:
118: /**
119: * @return char hopType
120: */
121: public char getHopType() {
122: return hopType;
123: }
124:
125: /**
126: * Create a suitable XPath-like context from an element name and optional
127: * attribute name.
128: *
129: * @param element
130: * @param attribute
131: * @return CharSequence context
132: */
133: public static CharSequence elementContext(CharSequence element,
134: CharSequence attribute) {
135: return attribute == null ? "" : element + "/@" + attribute;
136: }
137:
138: @Override
139: public String toString() {
140: return this .destination + " " + this .hopType + " "
141: + this.context;
142: }
143: }
|