001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * CandidateURI.java
020: * Created on Sep 30, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.datamodel;
025:
026: import java.io.IOException;
027: import java.io.ObjectInputStream;
028: import java.io.ObjectOutputStream;
029: import java.io.PrintWriter;
030: import java.io.Serializable;
031: import java.util.ArrayList;
032: import java.util.Iterator;
033: import java.util.List;
034:
035: import org.apache.commons.httpclient.URIException;
036: import org.archive.crawler.extractor.Link;
037: import org.archive.net.UURI;
038: import org.archive.net.UURIFactory;
039: import org.archive.util.ArchiveUtils;
040: import org.archive.util.Reporter;
041:
042: import st.ata.util.AList;
043: import st.ata.util.HashtableAList;
044:
045: /**
046: * A URI, discovered or passed-in, that may be scheduled.
047: * When scheduled, a CandidateURI becomes a {@link CrawlURI}
048: * made with the data contained herein. A CandidateURI
049: * contains just the fields necessary to perform quick in-scope analysis.
050: *
051: * <p>Has a flexible attribute list that will be promoted into
052: * any {@link CrawlURI} created from this CandidateURI. Use it
053: * to add custom data or state needed later doing custom processing.
054: * See accessors/setters {@link #putString(String, String)},
055: * {@link #getString(String)}, etc.
056: *
057: * @author Gordon Mohr
058: */
059: public class CandidateURI implements Serializable, Reporter,
060: CoreAttributeConstants {
061: private static final long serialVersionUID = -7152937921526560388L;
062:
063: /** Highest scheduling priority.
064: * Before any others of its class.
065: */
066: public static final int HIGHEST = 0;
067:
068: /** High scheduling priority.
069: * After any {@link #HIGHEST}.
070: */
071: public static final int HIGH = 1;
072:
073: /** Medium priority.
074: * After any {@link #HIGH}.
075: */
076: public static final int MEDIUM = 2;
077:
078: /** Normal/low priority.
079: * Whenever/end of queue.
080: */
081: public static final int NORMAL = 3;
082:
083: private int schedulingDirective = NORMAL;
084:
085: /**
086: * Usuable URI under consideration. Transient to allow
087: * more efficient custom serialization
088: */
089: private transient UURI uuri;
090:
091: /** Seed status */
092: private boolean isSeed = false;
093:
094: private boolean forceRevisit = false; // even if already visited
095:
096: /** String of letters indicating how this URI was reached from a seed.
097: * <pre>
098: * P precondition
099: * R redirection
100: * E embedded (as frame, src, link, codebase, etc.)
101: * X speculative embed (as from javascript, some alternate-format extractors
102: * L link</pre>
103: * For example LLLE (an embedded image on a page 3 links from seed).
104: */
105: private String pathFromSeed;
106:
107: /**
108: * Where this URI was (presently) discovered. . Transient to allow
109: * more efficient custom serialization
110: */
111: private transient UURI via;
112:
113: /**
114: * Context of URI's discovery, as per the 'context' in Link
115: */
116: private CharSequence viaContext;
117:
118: /**
119: * Flexible dynamic attributes list.
120: * <p>
121: * The attribute list is a flexible map of key/value pairs for storing
122: * status of this URI for use by other processors. By convention the
123: * attribute list is keyed by constants found in the
124: * {@link CoreAttributeConstants} interface. Use this list to carry
125: * data or state produced by custom processors rather change the
126: * classes {@link CrawlURI} or this class, CandidateURI.
127: *
128: * Transient to allow more efficient custom serialization.
129: */
130: private transient AList alist;
131:
132: /**
133: * Cache of this candidate uuri as a string.
134: *
135: * Profiling shows us spending about 1-2% of total elapsed time in
136: * toString.
137: */
138: private String cachedCandidateURIString = null;
139:
140: /**
141: * Frontier/Scheduler lifecycle info.
142: * This is an identifier set by the Frontier for its
143: * purposes. Usually its the name of the Frontier queue
144: * this URI gets queued to. Values can be host + port
145: * or IP, etc.
146: */
147: private String classKey;
148:
149: /**
150: * Constructor.
151: * Protected access to block access to default constructor.
152: */
153: protected CandidateURI() {
154: super ();
155: }
156:
157: /**
158: * @param u uuri instance this CandidateURI wraps.
159: */
160: public CandidateURI(UURI u) {
161: this .uuri = u;
162: }
163:
164: /**
165: * @param u uuri instance this CandidateURI wraps.
166: * @param pathFromSeed
167: * @param via
168: * @param viaContext
169: */
170: public CandidateURI(UURI u, String pathFromSeed, UURI via,
171: CharSequence viaContext) {
172: this .uuri = u;
173: this .pathFromSeed = pathFromSeed;
174: this .via = via;
175: this .viaContext = viaContext;
176: }
177:
178: /**
179: * Set the <tt>isSeed</tt> attribute of this URI.
180: * @param b Is this URI a seed, true or false.
181: */
182: public void setIsSeed(boolean b) {
183: this .isSeed = b;
184: if (this .isSeed) {
185: if (pathFromSeed == null) {
186: this .pathFromSeed = "";
187: }
188: // seeds created on redirect must have a via to be recognized; don't clear
189: // setVia(null);
190: }
191: }
192:
193: /**
194: * @return UURI
195: */
196: public UURI getUURI() {
197: return this .uuri;
198: }
199:
200: /**
201: * @return Whether seeded.
202: */
203: public boolean isSeed() {
204: return this .isSeed;
205: }
206:
207: /**
208: * @return path (hop-types) from seed
209: */
210: public String getPathFromSeed() {
211: return this .pathFromSeed;
212: }
213:
214: /**
215: * @return URI via which this one was discovered
216: */
217: public UURI getVia() {
218: return this .via;
219: }
220:
221: /**
222: * @return CharSequence context in which this one was discovered
223: */
224: public CharSequence getViaContext() {
225: return this .viaContext;
226: }
227:
228: /**
229: * @param string
230: */
231: protected void setPathFromSeed(String string) {
232: pathFromSeed = string;
233: }
234:
235: /**
236: * Called when making a copy of another CandidateURI.
237: * @param alist AList to use.
238: */
239: protected void setAList(AList alist) {
240: this .alist = alist;
241: }
242:
243: public void setVia(UURI via) {
244: this .via = via;
245: }
246:
247: /**
248: * @return This candidate URI as a string wrapped with 'CandidateURI(' +
249: * ')'.
250: */
251: public synchronized String getCandidateURIString() {
252: if (this .cachedCandidateURIString == null) {
253: this .cachedCandidateURIString = "CandidateURI("
254: + toString() + ")";
255: }
256: return this .cachedCandidateURIString;
257: }
258:
259: /**
260: * Method returns string version of this URI's referral URI.
261: * @return String version of referral URI
262: */
263: public String flattenVia() {
264: return (via == null) ? "" : via.toString();
265: }
266:
267: /**
268: * @return The UURI this CandidateURI wraps as a string
269: * (We used return what {@link #getCandidateURIString()}
270: * returns on a toString -- use that method if you still need
271: * this functionality).
272: * @see #getCandidateURIString()
273: */
274: public String toString() {
275: return getURIString();
276: }
277:
278: /**
279: * @return URI String
280: * @deprecated Use {@link #toString()}.
281: */
282: public String getURIString() {
283: return getUURI().toString();
284: }
285:
286: /**
287: * Compares the domain of this CandidateURI with that of another
288: * CandidateURI
289: *
290: * @param other The other CandidateURI
291: *
292: * @return True if both are in the same domain, false otherwise.
293: * @throws URIException
294: */
295: public boolean sameDomainAs(CandidateURI other) throws URIException {
296: String domain = getUURI().getHost();
297: if (domain == null) {
298: return false;
299: }
300: while (domain.lastIndexOf('.') > domain.indexOf('.')) {
301: // While has more than one dot, lop off first segment
302: domain = domain.substring(domain.indexOf('.') + 1);
303: }
304: if (other.getUURI().getHost() == null) {
305: return false;
306: }
307: return other.getUURI().getHost().endsWith(domain);
308: }
309:
310: /**
311: * If this method returns true, this URI should be fetched even though
312: * it already has been crawled. This also implies
313: * that this URI will be scheduled for crawl before any other waiting
314: * URIs for the same host.
315: *
316: * This value is used to refetch any expired robots.txt or dns-lookups.
317: *
318: * @return true if crawling of this URI should be forced
319: */
320: public boolean forceFetch() {
321: return forceRevisit;
322: }
323:
324: /**
325: * Method to signal that this URI should be fetched even though
326: * it already has been crawled. Setting this to true also implies
327: * that this URI will be scheduled for crawl before any other waiting
328: * URIs for the same host.
329: *
330: * This value is used to refetch any expired robots.txt or dns-lookups.
331: *
332: * @param b set to true to enforce the crawling of this URI
333: */
334: public void setForceFetch(boolean b) {
335: forceRevisit = b;
336: }
337:
338: /**
339: * @return Returns the schedulingDirective.
340: */
341: public int getSchedulingDirective() {
342: return schedulingDirective;
343: }
344:
345: /**
346: * @param schedulingDirective The schedulingDirective to set.
347: */
348: public void setSchedulingDirective(int schedulingDirective) {
349: this .schedulingDirective = schedulingDirective;
350: }
351:
352: /**
353: * @return True if needs immediate scheduling.
354: */
355: public boolean needsImmediateScheduling() {
356: return schedulingDirective == HIGH;
357: }
358:
359: /**
360: * @return True if needs soon but not top scheduling.
361: */
362: public boolean needsSoonScheduling() {
363: return schedulingDirective == MEDIUM;
364: }
365:
366: /**
367: * Tally up the number of transitive (non-simple-link) hops at
368: * the end of this CandidateURI's pathFromSeed.
369: *
370: * In some cases, URIs with greater than zero but less than some
371: * threshold such hops are treated specially.
372: *
373: * <p>TODO: consider moving link-count in here as well, caching
374: * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.
375: *
376: * @return Transhop count.
377: */
378: public int getTransHops() {
379: String path = getPathFromSeed();
380: int transCount = 0;
381: for (int i = path.length() - 1; i >= 0; i--) {
382: if (path.charAt(i) == Link.NAVLINK_HOP) {
383: break;
384: }
385: transCount++;
386: }
387: return transCount;
388: }
389:
390: /**
391: * Given a string containing a URI, then optional whitespace
392: * delimited hops-path and via info, create a CandidateURI
393: * instance.
394: *
395: * @param uriHopsViaString String with a URI.
396: * @return A CandidateURI made from passed <code>uriHopsViaString</code>.
397: * @throws URIException
398: */
399: public static CandidateURI fromString(String uriHopsViaString)
400: throws URIException {
401: String args[] = uriHopsViaString.split("\\s+");
402: String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ? args[1]
403: : "";
404: UURI via = (args.length > 2 && !args[2].equals("-")) ? UURIFactory
405: .getInstance(args[2])
406: : null;
407: CharSequence viaContext = (args.length > 3 && !args[3]
408: .equals("-")) ? args[2] : null;
409: return new CandidateURI(UURIFactory.getInstance(args[0]),
410: pathFromSeeds, via, viaContext);
411: }
412:
413: public static CandidateURI createSeedCandidateURI(UURI uuri) {
414: CandidateURI c = new CandidateURI(uuri);
415: c.setIsSeed(true);
416: return c;
417: }
418:
419: /**
420: * Utility method for creation of CandidateURIs found extracting
421: * links from this CrawlURI.
422: * @param baseUURI BaseUURI for <code>link</code>.
423: * @param link Link to wrap CandidateURI in.
424: * @return New candidateURI wrapper around <code>link</code>.
425: * @throws URIException
426: */
427: public CandidateURI createCandidateURI(UURI baseUURI, Link link)
428: throws URIException {
429: UURI u = (link.getDestination() instanceof UURI) ? (UURI) link
430: .getDestination() : UURIFactory.getInstance(baseUURI,
431: link.getDestination().toString());
432: CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed()
433: + link.getHopType(), getUURI(), link.getContext());
434: newCaURI.inheritFrom(this );
435: return newCaURI;
436: }
437:
438: /**
439: * Utility method for creation of CandidateURIs found extracting
440: * links from this CrawlURI.
441: * @param baseUURI BaseUURI for <code>link</code>.
442: * @param link Link to wrap CandidateURI in.
443: * @param scheduling How new CandidateURI should be scheduled.
444: * @param seed True if this CandidateURI is a seed.
445: * @return New candidateURI wrapper around <code>link</code>.
446: * @throws URIException
447: */
448: public CandidateURI createCandidateURI(UURI baseUURI, Link link,
449: int scheduling, boolean seed) throws URIException {
450: final CandidateURI caURI = createCandidateURI(baseUURI, link);
451: caURI.setSchedulingDirective(scheduling);
452: caURI.setIsSeed(seed);
453: return caURI;
454: }
455:
456: /**
457: * Inherit (copy) the relevant keys-values from the ancestor.
458: *
459: * @param ancestor
460: */
461: protected void inheritFrom(CandidateURI ancestor) {
462: List heritableKeys = (List) ancestor
463: .getObject(A_HERITABLE_KEYS);
464: if (heritableKeys != null) {
465: getAList().copyKeysFrom(heritableKeys.iterator(),
466: ancestor.getAList());
467: }
468: }
469:
470: /**
471: * Get the token (usually the hostname + port) which indicates
472: * what "class" this CrawlURI should be grouped with,
473: * for the purposes of ensuring only one item of the
474: * class is processed at once, all items of the class
475: * are held for a politeness period, etc.
476: *
477: * @return Token (usually the hostname) which indicates
478: * what "class" this CrawlURI should be grouped with.
479: */
480: public String getClassKey() {
481: return classKey;
482: }
483:
484: public void setClassKey(String key) {
485: classKey = key;
486: }
487:
488: /**
489: * Assumption is that only one thread at a time will ever be accessing
490: * a particular CandidateURI.
491: *
492: * @return the attribute list.
493: */
494: public AList getAList() {
495: if (this .alist == null) {
496: this .alist = new HashtableAList();
497: }
498: return this .alist;
499: }
500:
501: protected void clearAList() {
502: this .alist = null;
503: }
504:
505: public void putObject(String key, Object value) {
506: getAList().putObject(key, value);
507: }
508:
509: public Object getObject(String key) {
510: return getAList().getObject(key);
511: }
512:
513: public String getString(String key) {
514: return getAList().getString(key);
515: }
516:
517: public void putString(String key, String value) {
518: getAList().putString(key, value);
519: }
520:
521: public long getLong(String key) {
522: return getAList().getLong(key);
523: }
524:
525: public void putLong(String key, long value) {
526: getAList().putLong(key, value);
527: }
528:
529: public int getInt(String key) {
530: return getAList().getInt(key);
531: }
532:
533: public void putInt(String key, int value) {
534: getAList().putInt(key, value);
535: }
536:
537: public boolean containsKey(String key) {
538: return getAList().containsKey(key);
539: }
540:
541: public void remove(String key) {
542: getAList().remove(key);
543: }
544:
545: public Iterator keys() {
546: return getAList().getKeys();
547: }
548:
549: /**
550: * @return True if this CandidateURI was result of a redirect:
551: * i.e. Its parent URI redirected to here, this URI was what was in
552: * the 'Location:' or 'Content-Location:' HTTP Header.
553: */
554: public boolean isLocation() {
555: return this .pathFromSeed != null
556: && this .pathFromSeed.length() > 0
557: && this .pathFromSeed
558: .charAt(this .pathFromSeed.length() - 1) == Link.REFER_HOP;
559: }
560:
561: /**
562: * Custom serialization writing 'uuri' and 'via' as Strings, rather
563: * than the bloated full serialization of their object classes, and
564: * an empty alist as 'null'. Shrinks serialized form by 50% or more
565: * in short tests.
566: *
567: * @param stream
568: * @throws IOException
569: */
570: private void writeObject(ObjectOutputStream stream)
571: throws IOException {
572: stream.defaultWriteObject();
573: stream.writeUTF(uuri.toString());
574: stream.writeObject((via == null) ? null : via.getURI());
575: stream.writeObject((alist == null) ? null : alist);
576: }
577:
578: /**
579: * Custom deserialization to reconstruct UURI instances from more
580: * compact Strings.
581: *
582: * @param stream
583: * @throws IOException
584: * @throws ClassNotFoundException
585: */
586: private void readObject(ObjectInputStream stream)
587: throws IOException, ClassNotFoundException {
588: stream.defaultReadObject();
589: uuri = readUuri(stream.readUTF());
590: via = readUuri((String) stream.readObject());
591: alist = (AList) stream.readObject();
592: }
593:
594: /**
595: * Read a UURI from a String, handling a null or URIException
596: *
597: * @param u String or null from which to create UURI
598: * @return the best UURI instance creatable
599: */
600: protected UURI readUuri(String u) {
601: if (u == null) {
602: return null;
603: }
604: try {
605: return UURIFactory.getInstance(u);
606: } catch (URIException ux) {
607: // simply continue to next try
608: }
609: try {
610: // try adding an junk scheme
611: return UURIFactory.getInstance("invalid:" + u);
612: } catch (URIException ux) {
613: ux.printStackTrace();
614: // ignored; method continues
615: }
616: try {
617: // return total junk
618: return UURIFactory.getInstance("invalid:");
619: } catch (URIException e) {
620: e.printStackTrace();
621: return null;
622: }
623: }
624:
625: //
626: // Reporter implementation
627: //
628:
629: public String singleLineReport() {
630: return ArchiveUtils.singleLineReport(this );
631: }
632:
633: public void singleLineReportTo(PrintWriter w) {
634: String className = this .getClass().getName();
635: className = className.substring(className.lastIndexOf(".") + 1);
636: w.print(className);
637: w.print(" ");
638: w.print(getUURI().toString());
639: w.print(" ");
640: w.print(pathFromSeed);
641: w.print(" ");
642: w.print(flattenVia());
643: }
644:
645: /* (non-Javadoc)
646: * @see org.archive.util.Reporter#singleLineLegend()
647: */
648: public String singleLineLegend() {
649: return "className uri hopsPath viaUri";
650: }
651:
652: /* (non-Javadoc)
653: * @see org.archive.util.Reporter#getReports()
654: */
655: public String[] getReports() {
656: // none but default: empty options
657: return new String[] {};
658: }
659:
660: /* (non-Javadoc)
661: * @see org.archive.util.Reporter#reportTo(java.lang.String, java.io.Writer)
662: */
663: public void reportTo(String name, PrintWriter writer) {
664: singleLineReportTo(writer);
665: writer.print("\n");
666: }
667:
668: /* (non-Javadoc)
669: * @see org.archive.util.Reporter#reportTo(java.io.Writer)
670: */
671: public void reportTo(PrintWriter writer) throws IOException {
672: reportTo(null, writer);
673: }
674:
675: /** Make the given key 'heritable', meaning its value will be
676: * added to descendant CandidateURIs. Only keys with immutable
677: * values should be made heritable -- the value instance may
678: * be shared until the AList is serialized/deserialized.
679: *
680: * @param key to make heritable
681: */
682: public void makeHeritable(String key) {
683: @SuppressWarnings("unchecked")
684: List<String> heritableKeys = (List<String>) getObject(A_HERITABLE_KEYS);
685: if (heritableKeys == null) {
686: heritableKeys = new ArrayList<String>();
687: heritableKeys.add(A_HERITABLE_KEYS);
688: putObject(A_HERITABLE_KEYS, heritableKeys);
689: }
690: heritableKeys.add(key);
691: }
692:
693: /** Make the given key non-'heritable', meaning its value will
694: * not be added to descendant CandidateURIs. Only meaningful if
695: * key was previously made heritable.
696: *
697: * @param key to make non-heritable
698: */
699: public void makeNonHeritable(String key) {
700: List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
701: if (heritableKeys == null) {
702: return;
703: }
704: heritableKeys.remove(key);
705: if (heritableKeys.size() == 1) {
706: // only remaining heritable key is itself; disable completely
707: remove(A_HERITABLE_KEYS);
708: }
709: }
710: }
|