001: /* SURTPrefixSet
002: *
003: * $Id: SurtPrefixSet.java 4644 2006-09-20 22:40:21Z paul_jack $
004: *
005: * Created on Jul 23, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.util;
026:
027: import java.io.BufferedInputStream;
028: import java.io.BufferedOutputStream;
029: import java.io.BufferedReader;
030: import java.io.FileInputStream;
031: import java.io.FileOutputStream;
032: import java.io.FileWriter;
033: import java.io.IOException;
034: import java.io.InputStream;
035: import java.io.InputStreamReader;
036: import java.io.PrintStream;
037: import java.io.Reader;
038: import java.util.Iterator;
039: import java.util.SortedSet;
040: import java.util.TreeSet;
041:
042: import org.apache.commons.httpclient.URIException;
043: import org.archive.net.UURI;
044: import org.archive.net.UURIFactory;
045: import org.archive.util.iterator.LineReadingIterator;
046: import org.archive.util.iterator.RegexpLineIterator;
047:
048: /**
049: * Specialized TreeSet for keeping a set of String prefixes.
050: *
051: * Redundant prefixes (those that are themselves prefixed
052: * by other set entries) are eliminated.
053: *
054: * @author gojomo
055: */
056: public class SurtPrefixSet extends TreeSet<String> {
057:
058: private static final long serialVersionUID = 2598365040524933110L;
059:
060: private static final String SURT_PREFIX_DIRECTIVE = "+";
061:
062: /**
063: * Test whether the given String is prefixed by one
064: * of this set's entries.
065: *
066: * @param s
067: * @return True if contains prefix.
068: */
069: public boolean containsPrefixOf(String s) {
070: SortedSet sub = headSet(s);
071: // because redundant prefixes have been eliminated,
072: // only a test against last item in headSet is necessary
073: if (!sub.isEmpty() && s.startsWith((String) sub.last())) {
074: return true; // prefix substring exists
075: } // else: might still exist exactly (headSet does not contain boundary)
076: return contains(s); // exact string exists, or no prefix is there
077: }
078:
079: /**
080: * Maintains additional invariant: if one entry is a
081: * prefix of another, keep only the prefix.
082: *
083: * @see java.util.Collection#add(java.lang.Object)
084: */
085: public boolean add(String s) {
086: SortedSet sub = headSet(s);
087: if (!sub.isEmpty() && s.startsWith((String) sub.last())) {
088: // no need to add; prefix is already present
089: return false;
090: }
091: boolean retVal = super .add(s);
092: sub = tailSet(s + "\0");
093: while (!sub.isEmpty() && ((String) sub.first()).startsWith(s)) {
094: // remove redundant entries
095: sub.remove(sub.first());
096: }
097: return retVal;
098: }
099:
100: /**
101: * Read a set of SURT prefixes from a reader source; keep sorted and
102: * with redundant entries removed.
103: *
104: * @param r reader over file of SURT_format strings
105: * @throws IOException
106: */
107: public void importFrom(Reader r) {
108: BufferedReader reader = new BufferedReader(r);
109: String s;
110:
111: Iterator iter = new RegexpLineIterator(
112: new LineReadingIterator(reader),
113: RegexpLineIterator.COMMENT_LINE,
114: RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
115: RegexpLineIterator.ENTRY);
116:
117: while (iter.hasNext()) {
118: s = (String) iter.next();
119: add(s.toLowerCase());
120: }
121: }
122:
123: /**
124: * @param r Where to read from.
125: */
126: public void importFromUris(Reader r) {
127: BufferedReader reader = new BufferedReader(r);
128: String s;
129:
130: Iterator iter = new RegexpLineIterator(
131: new LineReadingIterator(reader),
132: RegexpLineIterator.COMMENT_LINE,
133: RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
134: RegexpLineIterator.ENTRY);
135:
136: while (iter.hasNext()) {
137: s = (String) iter.next();
138: // s is a URI (or even fragmentary hostname), not a SURT
139: addFromPlain(s);
140: }
141: }
142:
143: /**
144: * Import SURT prefixes from a reader with mixed URI and SURT prefix
145: * format.
146: *
147: * @param r the reader to import the prefixes from
148: * @param deduceFromSeeds true to also import SURT prefixes implied
149: * from normal URIs/hostname seeds
150: */
151: public void importFromMixed(Reader r, boolean deduceFromSeeds) {
152: BufferedReader reader = new BufferedReader(r);
153: String s;
154:
155: Iterator iter = new RegexpLineIterator(
156: new LineReadingIterator(reader),
157: RegexpLineIterator.COMMENT_LINE,
158: RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
159: RegexpLineIterator.ENTRY);
160:
161: while (iter.hasNext()) {
162: s = (String) iter.next();
163: if (s.startsWith(SURT_PREFIX_DIRECTIVE)) {
164: // it's specifically a SURT prefix line
165: String u = s.substring(SURT_PREFIX_DIRECTIVE.length())
166: .trim();
167: if (u.indexOf("(") > 0) {
168: // formal SURT prefix; toLowerCase just in case
169: add(u.toLowerCase());
170: } else {
171: // hostname/normal form URI from which
172: // to deduce SURT prefix
173: addFromPlain(u);
174: }
175:
176: continue;
177: } else {
178: if (deduceFromSeeds) {
179: // also deducing 'implied' SURT prefixes
180: // from normal URIs/hostname seeds
181: addFromPlain(s);
182: }
183: }
184: }
185: }
186:
187: /**
188: * Given a plain URI or hostname, deduce an implied SURT prefix from
189: * it and add to active prefixes.
190: *
191: * @param u String of URI or hostname
192: */
193: private void addFromPlain(String u) {
194: u = prefixFromPlain(u);
195: add(u);
196: }
197:
198: /**
199: * Given a plain URI or hostname/hostname+path, deduce an implied SURT
200: * prefix from it. Results may be unpredictable on strings that cannot
201: * be interpreted as URIs.
202: *
203: * UURI 'fixup' is applied to the URI that is built.
204: *
205: * @param u URI or almost-URI to consider
206: * @return implied SURT prefix form
207: */
208: public static String prefixFromPlain(String u) {
209: u = ArchiveUtils.addImpliedHttpIfNecessary(u);
210: u = coerceFromHttpsForComparison(u);
211: boolean trailingSlash = u.endsWith("/");
212: // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
213: try {
214: u = UURIFactory.getInstance(u).toString();
215: } catch (URIException e) {
216: e.printStackTrace();
217: // allow to continue with original string uri
218: }
219: // except: don't let UURI-fixup add a trailing slash
220: // if it wasn't already there (presence or absence of
221: // such slash has special meaning specifying implied
222: // SURT prefixes)
223: if (!trailingSlash && u.endsWith("/")) {
224: u = u.substring(0, u.length() - 1);
225: }
226: // convert to full SURT
227: u = SURT.fromURI(u);
228: // truncate to implied prefix
229: u = SurtPrefixSet.asPrefix(u);
230: return u;
231: }
232:
233: /**
234: * For SURT comparisons -- prefixes or candidates being checked against
235: * those prefixes -- we treat https URIs as if they were http.
236: *
237: * @param u string to coerce if it has https scheme
238: * @return string converted to http scheme, or original if not necessary
239: */
240: private static String coerceFromHttpsForComparison(String u) {
241: if (u.startsWith("https://")) {
242: u = "http" + u.substring("https".length());
243: }
244: return u;
245: }
246:
247: /**
248: * Utility method for truncating a SURT that came from a
249: * full URI (as a seed, for example) into a prefix
250: * for determining inclusion.
251: *
252: * This involves:
253: * <pre>
254: * (1) removing the last path component, if any
255: * (anything after the last '/', if there are
256: * at least 3 '/'s)
257: * (2) removing a trailing ')', if present, opening
258: * the possibility of proper subdomains. (This
259: * means that the presence or absence of a
260: * trailing '/' after a hostname in a seed list
261: * is significant for the how the SURT prefix is
262: * created, even though it is not signficant for
263: * the URI's treatment as a seed.)
264: * </pre>
265: *
266: * @param s String to work on.
267: * @return As prefix.
268: */
269: private static String asPrefix(String s) {
270: // Strip last path-segment, if more than 3 slashes
271: s = s.replaceAll("^(.*//.*/)[^/]*", "$1");
272: // Strip trailing ")", if present and NO path (no 3rd slash).
273: if (!s.endsWith("/")) {
274: s = s.replaceAll("^(.*)\\)", "$1");
275: }
276: return s;
277: }
278:
279: /**
280: * Calculate the SURT form URI to use as a candidate against prefixes
281: * from the given Object (CandidateURI or UURI)
282: *
283: * @param object CandidateURI or UURI
284: * @return SURT form of URI for evaluation, or null if unavailable
285: */
286: public static String getCandidateSurt(Object object) {
287: UURI u = UURI.from(object);
288: if (u == null) {
289: return null;
290: }
291: String candidateSurt = u.getSurtForm();
292: // also want to treat https as http
293: candidateSurt = coerceFromHttpsForComparison(candidateSurt);
294: return candidateSurt;
295: }
296:
297: /**
298: * @param fw
299: * @throws IOException
300: */
301: public void exportTo(FileWriter fw) throws IOException {
302: Iterator iter = this .iterator();
303: while (iter.hasNext()) {
304: fw.write((String) iter.next() + "\n");
305: }
306: }
307:
308: /**
309: * Changes all prefixes so that they enforce an exact host. For
310: * prefixes that already include a ')', this means discarding
311: * anything after ')' (path info). For prefixes that don't include
312: * a ')' -- domain prefixes open to subdomains -- add the closing
313: * ')' (or ",)").
314: */
315: public void convertAllPrefixesToHosts() {
316: SurtPrefixSet iterCopy = (SurtPrefixSet) this .clone();
317: Iterator iter = iterCopy.iterator();
318: while (iter.hasNext()) {
319: String prefix = (String) iter.next();
320: String convPrefix = convertPrefixToHost(prefix);
321: if (prefix != convPrefix) {
322: // if returned value not unchanged, update set
323: this .remove(prefix);
324: this .add(convPrefix);
325: }
326: }
327: }
328:
329: public static String convertPrefixToHost(String prefix) {
330: if (prefix.endsWith(")")) {
331: return prefix; // no change necessary
332: }
333: if (prefix.indexOf(')') < 0) {
334: // open-ended domain prefix
335: if (!prefix.endsWith(",")) {
336: prefix += ",";
337: }
338: prefix += ")";
339: } else {
340: // prefix with excess path-info
341: prefix = prefix.substring(0, prefix.indexOf(')') + 1);
342: }
343: return prefix;
344: }
345:
346: /**
347: * Changes all prefixes so that they only enforce a general
348: * domain (allowing subdomains).For prefixes that don't include
349: * a ')', no change is necessary. For others, truncate everything
350: * from the ')' onward. Additionally, truncate off "www," if it
351: * appears.
352: */
353: public void convertAllPrefixesToDomains() {
354: SurtPrefixSet iterCopy = (SurtPrefixSet) this .clone();
355: Iterator iter = iterCopy.iterator();
356: while (iter.hasNext()) {
357: String prefix = (String) iter.next();
358: String convPrefix = convertPrefixToDomain(prefix);
359: if (prefix != convPrefix) {
360: // if returned value not unchanged, update set
361: this .remove(prefix);
362: this .add(convPrefix);
363: }
364: }
365: }
366:
367: public static String convertPrefixToDomain(String prefix) {
368: if (prefix.indexOf(')') >= 0) {
369: prefix = prefix.substring(0, prefix.indexOf(')'));
370: }
371: // strip 'www,' when present
372: if (prefix.endsWith("www,")) {
373: prefix = prefix.substring(0, prefix.length() - 4);
374: }
375: return prefix;
376: }
377:
378: /**
379: * Allow class to be used as a command-line tool for converting
380: * URL lists (or naked host or host/path fragments implied
381: * to be HTTP URLs) to implied SURT prefix form.
382: *
383: * Read from stdin or first file argument. Writes to stdout.
384: *
385: * @param args cmd-line arguments: may include input file
386: * @throws IOException
387: */
388: public static void main(String[] args) throws IOException {
389: InputStream in = args.length > 0 ? new BufferedInputStream(
390: new FileInputStream(args[0])) : System.in;
391: PrintStream out = args.length > 1 ? new PrintStream(
392: new BufferedOutputStream(new FileOutputStream(args[1])))
393: : System.out;
394: BufferedReader br = new BufferedReader(
395: new InputStreamReader(in));
396: String line;
397: while ((line = br.readLine()) != null) {
398: if (line.indexOf("#") > 0)
399: line = line.substring(0, line.indexOf("#"));
400: line = line.trim();
401: if (line.length() == 0)
402: continue;
403: out.println(prefixFromPlain(line));
404: }
405: br.close();
406: out.close();
407: }
408: }
|