001: /* SURT
002: *
003: * $Id: SURT.java 4919 2007-02-20 23:25:20Z gojomo $
004: *
005: * Created on Jul 16, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.util;
026:
027: import java.io.BufferedInputStream;
028: import java.io.BufferedOutputStream;
029: import java.io.BufferedReader;
030: import java.io.FileInputStream;
031: import java.io.FileOutputStream;
032: import java.io.IOException;
033: import java.io.InputStream;
034: import java.io.InputStreamReader;
035: import java.io.PrintStream;
036: import java.util.regex.Matcher;
037:
038: /**
039: * Sort-friendly URI Reordering Transform.
040: *
041: * Converts URIs of the form:
042: *
043: * scheme://userinfo@domain.tld:port/path?query#fragment
044: *
045: * ...into...
046: *
047: * scheme://(tld,domain,:port@userinfo)/path?query#fragment
048: *
049: * The '(' ')' characters serve as an unambiguous notice that the so-called
050: * 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has
051: * been transformed; the commas prevent confusion with regular hostnames.
052: *
053: * This remedies the 'problem' with standard URIs that the host portion of a
054: * regular URI, with its dotted-domains, is actually in reverse order from
055: * the natural hierarchy that's usually helpful for grouping and sorting.
056: *
057: * The value of respecting URI case variance is considered negligible: it
058: * is vanishingly rare for case-variance to be meaningful, while URI case-
059: * variance often arises from people's confusion or sloppiness, and they
060: * only correct it insofar as necessary to avoid blatant problems. Thus
061: * the usual SURT form is considered to be flattened to all lowercase, and
062: * not completely reversible.
063: *
064: * @author gojomo
065: */
066: public class SURT {
067: static char DOT = '.';
068: static String BEGIN_TRANSFORMED_AUTHORITY = "(";
069: static String TRANSFORMED_HOST_DELIM = ",";
070: static String END_TRANSFORMED_AUTHORITY = ")";
071:
072: // 1: scheme://
073: // 2: userinfo (if present)
074: // 3: @ (if present)
075: // 4: dotted-quad host
076: // 5: other host
077: // 6: :port
078: // 7: path
079: static String URI_SPLITTER = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?"
080: +
081: // 1 2 3
082: "(?:((?:\\d{1,3}\\.){3}\\d{1,3})|(\\S+?))(:\\d+)?(/\\S*)?$";
083:
084: // 4 5 6 7
085:
086: // RFC2396
087: // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
088: // "$" | ","
089: // unreserved = alphanum | mark
090: // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
091: // userinfo = *( unreserved | escaped |
092: // ";" | ":" | "&" | "=" | "+" | "$" | "," )
093: // escaped = "%" hex hex
094:
095: /**
096: * Utility method for creating the SURT form of the URI in the
097: * given String.
098: *
099: * By default, does not preserve casing.
100: *
101: * @param s String URI to be converted to SURT form
102: * @return SURT form
103: */
104: public static String fromURI(String s) {
105: return fromURI(s, false);
106: }
107:
108: /**
109: * Utility method for creating the SURT form of the URI in the
110: * given String.
111: *
112: * If it appears a bit convoluted in its approach, note that it was
113: * optimized to minimize object-creation after allocation-sites profiling
114: * indicated this method was a top source of garbage in long-running crawls.
115: *
116: * Assumes that the String URI has already been cleaned/fixed (eg
117: * by UURI fixup) in ways that put it in its crawlable form for
118: * evaluation.
119: *
120: * @param s String URI to be converted to SURT form
121: * @param preserveCase whether original case should be preserved
122: * @return SURT form
123: */
124: public static String fromURI(String s, boolean preserveCase) {
125: Matcher m = TextUtils.getMatcher(URI_SPLITTER, s);
126: if (!m.matches()) {
127: // not an authority-based URI scheme; return unchanged
128: TextUtils.recycleMatcher(m);
129: return s;
130: }
131: // preallocate enough space for SURT form, which includes
132: // 3 extra characters ('(', ')', and one more ',' than '.'s
133: // in original)
134: StringBuffer builder = new StringBuffer(s.length() + 3);
135: append(builder, s, m.start(1), m.end(1)); // scheme://
136: builder.append(BEGIN_TRANSFORMED_AUTHORITY); // '('
137:
138: if (m.start(4) > -1) {
139: // dotted-quad ip match: don't reverse
140: append(builder, s, m.start(4), m.end(4));
141: } else {
142: // other hostname match: do reverse
143: int hostSegEnd = m.end(5);
144: int hostStart = m.start(5);
145: for (int i = m.end(5) - 1; i >= hostStart; i--) {
146: if (s.charAt(i - 1) != DOT && i > hostStart) {
147: continue;
148: }
149: append(builder, s, i, hostSegEnd); // rev host segment
150: builder.append(TRANSFORMED_HOST_DELIM); // ','
151: hostSegEnd = i - 1;
152: }
153: }
154:
155: append(builder, s, m.start(6), m.end(6)); // :port
156: append(builder, s, m.start(3), m.end(3)); // at
157: append(builder, s, m.start(2), m.end(2)); // userinfo
158: builder.append(END_TRANSFORMED_AUTHORITY); // ')'
159: append(builder, s, m.start(7), m.end(7)); // path
160: if (!preserveCase) {
161: for (int i = 0; i < builder.length(); i++) {
162: builder.setCharAt(i, Character.toLowerCase(builder
163: .charAt((i))));
164: }
165: }
166: TextUtils.recycleMatcher(m);
167: return builder.toString();
168: }
169:
170: private static void append(StringBuffer b, CharSequence cs,
171: int start, int end) {
172: if (start < 0) {
173: return;
174: }
175: b.append(cs, start, end);
176: }
177:
178: /**
179: * Allow class to be used as a command-line tool for converting
180: * URL lists (or naked host or host/path fragments implied
181: * to be HTTP URLs) to SURT form. Lines that cannot be converted
182: * are returned unchanged.
183: *
184: *
185: * Read from stdin or first file argument. Writes to stdout or
186: * second argument filename
187: *
188: * @param args cmd-line arguments
189: * @throws IOException
190: */
191: public static void main(String[] args) throws IOException {
192: InputStream in = args.length > 0 ? new BufferedInputStream(
193: new FileInputStream(args[0])) : System.in;
194: PrintStream out = args.length > 1 ? new PrintStream(
195: new BufferedOutputStream(new FileOutputStream(args[1])))
196: : System.out;
197: BufferedReader br = new BufferedReader(
198: new InputStreamReader(in));
199: String line;
200: while ((line = br.readLine()) != null) {
201: if (line.indexOf("#") > 0)
202: line = line.substring(0, line.indexOf("#"));
203: line = line.trim();
204: if (line.length() == 0)
205: continue;
206: line = ArchiveUtils.addImpliedHttpIfNecessary(line);
207: out.println(SURT.fromURI(line));
208: }
209: br.close();
210: out.close();
211: }
212: }
|