Source Code Cross Referenced for SurtPrefixSet.java in » Web-Crawler » heritrix » org » archive » util » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.util

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* SURTPrefixSet
002:         *
003:         * $Id: SurtPrefixSet.java 4644 2006-09-20 22:40:21Z paul_jack $
004:         *
005:         * Created on Jul 23, 2004
006:         *
007:         * Copyright (C) 2004 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.util;
026:
027:        import java.io.BufferedInputStream;
028:        import java.io.BufferedOutputStream;
029:        import java.io.BufferedReader;
030:        import java.io.FileInputStream;
031:        import java.io.FileOutputStream;
032:        import java.io.FileWriter;
033:        import java.io.IOException;
034:        import java.io.InputStream;
035:        import java.io.InputStreamReader;
036:        import java.io.PrintStream;
037:        import java.io.Reader;
038:        import java.util.Iterator;
039:        import java.util.SortedSet;
040:        import java.util.TreeSet;
041:
042:        import org.apache.commons.httpclient.URIException;
043:        import org.archive.net.UURI;
044:        import org.archive.net.UURIFactory;
045:        import org.archive.util.iterator.LineReadingIterator;
046:        import org.archive.util.iterator.RegexpLineIterator;
047:
048:        /**
049:         * Specialized TreeSet for keeping a set of String prefixes. 
050:         * 
051:         * Redundant prefixes (those that are themselves prefixed
052:         * by other set entries) are eliminated.
053:         * 
054:         * @author gojomo
055:         */
056:        public class SurtPrefixSet extends TreeSet<String> {
057:
058:            private static final long serialVersionUID = 2598365040524933110L;
059:
060:            private static final String SURT_PREFIX_DIRECTIVE = "+";
061:
062:            /**
063:             * Test whether the given String is prefixed by one
064:             * of this set's entries. 
065:             * 
066:             * @param s
067:             * @return True if contains prefix.
068:             */
069:            public boolean containsPrefixOf(String s) {
070:                SortedSet sub = headSet(s);
071:                // because redundant prefixes have been eliminated,
072:                // only a test against last item in headSet is necessary
073:                if (!sub.isEmpty() && s.startsWith((String) sub.last())) {
074:                    return true; // prefix substring exists
075:                } // else: might still exist exactly (headSet does not contain boundary)
076:                return contains(s); // exact string exists, or no prefix is there
077:            }
078:
079:            /** 
080:             * Maintains additional invariant: if one entry is a 
081:             * prefix of another, keep only the prefix. 
082:             * 
083:             * @see java.util.Collection#add(java.lang.Object)
084:             */
085:            public boolean add(String s) {
086:                SortedSet sub = headSet(s);
087:                if (!sub.isEmpty() && s.startsWith((String) sub.last())) {
088:                    // no need to add; prefix is already present
089:                    return false;
090:                }
091:                boolean retVal = super .add(s);
092:                sub = tailSet(s + "\0");
093:                while (!sub.isEmpty() && ((String) sub.first()).startsWith(s)) {
094:                    // remove redundant entries
095:                    sub.remove(sub.first());
096:                }
097:                return retVal;
098:            }
099:
100:            /**
101:             * Read a set of SURT prefixes from a reader source; keep sorted and 
102:             * with redundant entries removed.
103:             * 
104:             * @param r reader over file of SURT_format strings
105:             * @throws IOException
106:             */
107:            public void importFrom(Reader r) {
108:                BufferedReader reader = new BufferedReader(r);
109:                String s;
110:
111:                Iterator iter = new RegexpLineIterator(
112:                        new LineReadingIterator(reader),
113:                        RegexpLineIterator.COMMENT_LINE,
114:                        RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
115:                        RegexpLineIterator.ENTRY);
116:
117:                while (iter.hasNext()) {
118:                    s = (String) iter.next();
119:                    add(s.toLowerCase());
120:                }
121:            }
122:
123:            /**
124:             * @param r Where to read from.
125:             */
126:            public void importFromUris(Reader r) {
127:                BufferedReader reader = new BufferedReader(r);
128:                String s;
129:
130:                Iterator iter = new RegexpLineIterator(
131:                        new LineReadingIterator(reader),
132:                        RegexpLineIterator.COMMENT_LINE,
133:                        RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
134:                        RegexpLineIterator.ENTRY);
135:
136:                while (iter.hasNext()) {
137:                    s = (String) iter.next();
138:                    // s is a URI (or even fragmentary hostname), not a SURT
139:                    addFromPlain(s);
140:                }
141:            }
142:
143:            /**
144:             * Import SURT prefixes from a reader with mixed URI and SURT prefix
145:             * format. 
146:             * 
147:             * @param r  the reader to import the prefixes from
148:             * @param deduceFromSeeds   true to also import SURT prefixes implied
149:             *                          from normal URIs/hostname seeds
150:             */
151:            public void importFromMixed(Reader r, boolean deduceFromSeeds) {
152:                BufferedReader reader = new BufferedReader(r);
153:                String s;
154:
155:                Iterator iter = new RegexpLineIterator(
156:                        new LineReadingIterator(reader),
157:                        RegexpLineIterator.COMMENT_LINE,
158:                        RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
159:                        RegexpLineIterator.ENTRY);
160:
161:                while (iter.hasNext()) {
162:                    s = (String) iter.next();
163:                    if (s.startsWith(SURT_PREFIX_DIRECTIVE)) {
164:                        // it's specifically a SURT prefix line
165:                        String u = s.substring(SURT_PREFIX_DIRECTIVE.length())
166:                                .trim();
167:                        if (u.indexOf("(") > 0) {
168:                            // formal SURT prefix; toLowerCase just in case
169:                            add(u.toLowerCase());
170:                        } else {
171:                            // hostname/normal form URI from which 
172:                            // to deduce SURT prefix
173:                            addFromPlain(u);
174:                        }
175:
176:                        continue;
177:                    } else {
178:                        if (deduceFromSeeds) {
179:                            // also deducing 'implied' SURT prefixes 
180:                            // from normal URIs/hostname seeds
181:                            addFromPlain(s);
182:                        }
183:                    }
184:                }
185:            }
186:
187:            /**
188:             * Given a plain URI or hostname, deduce an implied SURT prefix from
189:             * it and add to active prefixes. 
190:             * 
191:             * @param u String of URI or hostname
192:             */
193:            private void addFromPlain(String u) {
194:                u = prefixFromPlain(u);
195:                add(u);
196:            }
197:
198:            /**
199:             * Given a plain URI or hostname/hostname+path, deduce an implied SURT 
200:             * prefix from it. Results may be unpredictable on strings that cannot
201:             * be interpreted as URIs. 
202:             * 
203:             * UURI 'fixup' is applied to the URI that is built. 
204:             *
205:             * @param u URI or almost-URI to consider
206:             * @return implied SURT prefix form
207:             */
208:            public static String prefixFromPlain(String u) {
209:                u = ArchiveUtils.addImpliedHttpIfNecessary(u);
210:                u = coerceFromHttpsForComparison(u);
211:                boolean trailingSlash = u.endsWith("/");
212:                // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
213:                try {
214:                    u = UURIFactory.getInstance(u).toString();
215:                } catch (URIException e) {
216:                    e.printStackTrace();
217:                    // allow to continue with original string uri
218:                }
219:                // except: don't let UURI-fixup add a trailing slash
220:                // if it wasn't already there (presence or absence of
221:                // such slash has special meaning specifying implied
222:                // SURT prefixes)
223:                if (!trailingSlash && u.endsWith("/")) {
224:                    u = u.substring(0, u.length() - 1);
225:                }
226:                // convert to full SURT
227:                u = SURT.fromURI(u);
228:                // truncate to implied prefix
229:                u = SurtPrefixSet.asPrefix(u);
230:                return u;
231:            }
232:
233:            /**
234:             * For SURT comparisons -- prefixes or candidates being checked against
235:             * those prefixes -- we treat https URIs as if they were http.
236:             * 
237:             * @param u string to coerce if it has https scheme
238:             * @return string converted to http scheme, or original if not necessary
239:             */
240:            private static String coerceFromHttpsForComparison(String u) {
241:                if (u.startsWith("https://")) {
242:                    u = "http" + u.substring("https".length());
243:                }
244:                return u;
245:            }
246:
247:            /**
248:             * Utility method for truncating a SURT that came from a 
249:             * full URI (as a seed, for example) into a prefix
250:             * for determining inclusion.
251:             * 
252:             * This involves: 
253:             * <pre>
254:             *    (1) removing the last path component, if any
255:             *        (anything after the last '/', if there are
256:             *        at least 3 '/'s)
257:             *    (2) removing a trailing ')', if present, opening
258:             *        the possibility of proper subdomains. (This
259:             *        means that the presence or absence of a
260:             *        trailing '/' after a hostname in a seed list
261:             *        is significant for the how the SURT prefix is 
262:             *        created, even though it is not signficant for 
263:             *        the URI's treatment as a seed.)
264:             * </pre>
265:             *
266:             * @param s String to work on.
267:             * @return As prefix.
268:             */
269:            private static String asPrefix(String s) {
270:                // Strip last path-segment, if more than 3 slashes
271:                s = s.replaceAll("^(.*//.*/)[^/]*", "$1");
272:                // Strip trailing ")", if present and NO path (no 3rd slash).
273:                if (!s.endsWith("/")) {
274:                    s = s.replaceAll("^(.*)\\)", "$1");
275:                }
276:                return s;
277:            }
278:
279:            /**
280:             * Calculate the SURT form URI to use as a candidate against prefixes
281:             * from the given Object (CandidateURI or UURI)
282:             * 
283:             * @param object CandidateURI or UURI
284:             * @return SURT form of URI for evaluation, or null if unavailable
285:             */
286:            public static String getCandidateSurt(Object object) {
287:                UURI u = UURI.from(object);
288:                if (u == null) {
289:                    return null;
290:                }
291:                String candidateSurt = u.getSurtForm();
292:                // also want to treat https as http
293:                candidateSurt = coerceFromHttpsForComparison(candidateSurt);
294:                return candidateSurt;
295:            }
296:
297:            /**
298:             * @param fw
299:             * @throws IOException
300:             */
301:            public void exportTo(FileWriter fw) throws IOException {
302:                Iterator iter = this .iterator();
303:                while (iter.hasNext()) {
304:                    fw.write((String) iter.next() + "\n");
305:                }
306:            }
307:
308:            /**
309:             * Changes all prefixes so that they enforce an exact host. For
310:             * prefixes that already include a ')', this means discarding 
311:             * anything after ')' (path info). For prefixes that don't include
312:             * a ')' -- domain prefixes open to subdomains -- add the closing
313:             * ')' (or ",)").  
314:             */
315:            public void convertAllPrefixesToHosts() {
316:                SurtPrefixSet iterCopy = (SurtPrefixSet) this .clone();
317:                Iterator iter = iterCopy.iterator();
318:                while (iter.hasNext()) {
319:                    String prefix = (String) iter.next();
320:                    String convPrefix = convertPrefixToHost(prefix);
321:                    if (prefix != convPrefix) {
322:                        // if returned value not unchanged, update set
323:                        this .remove(prefix);
324:                        this .add(convPrefix);
325:                    }
326:                }
327:            }
328:
329:            public static String convertPrefixToHost(String prefix) {
330:                if (prefix.endsWith(")")) {
331:                    return prefix; // no change necessary
332:                }
333:                if (prefix.indexOf(')') < 0) {
334:                    // open-ended domain prefix
335:                    if (!prefix.endsWith(",")) {
336:                        prefix += ",";
337:                    }
338:                    prefix += ")";
339:                } else {
340:                    // prefix with excess path-info
341:                    prefix = prefix.substring(0, prefix.indexOf(')') + 1);
342:                }
343:                return prefix;
344:            }
345:
346:            /**
347:             * Changes all prefixes so that they only enforce a general
348:             * domain (allowing subdomains).For prefixes that don't include
349:             * a ')', no change is necessary. For others, truncate everything
350:             * from the ')' onward. Additionally, truncate off "www," if it
351:             * appears.
352:             */
353:            public void convertAllPrefixesToDomains() {
354:                SurtPrefixSet iterCopy = (SurtPrefixSet) this .clone();
355:                Iterator iter = iterCopy.iterator();
356:                while (iter.hasNext()) {
357:                    String prefix = (String) iter.next();
358:                    String convPrefix = convertPrefixToDomain(prefix);
359:                    if (prefix != convPrefix) {
360:                        // if returned value not unchanged, update set
361:                        this .remove(prefix);
362:                        this .add(convPrefix);
363:                    }
364:                }
365:            }
366:
367:            public static String convertPrefixToDomain(String prefix) {
368:                if (prefix.indexOf(')') >= 0) {
369:                    prefix = prefix.substring(0, prefix.indexOf(')'));
370:                }
371:                // strip 'www,' when present
372:                if (prefix.endsWith("www,")) {
373:                    prefix = prefix.substring(0, prefix.length() - 4);
374:                }
375:                return prefix;
376:            }
377:
378:            /**
379:             * Allow class to be used as a command-line tool for converting 
380:             * URL lists (or naked host or host/path fragments implied
381:             * to be HTTP URLs) to implied SURT prefix form. 
382:             * 
383:             * Read from stdin or first file argument. Writes to stdout. 
384:             *
385:             * @param args cmd-line arguments: may include input file
386:             * @throws IOException
387:             */
388:            public static void main(String[] args) throws IOException {
389:                InputStream in = args.length > 0 ? new BufferedInputStream(
390:                        new FileInputStream(args[0])) : System.in;
391:                PrintStream out = args.length > 1 ? new PrintStream(
392:                        new BufferedOutputStream(new FileOutputStream(args[1])))
393:                        : System.out;
394:                BufferedReader br = new BufferedReader(
395:                        new InputStreamReader(in));
396:                String line;
397:                while ((line = br.readLine()) != null) {
398:                    if (line.indexOf("#") > 0)
399:                        line = line.substring(0, line.indexOf("#"));
400:                    line = line.trim();
401:                    if (line.length() == 0)
402:                        continue;
403:                    out.println(prefixFromPlain(line));
404:                }
405:                br.close();
406:                out.close();
407:            }
408:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.