Source Code Cross Referenced for ExtractorUniversal.java in » Web-Crawler » heritrix » org » archive » crawler » extractor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.extractor
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * Created on Jan 15, 2004
020:         *
021:         */
022:        package org.archive.crawler.extractor;
023:
024:        import java.io.IOException;
025:        import java.io.InputStream;
026:        import java.util.regex.Matcher;
027:
028:        import javax.management.AttributeNotFoundException;
029:
030:        import org.archive.crawler.datamodel.CoreAttributeConstants;
031:        import org.archive.crawler.datamodel.CrawlURI;
032:        import org.archive.crawler.settings.SimpleType;
033:        import org.archive.crawler.settings.Type;
034:        import org.archive.net.UURI;
035:        import org.archive.util.TextUtils;
036:
037:        /**
038:         * A last ditch extractor that will look at the raw byte code and try to extract
039:         * anything that <i>looks</i> like a link.
040:         *
041:         * If used, it should always be specified as the last link extractor in the
042:         * order file.
043:         * <p>
044:         * To accomplish this it will scan through the bytecode and try and build up
045:         * strings of consecutive bytes that all represent characters that are valid
046:         * in a URL (see #isURLableChar(int) for details).
047:         * Once it hits the end of such a string (i.e. finds a character that
048:         * should not be in a URL) it will try to determine if it has found a URL.
049:         * This is done be seeing if the string is an IP address prefixed with
050:         * http(s):// or contains a dot followed by a Top Level Domain and end of
051:         * string or a slash.
052:         *
053:         * @author Kristinn Sigurdsson
054:         */
055:        public class ExtractorUniversal extends Extractor implements 
056:                CoreAttributeConstants {
057:
058:            private static final long serialVersionUID = -7593380118857156939L;
059:
060:            //    private static final Logger logger =
061:            //        Logger.getLogger(ExtractorUniversal.class.getName());
062:
063:            private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
064:
065:            /** Default value for how far into an unknown document we should scan
066:             * - 10k. A value of 0 or lower will disable this.
067:             */
068:            private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
069:
070:            private static String ATTR_MAX_URL_LENGTH = "max-url-length";
071:
072:            /** Maximum length for a URI that we try to match.*/
073:            private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
074:
075:            /**
076:             * Matches any string that begins with http:// or https:// followed by
077:             * something that looks like an ip address (four numbers, none longer then
078:             * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
079:             * each in the range 0-255.
080:             */
081:            static final String IP_ADDRESS = "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";
082:
083:            /**
084:             * Matches any string that begins with a TLD (no .) followed by a '/' slash
085:             * or end of string. If followed by slash then nothing after the slash is
086:             * of consequence.
087:             */
088:            public static final String TLDs = "(ac(/.*)?)" // ac  Ascension Island
089:                    + "|(ad(/.*)?)" // ad  Andorra
090:                    + "|(ae(/.*)?)" // ae  United Arab Emirates
091:                    + "|(af(/.*)?)" // af  Afghanistan
092:                    + "|(ag(/.*)?)" // ag  Antigua and Barbuda
093:                    + "|(ai(/.*)?)" // ai  Anguilla
094:                    + "|(al(/.*)?)" // al  Albania
095:                    + "|(am(/.*)?)" // am  Armenia
096:                    + "|(an(/.*)?)" // an  Netherlands Antilles
097:                    + "|(ao(/.*)?)" // ao  Angola
098:                    + "|(aero(/.*)?)" // aero Air-transport industry
099:                    + "|(aq(/.*)?)" // aq  Antarctica
100:                    + "|(ar(/.*)?)" // ar  Argentina
101:                    + "|(as(/.*)?)" // as  American Samoa
102:                    + "|(at(/.*)?)" // at  Austria
103:                    + "|(au(/.*)?)" // au  Australia
104:                    + "|(aw(/.*)?)" // aw  Aruba
105:                    + "|(az(/.*)?)" // az  Azerbaijan
106:                    + "|(ba(/.*)?)" // ba  Bosnia Hercegovina
107:                    + "|(bb(/.*)?)" // bb  Barbados
108:                    + "|(bd(/.*)?)" // bd  Bangladesh
109:                    + "|(be(/.*)?)" // be  Belgium
110:                    + "|(bf(/.*)?)" // bf  Burkina Faso
111:                    + "|(bg(/.*)?)" // bg  Bulgaria
112:                    + "|(bh(/.*)?)" // bh  Bahrain
113:                    + "|(bi(/.*)?)" // bi  Burundi
114:                    + "|(biz(/.*)?)" // biz Businesses
115:                    + "|(bj(/.*)?)" // bj  Benin
116:                    + "|(bm(/.*)?)" // bm  Bermuda
117:                    + "|(bn(/.*)?)" // bn  Brunei Darussalam
118:                    + "|(bo(/.*)?)" // bo  Bolivia
119:                    + "|(br(/.*)?)" // br  Brazil
120:                    + "|(bs(/.*)?)" // bs  Bahamas
121:                    + "|(bt(/.*)?)" // bt  Bhutan
122:                    + "|(bv(/.*)?)" // bv  Bouvet Island
123:                    + "|(bw(/.*)?)" // bw  Botswana
124:                    + "|(by(/.*)?)" // by  Belarus (Byelorussia)
125:                    + "|(bz(/.*)?)" // bz  Belize
126:                    + "|(ca(/.*)?)" // ca  Canada
127:                    + "|(cc(/.*)?)" // cc  Cocos Islands (Keeling)
128:                    + "|(cd(/.*)?)" // cd  Congo, Democratic Republic of the
129:                    + "|(cf(/.*)?)" // cf  Central African Republic
130:                    + "|(cg(/.*)?)" // cg  Congo, Republic of
131:                    + "|(ch(/.*)?)" // ch  Switzerland
132:                    + "|(ci(/.*)?)" // ci  Cote d'Ivoire (Ivory Coast)
133:                    + "|(ck(/.*)?)" // ck  Cook Islands
134:                    + "|(cl(/.*)?)" // cl  Chile
135:                    + "|(cm(/.*)?)" // cm  Cameroon
136:                    + "|(cn(/.*)?)" // cn  China
137:                    + "|(co(/.*)?)" // co  Colombia
138:                    + "|(com(/.*)?)" // com Commercial
139:                    + "|(coop(/.*)?)" // coop Cooperatives
140:                    + "|(cr(/.*)?)" // cr  Costa Rica
141:                    + "|(cs(/.*)?)" // cs  Czechoslovakia
142:                    + "|(cu(/.*)?)" // cu  Cuba
143:                    + "|(cv(/.*)?)" // cv  Cap Verde
144:                    + "|(cx(/.*)?)" // cx  Christmas Island
145:                    + "|(cy(/.*)?)" // cy  Cyprus
146:                    + "|(cz(/.*)?)" // cz  Czech Republic
147:                    + "|(de(/.*)?)" // de  Germany
148:                    + "|(dj(/.*)?)" // dj  Djibouti
149:                    + "|(dk(/.*)?)" // dk  Denmark
150:                    + "|(dm(/.*)?)" // dm  Dominica
151:                    + "|(do(/.*)?)" // do  Dominican Republic
152:                    + "|(dz(/.*)?)" // dz  Algeria
153:                    + "|(ec(/.*)?)" // ec  Ecuador
154:                    + "|(edu(/.*)?)" // edu Educational Institution
155:                    + "|(ee(/.*)?)" // ee  Estonia
156:                    + "|(eg(/.*)?)" // eg  Egypt
157:                    + "|(eh(/.*)?)" // eh  Western Sahara
158:                    + "|(er(/.*)?)" // er  Eritrea
159:                    + "|(es(/.*)?)" // es  Spain
160:                    + "|(et(/.*)?)" // et  Ethiopia
161:                    + "|(fi(/.*)?)" // fi  Finland
162:                    + "|(fj(/.*)?)" // fj  Fiji
163:                    + "|(fk(/.*)?)" // fk  Falkland Islands
164:                    + "|(fm(/.*)?)" // fm  Micronesia, Federal State of
165:                    + "|(fo(/.*)?)" // fo  Faroe Islands
166:                    + "|(fr(/.*)?)" // fr  France
167:                    + "|(ga(/.*)?)" // ga  Gabon
168:                    + "|(gd(/.*)?)" // gd  Grenada
169:                    + "|(ge(/.*)?)" // ge  Georgia
170:                    + "|(gf(/.*)?)" // gf  French Guiana
171:                    + "|(gg(/.*)?)" // gg  Guernsey
172:                    + "|(gh(/.*)?)" // gh  Ghana
173:                    + "|(gi(/.*)?)" // gi  Gibraltar
174:                    + "|(gl(/.*)?)" // gl  Greenland
175:                    + "|(gm(/.*)?)" // gm  Gambia
176:                    + "|(gn(/.*)?)" // gn  Guinea
177:                    + "|(gov(/.*)?)" // gov Government (US)
178:                    + "|(gp(/.*)?)" // gp  Guadeloupe
179:                    + "|(gq(/.*)?)" // gq  Equatorial Guinea
180:                    + "|(gr(/.*)?)" // gr  Greece
181:                    + "|(gs(/.*)?)" // gs  South Georgia and the South Sandwich Islands
182:                    + "|(gt(/.*)?)" // gt  Guatemala
183:                    + "|(gu(/.*)?)" // gu  Guam
184:                    + "|(gw(/.*)?)" // gw  Guinea-Bissau
185:                    + "|(gy(/.*)?)" // gy  Guyana
186:                    + "|(hk(/.*)?)" // hk  Hong Kong
187:                    + "|(hm(/.*)?)" // hm  Heard and McDonald Islands
188:                    + "|(hn(/.*)?)" // hn  Honduras
189:                    + "|(hr(/.*)?)" // hr  Croatia/Hrvatska
190:                    + "|(ht(/.*)?)" // ht  Haiti
191:                    + "|(hu(/.*)?)" // hu  Hungary
192:                    + "|(id(/.*)?)" // id  Indonesia
193:                    + "|(ie(/.*)?)" // ie  Ireland
194:                    + "|(il(/.*)?)" // il  Israel
195:                    + "|(im(/.*)?)" // im  Isle of Man
196:                    + "|(in(/.*)?)" // in  India
197:                    + "|(info(/.*)?)" // info
198:                    + "|(int(/.*)?)" // int Int. Organizations
199:                    + "|(io(/.*)?)" // io  British Indian Ocean Territory
200:                    + "|(iq(/.*)?)" // iq  Iraq
201:                    + "|(ir(/.*)?)" // ir  Iran, Islamic Republic of
202:                    + "|(is(/.*)?)" // is  Iceland
203:                    + "|(it(/.*)?)" // it  Italy
204:                    + "|(je(/.*)?)" // je  Jersey
205:                    + "|(jm(/.*)?)" // jm  Jamaica
206:                    + "|(jo(/.*)?)" // jo  Jordan
207:                    + "|(jp(/.*)?)" // jp  Japan
208:                    + "|(ke(/.*)?)" // ke  Kenya
209:                    + "|(kg(/.*)?)" // kg  Kyrgyzstan
210:                    + "|(kh(/.*)?)" // kh  Cambodia
211:                    + "|(ki(/.*)?)" // ki  Kiribati
212:                    + "|(km(/.*)?)" // km  Comoros
213:                    + "|(kn(/.*)?)" // kn  Saint Kitts and Nevis
214:                    + "|(kp(/.*)?)" // kp  Korea, Democratic People's Republic
215:                    + "|(kr(/.*)?)" // kr  Korea, Republic of
216:                    + "|(kw(/.*)?)" // kw  Kuwait
217:                    + "|(ky(/.*)?)" // ky  Cayman Islands
218:                    + "|(kz(/.*)?)" // kz  Kazakhstan
219:                    + "|(la(/.*)?)" // la  Lao People's Democratic Republic
220:                    + "|(lb(/.*)?)" // lb  Lebanon
221:                    + "|(lc(/.*)?)" // lc  Saint Lucia
222:                    + "|(li(/.*)?)" // li  Liechtenstein
223:                    + "|(lk(/.*)?)" // lk  Sri Lanka
224:                    + "|(lr(/.*)?)" // lr  Liberia
225:                    + "|(ls(/.*)?)" // ls  Lesotho
226:                    + "|(lt(/.*)?)" // lt  Lithuania
227:                    + "|(lu(/.*)?)" // lu  Luxembourg
228:                    + "|(lv(/.*)?)" // lv  Latvia
229:                    + "|(ly(/.*)?)" // ly  Libyan Arab Jamahiriya
230:                    + "|(ma(/.*)?)" // ma  Morocco
231:                    + "|(mc(/.*)?)" // mc  Monaco
232:                    + "|(md(/.*)?)" // md  Moldova, Republic of
233:                    + "|(mg(/.*)?)" // mg  Madagascar
234:                    + "|(mh(/.*)?)" // mh  Marshall Islands
235:                    + "|(mil(/.*)?)" // mil Military (US Dept of Defense)
236:                    + "|(mk(/.*)?)" // mk  Macedonia, Former Yugoslav Republic
237:                    + "|(ml(/.*)?)" // ml  Mali
238:                    + "|(mm(/.*)?)" // mm  Myanmar
239:                    + "|(mn(/.*)?)" // mn  Mongolia
240:                    + "|(mo(/.*)?)" // mo  Macau
241:                    + "|(mp(/.*)?)" // mp  Northern Mariana Islands
242:                    + "|(mq(/.*)?)" // mq  Martinique
243:                    + "|(mr(/.*)?)" // mr  Mauritani
244:                    + "|(ms(/.*)?)" // ms  Montserrat
245:                    + "|(mt(/.*)?)" // mt  Malta
246:                    + "|(mu(/.*)?)" // mu  Mauritius
247:                    + "|(museum(/.*)?)" // museum Museums
248:                    + "|(mv(/.*)?)" // mv  Maldives
249:                    + "|(mw(/.*)?)" // mw  Malawi
250:                    + "|(mx(/.*)?)" // mx  Mexico
251:                    + "|(my(/.*)?)" // my  Malaysia
252:                    + "|(mz(/.*)?)" // mz  Mozambique
253:                    + "|(na(/.*)?)" // na  Namibia
254:                    + "|(name(/.*)?)" // name Individuals
255:                    + "|(nc(/.*)?)" // nc  New Caledonia
256:                    + "|(ne(/.*)?)" // ne  Niger
257:                    + "|(net(/.*)?)" // net networks
258:                    + "|(nf(/.*)?)" // nf  Norfolk Island
259:                    + "|(ng(/.*)?)" // ng  Nigeria
260:                    + "|(ni(/.*)?)" // ni  Nicaragua
261:                    + "|(nl(/.*)?)" // nl  Netherlands
262:                    + "|(no(/.*)?)" // no  Norway
263:                    + "|(np(/.*)?)" // np  Nepal
264:                    + "|(nr(/.*)?)" // nr  Nauru
265:                    + "|(nt(/.*)?)" // nt  Neutral Zone
266:                    + "|(nu(/.*)?)" // nu  Niue
267:                    + "|(nz(/.*)?)" // nz  New Zealand
268:                    + "|(om(/.*)?)" // om  Oman
269:                    + "|(org(/.*)?)" // org Organization (non-profit)
270:                    + "|(pa(/.*)?)" // pa  Panama
271:                    + "|(pe(/.*)?)" // pe  Peru
272:                    + "|(pf(/.*)?)" // pf  French Polynesia
273:                    + "|(pg(/.*)?)" // pg  Papua New Guinea
274:                    + "|(ph(/.*)?)" // ph  Philippines
275:                    + "|(pk(/.*)?)" // pk  Pakistan
276:                    + "|(pl(/.*)?)" // pl  Poland
277:                    + "|(pm(/.*)?)" // pm  St. Pierre and Miquelon
278:                    + "|(pn(/.*)?)" // pn  Pitcairn Island
279:                    + "|(pr(/.*)?)" // pr  Puerto Rico
280:                    + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
281:                    + "|(ps(/.*)?)" // ps  Palestinian Territories
282:                    + "|(pt(/.*)?)" // pt  Portugal
283:                    + "|(pw(/.*)?)" // pw  Palau
284:                    + "|(py(/.*)?)" // py  Paraguay
285:                    + "|(qa(/.*)?)" // qa  Qatar
286:                    + "|(re(/.*)?)" // re  Reunion Island
287:                    + "|(ro(/.*)?)" // ro  Romania
288:                    + "|(ru(/.*)?)" // ru  Russian Federation
289:                    + "|(rw(/.*)?)" // rw  Rwanda
290:                    + "|(sa(/.*)?)" // sa  Saudi Arabia
291:                    + "|(sb(/.*)?)" // sb  Solomon Islands
292:                    + "|(sc(/.*)?)" // sc  Seychelles
293:                    + "|(sd(/.*)?)" // sd  Sudan
294:                    + "|(se(/.*)?)" // se  Sweden
295:                    + "|(sg(/.*)?)" // sg  Singapore
296:                    + "|(sh(/.*)?)" // sh  St. Helena
297:                    + "|(si(/.*)?)" // si  Slovenia
298:                    + "|(sj(/.*)?)" // sj  Svalbard and Jan Mayen Islands
299:                    + "|(sk(/.*)?)" // sk  Slovak Republic
300:                    + "|(sl(/.*)?)" // sl  Sierra Leone
301:                    + "|(sm(/.*)?)" // sm  San Marino
302:                    + "|(sn(/.*)?)" // sn  Senegal
303:                    + "|(so(/.*)?)" // so  Somalia
304:                    + "|(sr(/.*)?)" // sr  Suriname
305:                    + "|(sv(/.*)?)" // sv  El Salvador
306:                    + "|(st(/.*)?)" // st  Sao Tome and Principe
307:                    + "|(sy(/.*)?)" // sy  Syrian Arab Republic
308:                    + "|(sz(/.*)?)" // sz  Swaziland
309:                    + "|(tc(/.*)?)" // tc  Turks and Caicos Islands
310:                    + "|(td(/.*)?)" // td  Chad
311:                    + "|(tf(/.*)?)" // tf  French Southern Territories
312:                    + "|(tg(/.*)?)" // tg  Togo
313:                    + "|(th(/.*)?)" // th  Thailand
314:                    + "|(tj(/.*)?)" // tj  Tajikistan
315:                    + "|(tk(/.*)?)" // tk  Tokelau
316:                    + "|(tm(/.*)?)" // tm  Turkmenistan
317:                    + "|(tn(/.*)?)" // tn  Tunisia
318:                    + "|(to(/.*)?)" // to  Tonga
319:                    + "|(tp(/.*)?)" // tp  East Timor
320:                    + "|(tr(/.*)?)" // tr  Turkey
321:                    + "|(tt(/.*)?)" // tt  Trinidad and Tobago
322:                    + "|(tv(/.*)?)" // tv  Tuvalu
323:                    + "|(tw(/.*)?)" // tw  Taiwan
324:                    + "|(tz(/.*)?)" // tz  Tanzania
325:                    + "|(ua(/.*)?)" // ua  Ukraine
326:                    + "|(ug(/.*)?)" // ug  Uganda
327:                    + "|(uk(/.*)?)" // uk  United Kingdom
328:                    + "|(um(/.*)?)" // um  US Minor Outlying Islands
329:                    + "|(us(/.*)?)" // us  United States
330:                    + "|(uy(/.*)?)" // uy  Uruguay
331:                    + "|(uz(/.*)?)" // uz  Uzbekistan
332:                    + "|(va(/.*)?)" // va  Holy See (City Vatican State)
333:                    + "|(vc(/.*)?)" // vc  Saint Vincent and the Grenadines
334:                    + "|(ve(/.*)?)" // ve  Venezuela
335:                    + "|(vg(/.*)?)" // vg  Virgin Islands (British)
336:                    + "|(vi(/.*)?)" // vi  Virgin Islands (USA)
337:                    + "|(vn(/.*)?)" // vn  Vietnam
338:                    + "|(vu(/.*)?)" // vu  Vanuatu
339:                    + "|(wf(/.*)?)" // wf  Wallis and Futuna Islands
340:                    + "|(ws(/.*)?)" // ws  Western Samoa
341:                    + "|(ye(/.*)?)" // ye  Yemen
342:                    + "|(yt(/.*)?)" // yt  Mayotte
343:                    + "|(yu(/.*)?)" // yu  Yugoslavia
344:                    + "|(za(/.*)?)" // za  South Africa
345:                    + "|(zm(/.*)?)" // zm  Zambia
346:                    + "|(zw(/.*)?)" // zw  Zimbabwe
347:            ;
348:
349:            protected long numberOfCURIsHandled = 0;
350:            protected long numberOfLinksExtracted = 0;
351:
352:            /**
353:             * Constructor
354:             * @param name The name of the module.
355:             */
356:            public ExtractorUniversal(String name) {
357:                super (
358:                        name,
359:                        "Link extraction on unknown file types. A best effort"
360:                                + " extractor that looks at the raw byte code of any file "
361:                                + "that has not been handled by another extractor and tries"
362:                                + " to find URIs. Will only match absolute URIs.");
363:                Type e;
364:                e = addElementToDefinition(new SimpleType(
365:                        ATTR_MAX_DEPTH_BYTES,
366:                        "How deep to look into files for URI strings, in bytes",
367:                        new Long(DEFAULT_MAX_DEPTH_BYTES)));
368:                e.setExpertSetting(true);
369:                e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
370:                        "Max length of URIs in bytes", new Long(
371:                                DEFAULT_MAX_URL_LENGTH)));
372:                e.setExpertSetting(true);
373:            }
374:
375:            protected void extract(CrawlURI curi) {
376:                if (!isHttpTransactionContentToProcess(curi)) {
377:                    return;
378:                }
379:
380:                numberOfCURIsHandled++;
381:
382:                try {
383:                    InputStream instream = curi.getHttpRecorder()
384:                            .getRecordedInput().getContentReplayInputStream();
385:                    int ch = instream.read();
386:                    StringBuffer lookat = new StringBuffer();
387:                    long counter = 0;
388:                    long maxdepth = ((Long) getAttribute(ATTR_MAX_DEPTH_BYTES,
389:                            curi)).longValue();
390:                    if (maxdepth <= 0) {
391:                        maxdepth = Long.MAX_VALUE;
392:                    }
393:                    long maxURLLength = ((Long) getAttribute(
394:                            ATTR_MAX_URL_LENGTH, curi)).longValue();
395:                    boolean foundDot = false;
396:                    while (ch != -1 && ++counter <= maxdepth) {
397:                        if (lookat.length() > maxURLLength) {
398:                            //Exceeded maximum length of a URL. Start fresh.
399:                            lookat = new StringBuffer();
400:                            foundDot = false;
401:                        } else if (isURLableChar(ch)) {
402:                            //Add to buffer.
403:                            if (ch == 46) {
404:                                // Current character is a dot '.'
405:                                foundDot = true;
406:                            }
407:                            lookat.append((char) ch);
408:                        } else if (lookat.length() > 3 && foundDot) {
409:                            // It takes a bare mininum of 4 characters to form a URL
410:                            // Since we have at least that many let's try link
411:                            // extraction.
412:                            String newURL = lookat.toString();
413:                            if (looksLikeAnURL(newURL)) {
414:                                // Looks like we found something.
415:
416:                                // Let's start with a little cleanup as we may have
417:                                // junk in front or at the end.
418:                                if (newURL.toLowerCase().indexOf("http") > 0) {
419:                                    // Got garbage in front of the protocol. Remove.
420:                                    newURL = newURL.substring(newURL
421:                                            .toLowerCase().indexOf("http"));
422:                                }
423:                                while (newURL.substring(newURL.length() - 1)
424:                                        .equals(".")) {
425:                                    // URLs can't end with a dot. Strip it off.
426:                                    newURL = newURL.substring(0, newURL
427:                                            .length() - 1);
428:                                }
429:
430:                                // And add the URL to speculative embeds.
431:                                numberOfLinksExtracted++;
432:                                curi.createAndAddLink(newURL,
433:                                        Link.SPECULATIVE_MISC,
434:                                        Link.SPECULATIVE_HOP);
435:                            }
436:                            // Reset lookat for next string.
437:                            lookat = new StringBuffer();
438:                            foundDot = false;
439:                        } else if (lookat.length() > 0) {
440:                            // Didn't get enough chars. Reset lookat for next string.
441:                            lookat = new StringBuffer();
442:                            foundDot = false;
443:                        }
444:                        ch = instream.read();
445:                    }
446:                } catch (IOException e) {
447:                    //TODO: Handle this exception.
448:                    e.printStackTrace();
449:                } catch (AttributeNotFoundException e) {
450:                    // TODO Auto-generated catch block
451:                    e.printStackTrace();
452:                }
453:                // Set flag to indicate that link extraction is completed.
454:                curi.linkExtractorFinished();
455:            }
456:
457:            /**
458:             * This method takes a look at a string and determines if it could be a URL.
459:             * To qualify the string must either begin with "http://" (https would also
460:             * work) followed by something that looks like an IP address or contain
461:             * within the string (possible at the end but not at the beginning) a TLD
462:             * (Top Level Domain) preceded by a dot.
463:             *
464:             * @param lookat The string to examine in an effort to determine if it
465:             * could be a URL
466:             * @return True if the string matches the above criteria for a URL.
467:             */
468:            private boolean looksLikeAnURL(String lookat) {
469:                if (lookat.indexOf("http://") == 0
470:                        || lookat.indexOf("https://") == 0) {
471:                    //Check if the rest of the string looks like an IP address.
472:                    //if so return true. Otherwise continue on.
473:                    Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
474:                    boolean testVal = ip.matches();
475:                    TextUtils.recycleMatcher(ip);
476:                    if (testVal) {
477:                        return true;
478:                    }
479:                }
480:
481:                int dot = lookat.indexOf(".");
482:                if (dot != 0) {//An URL can't start with a .tld.
483:                    while (dot != -1 && dot < lookat.length()) {
484:                        lookat = lookat.substring(dot + 1);
485:                        if (isTLD(lookat.substring(0,
486:                                lookat.length() <= 6 ? lookat.length() : 6))) {
487:                            return true;
488:                        }
489:                        dot = lookat.indexOf(".");
490:                    }
491:                }
492:
493:                return false;
494:            }
495:
496:            /**
497:             * Checks if a string is equal to known Top Level Domain. The string may
498:             * contain additional characters <i>after</i> the TLD but not before.
499:             * @param potentialTLD The string (usually 2-6 chars) to check if it starts
500:             * with a TLD.
501:             * @return True if the given string starts with the name of a known TLD
502:             *
503:             * @see #TLDs
504:             */
505:            private boolean isTLD(String potentialTLD) {
506:                if (potentialTLD.length() < 2) {
507:                    return false;
508:                }
509:
510:                potentialTLD.toLowerCase();
511:                Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);
512:                boolean ret = uri.matches();
513:                TextUtils.recycleMatcher(uri);
514:                return ret;
515:            }
516:
517:            /**
518:             * Determines if a char (as represented by an int in the range of 0-255) is
519:             * a character (in the Ansi character set) that can be present in a URL.
520:             * This method takes a <b>strict</b> approach to what characters can be in
521:             * a URL.
522:             * <p>
523:             * The following are considered to be 'URLable'<br>
524:             * <ul>
525:             *  <li> <code># $ % & + , - . /</code> values 35-38,43-47
526:             *  <li> <code>[0-9]</code> values 48-57
527:             *  <li> <code>: ; = ? @</code> value 58-59,61,63-64
528:             *  <li> <code>[A-Z]</code> values 65-90
529:             *  <li> <code>_</code> value 95
530:             *  <li> <code>[a-z]</code> values 97-122
531:             *  <li> <code>~</code> value 126
532:             * </ul>
533:             * <p>
534:             * To summerize, the following ranges are considered URLable:<br>
535:             * 35-38,43-59,61,63-90,95,97-122,126
536:             *
537:             * @param ch The character (represented by an int) to test.
538:             * @return True if it is a URLable character, false otherwise.
539:             */
540:            private boolean isURLableChar(int ch) {
541:                return (ch >= 35 && ch <= 38) || (ch >= 43 && ch <= 59)
542:                        || (ch == 61) || (ch >= 63 && ch <= 90) || (ch == 95)
543:                        || (ch >= 97 && ch <= 122) || (ch == 126);
544:            }
545:
546:            /* (non-Javadoc)
547:             * @see org.archive.crawler.framework.Processor#report()
548:             */
549:            public String report() {
550:                StringBuffer ret = new StringBuffer();
551:                ret.append("Processor: org.archive.crawler.extractor."
552:                        + "ExtractorUniversal\n");
553:                ret
554:                        .append("  Function:          Link extraction on unknown file"
555:                                + " types.\n");
556:                ret.append("  CrawlURIs handled: " + numberOfCURIsHandled
557:                        + "\n");
558:                ret.append("  Links extracted:   " + numberOfLinksExtracted
559:                        + "\n\n");
560:
561:                return ret.toString();
562:            }
563:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.