001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Jan 15, 2004
020: *
021: */
022: package org.archive.crawler.extractor;
023:
024: import java.io.IOException;
025: import java.io.InputStream;
026: import java.util.regex.Matcher;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.archive.crawler.datamodel.CoreAttributeConstants;
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.crawler.settings.SimpleType;
033: import org.archive.crawler.settings.Type;
034: import org.archive.net.UURI;
035: import org.archive.util.TextUtils;
036:
037: /**
038: * A last ditch extractor that will look at the raw byte code and try to extract
039: * anything that <i>looks</i> like a link.
040: *
041: * If used, it should always be specified as the last link extractor in the
042: * order file.
043: * <p>
044: * To accomplish this it will scan through the bytecode and try and build up
045: * strings of consecutive bytes that all represent characters that are valid
046: * in a URL (see #isURLableChar(int) for details).
047: * Once it hits the end of such a string (i.e. finds a character that
048: * should not be in a URL) it will try to determine if it has found a URL.
049: * This is done be seeing if the string is an IP address prefixed with
050: * http(s):// or contains a dot followed by a Top Level Domain and end of
051: * string or a slash.
052: *
053: * @author Kristinn Sigurdsson
054: */
055: public class ExtractorUniversal extends Extractor implements
056: CoreAttributeConstants {
057:
058: private static final long serialVersionUID = -7593380118857156939L;
059:
060: // private static final Logger logger =
061: // Logger.getLogger(ExtractorUniversal.class.getName());
062:
063: private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
064:
065: /** Default value for how far into an unknown document we should scan
066: * - 10k. A value of 0 or lower will disable this.
067: */
068: private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
069:
070: private static String ATTR_MAX_URL_LENGTH = "max-url-length";
071:
072: /** Maximum length for a URI that we try to match.*/
073: private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
074:
075: /**
076: * Matches any string that begins with http:// or https:// followed by
077: * something that looks like an ip address (four numbers, none longer then
078: * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
079: * each in the range 0-255.
080: */
081: static final String IP_ADDRESS = "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";
082:
083: /**
084: * Matches any string that begins with a TLD (no .) followed by a '/' slash
085: * or end of string. If followed by slash then nothing after the slash is
086: * of consequence.
087: */
088: public static final String TLDs = "(ac(/.*)?)" // ac Ascension Island
089: + "|(ad(/.*)?)" // ad Andorra
090: + "|(ae(/.*)?)" // ae United Arab Emirates
091: + "|(af(/.*)?)" // af Afghanistan
092: + "|(ag(/.*)?)" // ag Antigua and Barbuda
093: + "|(ai(/.*)?)" // ai Anguilla
094: + "|(al(/.*)?)" // al Albania
095: + "|(am(/.*)?)" // am Armenia
096: + "|(an(/.*)?)" // an Netherlands Antilles
097: + "|(ao(/.*)?)" // ao Angola
098: + "|(aero(/.*)?)" // aero Air-transport industry
099: + "|(aq(/.*)?)" // aq Antarctica
100: + "|(ar(/.*)?)" // ar Argentina
101: + "|(as(/.*)?)" // as American Samoa
102: + "|(at(/.*)?)" // at Austria
103: + "|(au(/.*)?)" // au Australia
104: + "|(aw(/.*)?)" // aw Aruba
105: + "|(az(/.*)?)" // az Azerbaijan
106: + "|(ba(/.*)?)" // ba Bosnia Hercegovina
107: + "|(bb(/.*)?)" // bb Barbados
108: + "|(bd(/.*)?)" // bd Bangladesh
109: + "|(be(/.*)?)" // be Belgium
110: + "|(bf(/.*)?)" // bf Burkina Faso
111: + "|(bg(/.*)?)" // bg Bulgaria
112: + "|(bh(/.*)?)" // bh Bahrain
113: + "|(bi(/.*)?)" // bi Burundi
114: + "|(biz(/.*)?)" // biz Businesses
115: + "|(bj(/.*)?)" // bj Benin
116: + "|(bm(/.*)?)" // bm Bermuda
117: + "|(bn(/.*)?)" // bn Brunei Darussalam
118: + "|(bo(/.*)?)" // bo Bolivia
119: + "|(br(/.*)?)" // br Brazil
120: + "|(bs(/.*)?)" // bs Bahamas
121: + "|(bt(/.*)?)" // bt Bhutan
122: + "|(bv(/.*)?)" // bv Bouvet Island
123: + "|(bw(/.*)?)" // bw Botswana
124: + "|(by(/.*)?)" // by Belarus (Byelorussia)
125: + "|(bz(/.*)?)" // bz Belize
126: + "|(ca(/.*)?)" // ca Canada
127: + "|(cc(/.*)?)" // cc Cocos Islands (Keeling)
128: + "|(cd(/.*)?)" // cd Congo, Democratic Republic of the
129: + "|(cf(/.*)?)" // cf Central African Republic
130: + "|(cg(/.*)?)" // cg Congo, Republic of
131: + "|(ch(/.*)?)" // ch Switzerland
132: + "|(ci(/.*)?)" // ci Cote d'Ivoire (Ivory Coast)
133: + "|(ck(/.*)?)" // ck Cook Islands
134: + "|(cl(/.*)?)" // cl Chile
135: + "|(cm(/.*)?)" // cm Cameroon
136: + "|(cn(/.*)?)" // cn China
137: + "|(co(/.*)?)" // co Colombia
138: + "|(com(/.*)?)" // com Commercial
139: + "|(coop(/.*)?)" // coop Cooperatives
140: + "|(cr(/.*)?)" // cr Costa Rica
141: + "|(cs(/.*)?)" // cs Czechoslovakia
142: + "|(cu(/.*)?)" // cu Cuba
143: + "|(cv(/.*)?)" // cv Cap Verde
144: + "|(cx(/.*)?)" // cx Christmas Island
145: + "|(cy(/.*)?)" // cy Cyprus
146: + "|(cz(/.*)?)" // cz Czech Republic
147: + "|(de(/.*)?)" // de Germany
148: + "|(dj(/.*)?)" // dj Djibouti
149: + "|(dk(/.*)?)" // dk Denmark
150: + "|(dm(/.*)?)" // dm Dominica
151: + "|(do(/.*)?)" // do Dominican Republic
152: + "|(dz(/.*)?)" // dz Algeria
153: + "|(ec(/.*)?)" // ec Ecuador
154: + "|(edu(/.*)?)" // edu Educational Institution
155: + "|(ee(/.*)?)" // ee Estonia
156: + "|(eg(/.*)?)" // eg Egypt
157: + "|(eh(/.*)?)" // eh Western Sahara
158: + "|(er(/.*)?)" // er Eritrea
159: + "|(es(/.*)?)" // es Spain
160: + "|(et(/.*)?)" // et Ethiopia
161: + "|(fi(/.*)?)" // fi Finland
162: + "|(fj(/.*)?)" // fj Fiji
163: + "|(fk(/.*)?)" // fk Falkland Islands
164: + "|(fm(/.*)?)" // fm Micronesia, Federal State of
165: + "|(fo(/.*)?)" // fo Faroe Islands
166: + "|(fr(/.*)?)" // fr France
167: + "|(ga(/.*)?)" // ga Gabon
168: + "|(gd(/.*)?)" // gd Grenada
169: + "|(ge(/.*)?)" // ge Georgia
170: + "|(gf(/.*)?)" // gf French Guiana
171: + "|(gg(/.*)?)" // gg Guernsey
172: + "|(gh(/.*)?)" // gh Ghana
173: + "|(gi(/.*)?)" // gi Gibraltar
174: + "|(gl(/.*)?)" // gl Greenland
175: + "|(gm(/.*)?)" // gm Gambia
176: + "|(gn(/.*)?)" // gn Guinea
177: + "|(gov(/.*)?)" // gov Government (US)
178: + "|(gp(/.*)?)" // gp Guadeloupe
179: + "|(gq(/.*)?)" // gq Equatorial Guinea
180: + "|(gr(/.*)?)" // gr Greece
181: + "|(gs(/.*)?)" // gs South Georgia and the South Sandwich Islands
182: + "|(gt(/.*)?)" // gt Guatemala
183: + "|(gu(/.*)?)" // gu Guam
184: + "|(gw(/.*)?)" // gw Guinea-Bissau
185: + "|(gy(/.*)?)" // gy Guyana
186: + "|(hk(/.*)?)" // hk Hong Kong
187: + "|(hm(/.*)?)" // hm Heard and McDonald Islands
188: + "|(hn(/.*)?)" // hn Honduras
189: + "|(hr(/.*)?)" // hr Croatia/Hrvatska
190: + "|(ht(/.*)?)" // ht Haiti
191: + "|(hu(/.*)?)" // hu Hungary
192: + "|(id(/.*)?)" // id Indonesia
193: + "|(ie(/.*)?)" // ie Ireland
194: + "|(il(/.*)?)" // il Israel
195: + "|(im(/.*)?)" // im Isle of Man
196: + "|(in(/.*)?)" // in India
197: + "|(info(/.*)?)" // info
198: + "|(int(/.*)?)" // int Int. Organizations
199: + "|(io(/.*)?)" // io British Indian Ocean Territory
200: + "|(iq(/.*)?)" // iq Iraq
201: + "|(ir(/.*)?)" // ir Iran, Islamic Republic of
202: + "|(is(/.*)?)" // is Iceland
203: + "|(it(/.*)?)" // it Italy
204: + "|(je(/.*)?)" // je Jersey
205: + "|(jm(/.*)?)" // jm Jamaica
206: + "|(jo(/.*)?)" // jo Jordan
207: + "|(jp(/.*)?)" // jp Japan
208: + "|(ke(/.*)?)" // ke Kenya
209: + "|(kg(/.*)?)" // kg Kyrgyzstan
210: + "|(kh(/.*)?)" // kh Cambodia
211: + "|(ki(/.*)?)" // ki Kiribati
212: + "|(km(/.*)?)" // km Comoros
213: + "|(kn(/.*)?)" // kn Saint Kitts and Nevis
214: + "|(kp(/.*)?)" // kp Korea, Democratic People's Republic
215: + "|(kr(/.*)?)" // kr Korea, Republic of
216: + "|(kw(/.*)?)" // kw Kuwait
217: + "|(ky(/.*)?)" // ky Cayman Islands
218: + "|(kz(/.*)?)" // kz Kazakhstan
219: + "|(la(/.*)?)" // la Lao People's Democratic Republic
220: + "|(lb(/.*)?)" // lb Lebanon
221: + "|(lc(/.*)?)" // lc Saint Lucia
222: + "|(li(/.*)?)" // li Liechtenstein
223: + "|(lk(/.*)?)" // lk Sri Lanka
224: + "|(lr(/.*)?)" // lr Liberia
225: + "|(ls(/.*)?)" // ls Lesotho
226: + "|(lt(/.*)?)" // lt Lithuania
227: + "|(lu(/.*)?)" // lu Luxembourg
228: + "|(lv(/.*)?)" // lv Latvia
229: + "|(ly(/.*)?)" // ly Libyan Arab Jamahiriya
230: + "|(ma(/.*)?)" // ma Morocco
231: + "|(mc(/.*)?)" // mc Monaco
232: + "|(md(/.*)?)" // md Moldova, Republic of
233: + "|(mg(/.*)?)" // mg Madagascar
234: + "|(mh(/.*)?)" // mh Marshall Islands
235: + "|(mil(/.*)?)" // mil Military (US Dept of Defense)
236: + "|(mk(/.*)?)" // mk Macedonia, Former Yugoslav Republic
237: + "|(ml(/.*)?)" // ml Mali
238: + "|(mm(/.*)?)" // mm Myanmar
239: + "|(mn(/.*)?)" // mn Mongolia
240: + "|(mo(/.*)?)" // mo Macau
241: + "|(mp(/.*)?)" // mp Northern Mariana Islands
242: + "|(mq(/.*)?)" // mq Martinique
243: + "|(mr(/.*)?)" // mr Mauritani
244: + "|(ms(/.*)?)" // ms Montserrat
245: + "|(mt(/.*)?)" // mt Malta
246: + "|(mu(/.*)?)" // mu Mauritius
247: + "|(museum(/.*)?)" // museum Museums
248: + "|(mv(/.*)?)" // mv Maldives
249: + "|(mw(/.*)?)" // mw Malawi
250: + "|(mx(/.*)?)" // mx Mexico
251: + "|(my(/.*)?)" // my Malaysia
252: + "|(mz(/.*)?)" // mz Mozambique
253: + "|(na(/.*)?)" // na Namibia
254: + "|(name(/.*)?)" // name Individuals
255: + "|(nc(/.*)?)" // nc New Caledonia
256: + "|(ne(/.*)?)" // ne Niger
257: + "|(net(/.*)?)" // net networks
258: + "|(nf(/.*)?)" // nf Norfolk Island
259: + "|(ng(/.*)?)" // ng Nigeria
260: + "|(ni(/.*)?)" // ni Nicaragua
261: + "|(nl(/.*)?)" // nl Netherlands
262: + "|(no(/.*)?)" // no Norway
263: + "|(np(/.*)?)" // np Nepal
264: + "|(nr(/.*)?)" // nr Nauru
265: + "|(nt(/.*)?)" // nt Neutral Zone
266: + "|(nu(/.*)?)" // nu Niue
267: + "|(nz(/.*)?)" // nz New Zealand
268: + "|(om(/.*)?)" // om Oman
269: + "|(org(/.*)?)" // org Organization (non-profit)
270: + "|(pa(/.*)?)" // pa Panama
271: + "|(pe(/.*)?)" // pe Peru
272: + "|(pf(/.*)?)" // pf French Polynesia
273: + "|(pg(/.*)?)" // pg Papua New Guinea
274: + "|(ph(/.*)?)" // ph Philippines
275: + "|(pk(/.*)?)" // pk Pakistan
276: + "|(pl(/.*)?)" // pl Poland
277: + "|(pm(/.*)?)" // pm St. Pierre and Miquelon
278: + "|(pn(/.*)?)" // pn Pitcairn Island
279: + "|(pr(/.*)?)" // pr Puerto Rico
280: + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
281: + "|(ps(/.*)?)" // ps Palestinian Territories
282: + "|(pt(/.*)?)" // pt Portugal
283: + "|(pw(/.*)?)" // pw Palau
284: + "|(py(/.*)?)" // py Paraguay
285: + "|(qa(/.*)?)" // qa Qatar
286: + "|(re(/.*)?)" // re Reunion Island
287: + "|(ro(/.*)?)" // ro Romania
288: + "|(ru(/.*)?)" // ru Russian Federation
289: + "|(rw(/.*)?)" // rw Rwanda
290: + "|(sa(/.*)?)" // sa Saudi Arabia
291: + "|(sb(/.*)?)" // sb Solomon Islands
292: + "|(sc(/.*)?)" // sc Seychelles
293: + "|(sd(/.*)?)" // sd Sudan
294: + "|(se(/.*)?)" // se Sweden
295: + "|(sg(/.*)?)" // sg Singapore
296: + "|(sh(/.*)?)" // sh St. Helena
297: + "|(si(/.*)?)" // si Slovenia
298: + "|(sj(/.*)?)" // sj Svalbard and Jan Mayen Islands
299: + "|(sk(/.*)?)" // sk Slovak Republic
300: + "|(sl(/.*)?)" // sl Sierra Leone
301: + "|(sm(/.*)?)" // sm San Marino
302: + "|(sn(/.*)?)" // sn Senegal
303: + "|(so(/.*)?)" // so Somalia
304: + "|(sr(/.*)?)" // sr Suriname
305: + "|(sv(/.*)?)" // sv El Salvador
306: + "|(st(/.*)?)" // st Sao Tome and Principe
307: + "|(sy(/.*)?)" // sy Syrian Arab Republic
308: + "|(sz(/.*)?)" // sz Swaziland
309: + "|(tc(/.*)?)" // tc Turks and Caicos Islands
310: + "|(td(/.*)?)" // td Chad
311: + "|(tf(/.*)?)" // tf French Southern Territories
312: + "|(tg(/.*)?)" // tg Togo
313: + "|(th(/.*)?)" // th Thailand
314: + "|(tj(/.*)?)" // tj Tajikistan
315: + "|(tk(/.*)?)" // tk Tokelau
316: + "|(tm(/.*)?)" // tm Turkmenistan
317: + "|(tn(/.*)?)" // tn Tunisia
318: + "|(to(/.*)?)" // to Tonga
319: + "|(tp(/.*)?)" // tp East Timor
320: + "|(tr(/.*)?)" // tr Turkey
321: + "|(tt(/.*)?)" // tt Trinidad and Tobago
322: + "|(tv(/.*)?)" // tv Tuvalu
323: + "|(tw(/.*)?)" // tw Taiwan
324: + "|(tz(/.*)?)" // tz Tanzania
325: + "|(ua(/.*)?)" // ua Ukraine
326: + "|(ug(/.*)?)" // ug Uganda
327: + "|(uk(/.*)?)" // uk United Kingdom
328: + "|(um(/.*)?)" // um US Minor Outlying Islands
329: + "|(us(/.*)?)" // us United States
330: + "|(uy(/.*)?)" // uy Uruguay
331: + "|(uz(/.*)?)" // uz Uzbekistan
332: + "|(va(/.*)?)" // va Holy See (City Vatican State)
333: + "|(vc(/.*)?)" // vc Saint Vincent and the Grenadines
334: + "|(ve(/.*)?)" // ve Venezuela
335: + "|(vg(/.*)?)" // vg Virgin Islands (British)
336: + "|(vi(/.*)?)" // vi Virgin Islands (USA)
337: + "|(vn(/.*)?)" // vn Vietnam
338: + "|(vu(/.*)?)" // vu Vanuatu
339: + "|(wf(/.*)?)" // wf Wallis and Futuna Islands
340: + "|(ws(/.*)?)" // ws Western Samoa
341: + "|(ye(/.*)?)" // ye Yemen
342: + "|(yt(/.*)?)" // yt Mayotte
343: + "|(yu(/.*)?)" // yu Yugoslavia
344: + "|(za(/.*)?)" // za South Africa
345: + "|(zm(/.*)?)" // zm Zambia
346: + "|(zw(/.*)?)" // zw Zimbabwe
347: ;
348:
349: protected long numberOfCURIsHandled = 0;
350: protected long numberOfLinksExtracted = 0;
351:
352: /**
353: * Constructor
354: * @param name The name of the module.
355: */
356: public ExtractorUniversal(String name) {
357: super (
358: name,
359: "Link extraction on unknown file types. A best effort"
360: + " extractor that looks at the raw byte code of any file "
361: + "that has not been handled by another extractor and tries"
362: + " to find URIs. Will only match absolute URIs.");
363: Type e;
364: e = addElementToDefinition(new SimpleType(
365: ATTR_MAX_DEPTH_BYTES,
366: "How deep to look into files for URI strings, in bytes",
367: new Long(DEFAULT_MAX_DEPTH_BYTES)));
368: e.setExpertSetting(true);
369: e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
370: "Max length of URIs in bytes", new Long(
371: DEFAULT_MAX_URL_LENGTH)));
372: e.setExpertSetting(true);
373: }
374:
375: protected void extract(CrawlURI curi) {
376: if (!isHttpTransactionContentToProcess(curi)) {
377: return;
378: }
379:
380: numberOfCURIsHandled++;
381:
382: try {
383: InputStream instream = curi.getHttpRecorder()
384: .getRecordedInput().getContentReplayInputStream();
385: int ch = instream.read();
386: StringBuffer lookat = new StringBuffer();
387: long counter = 0;
388: long maxdepth = ((Long) getAttribute(ATTR_MAX_DEPTH_BYTES,
389: curi)).longValue();
390: if (maxdepth <= 0) {
391: maxdepth = Long.MAX_VALUE;
392: }
393: long maxURLLength = ((Long) getAttribute(
394: ATTR_MAX_URL_LENGTH, curi)).longValue();
395: boolean foundDot = false;
396: while (ch != -1 && ++counter <= maxdepth) {
397: if (lookat.length() > maxURLLength) {
398: //Exceeded maximum length of a URL. Start fresh.
399: lookat = new StringBuffer();
400: foundDot = false;
401: } else if (isURLableChar(ch)) {
402: //Add to buffer.
403: if (ch == 46) {
404: // Current character is a dot '.'
405: foundDot = true;
406: }
407: lookat.append((char) ch);
408: } else if (lookat.length() > 3 && foundDot) {
409: // It takes a bare mininum of 4 characters to form a URL
410: // Since we have at least that many let's try link
411: // extraction.
412: String newURL = lookat.toString();
413: if (looksLikeAnURL(newURL)) {
414: // Looks like we found something.
415:
416: // Let's start with a little cleanup as we may have
417: // junk in front or at the end.
418: if (newURL.toLowerCase().indexOf("http") > 0) {
419: // Got garbage in front of the protocol. Remove.
420: newURL = newURL.substring(newURL
421: .toLowerCase().indexOf("http"));
422: }
423: while (newURL.substring(newURL.length() - 1)
424: .equals(".")) {
425: // URLs can't end with a dot. Strip it off.
426: newURL = newURL.substring(0, newURL
427: .length() - 1);
428: }
429:
430: // And add the URL to speculative embeds.
431: numberOfLinksExtracted++;
432: curi.createAndAddLink(newURL,
433: Link.SPECULATIVE_MISC,
434: Link.SPECULATIVE_HOP);
435: }
436: // Reset lookat for next string.
437: lookat = new StringBuffer();
438: foundDot = false;
439: } else if (lookat.length() > 0) {
440: // Didn't get enough chars. Reset lookat for next string.
441: lookat = new StringBuffer();
442: foundDot = false;
443: }
444: ch = instream.read();
445: }
446: } catch (IOException e) {
447: //TODO: Handle this exception.
448: e.printStackTrace();
449: } catch (AttributeNotFoundException e) {
450: // TODO Auto-generated catch block
451: e.printStackTrace();
452: }
453: // Set flag to indicate that link extraction is completed.
454: curi.linkExtractorFinished();
455: }
456:
457: /**
458: * This method takes a look at a string and determines if it could be a URL.
459: * To qualify the string must either begin with "http://" (https would also
460: * work) followed by something that looks like an IP address or contain
461: * within the string (possible at the end but not at the beginning) a TLD
462: * (Top Level Domain) preceded by a dot.
463: *
464: * @param lookat The string to examine in an effort to determine if it
465: * could be a URL
466: * @return True if the string matches the above criteria for a URL.
467: */
468: private boolean looksLikeAnURL(String lookat) {
469: if (lookat.indexOf("http://") == 0
470: || lookat.indexOf("https://") == 0) {
471: //Check if the rest of the string looks like an IP address.
472: //if so return true. Otherwise continue on.
473: Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
474: boolean testVal = ip.matches();
475: TextUtils.recycleMatcher(ip);
476: if (testVal) {
477: return true;
478: }
479: }
480:
481: int dot = lookat.indexOf(".");
482: if (dot != 0) {//An URL can't start with a .tld.
483: while (dot != -1 && dot < lookat.length()) {
484: lookat = lookat.substring(dot + 1);
485: if (isTLD(lookat.substring(0,
486: lookat.length() <= 6 ? lookat.length() : 6))) {
487: return true;
488: }
489: dot = lookat.indexOf(".");
490: }
491: }
492:
493: return false;
494: }
495:
496: /**
497: * Checks if a string is equal to known Top Level Domain. The string may
498: * contain additional characters <i>after</i> the TLD but not before.
499: * @param potentialTLD The string (usually 2-6 chars) to check if it starts
500: * with a TLD.
501: * @return True if the given string starts with the name of a known TLD
502: *
503: * @see #TLDs
504: */
505: private boolean isTLD(String potentialTLD) {
506: if (potentialTLD.length() < 2) {
507: return false;
508: }
509:
510: potentialTLD.toLowerCase();
511: Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);
512: boolean ret = uri.matches();
513: TextUtils.recycleMatcher(uri);
514: return ret;
515: }
516:
517: /**
518: * Determines if a char (as represented by an int in the range of 0-255) is
519: * a character (in the Ansi character set) that can be present in a URL.
520: * This method takes a <b>strict</b> approach to what characters can be in
521: * a URL.
522: * <p>
523: * The following are considered to be 'URLable'<br>
524: * <ul>
525: * <li> <code># $ % & + , - . /</code> values 35-38,43-47
526: * <li> <code>[0-9]</code> values 48-57
527: * <li> <code>: ; = ? @</code> value 58-59,61,63-64
528: * <li> <code>[A-Z]</code> values 65-90
529: * <li> <code>_</code> value 95
530: * <li> <code>[a-z]</code> values 97-122
531: * <li> <code>~</code> value 126
532: * </ul>
533: * <p>
534: * To summerize, the following ranges are considered URLable:<br>
535: * 35-38,43-59,61,63-90,95,97-122,126
536: *
537: * @param ch The character (represented by an int) to test.
538: * @return True if it is a URLable character, false otherwise.
539: */
540: private boolean isURLableChar(int ch) {
541: return (ch >= 35 && ch <= 38) || (ch >= 43 && ch <= 59)
542: || (ch == 61) || (ch >= 63 && ch <= 90) || (ch == 95)
543: || (ch >= 97 && ch <= 122) || (ch == 126);
544: }
545:
546: /* (non-Javadoc)
547: * @see org.archive.crawler.framework.Processor#report()
548: */
549: public String report() {
550: StringBuffer ret = new StringBuffer();
551: ret.append("Processor: org.archive.crawler.extractor."
552: + "ExtractorUniversal\n");
553: ret
554: .append(" Function: Link extraction on unknown file"
555: + " types.\n");
556: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
557: + "\n");
558: ret.append(" Links extracted: " + numberOfLinksExtracted
559: + "\n\n");
560:
561: return ret.toString();
562: }
563: }
|