001: // htmlFilterContentScraper.java
002: // -----------------------------
003: // (C) by Michael Peter Christen; mc@anomic.de
004: // first published on http://www.anomic.de
005: // Frankfurt, Germany, 2004
006: //
007: // Contains contributions by Marc Nause [MN]
008: //
009: // $LastChangedDate: 2008-01-22 11:51:43 +0000 (Di, 22 Jan 2008) $
010: // $LastChangedRevision: 4352 $
011: // $LastChangedBy: orbiter $
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026: //
027: // Using this software in any meaning (reading, learning, copying, compiling,
028: // running) means that you agree that the Author(s) is (are) not responsible
029: // for cost, loss of data or any harm that may be caused directly or indirectly
030: // by usage of this softare or this documentation. The usage of this software
031: // is on your own risk. The installation and usage (starting/running) of this
032: // software may allow other people or application to access your computer and
033: // any attached devices and is highly dependent on the configuration of the
034: // software which must be done by the user of the software; the author(s) is
035: // (are) also not responsible for proper configuration and usage of the
036: // software, even if provoked by documentation provided together with
037: // the software.
038: //
039: // Any changes to this file according to the GPL as documented in the file
040: // gpl.txt aside this file in the shipment you received can be done to the
041: // lines that follows this copyright notice here, but changes must not be
042: // done inside the copyright notive above. A re-distribution must contain
043: // the intact and unchanged copyright notice.
044: // Contributions and changes to the program code must be marked as such.
045:
046: package de.anomic.htmlFilter;
047:
048: import java.io.ByteArrayInputStream;
049: import java.io.IOException;
050: import java.io.UnsupportedEncodingException;
051: import java.io.Writer;
052: import java.net.MalformedURLException;
053: import java.text.Collator;
054: import java.util.ArrayList;
055: import java.util.HashMap;
056: import java.util.List;
057: import java.util.Locale;
058: import java.util.Map;
059: import java.util.Properties;
060: import java.util.TreeSet;
061:
062: import javax.swing.event.EventListenerList;
063:
064: import de.anomic.http.httpc;
065: import de.anomic.plasma.plasmaSwitchboard;
066: import de.anomic.server.serverCharBuffer;
067: import de.anomic.server.serverFileUtils;
068: import de.anomic.yacy.yacyURL;
069:
070: public class htmlFilterContentScraper extends htmlFilterAbstractScraper
071: implements htmlFilterScraper {
072:
073: // statics: for initialisation of the HTMLFilterAbstractScraper
074: private static TreeSet<String> linkTags0;
075: private static TreeSet<String> linkTags1;
076:
077: private static final Collator insensitiveCollator = Collator
078: .getInstance(Locale.US);
079: static {
080: insensitiveCollator.setStrength(Collator.SECONDARY);
081: insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
082: }
083:
084: static {
085: linkTags0 = new TreeSet<String>(insensitiveCollator);
086: linkTags0.add("img");
087: linkTags0.add("base");
088: linkTags0.add("frame");
089: linkTags0.add("meta");
090: linkTags0.add("area");
091: linkTags0.add("link");
092: linkTags0.add("embed"); //added by [MN]
093: linkTags0.add("param"); //added by [MN]
094:
095: linkTags1 = new TreeSet<String>(insensitiveCollator);
096: linkTags1.add("a");
097: linkTags1.add("h1");
098: linkTags1.add("h2");
099: linkTags1.add("h3");
100: linkTags1.add("h4");
101: linkTags1.add("title");
102: }
103:
104: // class variables: collectors for links
105: private HashMap<yacyURL, String> anchors;
106: private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
107: private HashMap<String, String> metas;
108: private String title;
109: //private String headline;
110: private List<String>[] headlines;
111: private serverCharBuffer content;
112: private EventListenerList htmlFilterEventListeners = new EventListenerList();
113:
114: /**
115: * {@link URL} to the favicon that belongs to the document
116: */
117: private yacyURL favicon;
118:
119: /**
120: * The document root {@link URL}
121: */
122: private yacyURL root;
123:
124: @SuppressWarnings("unchecked")
125: public htmlFilterContentScraper(yacyURL root) {
126: // the root value here will not be used to load the resource.
127: // it is only the reference for relative links
128: super (linkTags0, linkTags1);
129: this .root = root;
130: this .anchors = new HashMap<yacyURL, String>();
131: this .images = new TreeSet<htmlFilterImageEntry>();
132: this .metas = new HashMap<String, String>();
133: this .title = "";
134: this .headlines = new ArrayList[4];
135: for (int i = 0; i < 4; i++)
136: headlines[i] = new ArrayList<String>();
137: this .content = new serverCharBuffer(1024);
138: }
139:
140: public final static boolean punctuation(char c) {
141: return (c == '.') || (c == '!') || (c == '?');
142: }
143:
144: public void scrapeText(char[] newtext, String insideTag) {
145: // System.out.println("SCRAPE: " + new String(newtext));
146: serverCharBuffer b = super .stripAll(
147: new serverCharBuffer(newtext, newtext.length + 1))
148: .trim();
149: if ((insideTag != null) && (!(insideTag.equals("a")))) {
150: // texts inside tags sometimes have no punctuation at the line end
151: // this is bad for the text sematics, because it is not possible for the
152: // condenser to distinguish headlines from text beginnings.
153: // to make it easier for the condenser, a dot ('.') is appended in case that
154: // no punctuation is part of the newtext line
155: if ((b.length() != 0)
156: && (!(punctuation(b.charAt(b.length() - 1)))))
157: b.append((int) '.');
158: //System.out.println("*** Appended dot: " + b.toString());
159: }
160: if (b.length() != 0)
161: content.append(b).append(32);
162: }
163:
164: public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
165:
166: public static String[] urlComps(String normalizedURL) {
167: int p = normalizedURL.indexOf("//");
168: if (p > 0)
169: normalizedURL = normalizedURL.substring(p + 2);
170: return normalizedURL.toLowerCase().split(splitrex); // word components of the url
171: }
172:
173: private yacyURL absolutePath(String relativePath) {
174: try {
175: return yacyURL.newURL(root, relativePath);
176: } catch (Exception e) {
177: return null;
178: }
179: }
180:
181: public void scrapeTag0(String tagname, Properties tagopts) {
182: if (tagname.equalsIgnoreCase("img")) {
183: int width = -1, height = -1;
184: try {
185: width = Integer.parseInt(tagopts.getProperty("width",
186: "-1"));
187: height = Integer.parseInt(tagopts.getProperty("height",
188: "-1"));
189: } catch (NumberFormatException e) {
190: }
191: yacyURL url = absolutePath(tagopts.getProperty("src", ""));
192: htmlFilterImageEntry ie = new htmlFilterImageEntry(url,
193: tagopts.getProperty("alt", ""), width, height);
194: images.add(ie);
195: }
196: if (tagname.equalsIgnoreCase("base"))
197: try {
198: root = new yacyURL(tagopts.getProperty("href", ""),
199: null);
200: } catch (MalformedURLException e) {
201: }
202: if (tagname.equalsIgnoreCase("frame")) {
203: anchors.put(absolutePath(tagopts.getProperty("src", "")),
204: tagopts.getProperty("name", ""));
205: }
206: if (tagname.equalsIgnoreCase("meta")) {
207: String name = tagopts.getProperty("name", "");
208: if (name.length() > 0) {
209: metas.put(name.toLowerCase(), tagopts.getProperty(
210: "content", ""));
211: } else {
212: name = tagopts.getProperty("http-equiv", "");
213: if (name.length() > 0) {
214: metas.put(name.toLowerCase(), tagopts.getProperty(
215: "content", ""));
216: }
217: }
218: }
219: if (tagname.equalsIgnoreCase("area")) {
220: String areatitle = cleanLine(tagopts.getProperty("title",
221: ""));
222: //String alt = tagopts.getProperty("alt","");
223: String href = tagopts.getProperty("href", "");
224: if (href.length() > 0)
225: anchors.put(absolutePath(href), areatitle);
226: }
227: if (tagname.equalsIgnoreCase("link")) {
228: yacyURL newLink = absolutePath(tagopts.getProperty("href",
229: ""));
230:
231: if (newLink != null) {
232: String type = tagopts.getProperty("rel", "");
233: String linktitle = tagopts.getProperty("title", "");
234:
235: if (type.equalsIgnoreCase("shortcut icon")) {
236: htmlFilterImageEntry ie = new htmlFilterImageEntry(
237: newLink, linktitle, -1, -1);
238: images.add(ie);
239: this .favicon = newLink;
240: } else if (!type.equalsIgnoreCase("stylesheet")
241: && !type
242: .equalsIgnoreCase("alternate stylesheet")) {
243: anchors.put(newLink, linktitle);
244: }
245: }
246: }
247: //start contrib [MN]
248: if (tagname.equalsIgnoreCase("embed")) {
249: anchors.put(absolutePath(tagopts.getProperty("src", "")),
250: tagopts.getProperty("name", ""));
251: }
252: if (tagname.equalsIgnoreCase("param")) {
253: String name = tagopts.getProperty("name", "");
254: if (name.equalsIgnoreCase("movie")) {
255: anchors.put(absolutePath(tagopts.getProperty("value",
256: "")), name);
257: }
258: }
259: //end contrib [MN]
260:
261: // fire event
262: fireScrapeTag0(tagname, tagopts);
263: }
264:
265: public void scrapeTag1(String tagname, Properties tagopts,
266: char[] text) {
267: // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
268: if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
269: String href = tagopts.getProperty("href", "");
270: if (href.length() > 0)
271: anchors.put(absolutePath(href), super .stripAll(
272: new serverCharBuffer(text)).trim().toString());
273: }
274: String h;
275: if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
276: h = cleanLine(super .stripAll(new serverCharBuffer(text))
277: .toString());
278: if (h.length() > 0)
279: headlines[0].add(h);
280: }
281: if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
282: h = cleanLine(super .stripAll(new serverCharBuffer(text))
283: .toString());
284: if (h.length() > 0)
285: headlines[1].add(h);
286: }
287: if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
288: h = cleanLine(super .stripAll(new serverCharBuffer(text))
289: .toString());
290: if (h.length() > 0)
291: headlines[2].add(h);
292: }
293: if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
294: h = cleanLine(super .stripAll(new serverCharBuffer(text))
295: .toString());
296: if (h.length() > 0)
297: headlines[3].add(h);
298: }
299: if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
300: title = cleanLine(super
301: .stripAll(new serverCharBuffer(text)).toString());
302: }
303:
304: // fire event
305: fireScrapeTag1(tagname, tagopts, text);
306: }
307:
308: private static String cleanLine(String s) {
309: /*
310: // may contain too many funny symbols
311: for (int i = 0; i < s.length(); i++)
312: if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1);
313: */
314:
315: int p;
316:
317: // CR/LF entfernen, dabei koennen doppelte Leerzeichen enstehen die aber weiter unten entfernt werden - thq
318: while ((p = s.indexOf("\n")) >= 0)
319: s = s.substring(0, p)
320: + ((p + 1 == s.length()) ? "" : " "
321: + s.substring(p + 1));
322:
323: // remove double-spaces
324: while ((p = s.indexOf(" ")) >= 0)
325: s = s.substring(0, p) + s.substring(p + 1);
326:
327: // we don't accept headlines that are too short
328: s = s.trim();
329: if (s.length() < 4)
330: s = "";
331:
332: // return result
333: return s;
334: }
335:
336: public String getTitle() {
337: // construct a title string, even if the document has no title
338:
339: // some documents have a title tag as meta tag
340: String s = (String) metas.get("title");
341:
342: // try to construct the title with the content of the title tag
343: if (title.length() > 0) {
344: if (s == null) {
345: return title;
346: } else {
347: if ((title.compareToIgnoreCase(s) == 0)
348: || (title.indexOf(s) >= 0))
349: return s;
350: else
351: return title + ": " + s;
352: }
353: } else {
354: if (s != null) {
355: return s;
356: }
357: }
358:
359: // otherwise take any headline
360: for (int i = 0; i < 4; i++) {
361: if (headlines[i].size() > 0)
362: return (String) headlines[i].get(0);
363: }
364:
365: // take description tag
366: s = getDescription();
367: if (s.length() > 0)
368: return s;
369:
370: // extract headline from content
371: if (content.length() > 80) {
372: return cleanLine(new String(content.getChars(), 0, 80));
373: }
374: return cleanLine(content.trim().toString());
375: }
376:
377: public String[] getHeadlines(int i) {
378: assert ((i >= 1) && (i <= 4));
379: String[] s = new String[headlines[i - 1].size()];
380: for (int j = 0; j < headlines[i - 1].size(); j++)
381: s[j] = (String) headlines[i - 1].get(j);
382: return s;
383: }
384:
385: public byte[] getText() {
386: return this .getText("UTF-8");
387: }
388:
389: public byte[] getText(String charSet) {
390: try {
391: return content.toString().getBytes(charSet);
392: } catch (UnsupportedEncodingException e) {
393: return content.toString().getBytes();
394: }
395: }
396:
397: public Map<yacyURL, String> getAnchors() {
398: // returns a url (String) / name (String) relation
399: return anchors;
400: }
401:
402: public TreeSet<htmlFilterImageEntry> getImages() {
403: // this resturns a String(absolute url)/htmlFilterImageEntry - relation
404: return images;
405: }
406:
407: public Map<String, String> getMetas() {
408: return metas;
409: }
410:
411: /**
412: * @return the {@link URL} to the favicon that belongs to the document
413: */
414: public yacyURL getFavicon() {
415: return this .favicon;
416: }
417:
418: /*
419: DC in html example:
420: <meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />
421: <meta name="DC.creator" content="Andy Powell, UKOLN, University of Bath" />
422: <meta name="DC.identifier" scheme="DCTERMS.URI" content="http://dublincore.org/documents/dcq-html/" />
423: <meta name="DC.format" scheme="DCTERMS.IMT" content="text/html" />
424: <meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
425: */
426:
427: public String getDescription() {
428: String s = metas.get("description");
429: if (s == null)
430: s = metas.get("DC.description");
431: if (s == null)
432: return "";
433: else
434: return s;
435: }
436:
437: public String getContentType() {
438: String s = metas.get("content-type");
439: if (s == null)
440: return "";
441: else
442: return s;
443: }
444:
445: public String getAuthor() {
446: String s = metas.get("author");
447: if (s == null)
448: s = metas.get("copyright");
449: if (s == null)
450: s = metas.get("DC.creator");
451: if (s == null)
452: return "";
453: return s;
454: }
455:
456: public String[] getContentLanguages() {
457: String s = metas.get("content-language");
458: if (s == null)
459: s = metas.get("DC.language");
460: if (s == null)
461: s = "";
462: return s.split(" |,");
463: }
464:
465: public String[] getKeywords() {
466: String s = metas.get("keywords");
467: if (s == null)
468: s = metas.get("DC.description");
469: if (s == null)
470: s = "";
471: if (s.length() == 0) {
472: return getTitle().toLowerCase().split(splitrex);
473: } else {
474: return s.split(" |,");
475: }
476: }
477:
478: public int getRefreshSeconds() {
479: String s = (String) metas.get("refresh");
480: if (s == null)
481: return 9999;
482: else
483: try {
484: int pos = s.indexOf(';');
485: if (pos < 0)
486: return 9999;
487: int i = Integer.parseInt(s.substring(0, pos));
488: return i;
489: } catch (NumberFormatException e) {
490: return 9999;
491: }
492: }
493:
494: public String getRefreshPath() {
495: String s = (String) metas.get("refresh");
496: if (s == null)
497: return "";
498: else {
499: int pos = s.indexOf(';');
500: if (pos < 0)
501: return "";
502: s = s.substring(pos + 1);
503: if (s.toLowerCase().startsWith("url="))
504: return s.substring(4).trim();
505: else
506: return "";
507: }
508: }
509:
510: /*
511: * (non-Javadoc)
512: * @see de.anomic.htmlFilter.htmlFilterScraper#close()
513: */
514: public void close() {
515: // free resources
516: super .close();
517: anchors = null;
518: images = null;
519: title = null;
520: headlines = null;
521: content = null;
522: root = null;
523: }
524:
525: public void print() {
526: System.out.println("TITLE :" + title);
527: for (int i = 0; i < 4; i++) {
528: System.out.println("HEADLINE" + i + ":"
529: + headlines[i].toString());
530: }
531: System.out.println("ANCHORS :" + anchors.toString());
532: System.out.println("IMAGES :" + images.toString());
533: System.out.println("METAS :" + metas.toString());
534: System.out.println("TEXT :" + content.toString());
535: }
536:
537: public void registerHtmlFilterEventListener(
538: htmlFilterEventListener listener) {
539: if (listener != null) {
540: this .htmlFilterEventListeners.add(
541: htmlFilterEventListener.class, listener);
542: }
543: }
544:
545: public void deregisterHtmlFilterEventListener(
546: htmlFilterEventListener listener) {
547: if (listener != null) {
548: this .htmlFilterEventListeners.remove(
549: htmlFilterEventListener.class, listener);
550: }
551: }
552:
553: void fireScrapeTag0(String tagname, Properties tagopts) {
554: Object[] listeners = this .htmlFilterEventListeners
555: .getListenerList();
556: for (int i = 0; i < listeners.length; i += 2) {
557: if (listeners[i] == htmlFilterEventListener.class) {
558: ((htmlFilterEventListener) listeners[i + 1])
559: .scrapeTag0(tagname, tagopts);
560: }
561: }
562: }
563:
564: void fireScrapeTag1(String tagname, Properties tagopts, char[] text) {
565: Object[] listeners = this .htmlFilterEventListeners
566: .getListenerList();
567: for (int i = 0; i < listeners.length; i += 2) {
568: if (listeners[i] == htmlFilterEventListener.class) {
569: ((htmlFilterEventListener) listeners[i + 1])
570: .scrapeTag1(tagname, tagopts, text);
571: }
572: }
573: }
574:
575: public static htmlFilterContentScraper parseResource(
576: yacyURL location) throws IOException {
577: // load page
578: byte[] page = httpc.wget(location, location.getHost(), 10000,
579: null, null,
580: plasmaSwitchboard.getSwitchboard().remoteProxyConfig,
581: null, null);
582: if (page == null)
583: throw new IOException("no response from url "
584: + location.toString());
585:
586: // scrape content
587: htmlFilterContentScraper scraper = new htmlFilterContentScraper(
588: location);
589: Writer writer = new htmlFilterWriter(null, null, scraper, null,
590: false);
591: serverFileUtils.copy(new ByteArrayInputStream(page), writer,
592: "UTF-8");
593:
594: return scraper;
595: }
596: }
|