001: // plasmaSearchImages.java
002: // -----------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2006
007: // Created: 04.04.2006
008: //
009: // This program is free software; you can redistribute it and/or modify
010: // it under the terms of the GNU General Public License as published by
011: // the Free Software Foundation; either version 2 of the License, or
012: // (at your option) any later version.
013: //
014: // This program is distributed in the hope that it will be useful,
015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: // GNU General Public License for more details.
018: //
019: // You should have received a copy of the GNU General Public License
020: // along with this program; if not, write to the Free Software
021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: //
023: // Using this software in any meaning (reading, learning, copying, compiling,
024: // running) means that you agree that the Author(s) is (are) not responsible
025: // for cost, loss of data or any harm that may be caused directly or indirectly
026: // by usage of this softare or this documentation. The usage of this software
027: // is on your own risk. The installation and usage (starting/running) of this
028: // software may allow other people or application to access your computer and
029: // any attached devices and is highly dependent on the configuration of the
030: // software which must be done by the user of the software; the author(s) is
031: // (are) also not responsible for proper configuration and usage of the
032: // software, even if provoked by documentation provided together with
033: // the software.
034: //
035: // Any changes to this file according to the GPL as documented in the file
036: // gpl.txt aside this file in the shipment you received can be done to the
037: // lines that follows this copyright notice here, but changes must not be
038: // done inside the copyright notive above. A re-distribution must contain
039: // the intact and unchanged copyright notice.
040: // Contributions and changes to the program code must be marked as such.
041:
042: package de.anomic.plasma;
043:
044: import java.io.InputStream;
045: import java.net.MalformedURLException;
046: import java.util.Iterator;
047: import java.util.TreeSet;
048:
049: import de.anomic.htmlFilter.htmlFilterImageEntry;
050: import de.anomic.plasma.parser.ParserException;
051: import de.anomic.server.serverDate;
052: import de.anomic.yacy.yacyURL;
053:
054: public final class plasmaSearchImages {
055:
056: private TreeSet<htmlFilterImageEntry> images;
057:
058: public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
059: long start = System.currentTimeMillis();
060: this .images = new TreeSet<htmlFilterImageEntry>();
061: if (maxTime > 10) {
062: Object[] resource = plasmaSnippetCache.getResource(url,
063: true, (int) maxTime, false);
064: InputStream res = (InputStream) resource[0];
065: Long resLength = (Long) resource[1];
066: if (res != null) {
067: plasmaParserDocument document = null;
068: try {
069: // parse the document
070: document = plasmaSnippetCache.parseDocument(url,
071: resLength.longValue(), res);
072: } catch (ParserException e) {
073: // parsing failed
074: } finally {
075: try {
076: res.close();
077: } catch (Exception e) {/* ignore this */
078: }
079: }
080: if (document == null)
081: return;
082:
083: // add the image links
084: this .addAll(document.getImages());
085:
086: // add also links from pages one step deeper, if depth > 0
087: if (depth > 0) {
088: Iterator<yacyURL> i = document.getHyperlinks()
089: .keySet().iterator();
090: String nexturlstring;
091: while (i.hasNext()) {
092: try {
093: nexturlstring = i.next().toNormalform(true,
094: true);
095: addAll(new plasmaSearchImages(serverDate
096: .remainingTime(start, maxTime, 10),
097: new yacyURL(nexturlstring, null),
098: depth - 1));
099: } catch (MalformedURLException e1) {
100: e1.printStackTrace();
101: }
102: }
103: }
104: document.close();
105: }
106: }
107: }
108:
109: public void addAll(plasmaSearchImages m) {
110: synchronized (m.images) {
111: addAll(m.images);
112: }
113: }
114:
115: private void addAll(TreeSet<htmlFilterImageEntry> ts) {
116: Iterator<htmlFilterImageEntry> i = ts.iterator();
117: htmlFilterImageEntry ie;
118: while (i.hasNext()) {
119: ie = i.next();
120: if (images.contains(ie)) {
121: if ((ie.height() > 0) && (ie.width() > 0))
122: images.add(ie);
123: } else {
124: images.add(ie);
125: }
126: }
127: }
128:
129: public Iterator<htmlFilterImageEntry> entries() {
130: // returns htmlFilterImageEntry - Objects
131: return images.iterator();
132: }
133:
134: }
|