001: // plasmaNURL.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004
007: // last major change: 09.08.2004
008: //
009: // This program is free software; you can redistribute it and/or modify
010: // it under the terms of the GNU General Public License as published by
011: // the Free Software Foundation; either version 2 of the License, or
012: // (at your option) any later version.
013: //
014: // This program is distributed in the hope that it will be useful,
015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: // GNU General Public License for more details.
018: //
019: // You should have received a copy of the GNU General Public License
020: // along with this program; if not, write to the Free Software
021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: //
023: // Using this software in any meaning (reading, learning, copying, compiling,
024: // running) means that you agree that the Author(s) is (are) not responsible
025: // for cost, loss of data or any harm that may be caused directly or indirectly
026: // by usage of this softare or this documentation. The usage of this software
027: // is on your own risk. The installation and usage (starting/running) of this
028: // software may allow other people or application to access your computer and
029: // any attached devices and is highly dependent on the configuration of the
030: // software which must be done by the user of the software; the author(s) is
031: // (are) also not responsible for proper configuration and usage of the
032: // software, even if provoked by documentation provided together with
033: // the software.
034: //
035: // Any changes to this file according to the GPL as documented in the file
036: // gpl.txt aside this file in the shipment you received can be done to the
037: // lines that follows this copyright notice here, but changes must not be
038: // done inside the copyright notive above. A re-distribution must contain
039: // the intact and unchanged copyright notice.
040: // Contributions and changes to the program code must be marked as such.
041:
042: // NURL - noticed (known but not loaded) URL's
043:
044: package de.anomic.plasma;
045:
046: import java.io.File;
047: import java.io.IOException;
048: import java.util.ArrayList;
049: import java.util.HashSet;
050: import java.util.Iterator;
051:
052: public class plasmaCrawlNURL {
053:
054: public static final int STACK_TYPE_NULL = 0; // do not stack
055: public static final int STACK_TYPE_CORE = 1; // put on local stack
056: public static final int STACK_TYPE_LIMIT = 2; // put on global stack
057: public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
058: public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
059: public static final int STACK_TYPE_IMAGE = 11; // put on image stack
060: public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
061: public static final int STACK_TYPE_MUSIC = 13; // put on music stack
062:
063: private static final long minimumLocalDelta = 50; // the minimum time difference between access of the same local domain
064: private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
065: private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
066:
067: private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
068: private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
069: private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders
070:
071: //private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
072: //private kelondroStack imageStack; // links pointing to image resources
073: //private kelondroStack movieStack; // links pointing to movie resources
074: //private kelondroStack musicStack; // links pointing to music resources
075:
076: public plasmaCrawlNURL(File cachePath) {
077: super ();
078: coreStack = new plasmaCrawlBalancer(cachePath,
079: "urlNoticeCoreStack", false);
080: limitStack = new plasmaCrawlBalancer(cachePath,
081: "urlNoticeLimitStack", false);
082: //overhangStack = new plasmaCrawlBalancer(overhangStackFile);
083: remoteStack = new plasmaCrawlBalancer(cachePath,
084: "urlNoticeRemoteStack", false);
085: }
086:
087: public void close() {
088: coreStack.close();
089: limitStack.close();
090: //overhangStack.close();
091: remoteStack.close();
092: }
093:
094: public boolean notEmpty() {
095: return coreStack.notEmpty() || limitStack.notEmpty()
096: || remoteStack.notEmpty();
097: }
098:
099: public int size() {
100: // this does not count the overhang stack size
101: return coreStack.size() + limitStack.size()
102: + remoteStack.size();
103: }
104:
105: public int stackSize(int stackType) {
106: switch (stackType) {
107: case STACK_TYPE_CORE:
108: return coreStack.size();
109: case STACK_TYPE_LIMIT:
110: return limitStack.size();
111: case STACK_TYPE_OVERHANG:
112: return 0;
113: case STACK_TYPE_REMOTE:
114: return remoteStack.size();
115: default:
116: return -1;
117: }
118: }
119:
120: public boolean existsInStack(String urlhash) {
121: return coreStack.has(urlhash) || limitStack.has(urlhash) ||
122: //overhangStack.has(urlhash) ||
123: remoteStack.has(urlhash);
124: }
125:
126: public void push(int stackType, plasmaCrawlEntry entry) {
127: try {
128: switch (stackType) {
129: case STACK_TYPE_CORE:
130: coreStack.push(entry);
131: break;
132: case STACK_TYPE_LIMIT:
133: limitStack.push(entry);
134: break;
135: case STACK_TYPE_REMOTE:
136: remoteStack.push(entry);
137: break;
138: default:
139: break;
140: }
141: } catch (IOException er) {
142: }
143: }
144:
145: public plasmaCrawlEntry get(String urlhash) {
146: plasmaCrawlEntry entry = null;
147: try {
148: if ((entry = coreStack.get(urlhash)) != null)
149: return entry;
150: } catch (IOException e) {
151: }
152: try {
153: if ((entry = limitStack.get(urlhash)) != null)
154: return entry;
155: } catch (IOException e) {
156: }
157: try {
158: if ((entry = remoteStack.get(urlhash)) != null)
159: return entry;
160: } catch (IOException e) {
161: }
162: return null;
163: }
164:
165: public plasmaCrawlEntry removeByURLHash(String urlhash) {
166: plasmaCrawlEntry entry = null;
167: try {
168: if ((entry = coreStack.remove(urlhash)) != null)
169: return entry;
170: } catch (IOException e) {
171: }
172: try {
173: if ((entry = limitStack.remove(urlhash)) != null)
174: return entry;
175: } catch (IOException e) {
176: }
177: try {
178: if ((entry = remoteStack.remove(urlhash)) != null)
179: return entry;
180: } catch (IOException e) {
181: }
182: return null;
183: }
184:
185: public int removeByProfileHandle(String handle) {
186: int removed = 0;
187: try {
188: removed += coreStack.removeAllByProfileHandle(handle);
189: } catch (IOException e) {
190: }
191: try {
192: removed += limitStack.removeAllByProfileHandle(handle);
193: } catch (IOException e) {
194: }
195: try {
196: removed += remoteStack.removeAllByProfileHandle(handle);
197: } catch (IOException e) {
198: }
199: return removed;
200: }
201:
202: public plasmaCrawlEntry[] top(int stackType, int count) {
203: switch (stackType) {
204: case STACK_TYPE_CORE:
205: return top(coreStack, count);
206: case STACK_TYPE_LIMIT:
207: return top(limitStack, count);
208: case STACK_TYPE_REMOTE:
209: return top(remoteStack, count);
210: default:
211: return null;
212: }
213: }
214:
215: public plasmaCrawlEntry pop(int stackType, boolean delay)
216: throws IOException {
217: switch (stackType) {
218: case STACK_TYPE_CORE:
219: return pop(coreStack, delay);
220: case STACK_TYPE_LIMIT:
221: return pop(limitStack, delay);
222: case STACK_TYPE_REMOTE:
223: return pop(remoteStack, delay);
224: default:
225: return null;
226: }
227: }
228:
229: public void shift(int fromStack, int toStack) {
230: try {
231: plasmaCrawlEntry entry = pop(fromStack, false);
232: if (entry != null)
233: push(toStack, entry);
234: } catch (IOException e) {
235: return;
236: }
237: }
238:
239: public void clear(int stackType) {
240: switch (stackType) {
241: case STACK_TYPE_CORE:
242: coreStack.clear();
243: break;
244: case STACK_TYPE_LIMIT:
245: limitStack.clear();
246: break;
247: case STACK_TYPE_REMOTE:
248: remoteStack.clear();
249: break;
250: default:
251: return;
252: }
253: }
254:
255: private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer,
256: boolean delay) throws IOException {
257: // this is a filo - pop
258: int s;
259: plasmaCrawlEntry entry;
260: synchronized (balancer) {
261: while ((s = balancer.size()) > 0) {
262: entry = balancer
263: .pop((delay) ? minimumLocalDelta : 0,
264: (delay) ? minimumGlobalDelta : 0,
265: maximumDomAge);
266: if (entry == null) {
267: if (s > balancer.size())
268: continue;
269: int aftersize = balancer.size();
270: balancer.clear(); // the balancer is broken and cannot shrink
271: throw new IOException(
272: "entry is null, balancer cannot shrink (bevore pop = "
273: + s + ", after pop = " + aftersize
274: + "); reset of balancer");
275: }
276: return entry;
277: }
278: }
279: throw new IOException("balancer stack is empty");
280: }
281:
282: private plasmaCrawlEntry[] top(plasmaCrawlBalancer balancer,
283: int count) {
284: // this is a filo - top
285: if (count > balancer.size())
286: count = balancer.size();
287: ArrayList<plasmaCrawlEntry> list = new ArrayList<plasmaCrawlEntry>(
288: count);
289: for (int i = 0; i < count; i++) {
290: try {
291: plasmaCrawlEntry entry = balancer.top(i);
292: if (entry == null)
293: break;
294: list.add(entry);
295: } catch (IOException e) {
296: break;
297: }
298: }
299: return (plasmaCrawlEntry[]) list
300: .toArray(new plasmaCrawlEntry[list.size()]);
301: }
302:
303: public Iterator<plasmaCrawlEntry> iterator(int stackType) {
304: // returns an iterator of plasmaCrawlBalancerEntry Objects
305: try {
306: switch (stackType) {
307: case STACK_TYPE_CORE:
308: return coreStack.iterator();
309: case STACK_TYPE_LIMIT:
310: return limitStack.iterator();
311: case STACK_TYPE_REMOTE:
312: return remoteStack.iterator();
313: default:
314: return null;
315: }
316: } catch (IOException e) {
317: return new HashSet<plasmaCrawlEntry>().iterator();
318: }
319: }
320:
321: }
|