001: // plasmaCrawlZURL.java
002: // (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
003: // first published 15.03.2007 on http://www.anomic.de
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma;
028:
029: import java.io.File;
030: import java.io.IOException;
031: import java.util.Date;
032: import java.util.Iterator;
033: import java.util.LinkedList;
034:
035: import de.anomic.kelondro.kelondroBase64Order;
036: import de.anomic.kelondro.kelondroEcoTable;
037: import de.anomic.kelondro.kelondroFlexTable;
038: import de.anomic.kelondro.kelondroIndex;
039: import de.anomic.kelondro.kelondroRow;
040: import de.anomic.kelondro.kelondroRowSet;
041: import de.anomic.yacy.yacyCore;
042: import de.anomic.yacy.yacySeedDB;
043: import de.anomic.yacy.yacyURL;
044:
045: public class plasmaCrawlZURL {
046:
047: private static final int EcoFSBufferSize = 200;
048:
049: public final static kelondroRow rowdef = new kelondroRow(
050: "String urlhash-" + yacySeedDB.commonHashLength
051: + ", "
052: + // the url's hash
053: "String executor-" + yacySeedDB.commonHashLength
054: + ", "
055: + // the crawling executor
056: "Cardinal workdate-8 {b256}, "
057: + // the time when the url was last time tried to load
058: "Cardinal workcount-4 {b256}, "
059: + // number of load retries
060: "String anycause-80, "
061: + // string describing load failure
062: "byte[] entry-"
063: + plasmaCrawlEntry.rowdef.objectsize, // extra space
064: kelondroBase64Order.enhancedCoder, 0);
065:
066: // the class object
067: private kelondroIndex urlIndex = null;
068: private LinkedList<String> stack = new LinkedList<String>(); // strings: url
069:
070: public plasmaCrawlZURL(File cachePath, String tablename,
071: boolean startWithEmptyFile) {
072: // creates a new ZURL in a file
073: cachePath.mkdirs();
074: File f = new File(cachePath, tablename);
075: if (startWithEmptyFile) {
076: if (f.exists()) {
077: if (f.isDirectory())
078: kelondroFlexTable.delete(cachePath, tablename);
079: else
080: f.delete();
081: }
082: }
083: urlIndex = new kelondroEcoTable(f, rowdef,
084: kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
085: //urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
086: }
087:
088: public plasmaCrawlZURL() {
089: // creates a new ZUR in RAM
090: urlIndex = new kelondroRowSet(rowdef, 0);
091: }
092:
093: public int size() {
094: return urlIndex.size();
095: }
096:
097: public void close() {
098: if (urlIndex != null) {
099: urlIndex.close();
100: urlIndex = null;
101: }
102: }
103:
104: public synchronized Entry newEntry(plasmaCrawlEntry bentry,
105: String executor, Date workdate, int workcount,
106: String anycause) {
107: if ((executor == null)
108: || (executor.length() < yacySeedDB.commonHashLength))
109: executor = yacyURL.dummyHash;
110: if (anycause == null)
111: anycause = "unknown";
112: return new Entry(bentry, executor, workdate, workcount,
113: anycause);
114: }
115:
116: public synchronized Entry newEntry(yacyURL url, String anycause) {
117: return new Entry(url, anycause);
118: }
119:
120: public boolean remove(String hash) {
121: if (hash == null)
122: return false;
123: try {
124: urlIndex.remove(hash.getBytes(), false);
125: return true;
126: } catch (IOException e) {
127: return false;
128: }
129: }
130:
131: public synchronized void push(Entry e) {
132: stack.add(e.hash());
133: }
134:
135: public Entry top(int pos) {
136: String urlhash = (String) stack.get(pos);
137: if (urlhash == null)
138: return null;
139: return getEntry(urlhash);
140: }
141:
142: public synchronized Entry getEntry(String urlhash) {
143: try {
144: kelondroRow.Entry entry = urlIndex.get(urlhash.getBytes());
145: if (entry == null)
146: return null;
147: return new Entry(entry);
148: } catch (IOException e) {
149: e.printStackTrace();
150: return null;
151: }
152: }
153:
154: public boolean exists(String urlHash) {
155: try {
156: return urlIndex.has(urlHash.getBytes());
157: } catch (IOException e) {
158: return false;
159: }
160: }
161:
162: public void clearStack() {
163: stack.clear();
164: }
165:
166: public int stackSize() {
167: return stack.size();
168: }
169:
170: public class Entry {
171:
172: plasmaCrawlEntry bentry; // the balancer entry
173: private String executor; // the crawling initiator
174: private Date workdate; // the time when the url was last time tried to load
175: private int workcount; // number of tryings
176: private String anycause; // string describing reason for load fail
177: private boolean stored;
178:
179: public Entry(yacyURL url, String reason) {
180: this (new plasmaCrawlEntry(url), null, new Date(), 0, reason);
181: }
182:
183: public Entry(plasmaCrawlEntry bentry, String executor,
184: Date workdate, int workcount, String anycause) {
185: // create new entry
186: assert bentry != null;
187: this .bentry = bentry;
188: this .executor = (executor == null) ? yacyCore.seedDB
189: .mySeed().hash : executor;
190: this .workdate = (workdate == null) ? new Date() : workdate;
191: this .workcount = workcount;
192: this .anycause = (anycause == null) ? "" : anycause;
193: stored = false;
194: }
195:
196: public Entry(kelondroRow.Entry entry) throws IOException {
197: insertEntry(entry);
198: this .stored = true;
199: }
200:
201: private void insertEntry(kelondroRow.Entry entry)
202: throws IOException {
203: assert (entry != null);
204: this .executor = entry.getColString(1, "UTF-8");
205: this .workdate = new Date(entry.getColLong(2));
206: this .workcount = (int) entry.getColLong(3);
207: this .anycause = entry.getColString(4, "UTF-8");
208: this .bentry = new plasmaCrawlEntry(plasmaCrawlEntry.rowdef
209: .newEntry(entry.getColBytes(5)));
210: assert ((new String(entry.getColBytes(0))).equals(bentry
211: .url().hash()));
212: return;
213: }
214:
215: public void store() {
216: // stores the values from the object variables into the database
217: if (this .stored)
218: return;
219: if (this .bentry == null)
220: return;
221: kelondroRow.Entry newrow = rowdef.newEntry();
222: newrow.setCol(0, this .bentry.url().hash().getBytes());
223: newrow.setCol(1, this .executor.getBytes());
224: newrow.setCol(2, this .workdate.getTime());
225: newrow.setCol(3, this .workcount);
226: newrow.setCol(4, this .anycause.getBytes());
227: newrow.setCol(5, this .bentry.toRow().bytes());
228: try {
229: urlIndex.put(newrow);
230: this .stored = true;
231: } catch (IOException e) {
232: System.out
233: .println("INTERNAL ERROR AT plasmaEURL:url2hash:"
234: + e.toString());
235: }
236: }
237:
238: public yacyURL url() {
239: return this .bentry.url();
240: }
241:
242: public String initiator() {
243: return this .bentry.initiator();
244: }
245:
246: public String hash() {
247: // return a url-hash, based on the md5 algorithm
248: // the result is a String of 12 bytes within a 72-bit space
249: // (each byte has an 6-bit range)
250: // that should be enough for all web pages on the world
251: return this .bentry.url().hash();
252: }
253:
254: public Date workdate() {
255: return workdate;
256: }
257:
258: public String executor() {
259: // return the creator's hash
260: return executor;
261: }
262:
263: public String anycause() {
264: return anycause;
265: }
266:
267: }
268:
269: public class kiter implements Iterator<Entry> {
270: // enumerates entry elements
271: Iterator<kelondroRow.Entry> i;
272: boolean error = false;
273:
274: public kiter(boolean up, String firstHash) throws IOException {
275: i = urlIndex.rows(up, (firstHash == null) ? null
276: : firstHash.getBytes());
277: error = false;
278: }
279:
280: public boolean hasNext() {
281: if (error)
282: return false;
283: return i.hasNext();
284: }
285:
286: public Entry next() throws RuntimeException {
287: kelondroRow.Entry e = (kelondroRow.Entry) i.next();
288: if (e == null)
289: return null;
290: try {
291: return new Entry(e);
292: } catch (IOException ex) {
293: throw new RuntimeException("error '" + ex.getMessage()
294: + "' for hash " + e.getColString(0, null));
295: }
296: }
297:
298: public void remove() {
299: i.remove();
300: }
301:
302: }
303:
304: public Iterator<Entry> entries(boolean up, String firstHash)
305: throws IOException {
306: // enumerates entry elements
307: return new kiter(up, firstHash);
308: }
309: }
|