001: // plasmaSwitchboardQueueEntry.java
002: // --------------------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: //
008: // $LastChangedDate: 2008-01-19 00:40:19 +0000 (Sa, 19 Jan 2008) $
009: // $LastChangedRevision: 4343 $
010: // $LastChangedBy: orbiter $
011: //
012: // This program is free software; you can redistribute it and/or modify
013: // it under the terms of the GNU General Public License as published by
014: // the Free Software Foundation; either version 2 of the License, or
015: // (at your option) any later version.
016: //
017: // This program is distributed in the hope that it will be useful,
018: // but WITHOUT ANY WARRANTY; without even the implied warranty of
019: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: // GNU General Public License for more details.
021: //
022: // You should have received a copy of the GNU General Public License
023: // along with this program; if not, write to the Free Software
024: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: //
026: // Using this software in any meaning (reading, learning, copying, compiling,
027: // running) means that you agree that the Author(s) is (are) not responsible
028: // for cost, loss of data or any harm that may be caused directly or indirectly
029: // by usage of this softare or this documentation. The usage of this software
030: // is on your own risk. The installation and usage (starting/running) of this
031: // software may allow other people or application to access your computer and
032: // any attached devices and is highly dependent on the configuration of the
033: // software which must be done by the user of the software; the author(s) is
034: // (are) also not responsible for proper configuration and usage of the
035: // software, even if provoked by documentation provided together with
036: // the software.
037: //
038: // Any changes to this file according to the GPL as documented in the file
039: // gpl.txt aside this file in the shipment you received can be done to the
040: // lines that follows this copyright notice here, but changes must not be
041: // done inside the copyright notive above. A re-distribution must contain
042: // the intact and unchanged copyright notice.
043: // Contributions and changes to the program code must be marked as such.
044:
045: package de.anomic.plasma;
046:
047: import java.io.File;
048: import java.io.IOException;
049: import java.net.MalformedURLException;
050: import java.util.Date;
051: import java.util.Iterator;
052:
053: import de.anomic.index.indexURLEntry;
054: import de.anomic.kelondro.kelondroBase64Order;
055: import de.anomic.kelondro.kelondroNaturalOrder;
056: import de.anomic.kelondro.kelondroRow;
057: import de.anomic.kelondro.kelondroStack;
058: import de.anomic.plasma.cache.IResourceInfo;
059: import de.anomic.server.logging.serverLog;
060: import de.anomic.yacy.yacySeedDB;
061: import de.anomic.yacy.yacyURL;
062:
063: public class plasmaSwitchboardQueue {
064:
065: private kelondroStack sbQueueStack;
066: private plasmaCrawlProfile profiles;
067: private plasmaCrawlLURL lurls;
068: private File sbQueueStackPath;
069:
070: public plasmaSwitchboardQueue(plasmaCrawlLURL lurls,
071: File sbQueueStackPath, plasmaCrawlProfile profiles) {
072: this .sbQueueStackPath = sbQueueStackPath;
073: this .profiles = profiles;
074: this .lurls = lurls;
075:
076: initQueueStack();
077: }
078:
079: public static final kelondroRow rowdef = new kelondroRow(
080: "String url-256, "
081: + // the url
082: "String refhash-"
083: + yacySeedDB.commonHashLength
084: + ", "
085: + // the url's referrer hash
086: "Cardinal modifiedsince-11 {b64e}, "
087: + // from ifModifiedSince
088: "byte[] flags-1, "
089: + // flags
090: "String initiator-" + yacySeedDB.commonHashLength
091: + ", "
092: + // the crawling initiator
093: "Cardinal depth-2 {b64e}, "
094: + // the prefetch depth so far, starts at 0
095: "String profile-" + yacySeedDB.commonHashLength
096: + ", " + // the name of the prefetch profile handle
097: "String urldescr-80",
098: kelondroNaturalOrder.naturalOrder, 0);
099:
100: private void initQueueStack() {
101: sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef);
102: }
103:
104: /*
105: private void resetQueueStack() {
106: try {sbQueueStack.close();} catch (Exception e) {}
107: if (sbQueueStackPath.exists()) sbQueueStackPath.delete();
108: initQueueStack();
109: }
110: */
111: public int size() {
112: return sbQueueStack.size();
113: }
114:
115: public synchronized void push(Entry entry) throws IOException {
116: if (entry == null)
117: return;
118: sbQueueStack
119: .push(sbQueueStack
120: .row()
121: .newEntry(
122: new byte[][] {
123: entry.url.toString().getBytes(),
124: (entry.referrerHash == null) ? yacyURL.dummyHash
125: .getBytes()
126: : entry.referrerHash
127: .getBytes(),
128: kelondroBase64Order.enhancedCoder
129: .encodeLong(
130: (entry.ifModifiedSince == null) ? 0
131: : entry.ifModifiedSince
132: .getTime(),
133: 11).getBytes(),
134: new byte[] { entry.flags },
135: (entry.initiator == null) ? yacyURL.dummyHash
136: .getBytes()
137: : entry.initiator
138: .getBytes(),
139: kelondroBase64Order.enhancedCoder
140: .encodeLong(
141: (long) entry.depth,
142: rowdef.width(5))
143: .getBytes(),
144: (entry.profileHandle == null) ? yacyURL.dummyHash
145: .getBytes()
146: : entry.profileHandle
147: .getBytes(),
148: (entry.anchorName == null) ? "-"
149: .getBytes("UTF-8")
150: : entry.anchorName
151: .getBytes("UTF-8") }));
152: }
153:
154: public synchronized Entry pop() throws IOException {
155: if (sbQueueStack.size() == 0)
156: return null;
157: kelondroRow.Entry b = sbQueueStack.pot();
158: if (b == null)
159: return null;
160: return new Entry(b);
161: }
162:
163: public synchronized Entry remove(String urlHash) {
164: Iterator<kelondroRow.Entry> i = sbQueueStack
165: .stackIterator(true);
166: kelondroRow.Entry rowentry;
167: Entry entry;
168: while (i.hasNext()) {
169: rowentry = (kelondroRow.Entry) i.next();
170: entry = new Entry(rowentry);
171: if (entry.urlHash().equals(urlHash)) {
172: i.remove();
173: return entry;
174: }
175: }
176: return null;
177: }
178:
179: public void clear() {
180: sbQueueStack = kelondroStack.reset(sbQueueStack);
181: }
182:
183: public void close() {
184: if (sbQueueStack != null) {
185: sbQueueStack.close();
186: }
187: sbQueueStack = null;
188: }
189:
190: protected void finalize() throws Throwable {
191: try {
192: close();
193: } catch (Exception e) {
194: throw new IOException("plasmaSwitchboardQueue.finalize()"
195: + e.getMessage());
196: }
197: super .finalize();
198: }
199:
200: public Iterator<Entry> entryIterator(boolean up) {
201: // iterates the elements in an ordered way.
202: // returns plasmaSwitchboardQueue.Entry - type Objects
203: return new entryIterator(up);
204: }
205:
206: public class entryIterator implements Iterator<Entry> {
207:
208: Iterator<kelondroRow.Entry> rows;
209:
210: public entryIterator(boolean up) {
211: rows = sbQueueStack.stackIterator(up);
212: }
213:
214: public boolean hasNext() {
215: return rows.hasNext();
216: }
217:
218: public Entry next() {
219: return new Entry((kelondroRow.Entry) rows.next());
220: }
221:
222: public void remove() {
223: rows.remove();
224: }
225: }
226:
227: public Entry newEntry(yacyURL url, String referrer,
228: Date ifModifiedSince, boolean requestWithCookie,
229: String initiator, int depth, String profilehandle,
230: String anchorName) {
231: return new Entry(url, referrer, ifModifiedSince,
232: requestWithCookie, initiator, depth, profilehandle,
233: anchorName);
234: }
235:
236: public class Entry {
237: private yacyURL url; // plasmaURL.urlStringLength
238: private String referrerHash; // plasmaURL.urlHashLength
239: private Date ifModifiedSince; // 6
240: private byte flags; // 1
241: private String initiator; // yacySeedDB.commonHashLength
242: private int depth; // plasmaURL.urlCrawlDepthLength
243: private String profileHandle; // plasmaURL.urlCrawlProfileHandleLength
244: private String anchorName; // plasmaURL.urlDescrLength
245:
246: // computed values
247: private plasmaCrawlProfile.entry profileEntry;
248: private IResourceInfo contentInfo;
249: private yacyURL referrerURL;
250:
251: public Entry(yacyURL url, String referrer,
252: Date ifModifiedSince, boolean requestWithCookie,
253: String initiator, int depth, String profileHandle,
254: String anchorName) {
255: this .url = url;
256: this .referrerHash = referrer;
257: this .ifModifiedSince = ifModifiedSince;
258: this .flags = (requestWithCookie) ? (byte) 1 : (byte) 0;
259: this .initiator = initiator;
260: this .depth = depth;
261: this .profileHandle = profileHandle;
262: this .anchorName = (anchorName == null) ? "" : anchorName
263: .trim();
264:
265: this .profileEntry = null;
266: this .contentInfo = null;
267: this .referrerURL = null;
268: }
269:
270: public Entry(kelondroRow.Entry row) {
271: long ims = row.getColLong(2);
272: byte flags = row.getColByte(3);
273: try {
274: this .url = new yacyURL(row.getColString(0, "UTF-8"),
275: null);
276: } catch (MalformedURLException e) {
277: this .url = null;
278: }
279: this .referrerHash = row.getColString(1, "UTF-8");
280: this .ifModifiedSince = (ims == 0) ? null : new Date(ims);
281: this .flags = ((flags & 1) == 1) ? (byte) 1 : (byte) 0;
282: this .initiator = row.getColString(4, "UTF-8");
283: this .depth = (int) row.getColLong(5);
284: this .profileHandle = row.getColString(6, "UTF-8");
285: this .anchorName = row.getColString(7, "UTF-8");
286:
287: this .profileEntry = null;
288: this .contentInfo = null;
289: this .referrerURL = null;
290: }
291:
292: public Entry(byte[][] row) throws IOException {
293: long ims = (row[2] == null) ? 0
294: : kelondroBase64Order.enhancedCoder
295: .decodeLong(new String(row[2], "UTF-8"));
296: byte flags = (row[3] == null) ? 0 : row[3][0];
297: try {
298: this .url = new yacyURL(new String(row[0], "UTF-8"),
299: null);
300: } catch (MalformedURLException e) {
301: this .url = null;
302: }
303: this .referrerHash = (row[1] == null) ? null : new String(
304: row[1], "UTF-8");
305: this .ifModifiedSince = (ims == 0) ? null : new Date(ims);
306: this .flags = ((flags & 1) == 1) ? (byte) 1 : (byte) 0;
307: this .initiator = (row[4] == null) ? null : new String(
308: row[4], "UTF-8");
309: this .depth = (int) kelondroBase64Order.enhancedCoder
310: .decodeLong(new String(row[5], "UTF-8"));
311: this .profileHandle = new String(row[6], "UTF-8");
312: this .anchorName = (row[7] == null) ? null : (new String(
313: row[7], "UTF-8")).trim();
314:
315: this .profileEntry = null;
316: this .contentInfo = null;
317: this .referrerURL = null;
318: }
319:
320: public yacyURL url() {
321: return url;
322: }
323:
324: public String urlHash() {
325: return url.hash();
326: }
327:
328: public boolean requestedWithCookie() {
329: return (flags & 1) == 1;
330: }
331:
332: public File cacheFile() {
333: return plasmaHTCache.getCachePath(url);
334: }
335:
336: public boolean proxy() {
337: return (initiator == null)
338: || (initiator.equals(yacyURL.dummyHash));
339: }
340:
341: public String initiator() {
342: return initiator;
343: }
344:
345: public int depth() {
346: return depth;
347: }
348:
349: public long size() {
350: if (cacheFile().exists())
351: return cacheFile().length();
352: else
353: return 0;
354: }
355:
356: public plasmaCrawlProfile.entry profile() {
357: if (profileEntry == null)
358: profileEntry = profiles.getEntry(profileHandle);
359: return profileEntry;
360: }
361:
362: private IResourceInfo getCachedObjectInfo() {
363: if (this .contentInfo == null)
364: try {
365: this .contentInfo = plasmaHTCache
366: .loadResourceInfo(this .url);
367: } catch (Exception e) {
368: serverLog.logSevere("PLASMA",
369: "responseHeader: failed to get header", e);
370: return null;
371: }
372: return this .contentInfo;
373: }
374:
375: public String getMimeType() {
376: IResourceInfo info = this .getCachedObjectInfo();
377: return (info == null) ? null : info.getMimeType();
378: }
379:
380: public String getCharacterEncoding() {
381: IResourceInfo info = this .getCachedObjectInfo();
382: return (info == null) ? null : info.getCharacterEncoding();
383: }
384:
385: public Date getModificationDate() {
386: IResourceInfo info = this .getCachedObjectInfo();
387: return (info == null) ? new Date() : info
388: .getModificationDate();
389: }
390:
391: public yacyURL referrerURL() {
392: if (referrerURL == null) {
393: if ((referrerHash == null)
394: || (referrerHash.equals(yacyURL.dummyHash)))
395: return null;
396: indexURLEntry entry = lurls.load(referrerHash, null, 0);
397: if (entry == null)
398: referrerURL = null;
399: else
400: referrerURL = entry.comp().url();
401: }
402: return referrerURL;
403: }
404:
405: public String referrerHash() {
406: return referrerHash;
407: }
408:
409: public String anchorName() {
410: return anchorName;
411: }
412:
413: /**
414: * decide upon header information if a specific file should be indexed
415: * this method returns null if the answer is 'YES'!
416: * if the answer is 'NO' (do not index), it returns a string with the reason
417: * to reject the crawling demand in clear text
418: *
419: * This function is used by plasmaSwitchboard#processResourceStack
420: */
421: public final String shallIndexCacheForProxy() {
422: if (profile() == null) {
423: return "shallIndexCacheForProxy: profile() is null !";
424: }
425:
426: // check profile
427: if (!profile().indexText() && !profile().indexMedia()) {
428: return "Indexing_Not_Allowed";
429: }
430:
431: // -CGI access in request
432: // CGI access makes the page very individual, and therefore not usable in caches
433: if (!profile().crawlingQ()) {
434: if (url.isPOST()) {
435: return "Dynamic_(POST)";
436: }
437: if (url.isCGI()) {
438: return "Dynamic_(CGI)";
439: }
440: }
441:
442: // -authorization cases in request
443: // we checked that in shallStoreCache
444:
445: // -ranges in request
446: // we checked that in shallStoreCache
447:
448: // a picture cannot be indexed
449: if (plasmaHTCache.noIndexingURL(url)) {
450: return "Media_Content_(forbidden)";
451: }
452:
453: // -cookies in request
454: // unfortunately, we cannot index pages which have been requested with a cookie
455: // because the returned content may be special for the client
456: if (requestedWithCookie()) {
457: // System.out.println("***not indexed because cookie");
458: return "Dynamic_(Requested_With_Cookie)";
459: }
460:
461: if (getCachedObjectInfo() != null) {
462: return this .getCachedObjectInfo()
463: .shallIndexCacheForProxy();
464: }
465: return null;
466: }
467:
468: /**
469: * decide upon header information if a specific file should be indexed
470: * this method returns null if the answer is 'YES'!
471: * if the answer is 'NO' (do not index), it returns a string with the reason
472: * to reject the crawling demand in clear text
473: *
474: * This function is used by plasmaSwitchboard#processResourceStack
475: */
476: public final String shallIndexCacheForCrawler() {
477: if (profile() == null) {
478: return "shallIndexCacheForCrawler: profile() is null !";
479: }
480:
481: // check profile
482: if (!profile().indexText() && !profile().indexMedia()) {
483: return "Indexing_Not_Allowed";
484: }
485:
486: // -CGI access in request
487: // CGI access makes the page very individual, and therefore not usable in caches
488: if (!profile().crawlingQ()) {
489: if (url().isPOST()) {
490: return "Dynamic_(POST)";
491: }
492: if (url().isCGI()) {
493: return "Dynamic_(CGI)";
494: }
495: }
496:
497: // -authorization cases in request
498: // we checked that in shallStoreCache
499:
500: // -ranges in request
501: // we checked that in shallStoreCache
502:
503: // a picture cannot be indexed
504: if (getCachedObjectInfo() != null) {
505: String status = this .getCachedObjectInfo()
506: .shallIndexCacheForCrawler();
507: if (status != null)
508: return status;
509: }
510: if (plasmaHTCache.noIndexingURL(url())) {
511: return "Media_Content_(forbidden)";
512: }
513:
514: // -if-modified-since in request
515: // if the page is fresh at the very moment we can index it
516: // -> this does not apply for the crawler
517:
518: // -cookies in request
519: // unfortunately, we cannot index pages which have been requested with a cookie
520: // because the returned content may be special for the client
521: // -> this does not apply for a crawler
522:
523: // -set-cookie in response
524: // the set-cookie from the server does not indicate that the content is special
525: // thus we do not care about it here for indexing
526: // -> this does not apply for a crawler
527:
528: // -pragma in cached response
529: // -> in the crawler we ignore this
530:
531: // look for freshnes information
532:
533: // -expires in cached response
534: // the expires value gives us a very easy hint when the cache is stale
535: // sometimes, the expires date is set to the past to prevent that a page is cached
536: // we use that information to see if we should index it
537: // -> this does not apply for a crawler
538:
539: // -lastModified in cached response
540: // this information is too weak to use it to prevent indexing
541: // even if we can apply a TTL heuristic for cache usage
542:
543: // -cache-control in cached response
544: // the cache-control has many value options.
545: // -> in the crawler we ignore this
546:
547: return null;
548: }
549: } // class Entry
550: }
|