001: // plasmaCrawlEntry.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 14.03.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma;
028:
029: import java.io.IOException;
030: import java.io.UnsupportedEncodingException;
031: import java.util.Date;
032:
033: import de.anomic.kelondro.kelondroBase64Order;
034: import de.anomic.kelondro.kelondroBitfield;
035: import de.anomic.kelondro.kelondroNaturalOrder;
036: import de.anomic.kelondro.kelondroRow;
037: import de.anomic.yacy.yacyCore;
038: import de.anomic.yacy.yacySeedDB;
039: import de.anomic.yacy.yacyURL;
040:
041: public class plasmaCrawlEntry {
042:
043: // row definition for balancer-related NURL-entries
044: public final static kelondroRow rowdef = new kelondroRow(
045: "String urlhash-"
046: + yacySeedDB.commonHashLength
047: + ", "
048: + // the url's hash
049: "String initiator-"
050: + yacySeedDB.commonHashLength
051: + ", "
052: + // the crawling initiator
053: "String urlstring-256, "
054: + // the url as string
055: "String refhash-" + yacySeedDB.commonHashLength
056: + ", "
057: + // the url's referrer hash
058: "String urlname-80, "
059: + // the name of the url, from anchor tag <a>name</a>
060: "Cardinal appdate-8 {b256}, "
061: + // the time when the url was first time appeared
062: "String profile-" + yacySeedDB.commonHashLength
063: + ", " + // the name of the prefetch profile handle
064: "Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
065: "Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
066: "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
067: "byte[] flags-4, " + // flags
068: "String handle-4, " + // extra handle
069: "Cardinal loaddate-8 {b256}," + // time when the file was loaded
070: "Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
071: "Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
072: kelondroBase64Order.enhancedCoder, 0);
073:
074: private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
075: // if this is generated by a crawl, the own peer hash in entered
076: private String refhash; // the url's referrer hash
077: private yacyURL url; // the url as string
078: private String name; // the name of the url, from anchor tag <a>name</a>
079: private long appdate; // the time when the url was first time appeared
080: private long loaddate; // the time when the url was loaded
081: private long serverdate; // the document date from the target server
082: private long imsdate; // the time of a ifModifiedSince request
083: private String profileHandle; // the name of the prefetch profile
084: private int depth; // the prefetch depth so far, starts at 0
085: private int anchors; // number of anchors of the parent
086: private int forkfactor; // sum of anchors of all ancestors
087: private kelondroBitfield flags;
088: private int handle;
089: private String status;
090: private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
091:
092: public plasmaCrawlEntry(yacyURL url) {
093: this (yacyCore.seedDB.mySeed().hash, url, null, null,
094: new Date(), null, 0, 0, 0);
095: }
096:
097: /**
098: * @param initiator the hash of the initiator peer
099: * @param url the {@link URL} to crawl
100: * @param referrer the hash of the referrer URL
101: * @param name the name of the document to crawl
102: * @param appdate the time when the url was first time appeared
103: * @param profileHandle the name of the prefetch profile. This must not be null!
104: * @param depth the crawling depth of the entry
105: * @param anchors number of anchors of the parent
106: * @param forkfactor sum of anchors of all ancestors
107: */
108: public plasmaCrawlEntry(String initiator, yacyURL url,
109: String referrerhash, String name, Date appdate,
110: String profileHandle, int depth, int anchors, int forkfactor) {
111: // create new entry and store it into database
112: assert appdate != null;
113: assert url != null;
114: if ((initiator == null) || (initiator.length() == 0))
115: initiator = yacyURL.dummyHash;
116: this .initiator = initiator;
117: this .url = url;
118: this .refhash = (referrerhash == null) ? yacyURL.dummyHash
119: : referrerhash;
120: this .name = (name == null) ? "" : name;
121: this .appdate = (appdate == null) ? 0 : appdate.getTime();
122: this .profileHandle = profileHandle; // must not be null
123: this .depth = depth;
124: this .anchors = anchors;
125: this .forkfactor = forkfactor;
126: this .flags = new kelondroBitfield(rowdef.width(10));
127: this .handle = 0;
128: this .loaddate = 0;
129: this .serverdate = 0;
130: this .imsdate = 0;
131: this .status = "loaded(args)";
132: this .initialHash = url.hashCode();
133: }
134:
135: public plasmaCrawlEntry(kelondroRow.Entry entry) throws IOException {
136: assert (entry != null);
137: insertEntry(entry);
138: }
139:
140: private void insertEntry(kelondroRow.Entry entry)
141: throws IOException {
142: String urlstring = entry.getColString(2, null);
143: if (urlstring == null)
144: throw new IOException("url string is null");
145: this .initiator = entry.getColString(1, null);
146: this .url = new yacyURL(urlstring, entry.getColString(0, null));
147: this .refhash = (entry.empty(3)) ? yacyURL.dummyHash : entry
148: .getColString(3, null);
149: this .name = (entry.empty(4)) ? "" : entry.getColString(4,
150: "UTF-8").trim();
151: this .appdate = entry.getColLong(5);
152: this .profileHandle = (entry.empty(6)) ? null : entry
153: .getColString(6, null).trim();
154: this .depth = (int) entry.getColLong(7);
155: this .anchors = (int) entry.getColLong(8);
156: this .forkfactor = (int) entry.getColLong(9);
157: this .flags = new kelondroBitfield(entry.getColBytes(10));
158: this .handle = Integer
159: .parseInt(entry.getColString(11, null), 16);
160: this .loaddate = entry.getColLong(12);
161: this .serverdate = entry.getColLong(13);
162: this .imsdate = entry.getColLong(14);
163: this .status = "loaded(kelondroRow.Entry)";
164: this .initialHash = url.hashCode();
165: return;
166: }
167:
168: public int hashCode() {
169: // overloads Object.hashCode()
170: return this .initialHash;
171: }
172:
173: public void setStatus(String s) {
174: this .status = s;
175: }
176:
177: public String getStatus() {
178: return this .status;
179: }
180:
181: private static String normalizeHandle(int h) {
182: String d = Integer.toHexString(h);
183: while (d.length() < rowdef.width(11))
184: d = "0" + d;
185: return d;
186: }
187:
188: public kelondroRow.Entry toRow() {
189: byte[] appdatestr = kelondroNaturalOrder.encodeLong(appdate,
190: rowdef.width(5));
191: byte[] loaddatestr = kelondroNaturalOrder.encodeLong(loaddate,
192: rowdef.width(12));
193: byte[] serverdatestr = kelondroNaturalOrder.encodeLong(
194: serverdate, rowdef.width(13));
195: byte[] imsdatestr = kelondroNaturalOrder.encodeLong(imsdate,
196: rowdef.width(14));
197: // store the hash in the hash cache
198: byte[] namebytes;
199: try {
200: namebytes = this .name.getBytes("UTF-8");
201: } catch (UnsupportedEncodingException e) {
202: namebytes = this .name.getBytes();
203: }
204: byte[][] entry = new byte[][] {
205: this .url.hash().getBytes(),
206: (initiator == null) ? "".getBytes() : this .initiator
207: .getBytes(),
208: this .url.toString().getBytes(),
209: this .refhash.getBytes(),
210: namebytes,
211: appdatestr,
212: (this .profileHandle == null) ? null
213: : this .profileHandle.getBytes(),
214: kelondroNaturalOrder.encodeLong(this .depth, rowdef
215: .width(7)),
216: kelondroNaturalOrder.encodeLong(this .anchors, rowdef
217: .width(8)),
218: kelondroNaturalOrder.encodeLong(this .forkfactor, rowdef
219: .width(9)), this .flags.bytes(),
220: normalizeHandle(this .handle).getBytes(), loaddatestr,
221: serverdatestr, imsdatestr };
222: return rowdef.newEntry(entry);
223: }
224:
225: public yacyURL url() {
226: // the url
227: return url;
228: }
229:
230: public void redirectURL(yacyURL redirectedURL) {
231: // replace old URL by new one. This should only be used in case of url redirection
232: this .url = redirectedURL;
233: }
234:
235: public String referrerhash() {
236: // the urlhash of a referer url
237: return this .refhash;
238: }
239:
240: public String initiator() {
241: // returns the hash of the initiating peer
242: if (initiator == null)
243: return null;
244: if (initiator.length() == 0)
245: return null;
246: return initiator;
247: }
248:
249: public boolean proxy() {
250: // true when the url was retrieved using the proxy
251: return (initiator() == null);
252: }
253:
254: public Date appdate() {
255: // the date when the url appeared first
256: return new Date(this .appdate);
257: }
258:
259: public Date loaddate() {
260: // the date when the url was loaded
261: return new Date(this .loaddate);
262: }
263:
264: public Date serverdate() {
265: // the date that the server returned as document date
266: return new Date(this .serverdate);
267: }
268:
269: public Date imsdate() {
270: // the date that the client (browser) send as ifModifiedSince in proxy mode
271: return new Date(this .imsdate);
272: }
273:
274: public String name() {
275: // return the anchor name (text inside <a> tag)
276: return this .name;
277: }
278:
279: public int depth() {
280: // crawl depth where the url appeared
281: return this .depth;
282: }
283:
284: public String profileHandle() {
285: // the handle of the crawl profile
286: return this.profileHandle;
287: }
288: }
|