001: // indexURLEntry.java
002: // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
003: // first published 2006 on http://www.anomic.de
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.index;
028:
029: import java.net.MalformedURLException;
030: import java.text.ParseException;
031: import java.util.ArrayList;
032: import java.util.Date;
033: import java.util.Properties;
034:
035: import de.anomic.kelondro.kelondroBase64Order;
036: import de.anomic.kelondro.kelondroBitfield;
037: import de.anomic.kelondro.kelondroException;
038: import de.anomic.kelondro.kelondroNaturalOrder;
039: import de.anomic.kelondro.kelondroRow;
040: import de.anomic.plasma.plasmaCrawlEntry;
041: import de.anomic.plasma.plasmaSearchQuery;
042: import de.anomic.server.serverCharBuffer;
043: import de.anomic.server.serverCodings;
044: import de.anomic.server.serverDate;
045: import de.anomic.tools.crypt;
046: import de.anomic.tools.nxTools;
047: import de.anomic.yacy.yacyURL;
048:
049: public class indexURLEntry {
050:
051: // this object stores attributes for URL entries
052:
053: public static final kelondroRow rowdef = new kelondroRow(
054: "String hash-12, " + // the url's hash
055: "String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
056: "Cardinal mod-4 {b256}, " + // last-modified from the httpd
057: "Cardinal load-4 {b256}, " + // time when the url was loaded
058: "Cardinal fresh-4 {b256}, " + // time until this url is fresh
059: "String referrer-12, " + // (one of) the url's referrer hash(es)
060: "byte[] md5-8, " + // the md5 of the url content (to identify changes)
061: "Cardinal size-6 {b256}, " + // size of file in bytes
062: "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
063: "byte[] dt-1, " + // doctype, taken from extension or any other heuristic
064: "Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition)
065: "String lang-2, " + // language
066: "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
067: "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
068: "Cardinal limage-2 {b256}, " + // # of embedded image links
069: "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
070: "Cardinal lvideo-2 {b256}, " + // # of embedded video links
071: "Cardinal lapp-2 {b256}", // # of embedded links to applications
072: kelondroBase64Order.enhancedCoder, 0);
073:
074: /* ===========================================================================
075: * Constants to access the various columns of an URL entry
076: * =========================================================================== */
077: /** the url's hash */
078: private static final int col_hash = 0;
079: /** components: the url, description, author and tags. As 5th element, an ETag is possible */
080: private static final int col_comp = 1;
081: /** components: the url, description, author and tags. As 5th element, an ETag is possible */
082: private static final int col_mod = 2;
083: /** time when the url was loaded */
084: private static final int col_load = 3;
085: /** time until this url is fresh */
086: private static final int col_fresh = 4;
087: /** time when the url was loaded */
088: private static final int col_referrer = 5;
089: /** the md5 of the url content (to identify changes) */
090: private static final int col_md5 = 6;
091: /** size of file in bytes */
092: private static final int col_size = 7;
093: /** size of file by number of words; for video and audio: seconds */
094: private static final int col_wc = 8;
095: /** doctype, taken from extension or any other heuristic */
096: private static final int col_dt = 9;
097: /** flags; any stuff (see Word-Entity definition) */
098: private static final int col_flags = 10;
099: /** language */
100: private static final int col_lang = 11;
101: /** of outlinks to same domain; for video and image: width */
102: private static final int col_llocal = 12;
103: /** of outlinks to outside domain; for video and image: height */
104: private static final int col_lother = 13;
105: /** of embedded image links */
106: private static final int col_limage = 14;
107: /** of embedded audio links; for audio: track number; for video: number of audio tracks */
108: private static final int col_laudio = 15;
109: /** of embedded video links */
110: private static final int col_lvideo = 16;
111: /** of embedded links to applications */
112: private static final int col_lapp = 17;
113:
114: private kelondroRow.Entry entry;
115: private String snippet;
116: private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests
117: private long ranking; // during generation of a search result this value is set
118:
119: public indexURLEntry(yacyURL url, String dc_title,
120: String dc_creator, String dc_subject, String ETag,
121: Date mod, Date load, Date fresh, String referrer,
122: byte[] md5, long size, int wc, char dt,
123: kelondroBitfield flags, String lang, int llocal,
124: int lother, int laudio, int limage, int lvideo, int lapp) {
125: // create new entry and store it into database
126: this .entry = rowdef.newEntry();
127: this .entry.setCol(col_hash, url.hash(), null);
128: this .entry.setCol(col_comp, encodeComp(url, dc_title,
129: dc_creator, dc_subject, ETag));
130: encodeDate(col_mod, mod);
131: encodeDate(col_load, load);
132: encodeDate(col_fresh, fresh);
133: this .entry.setCol(col_referrer, (referrer == null) ? null
134: : referrer.getBytes());
135: this .entry.setCol(col_md5, md5);
136: this .entry.setCol(col_size, size);
137: this .entry.setCol(col_wc, wc);
138: this .entry.setCol(col_dt, new byte[] { (byte) dt });
139: this .entry.setCol(col_flags, flags.bytes());
140: this .entry.setCol(col_lang, lang.getBytes());
141: this .entry.setCol(col_llocal, llocal);
142: this .entry.setCol(col_lother, lother);
143: this .entry.setCol(col_limage, limage);
144: this .entry.setCol(col_laudio, laudio);
145: this .entry.setCol(col_lvideo, lvideo);
146: this .entry.setCol(col_lapp, lapp);
147: //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
148: this .snippet = null;
149: this .word = null;
150: this .ranking = 0;
151: }
152:
153: private void encodeDate(int col, Date d) {
154: // calculates the number of days since 1.1.1970 and returns this as 4-byte array
155: this .entry.setCol(col, kelondroNaturalOrder.encodeLong(d
156: .getTime() / 86400000, 4));
157: }
158:
159: private Date decodeDate(int col) {
160: return new Date(86400000 * this .entry.getColLong(col));
161: }
162:
163: public static byte[] encodeComp(yacyURL url, String dc_title,
164: String dc_creator, String dc_subject, String ETag) {
165: serverCharBuffer s = new serverCharBuffer(200);
166: s.append(url.toNormalform(false, true)).append(10);
167: s.append(dc_title).append(10);
168: s.append(dc_creator).append(10);
169: s.append(dc_subject).append(10);
170: s.append(ETag).append(10);
171: return s.toString().getBytes();
172: }
173:
174: public indexURLEntry(kelondroRow.Entry entry,
175: indexRWIRowEntry searchedWord, long ranking) {
176: this .entry = entry;
177: this .snippet = null;
178: this .word = searchedWord;
179: this .ranking = ranking;
180: }
181:
182: public indexURLEntry(Properties prop) {
183: // generates an plasmaLURLEntry using the properties from the argument
184: // the property names must correspond to the one from toString
185: //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
186: yacyURL url;
187: try {
188: url = new yacyURL(crypt.simpleDecode(prop.getProperty(
189: "url", ""), null), prop.getProperty("hash"));
190: } catch (MalformedURLException e) {
191: url = null;
192: }
193: String descr = crypt.simpleDecode(
194: prop.getProperty("descr", ""), null);
195: if (descr == null)
196: descr = "";
197: String dc_creator = crypt.simpleDecode(prop.getProperty(
198: "author", ""), null);
199: if (dc_creator == null)
200: dc_creator = "";
201: String tags = crypt.simpleDecode(prop.getProperty("tags", ""),
202: null);
203: if (tags == null)
204: tags = "";
205: String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""),
206: null);
207: if (ETag == null)
208: ETag = "";
209:
210: this .entry = rowdef.newEntry();
211: this .entry.setCol(col_hash, url.hash(), null);
212: this .entry.setCol(col_comp, encodeComp(url, descr, dc_creator,
213: tags, ETag));
214: try {
215: encodeDate(col_mod, serverDate.parseShortDay(prop
216: .getProperty("mod", "20000101")));
217: } catch (ParseException e) {
218: encodeDate(col_mod, new Date());
219: }
220: try {
221: encodeDate(col_load, serverDate.parseShortDay(prop
222: .getProperty("load", "20000101")));
223: } catch (ParseException e) {
224: encodeDate(col_load, new Date());
225: }
226: try {
227: encodeDate(col_fresh, serverDate.parseShortDay(prop
228: .getProperty("fresh", "20000101")));
229: } catch (ParseException e) {
230: encodeDate(col_fresh, new Date());
231: }
232: this .entry.setCol(col_referrer, prop.getProperty("referrer",
233: yacyURL.dummyHash).getBytes());
234: this .entry.setCol(col_md5, serverCodings.decodeHex(prop
235: .getProperty("md5", "")));
236: this .entry.setCol(col_size, Integer.parseInt(prop.getProperty(
237: "size", "0")));
238: this .entry.setCol(col_wc, Integer.parseInt(prop.getProperty(
239: "wc", "0")));
240: this .entry.setCol(col_dt, new byte[] { (byte) prop.getProperty(
241: "dt", "t").charAt(0) });
242: String flags = prop.getProperty("flags", "AAAAAA");
243: this .entry
244: .setCol(
245: col_flags,
246: (flags.length() > 6) ? plasmaSearchQuery.empty_constraint
247: .bytes()
248: : (new kelondroBitfield(4, flags))
249: .bytes());
250: this .entry.setCol(col_lang, prop.getProperty("lang", "uk")
251: .getBytes());
252: this .entry.setCol(col_llocal, Integer.parseInt(prop
253: .getProperty("llocal", "0")));
254: this .entry.setCol(col_lother, Integer.parseInt(prop
255: .getProperty("lother", "0")));
256: this .entry.setCol(col_limage, Integer.parseInt(prop
257: .getProperty("limage", "0")));
258: this .entry.setCol(col_laudio, Integer.parseInt(prop
259: .getProperty("laudio", "0")));
260: this .entry.setCol(col_lvideo, Integer.parseInt(prop
261: .getProperty("lvideo", "0")));
262: this .entry.setCol(col_lapp, Integer.parseInt(prop.getProperty(
263: "lapp", "0")));
264: this .snippet = crypt.simpleDecode(prop.getProperty("snippet",
265: ""), null);
266: this .word = null;
267: if (prop.containsKey("word"))
268: throw new kelondroException(
269: "old database structure is not supported");
270: if (prop.containsKey("wi")) {
271: this .word = new indexRWIRowEntry(
272: kelondroBase64Order.enhancedCoder
273: .decodeString(prop.getProperty("wi", ""),
274: "de.anomic.index.indexURLEntry.indexURLEntry()"));
275: }
276: this .ranking = 0;
277: }
278:
279: private StringBuffer corePropList() {
280: // generate a parseable string; this is a simple property-list
281: indexURLEntry.Components comp = this .comp();
282: final StringBuffer s = new StringBuffer(300);
283: //System.out.println("author=" + comp.author());
284: try {
285: s.append("hash=").append(hash());
286: s.append(",url=").append(
287: crypt.simpleEncode(comp.url().toNormalform(false,
288: true)));
289: s.append(",descr=").append(
290: crypt.simpleEncode(comp.dc_title()));
291: s.append(",author=").append(
292: crypt.simpleEncode(comp.dc_creator()));
293: s.append(",tags=").append(
294: crypt.simpleEncode(comp.dc_subject()));
295: s.append(",ETag=").append(crypt.simpleEncode(comp.ETag()));
296: s.append(",mod=").append(
297: serverDate.formatShortDay(moddate()));
298: s.append(",load=").append(
299: serverDate.formatShortDay(loaddate()));
300: s.append(",fresh=").append(
301: serverDate.formatShortDay(freshdate()));
302: s.append(",referrer=").append(referrerHash());
303: s.append(",md5=").append(md5());
304: s.append(",size=").append(size());
305: s.append(",wc=").append(wordCount());
306: s.append(",dt=").append(doctype());
307: s.append(",flags=").append(flags().exportB64());
308: s.append(",lang=").append(language());
309: s.append(",llocal=").append(llocal());
310: s.append(",lother=").append(lother());
311: s.append(",limage=").append(limage());
312: s.append(",laudio=").append(laudio());
313: s.append(",lvideo=").append(lvideo());
314: s.append(",lapp=").append(lapp());
315:
316: if (this .word != null) {
317: // append also word properties
318: s.append(",wi=").append(
319: kelondroBase64Order.enhancedCoder
320: .encodeString(word.toPropertyForm()));
321: }
322: return s;
323:
324: } catch (Exception e) {
325: // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
326: // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
327: // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
328: e.printStackTrace();
329: return null;
330: }
331: }
332:
333: public kelondroRow.Entry toRowEntry() {
334: return this .entry;
335: }
336:
337: public String hash() {
338: // return a url-hash, based on the md5 algorithm
339: // the result is a String of 12 bytes within a 72-bit space
340: // (each byte has an 6-bit range)
341: // that should be enough for all web pages on the world
342: return this .entry.getColString(col_hash, null);
343: }
344:
345: public long ranking() {
346: return this .ranking;
347: }
348:
349: public indexURLEntry.Components comp() {
350: ArrayList<String> cl = nxTools.strings(this .entry.getCol(
351: "comp", null), "UTF-8");
352: return new indexURLEntry.Components(
353: (cl.size() > 0) ? ((String) cl.get(0)).trim() : "",
354: hash(), (cl.size() > 1) ? ((String) cl.get(1)).trim()
355: : "", (cl.size() > 2) ? ((String) cl.get(2))
356: .trim() : "", (cl.size() > 3) ? ((String) cl
357: .get(3)).trim() : "",
358: (cl.size() > 4) ? ((String) cl.get(4)).trim() : "");
359: }
360:
361: public Date moddate() {
362: return decodeDate(col_mod);
363: }
364:
365: public Date loaddate() {
366: return decodeDate(col_load);
367: }
368:
369: public Date freshdate() {
370: return decodeDate(col_fresh);
371: }
372:
373: public String referrerHash() {
374: // return the creator's hash
375: return entry.getColString(col_referrer, null);
376: }
377:
378: public String md5() {
379: // returns the md5 in hex representation
380: return serverCodings.encodeHex(entry.getColBytes(col_md5));
381: }
382:
383: public char doctype() {
384: return (char) entry.getColByte(col_dt);
385: }
386:
387: public String language() {
388: return this .entry.getColString(col_lang, null);
389: }
390:
391: public int size() {
392: return (int) this .entry.getColLong(col_size);
393: }
394:
395: public kelondroBitfield flags() {
396: return new kelondroBitfield(this .entry.getColBytes(col_flags));
397: }
398:
399: public int wordCount() {
400: return (int) this .entry.getColLong(col_wc);
401: }
402:
403: public int llocal() {
404: return (int) this .entry.getColLong(col_llocal);
405: }
406:
407: public int lother() {
408: return (int) this .entry.getColLong(col_lother);
409: }
410:
411: public int limage() {
412: return (int) this .entry.getColLong(col_limage);
413: }
414:
415: public int laudio() {
416: return (int) this .entry.getColLong(col_laudio);
417: }
418:
419: public int lvideo() {
420: return (int) this .entry.getColLong(col_lvideo);
421: }
422:
423: public int lapp() {
424: return (int) this .entry.getColLong(col_lapp);
425: }
426:
427: public String snippet() {
428: // the snippet may appear here if the url was transported in a remote search
429: // it will not be saved anywhere, but can only be requested here
430: return snippet;
431: }
432:
433: public indexRWIRowEntry word() {
434: return word;
435: }
436:
437: public boolean isOlder(indexURLEntry other) {
438: if (other == null)
439: return false;
440: Date tmoddate = moddate();
441: Date omoddate = other.moddate();
442: if (tmoddate.before(omoddate))
443: return true;
444: if (tmoddate.equals(omoddate)) {
445: Date tloaddate = loaddate();
446: Date oloaddate = other.loaddate();
447: if (tloaddate.before(oloaddate))
448: return true;
449: if (tloaddate.equals(oloaddate))
450: return true;
451: }
452: return false;
453: }
454:
455: public String toString(String snippet) {
456: // add information needed for remote transport
457: final StringBuffer core = corePropList();
458: if (core == null)
459: return null;
460:
461: core.ensureCapacity(core.length() + snippet.length() * 2);
462: core.insert(0, "{");
463: core.append(",snippet=").append(crypt.simpleEncode(snippet));
464: core.append("}");
465:
466: return new String(core);
467: //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
468: }
469:
470: public plasmaCrawlEntry toBalancerEntry() {
471: return new plasmaCrawlEntry(null, comp().url(), referrerHash(),
472: comp().dc_title(), loaddate(), null, 0, 0, 0);
473: }
474:
475: /**
476: * @return the object as String.<br>
477: * This e.g. looks like this:
478: * <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
479: */
480: public String toString() {
481: final StringBuffer core = corePropList();
482: if (core == null)
483: return null;
484:
485: core.insert(0, "{");
486: core.append("}");
487:
488: return new String(core);
489: //return "{" + core + "}";
490: }
491:
492: public class Components {
493: private yacyURL url;
494: private String dc_title, dc_creator, dc_subject, ETag;
495:
496: public Components(String url, String urlhash, String title,
497: String author, String tags, String ETag) {
498: try {
499: this .url = new yacyURL(url, urlhash);
500: } catch (MalformedURLException e) {
501: this .url = null;
502: }
503: this .dc_title = title;
504: this .dc_creator = author;
505: this .dc_subject = tags;
506: this .ETag = ETag;
507: }
508:
509: public Components(yacyURL url, String descr, String author,
510: String tags, String ETag) {
511: this .url = url;
512: this .dc_title = descr;
513: this .dc_creator = author;
514: this .dc_subject = tags;
515: this .ETag = ETag;
516: }
517:
518: public yacyURL url() {
519: return this .url;
520: }
521:
522: public String dc_title() {
523: return this .dc_title;
524: }
525:
526: public String dc_creator() {
527: return this .dc_creator;
528: }
529:
530: public String dc_subject() {
531: return this .dc_subject;
532: }
533:
534: public String ETag() {
535: return this.ETag;
536: }
537: }
538:
539: }
|