001: // indexRWIRowEntry.java
002: // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 20.05.2006 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.index;
028:
029: import de.anomic.kelondro.kelondroBase64Order;
030: import de.anomic.kelondro.kelondroBitfield;
031: import de.anomic.kelondro.kelondroColumn;
032: import de.anomic.kelondro.kelondroRow;
033: import de.anomic.kelondro.kelondroRow.Entry;
034: import de.anomic.plasma.plasmaWordIndex;
035: import de.anomic.yacy.yacySeedDB;
036:
037: public final class indexRWIRowEntry implements indexRWIEntry {
038:
039: // this object stores attributes to URL references inside RWI collections
040:
041: public static kelondroRow urlEntryRow = new kelondroRow(
042: new kelondroColumn[] {
043: new kelondroColumn("h",
044: kelondroColumn.celltype_string,
045: kelondroColumn.encoder_bytes,
046: yacySeedDB.commonHashLength, "urlhash"),
047: new kelondroColumn("a",
048: kelondroColumn.celltype_cardinal,
049: kelondroColumn.encoder_b256, 2,
050: "lastModified"),
051: new kelondroColumn("s",
052: kelondroColumn.celltype_cardinal,
053: kelondroColumn.encoder_b256, 2,
054: "freshUntil"),
055: new kelondroColumn("u",
056: kelondroColumn.celltype_cardinal,
057: kelondroColumn.encoder_b256, 1,
058: "wordsInTitle"),
059: new kelondroColumn("w",
060: kelondroColumn.celltype_cardinal,
061: kelondroColumn.encoder_b256, 2,
062: "wordsInText"),
063: new kelondroColumn("p",
064: kelondroColumn.celltype_cardinal,
065: kelondroColumn.encoder_b256, 2,
066: "phrasesInText"),
067: new kelondroColumn("d",
068: kelondroColumn.celltype_binary,
069: kelondroColumn.encoder_bytes, 1, "doctype"),
070: new kelondroColumn("l",
071: kelondroColumn.celltype_string,
072: kelondroColumn.encoder_bytes, 2, "language"),
073: new kelondroColumn("x",
074: kelondroColumn.celltype_cardinal,
075: kelondroColumn.encoder_b256, 1, "llocal"),
076: new kelondroColumn("y",
077: kelondroColumn.celltype_cardinal,
078: kelondroColumn.encoder_b256, 1, "lother"),
079: new kelondroColumn("m",
080: kelondroColumn.celltype_cardinal,
081: kelondroColumn.encoder_b256, 1, "urlLength"),
082: new kelondroColumn("n",
083: kelondroColumn.celltype_cardinal,
084: kelondroColumn.encoder_b256, 1, "urlComps"),
085: new kelondroColumn("g",
086: kelondroColumn.celltype_binary,
087: kelondroColumn.encoder_bytes, 1,
088: "typeofword"),
089: new kelondroColumn("z",
090: kelondroColumn.celltype_bitfield,
091: kelondroColumn.encoder_bytes, 4, "flags"),
092: new kelondroColumn("c",
093: kelondroColumn.celltype_cardinal,
094: kelondroColumn.encoder_b256, 1, "hitcount"),
095: new kelondroColumn("t",
096: kelondroColumn.celltype_cardinal,
097: kelondroColumn.encoder_b256, 2, "posintext"),
098: new kelondroColumn("r",
099: kelondroColumn.celltype_cardinal,
100: kelondroColumn.encoder_b256, 1,
101: "posinphrase"),
102: new kelondroColumn("o",
103: kelondroColumn.celltype_cardinal,
104: kelondroColumn.encoder_b256, 1,
105: "posofphrase"),
106: new kelondroColumn("i",
107: kelondroColumn.celltype_cardinal,
108: kelondroColumn.encoder_b256, 1,
109: "worddistance"),
110: new kelondroColumn("k",
111: kelondroColumn.celltype_cardinal,
112: kelondroColumn.encoder_b256, 1, "reserve") },
113: kelondroBase64Order.enhancedCoder, 0);
114: // available chars: b,e,j,q
115:
116: // static properties
117: private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
118: private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears
119: private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short
120: private static final int col_wordsInTitle = 3; // u 1 words in description/length (longer are better?)
121: private static final int col_wordsInText = 4; // w 2 total number of words in document
122: private static final int col_phrasesInText = 5; // p 2 total number of phrases in document
123: private static final int col_doctype = 6; // d 1 type of document
124: private static final int col_language = 7; // l 2 (guessed) language of document
125: private static final int col_llocal = 8; // x 1 outlinks to same domain
126: private static final int col_lother = 9; // y 1 outlinks to other domain
127: private static final int col_urlLength = 10; // m 1 byte-length of complete URL
128: private static final int col_urlComps = 11; // n 1 number of path components
129:
130: // dynamic properties
131: private static final int col_typeofword = 12; // g 1 grammatical classification
132: private static final int col_flags = 13; // z 4 b64-encoded appearance flags (24 bit, see definition below)
133: private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text
134: private static final int col_posintext = 15; // t 2 first appearance of word in text
135: private static final int col_posinphrase = 16; // r 1 position of word in its phrase
136: private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
137: private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
138: private static final int col_reserve = 19; // k 1 reserve
139:
140: private kelondroRow.Entry entry;
141:
142: public indexRWIRowEntry(String urlHash, int urlLength, // byte-length of complete URL
143: int urlComps, // number of path components
144: int titleLength, // length of description/length (longer are better?)
145: int hitcount, // how often appears this word in the text
146: int wordcount, // total number of words
147: int phrasecount, // total number of phrases
148: int posintext, // position of word in all words
149: int posinphrase, // position of word in its phrase
150: int posofphrase, // number of the phrase where word appears
151: int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
152: int sizeOfPage, // # of bytes of the page TODO: not needed any more
153: long lastmodified, // last-modified time of the document where word appears
154: long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
155: String language, // (guessed) language of document
156: char doctype, // type of document
157: int outlinksSame, // outlinks to same domain
158: int outlinksOther, // outlinks to other domain
159: kelondroBitfield flags // attributes to the url and to the word according the url
160: ) {
161:
162: assert (urlHash.length() == 12) : "urlhash = " + urlHash;
163: if ((language == null)
164: || (language.length() != urlEntryRow
165: .width(col_language)))
166: language = "uk";
167: this .entry = urlEntryRow.newEntry();
168: int mddlm = plasmaWordIndex.microDateDays(lastmodified);
169: int mddct = plasmaWordIndex.microDateDays(updatetime);
170: this .entry.setCol(col_urlhash, urlHash, null);
171: this .entry.setCol(col_lastModified, mddlm);
172: this .entry.setCol(col_freshUntil, Math.max(0, mddlm
173: + (mddct - mddlm) * 2)); // TTL computation
174: this .entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
175: this .entry.setCol(col_wordsInText, wordcount);
176: this .entry.setCol(col_phrasesInText, phrasecount);
177: this .entry.setCol(col_doctype, new byte[] { (byte) doctype });
178: this .entry.setCol(col_language, language, null);
179: this .entry.setCol(col_llocal, outlinksSame);
180: this .entry.setCol(col_lother, outlinksOther);
181: this .entry.setCol(col_urlLength, urlLength);
182: this .entry.setCol(col_urlComps, urlComps);
183: this .entry.setCol(col_typeofword, new byte[] { (byte) 0 }); // TODO: grammatical classification
184: this .entry.setCol(col_flags, flags.bytes());
185: this .entry.setCol(col_hitcount, hitcount);
186: this .entry.setCol(col_posintext, posintext);
187: this .entry.setCol(col_posinphrase, posinphrase);
188: this .entry.setCol(col_posofphrase, posofphrase);
189: this .entry.setCol(col_worddistance, worddistance);
190: this .entry.setCol(col_reserve, 0);
191: }
192:
193: public indexRWIRowEntry(String urlHash, String code) {
194: // the code is the external form of the row minus the leading urlHash entry
195: this .entry = urlEntryRow.newEntry((urlHash + code).getBytes());
196: }
197:
198: public indexRWIRowEntry(String external) {
199: this .entry = urlEntryRow.newEntry(external, true);
200: }
201:
202: public indexRWIRowEntry(byte[] row) {
203: this .entry = urlEntryRow.newEntry(row);
204: }
205:
206: public indexRWIRowEntry(byte[] row, int offset, boolean clone) {
207: this .entry = urlEntryRow.newEntry(row, offset, clone);
208: }
209:
210: public indexRWIRowEntry(kelondroRow.Entry rentry) {
211: // FIXME: see if cloning is necessary
212: this .entry = rentry;
213: }
214:
215: public static int days(long time) {
216: // calculates the number of days since 1.1.1970 and returns this as 4-byte array
217: return (int) (time / 86400000);
218: }
219:
220: public Object clone() {
221: byte[] b = new byte[urlEntryRow.objectsize];
222: System
223: .arraycopy(entry.bytes(), 0, b, 0,
224: urlEntryRow.objectsize);
225: return new indexRWIRowEntry(b);
226: }
227:
228: public String toPropertyForm() {
229: return entry.toPropertyForm(true, true, false);
230: }
231:
232: public Entry toKelondroEntry() {
233: return this .entry;
234: }
235:
236: public String urlHash() {
237: return this .entry.getColString(col_urlhash, null);
238: }
239:
240: public int quality() {
241: return 0; // not used any more
242: }
243:
244: public int virtualAge() {
245: return (int) this .entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
246: }
247:
248: public long lastModified() {
249: return plasmaWordIndex.reverseMicroDateDays((int) this .entry
250: .getColLong(col_lastModified));
251: }
252:
253: public long freshUntil() {
254: return plasmaWordIndex.reverseMicroDateDays((int) this .entry
255: .getColLong(col_freshUntil));
256: }
257:
258: public int hitcount() {
259: return (int) this .entry.getColLong(col_hitcount);
260: }
261:
262: public int posintext() {
263: return (int) this .entry.getColLong(col_posintext);
264: }
265:
266: public int posinphrase() {
267: return (int) this .entry.getColLong(col_posinphrase);
268: }
269:
270: public int posofphrase() {
271: return (int) this .entry.getColLong(col_posofphrase);
272: }
273:
274: public int wordsintext() {
275: return (int) this .entry.getColLong(col_wordsInText);
276: }
277:
278: public int phrasesintext() {
279: return (int) this .entry.getColLong(col_phrasesInText);
280: }
281:
282: public String getLanguage() {
283: return this .entry.getColString(col_language, null);
284: }
285:
286: public char getType() {
287: return (char) this .entry.getColByte(col_doctype);
288: }
289:
290: public int wordsintitle() {
291: return (int) this .entry.getColLong(col_wordsInTitle);
292: }
293:
294: public int llocal() {
295: return (int) this .entry.getColLong(col_llocal);
296: }
297:
298: public int lother() {
299: return (int) this .entry.getColLong(col_lother);
300: }
301:
302: public int urllength() {
303: return (int) this .entry.getColLong(col_urlLength);
304: }
305:
306: public int urlcomps() {
307: return (int) this .entry.getColLong(col_urlComps);
308: }
309:
310: public kelondroBitfield flags() {
311: return new kelondroBitfield(this .entry.getColBytes(col_flags));
312: }
313:
314: public double termFrequency() {
315: return (((double) this .hitcount()) / ((double) (this
316: .wordsintext()
317: + this .wordsintitle() + 1)));
318: }
319:
320: public String toString() {
321: return toPropertyForm();
322: }
323:
324: public static indexRWIEntry join(indexRWIRowEntry ie1,
325: indexRWIEntry ie2) {
326: // returns a modified entry of the first argument
327:
328: // combine the distance
329: ie1.entry.setCol(col_worddistance, ie1.worddistance()
330: + ie2.worddistance()
331: + Math.abs(ie1.posintext() - ie2.posintext()));
332: ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2
333: .posintext()));
334: ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2
335: .posofphrase()) ? Math.min(ie1.posinphrase(), ie2
336: .posinphrase()) : 0 /*unknown*/);
337: ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(),
338: ie2.posofphrase()));
339:
340: // combine term frequency
341: ie1.entry.setCol(col_wordsInText, ie1.wordsintext()
342: + ie2.wordsintext());
343: return ie1;
344: }
345:
346: public void join(indexRWIEntry oe) {
347: join(this , oe);
348: }
349:
350: public int worddistance() {
351: return (int) this .entry.getColLong(col_worddistance);
352: }
353:
354: public boolean isNewer(indexRWIEntry other) {
355: if (other == null)
356: return true;
357: if (this .lastModified() > other.lastModified())
358: return true;
359: if (this .lastModified() == other.lastModified()) {
360: if (this .quality() > other.quality())
361: return true;
362: }
363: return false;
364: }
365:
366: public boolean isOlder(indexRWIEntry other) {
367: if (other == null)
368: return false;
369: if (this .lastModified() < other.lastModified())
370: return true;
371: if (this .lastModified() == other.lastModified()) {
372: if (this .quality() < other.quality())
373: return true;
374: }
375: return false;
376: }
377:
378: }
|