Source Code Cross Referenced for indexRWIRowEntry.java in  » Search-Engine » yacy » de » anomic » index » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.index 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        // indexRWIRowEntry.java
002:        // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003:        // first published 20.05.2006 on http://yacy.net
004:        //
005:        // This is a part of YaCy, a peer-to-peer based web search engine
006:        //
007:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008:        // $LastChangedRevision: 1986 $
009:        // $LastChangedBy: orbiter $
010:        //
011:        // LICENSE
012:        // 
013:        // This program is free software; you can redistribute it and/or modify
014:        // it under the terms of the GNU General Public License as published by
015:        // the Free Software Foundation; either version 2 of the License, or
016:        // (at your option) any later version.
017:        //
018:        // This program is distributed in the hope that it will be useful,
019:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
020:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
021:        // GNU General Public License for more details.
022:        //
023:        // You should have received a copy of the GNU General Public License
024:        // along with this program; if not, write to the Free Software
025:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026:
027:        package de.anomic.index;
028:
029:        import de.anomic.kelondro.kelondroBase64Order;
030:        import de.anomic.kelondro.kelondroBitfield;
031:        import de.anomic.kelondro.kelondroColumn;
032:        import de.anomic.kelondro.kelondroRow;
033:        import de.anomic.kelondro.kelondroRow.Entry;
034:        import de.anomic.plasma.plasmaWordIndex;
035:        import de.anomic.yacy.yacySeedDB;
036:
037:        public final class indexRWIRowEntry implements  indexRWIEntry {
038:
039:            // this object stores attributes to URL references inside RWI collections
040:
041:            public static kelondroRow urlEntryRow = new kelondroRow(
042:                    new kelondroColumn[] {
043:                            new kelondroColumn("h",
044:                                    kelondroColumn.celltype_string,
045:                                    kelondroColumn.encoder_bytes,
046:                                    yacySeedDB.commonHashLength, "urlhash"),
047:                            new kelondroColumn("a",
048:                                    kelondroColumn.celltype_cardinal,
049:                                    kelondroColumn.encoder_b256, 2,
050:                                    "lastModified"),
051:                            new kelondroColumn("s",
052:                                    kelondroColumn.celltype_cardinal,
053:                                    kelondroColumn.encoder_b256, 2,
054:                                    "freshUntil"),
055:                            new kelondroColumn("u",
056:                                    kelondroColumn.celltype_cardinal,
057:                                    kelondroColumn.encoder_b256, 1,
058:                                    "wordsInTitle"),
059:                            new kelondroColumn("w",
060:                                    kelondroColumn.celltype_cardinal,
061:                                    kelondroColumn.encoder_b256, 2,
062:                                    "wordsInText"),
063:                            new kelondroColumn("p",
064:                                    kelondroColumn.celltype_cardinal,
065:                                    kelondroColumn.encoder_b256, 2,
066:                                    "phrasesInText"),
067:                            new kelondroColumn("d",
068:                                    kelondroColumn.celltype_binary,
069:                                    kelondroColumn.encoder_bytes, 1, "doctype"),
070:                            new kelondroColumn("l",
071:                                    kelondroColumn.celltype_string,
072:                                    kelondroColumn.encoder_bytes, 2, "language"),
073:                            new kelondroColumn("x",
074:                                    kelondroColumn.celltype_cardinal,
075:                                    kelondroColumn.encoder_b256, 1, "llocal"),
076:                            new kelondroColumn("y",
077:                                    kelondroColumn.celltype_cardinal,
078:                                    kelondroColumn.encoder_b256, 1, "lother"),
079:                            new kelondroColumn("m",
080:                                    kelondroColumn.celltype_cardinal,
081:                                    kelondroColumn.encoder_b256, 1, "urlLength"),
082:                            new kelondroColumn("n",
083:                                    kelondroColumn.celltype_cardinal,
084:                                    kelondroColumn.encoder_b256, 1, "urlComps"),
085:                            new kelondroColumn("g",
086:                                    kelondroColumn.celltype_binary,
087:                                    kelondroColumn.encoder_bytes, 1,
088:                                    "typeofword"),
089:                            new kelondroColumn("z",
090:                                    kelondroColumn.celltype_bitfield,
091:                                    kelondroColumn.encoder_bytes, 4, "flags"),
092:                            new kelondroColumn("c",
093:                                    kelondroColumn.celltype_cardinal,
094:                                    kelondroColumn.encoder_b256, 1, "hitcount"),
095:                            new kelondroColumn("t",
096:                                    kelondroColumn.celltype_cardinal,
097:                                    kelondroColumn.encoder_b256, 2, "posintext"),
098:                            new kelondroColumn("r",
099:                                    kelondroColumn.celltype_cardinal,
100:                                    kelondroColumn.encoder_b256, 1,
101:                                    "posinphrase"),
102:                            new kelondroColumn("o",
103:                                    kelondroColumn.celltype_cardinal,
104:                                    kelondroColumn.encoder_b256, 1,
105:                                    "posofphrase"),
106:                            new kelondroColumn("i",
107:                                    kelondroColumn.celltype_cardinal,
108:                                    kelondroColumn.encoder_b256, 1,
109:                                    "worddistance"),
110:                            new kelondroColumn("k",
111:                                    kelondroColumn.celltype_cardinal,
112:                                    kelondroColumn.encoder_b256, 1, "reserve") },
113:                    kelondroBase64Order.enhancedCoder, 0);
114:            // available chars: b,e,j,q
115:
116:            // static properties
117:            private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
118:            private static final int col_lastModified = 1; // a  2 last-modified time of the document where word appears
119:            private static final int col_freshUntil = 2; // s  2 TTL for the word, so it can be removed easily if the TTL is short
120:            private static final int col_wordsInTitle = 3; // u  1 words in description/length (longer are better?)
121:            private static final int col_wordsInText = 4; // w  2 total number of words in document
122:            private static final int col_phrasesInText = 5; // p  2 total number of phrases in document
123:            private static final int col_doctype = 6; // d  1 type of document
124:            private static final int col_language = 7; // l  2 (guessed) language of document
125:            private static final int col_llocal = 8; // x  1 outlinks to same domain
126:            private static final int col_lother = 9; // y  1 outlinks to other domain
127:            private static final int col_urlLength = 10; // m  1 byte-length of complete URL
128:            private static final int col_urlComps = 11; // n  1 number of path components
129:
130:            // dynamic properties    
131:            private static final int col_typeofword = 12; // g  1 grammatical classification
132:            private static final int col_flags = 13; // z  4 b64-encoded appearance flags (24 bit, see definition below)
133:            private static final int col_hitcount = 14; // c  1 number of occurrences of this word in text
134:            private static final int col_posintext = 15; // t  2 first appearance of word in text
135:            private static final int col_posinphrase = 16; // r  1 position of word in its phrase
136:            private static final int col_posofphrase = 17; // o  1 number of the phrase where word appears
137:            private static final int col_worddistance = 18; // i  1 initial zero; may be used as reserve: is filled during search
138:            private static final int col_reserve = 19; // k  1 reserve
139:
140:            private kelondroRow.Entry entry;
141:
142:            public indexRWIRowEntry(String urlHash, int urlLength, // byte-length of complete URL
143:                    int urlComps, // number of path components
144:                    int titleLength, // length of description/length (longer are better?)
145:                    int hitcount, // how often appears this word in the text
146:                    int wordcount, // total number of words
147:                    int phrasecount, // total number of phrases
148:                    int posintext, // position of word in all words
149:                    int posinphrase, // position of word in its phrase
150:                    int posofphrase, // number of the phrase where word appears
151:                    int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
152:                    int sizeOfPage, // # of bytes of the page TODO: not needed any more
153:                    long lastmodified, // last-modified time of the document where word appears
154:                    long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
155:                    String language, // (guessed) language of document
156:                    char doctype, // type of document
157:                    int outlinksSame, // outlinks to same domain
158:                    int outlinksOther, // outlinks to other domain
159:                    kelondroBitfield flags // attributes to the url and to the word according the url
160:            ) {
161:
162:                assert (urlHash.length() == 12) : "urlhash = " + urlHash;
163:                if ((language == null)
164:                        || (language.length() != urlEntryRow
165:                                .width(col_language)))
166:                    language = "uk";
167:                this .entry = urlEntryRow.newEntry();
168:                int mddlm = plasmaWordIndex.microDateDays(lastmodified);
169:                int mddct = plasmaWordIndex.microDateDays(updatetime);
170:                this .entry.setCol(col_urlhash, urlHash, null);
171:                this .entry.setCol(col_lastModified, mddlm);
172:                this .entry.setCol(col_freshUntil, Math.max(0, mddlm
173:                        + (mddct - mddlm) * 2)); // TTL computation
174:                this .entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
175:                this .entry.setCol(col_wordsInText, wordcount);
176:                this .entry.setCol(col_phrasesInText, phrasecount);
177:                this .entry.setCol(col_doctype, new byte[] { (byte) doctype });
178:                this .entry.setCol(col_language, language, null);
179:                this .entry.setCol(col_llocal, outlinksSame);
180:                this .entry.setCol(col_lother, outlinksOther);
181:                this .entry.setCol(col_urlLength, urlLength);
182:                this .entry.setCol(col_urlComps, urlComps);
183:                this .entry.setCol(col_typeofword, new byte[] { (byte) 0 }); // TODO: grammatical classification
184:                this .entry.setCol(col_flags, flags.bytes());
185:                this .entry.setCol(col_hitcount, hitcount);
186:                this .entry.setCol(col_posintext, posintext);
187:                this .entry.setCol(col_posinphrase, posinphrase);
188:                this .entry.setCol(col_posofphrase, posofphrase);
189:                this .entry.setCol(col_worddistance, worddistance);
190:                this .entry.setCol(col_reserve, 0);
191:            }
192:
193:            public indexRWIRowEntry(String urlHash, String code) {
194:                // the code is the external form of the row minus the leading urlHash entry
195:                this .entry = urlEntryRow.newEntry((urlHash + code).getBytes());
196:            }
197:
198:            public indexRWIRowEntry(String external) {
199:                this .entry = urlEntryRow.newEntry(external, true);
200:            }
201:
202:            public indexRWIRowEntry(byte[] row) {
203:                this .entry = urlEntryRow.newEntry(row);
204:            }
205:
206:            public indexRWIRowEntry(byte[] row, int offset, boolean clone) {
207:                this .entry = urlEntryRow.newEntry(row, offset, clone);
208:            }
209:
210:            public indexRWIRowEntry(kelondroRow.Entry rentry) {
211:                // FIXME: see if cloning is necessary
212:                this .entry = rentry;
213:            }
214:
215:            public static int days(long time) {
216:                // calculates the number of days since 1.1.1970 and returns this as 4-byte array
217:                return (int) (time / 86400000);
218:            }
219:
220:            public Object clone() {
221:                byte[] b = new byte[urlEntryRow.objectsize];
222:                System
223:                        .arraycopy(entry.bytes(), 0, b, 0,
224:                                urlEntryRow.objectsize);
225:                return new indexRWIRowEntry(b);
226:            }
227:
228:            public String toPropertyForm() {
229:                return entry.toPropertyForm(true, true, false);
230:            }
231:
232:            public Entry toKelondroEntry() {
233:                return this .entry;
234:            }
235:
236:            public String urlHash() {
237:                return this .entry.getColString(col_urlhash, null);
238:            }
239:
240:            public int quality() {
241:                return 0; // not used any more
242:            }
243:
244:            public int virtualAge() {
245:                return (int) this .entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
246:            }
247:
248:            public long lastModified() {
249:                return plasmaWordIndex.reverseMicroDateDays((int) this .entry
250:                        .getColLong(col_lastModified));
251:            }
252:
253:            public long freshUntil() {
254:                return plasmaWordIndex.reverseMicroDateDays((int) this .entry
255:                        .getColLong(col_freshUntil));
256:            }
257:
258:            public int hitcount() {
259:                return (int) this .entry.getColLong(col_hitcount);
260:            }
261:
262:            public int posintext() {
263:                return (int) this .entry.getColLong(col_posintext);
264:            }
265:
266:            public int posinphrase() {
267:                return (int) this .entry.getColLong(col_posinphrase);
268:            }
269:
270:            public int posofphrase() {
271:                return (int) this .entry.getColLong(col_posofphrase);
272:            }
273:
274:            public int wordsintext() {
275:                return (int) this .entry.getColLong(col_wordsInText);
276:            }
277:
278:            public int phrasesintext() {
279:                return (int) this .entry.getColLong(col_phrasesInText);
280:            }
281:
282:            public String getLanguage() {
283:                return this .entry.getColString(col_language, null);
284:            }
285:
286:            public char getType() {
287:                return (char) this .entry.getColByte(col_doctype);
288:            }
289:
290:            public int wordsintitle() {
291:                return (int) this .entry.getColLong(col_wordsInTitle);
292:            }
293:
294:            public int llocal() {
295:                return (int) this .entry.getColLong(col_llocal);
296:            }
297:
298:            public int lother() {
299:                return (int) this .entry.getColLong(col_lother);
300:            }
301:
302:            public int urllength() {
303:                return (int) this .entry.getColLong(col_urlLength);
304:            }
305:
306:            public int urlcomps() {
307:                return (int) this .entry.getColLong(col_urlComps);
308:            }
309:
310:            public kelondroBitfield flags() {
311:                return new kelondroBitfield(this .entry.getColBytes(col_flags));
312:            }
313:
314:            public double termFrequency() {
315:                return (((double) this .hitcount()) / ((double) (this 
316:                        .wordsintext()
317:                        + this .wordsintitle() + 1)));
318:            }
319:
320:            public String toString() {
321:                return toPropertyForm();
322:            }
323:
324:            public static indexRWIEntry join(indexRWIRowEntry ie1,
325:                    indexRWIEntry ie2) {
326:                // returns a modified entry of the first argument
327:
328:                // combine the distance
329:                ie1.entry.setCol(col_worddistance, ie1.worddistance()
330:                        + ie2.worddistance()
331:                        + Math.abs(ie1.posintext() - ie2.posintext()));
332:                ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2
333:                        .posintext()));
334:                ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2
335:                        .posofphrase()) ? Math.min(ie1.posinphrase(), ie2
336:                        .posinphrase()) : 0 /*unknown*/);
337:                ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(),
338:                        ie2.posofphrase()));
339:
340:                // combine term frequency
341:                ie1.entry.setCol(col_wordsInText, ie1.wordsintext()
342:                        + ie2.wordsintext());
343:                return ie1;
344:            }
345:
346:            public void join(indexRWIEntry oe) {
347:                join(this , oe);
348:            }
349:
350:            public int worddistance() {
351:                return (int) this .entry.getColLong(col_worddistance);
352:            }
353:
354:            public boolean isNewer(indexRWIEntry other) {
355:                if (other == null)
356:                    return true;
357:                if (this .lastModified() > other.lastModified())
358:                    return true;
359:                if (this .lastModified() == other.lastModified()) {
360:                    if (this .quality() > other.quality())
361:                        return true;
362:                }
363:                return false;
364:            }
365:
366:            public boolean isOlder(indexRWIEntry other) {
367:                if (other == null)
368:                    return false;
369:                if (this .lastModified() < other.lastModified())
370:                    return true;
371:                if (this .lastModified() == other.lastModified()) {
372:                    if (this .quality() < other.quality())
373:                        return true;
374:                }
375:                return false;
376:            }
377:
378:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.