001: // plasmaCRProcess.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: // Created 15.11.2005
008: //
009: // $LastChangedDate: 2008-01-29 22:13:30 +0000 (Di, 29 Jan 2008) $
010: // $LastChangedRevision: 4417 $
011: // $LastChangedBy: orbiter $
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026: //
027: // Using this software in any meaning (reading, learning, copying, compiling,
028: // running) means that you agree that the Author(s) is (are) not responsible
029: // for cost, loss of data or any harm that may be caused directly or indirectly
030: // by usage of this softare or this documentation. The usage of this software
031: // is on your own risk. The installation and usage (starting/running) of this
032: // software may allow other people or application to access your computer and
033: // any attached devices and is highly dependent on the configuration of the
034: // software which must be done by the user of the software; the author(s) is
035: // (are) also not responsible for proper configuration and usage of the
036: // software, even if provoked by documentation provided together with
037: // the software.
038: //
039: // Any changes to this file according to the GPL as documented in the file
040: // gpl.txt aside this file in the shipment you received can be done to the
041: // lines that follows this copyright notice here, but changes must not be
042: // done inside the copyright notive above. A re-distribution must contain
043: // the intact and unchanged copyright notice.
044: // Contributions and changes to the program code must be marked as such.
045:
046: package de.anomic.plasma;
047:
048: import java.io.File;
049: import java.io.IOException;
050: import java.util.Iterator;
051:
052: import de.anomic.kelondro.kelondroAttrSeq;
053: import de.anomic.kelondro.kelondroBase64Order;
054: import de.anomic.kelondro.kelondroBitfield;
055: import de.anomic.kelondro.kelondroCollectionIndex;
056: import de.anomic.kelondro.kelondroFlexTable;
057: import de.anomic.kelondro.kelondroIndex;
058: import de.anomic.kelondro.kelondroRow;
059: import de.anomic.kelondro.kelondroRowSet;
060: import de.anomic.server.serverDate;
061: import de.anomic.server.serverFileUtils;
062: import de.anomic.server.serverMemory;
063:
064: public class plasmaRankingCRProcess {
065:
066: /*
067: header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
068: header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10);
069: header.append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-" + ((type.equals("crl")) ? "6" : "12") + ">"); header.append((char) 13); header.append((char) 10);
070: header.append("# ---"); header.append((char) 13); header.append((char) 10);
071: */
072:
073: public static final kelondroRow CRG_accrow = new kelondroRow(
074: "byte[] Referee-12,"
075: + "Cardinal UDate-3 {b64e}, Cardinal VDate-3 {b64e}, "
076: + "Cardinal LCount-2 {b64e}, Cardinal GCount-2 {b64e}, Cardinal ICount-2 {b64e}, Cardinal DCount-2 {b64e}, Cardinal TLength-3 {b64e}, "
077: + "Cardinal WACount-3 {b64e}, Cardinal WUCount-3 {b64e}, Cardinal Flags-1 {b64e}, "
078: + "Cardinal FUDate-3 {b64e}, Cardinal FDDate-3 {b64e}, Cardinal LUDate-3 {b64e}, "
079: + "Cardinal UCount-2 {b64e}, Cardinal PCount-2 {b64e}, Cardinal ACount-2 {b64e}, Cardinal VCount-2 {b64e}, Cardinal Vita-2 {b64e}",
080: kelondroBase64Order.enhancedCoder, 0);
081: public static final kelondroRow CRG_colrow = new kelondroRow(
082: "byte[] Anchor-12", kelondroBase64Order.enhancedCoder, 0);
083: public static final String CRG_accname = "CRG-a-attr";
084: public static final String CRG_seqname = "CRG-a-coli";
085: public static final kelondroRow RCI_coli = new kelondroRow(
086: "byte[] RefereeDom-6", kelondroBase64Order.enhancedCoder, 0);
087: public static final String RCI_colname = "RCI-a-coli";
088:
089: private static boolean accumulate_upd(File f, kelondroAttrSeq acc) {
090: // open file
091: kelondroAttrSeq source_cr = null;
092: try {
093: source_cr = new kelondroAttrSeq(f, false);
094: } catch (IOException e) {
095: return false;
096: }
097:
098: // put elements in accumulator file
099: Iterator<String> el = source_cr.keys();
100: String key;
101: kelondroAttrSeq.Entry new_entry, acc_entry;
102: int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
103: kelondroBitfield acc_flags, new_flags;
104: while (el.hasNext()) {
105: key = (String) el.next();
106: new_entry = source_cr.getEntry(key);
107: new_flags = new kelondroBitfield(
108: kelondroBase64Order.enhancedCoder.encodeLong(
109: (long) new_entry.getAttr("Flags", 0), 1)
110: .getBytes());
111: // enrich information with additional values
112: if ((acc_entry = acc.getEntry(key)) != null) {
113: FUDate = (int) acc_entry.getAttr("FUDate", 0);
114: FDDate = (int) acc_entry.getAttr("FDDate", 0);
115: LUDate = (int) acc_entry.getAttr("LUDate", 0);
116: UCount = (int) acc_entry.getAttr("UCount", 0);
117: PCount = (int) acc_entry.getAttr("PCount", 0);
118: ACount = (int) acc_entry.getAttr("ACount", 0);
119: VCount = (int) acc_entry.getAttr("VCount", 0);
120: Vita = (int) acc_entry.getAttr("Vita", 0);
121:
122: // update counters and dates
123: acc_entry.setSeq(new_entry.getSeqSet()); // need to be checked
124:
125: UCount++; // increase update counter
126: PCount += (new_flags.get(1)) ? 1 : 0;
127: ACount += (new_flags.get(2)) ? 1 : 0;
128: VCount += (new_flags.get(3)) ? 1 : 0;
129:
130: // 'OR' the flags
131: acc_flags = new kelondroBitfield(
132: kelondroBase64Order.enhancedCoder
133: .encodeLong(
134: (long) acc_entry.getAttr(
135: "Flags", 0), 1)
136: .getBytes());
137: for (int i = 0; i < 6; i++) {
138: if (new_flags.get(i))
139: acc_flags.set(i, true);
140: }
141: acc_entry.setAttr("Flags",
142: (int) kelondroBase64Order.enhancedCoder
143: .decodeLong(acc_flags.exportB64()));
144: } else {
145: // initialize counters and dates
146: acc_entry = acc.newEntry(key, new_entry.getAttrs(),
147: new_entry.getSeqSet());
148: FUDate = plasmaWordIndex.microDateHoursInt(System
149: .currentTimeMillis()); // first update date
150: FDDate = plasmaWordIndex.microDateHoursInt(System
151: .currentTimeMillis()); // very difficult to compute; this is only a quick-hack
152: LUDate = (int) new_entry.getAttr("VDate", 0);
153: UCount = 0;
154: PCount = (new_flags.get(1)) ? 1 : 0;
155: ACount = (new_flags.get(2)) ? 1 : 0;
156: VCount = (new_flags.get(3)) ? 1 : 0;
157: Vita = 0;
158: }
159: // make plausibility check?
160:
161: // insert into accumulator
162: acc_entry.setAttr("FUDate", (long) FUDate);
163: acc_entry.setAttr("FDDate", (long) FDDate);
164: acc_entry.setAttr("LUDate", (long) LUDate);
165: acc_entry.setAttr("UCount", (long) UCount);
166: acc_entry.setAttr("PCount", (long) PCount);
167: acc_entry.setAttr("ACount", (long) ACount);
168: acc_entry.setAttr("VCount", (long) VCount);
169: acc_entry.setAttr("Vita", (long) Vita);
170: acc.putEntrySmall(acc_entry);
171: }
172:
173: return true;
174: }
175:
176: private static boolean accumulate_upd(File f, kelondroIndex acc,
177: kelondroCollectionIndex seq) throws IOException {
178: // open file
179: kelondroAttrSeq source_cr = null;
180: try {
181: source_cr = new kelondroAttrSeq(f, false);
182: } catch (IOException e) {
183: return false;
184: }
185:
186: // put elements in accumulator file
187: Iterator<String> el = source_cr.keys();
188: String key;
189: kelondroAttrSeq.Entry new_entry;
190: kelondroRow.Entry acc_entry;
191: int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
192: kelondroBitfield acc_flags, new_flags;
193: while (el.hasNext()) {
194: key = (String) el.next();
195: new_entry = source_cr.getEntry(key);
196: new_flags = new kelondroBitfield(
197: kelondroBase64Order.enhancedCoder.encodeLong(
198: (long) new_entry.getAttr("Flags", 0), 1)
199: .getBytes());
200: // enrich information with additional values
201: if ((acc_entry = acc.get(key.getBytes())) != null) {
202: FUDate = (int) acc_entry.getColLong("FUDate", 0);
203: FDDate = (int) acc_entry.getColLong("FDDate", 0);
204: LUDate = (int) acc_entry.getColLong("LUDate", 0);
205: UCount = (int) acc_entry.getColLong("UCount", 0);
206: PCount = (int) acc_entry.getColLong("PCount", 0);
207: ACount = (int) acc_entry.getColLong("ACount", 0);
208: VCount = (int) acc_entry.getColLong("VCount", 0);
209: Vita = (int) acc_entry.getColLong("Vita", 0);
210:
211: // update counters and dates
212: seq.put(key.getBytes(), new_entry.getSeqCollection()); // FIXME: old and new collection must be joined
213:
214: UCount++; // increase update counter
215: PCount += (new_flags.get(1)) ? 1 : 0;
216: ACount += (new_flags.get(2)) ? 1 : 0;
217: VCount += (new_flags.get(3)) ? 1 : 0;
218:
219: // 'OR' the flags
220: acc_flags = new kelondroBitfield(
221: kelondroBase64Order.enhancedCoder.encodeLong(
222: acc_entry.getColLong("Flags", 0), 1)
223: .getBytes());
224: for (int i = 0; i < 6; i++) {
225: if (new_flags.get(i))
226: acc_flags.set(i, true);
227: }
228: acc_entry.setCol("Flags",
229: (int) kelondroBase64Order.enhancedCoder
230: .decodeLong(acc_flags.exportB64()));
231: } else {
232: // initialize counters and dates
233: acc_entry = acc.row().newEntry();
234: acc_entry.setCol("Referee", key, null);
235: for (int i = 1; i < acc.row().columns(); i++) {
236: acc_entry.setCol(i, new_entry.getAttr(acc.row()
237: .column(i).nickname, 0));
238: }
239: seq.put(key.getBytes(), new_entry.getSeqCollection());
240: FUDate = plasmaWordIndex.microDateHoursInt(System
241: .currentTimeMillis()); // first update date
242: FDDate = plasmaWordIndex.microDateHoursInt(System
243: .currentTimeMillis()); // very difficult to compute; this is only a quick-hack
244: LUDate = (int) new_entry.getAttr("VDate", 0);
245: UCount = 0;
246: PCount = (new_flags.get(1)) ? 1 : 0;
247: ACount = (new_flags.get(2)) ? 1 : 0;
248: VCount = (new_flags.get(3)) ? 1 : 0;
249: Vita = 0;
250: }
251: // make plausibility check?
252:
253: // insert into accumulator
254: acc_entry.setCol("FUDate", (long) FUDate);
255: acc_entry.setCol("FDDate", (long) FDDate);
256: acc_entry.setCol("LUDate", (long) LUDate);
257: acc_entry.setCol("UCount", (long) UCount);
258: acc_entry.setCol("PCount", (long) PCount);
259: acc_entry.setCol("ACount", (long) ACount);
260: acc_entry.setCol("VCount", (long) VCount);
261: acc_entry.setCol("Vita", (long) Vita);
262: acc.put(acc_entry);
263: }
264:
265: return true;
266: }
267:
268: public static void accumulate(File from_dir, File tmp_dir,
269: File err_dir, File bkp_dir, File to_file, int max_files,
270: boolean newdb) throws IOException {
271: if (!(from_dir.isDirectory())) {
272: System.out.println("source path " + from_dir
273: + " is not a directory.");
274: return;
275: }
276: if (!(tmp_dir.isDirectory())) {
277: System.out.println("temporary path " + tmp_dir
278: + " is not a directory.");
279: return;
280: }
281: if (!(err_dir.isDirectory())) {
282: System.out.println("error path " + err_dir
283: + " is not a directory.");
284: return;
285: }
286: if (!(bkp_dir.isDirectory())) {
287: System.out.println("back-up path " + bkp_dir
288: + " is not a directory.");
289: return;
290: }
291:
292: // open target file
293: kelondroAttrSeq acc = null;
294: kelondroIndex newacc = null;
295: kelondroCollectionIndex newseq = null;
296: if (newdb) {
297: File path = to_file.getParentFile(); // path to storage place
298: newacc = new kelondroFlexTable(path, CRG_accname, -1,
299: CRG_accrow, 0, false);
300: newseq = new kelondroCollectionIndex(path, CRG_seqname, 12,
301: kelondroBase64Order.enhancedCoder, -1, 2, 9,
302: CRG_colrow);
303: } else {
304: if (!(to_file.exists())) {
305: acc = new kelondroAttrSeq(
306: "Global Ranking Accumulator File",
307: "<Referee-12>,'=',"
308: + "<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,"
309: + "<FUDate-3>,<FDDate-3>,<LUDate-3>,<UCount-2>,<PCount-2>,<ACount-2>,<VCount-2>,<Vita-2>,"
310: + "'|',*<Anchor-12>", false);
311: acc.toFile(to_file);
312: }
313: acc = new kelondroAttrSeq(to_file, false);
314: }
315: // collect source files
316: File source_file = null;
317: String[] files = from_dir.list();
318: if (files.length < max_files)
319: max_files = files.length;
320: for (int i = 0; i < max_files; i++) {
321: // open file
322: source_file = new File(from_dir, files[i]);
323: if (newdb) {
324: if (accumulate_upd(source_file, newacc, newseq)) {
325: // move CR file to temporary folder
326: source_file.renameTo(new File(tmp_dir, files[i]));
327: } else {
328: // error case: the CR-file is not valid; move to error path
329: source_file.renameTo(new File(err_dir, files[i]));
330: }
331: } else {
332: if (accumulate_upd(source_file, acc)) {
333: // move CR file to temporary folder
334: source_file.renameTo(new File(tmp_dir, files[i]));
335: } else {
336: // error case: the CR-file is not valid; move to error path
337: source_file.renameTo(new File(err_dir, files[i]));
338: }
339: }
340: }
341:
342: try {
343: if (newdb) {
344: newacc.close();
345: newseq.close();
346: } else {
347: // save accumulator to temporary file
348: File tmp_file;
349: if (to_file.toString().endsWith(".gz")) {
350: tmp_file = new File(to_file.toString() + "."
351: + (System.currentTimeMillis() % 1000)
352: + ".tmp.gz");
353: } else {
354: tmp_file = new File(to_file.toString() + "."
355: + (System.currentTimeMillis() % 1000)
356: + ".tmp");
357: }
358: // store the file
359: acc.toFile(tmp_file);
360: // since this was successful, we remove the old file and move the new file to it
361: to_file.delete();
362: tmp_file.renameTo(to_file);
363: }
364: serverFileUtils.moveAll(tmp_dir, bkp_dir);
365: } catch (IOException e) {
366: // move previously processed files back
367: e.printStackTrace();
368: serverFileUtils.moveAll(tmp_dir, from_dir);
369: }
370:
371: }
372:
373: public static int genrci(File cr_in, File rci_out)
374: throws IOException {
375: if (!(cr_in.exists()))
376: return 0;
377: kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false);
378: //if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
379: if (!(rci_out.exists())) {
380: kelondroAttrSeq rcix = new kelondroAttrSeq(
381: "Global Ranking Reverse Citation Index",
382: "<AnchorDom-6>,'='," + "<UDate-3>,"
383: + "'|',*<Referee-12>", false);
384: rcix.toFile(rci_out);
385: }
386: final kelondroAttrSeq rci = new kelondroAttrSeq(rci_out, false);
387:
388: // loop over all referees
389: int count = 0;
390: int size = cr.size();
391: long start = System.currentTimeMillis();
392: long l;
393: final Iterator<String> i = cr.keys();
394: String referee, anchor, anchorDom;
395: kelondroAttrSeq.Entry cr_entry, rci_entry;
396: long cr_UDate, rci_UDate;
397: while (i.hasNext()) {
398: referee = i.next();
399: cr_entry = cr.getEntry(referee);
400: cr_UDate = cr_entry.getAttr("UDate", 0);
401:
402: // loop over all anchors
403: Iterator<String> j = cr_entry.getSeqSet().iterator();
404: while (j.hasNext()) {
405: // get domain of anchors
406: anchor = j.next();
407: if (anchor.length() == 6)
408: anchorDom = anchor;
409: else
410: anchorDom = anchor.substring(6);
411:
412: // update domain-specific entry
413: rci_entry = rci.getEntry(anchorDom);
414: if (rci_entry == null)
415: rci_entry = rci.newEntry(anchorDom, false);
416: rci_entry.addSeq(referee);
417:
418: // update Update-Date
419: rci_UDate = rci_entry.getAttr("UDate", 0);
420: if (cr_UDate > rci_UDate)
421: rci_entry.setAttr("UDate", cr_UDate);
422:
423: // insert entry
424: rci.putEntry(rci_entry);
425: }
426: count++;
427: if ((count % 1000) == 0) {
428: l = java.lang.Math.max(1,
429: (System.currentTimeMillis() - start) / 1000);
430: System.out.println("processed " + count
431: + " citations, " + (count / l)
432: + " per second, rci.size = " + rci.size()
433: + ", " + ((size - count) / (count / l))
434: + " seconds remaining; mem = "
435: + serverMemory.available());
436: }
437: i.remove();
438: }
439:
440: // finished. write to file
441: cr = null;
442: cr_in = null;
443: serverMemory.gc(1000, "plasmaRankingCRProcess.genrci(...)"); // thq
444: rci.toFile(rci_out);
445: return count;
446: }
447:
448: public static int genrcix(File cr_path_in, File rci_path_out)
449: throws IOException {
450: //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
451: kelondroCollectionIndex seq = new kelondroCollectionIndex(
452: cr_path_in, CRG_seqname, 12,
453: kelondroBase64Order.enhancedCoder, -1, 2, 9, CRG_colrow);
454: kelondroCollectionIndex rci = new kelondroCollectionIndex(
455: rci_path_out, RCI_colname, 6,
456: kelondroBase64Order.enhancedCoder, -1, 2, 9, RCI_coli);
457:
458: // loop over all referees
459: int count = 0;
460: int size = seq.size();
461: long start = System.currentTimeMillis();
462: long l;
463: final Iterator<Object[]> i = seq.keycollections(null, null,
464: false);
465: Object[] keycollection;
466: String referee, refereeDom, anchor, anchorDom;
467: kelondroRowSet cr_entry, rci_entry;
468: while (i.hasNext()) {
469: keycollection = i.next();
470: referee = new String((byte[]) keycollection[0]);
471: if (referee.length() == 6)
472: refereeDom = referee;
473: else
474: refereeDom = referee.substring(6);
475: cr_entry = (kelondroRowSet) keycollection[1];
476:
477: // loop over all anchors
478: Iterator<kelondroRow.Entry> j = cr_entry.rows();
479: kelondroRow.Entry entry;
480: while (j.hasNext()) {
481: // get domain of anchors
482: entry = j.next();
483: anchor = (String) entry.getColString(0, null);
484: if (anchor.length() == 6)
485: anchorDom = anchor;
486: else
487: anchorDom = anchor.substring(6);
488:
489: // update domain-specific entry
490: rci_entry = rci.get(anchorDom.getBytes());
491: if (rci_entry == null)
492: rci_entry = new kelondroRowSet(RCI_coli, 0);
493: rci_entry.add(refereeDom.getBytes());
494:
495: // insert entry
496: rci.put(anchorDom.getBytes(), rci_entry);
497: }
498: count++;
499: if ((count % 1000) == 0) {
500: l = java.lang.Math.max(1,
501: (System.currentTimeMillis() - start) / 1000);
502: System.out.println("processed " + count
503: + " citations, " + (count / l)
504: + " per second, rci.size = " + rci.size()
505: + ", " + ((size - count) / (count / l) / 60)
506: + " minutes remaining; mem = "
507: + Runtime.getRuntime().freeMemory());
508: }
509: }
510:
511: // finished. write to file
512: seq.close();
513: rci.close();
514: return count;
515: }
516:
517: public static void main(String[] args) {
518: // java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
519: try {
520: if ((args.length == 5) && (args[0].equals("-accumulate"))) {
521: accumulate(new File(args[1]), new File(args[2]),
522: new File(args[3]), new File(args[4]), new File(
523: args[5]), Integer.parseInt(args[6]),
524: true);
525: }
526: if ((args.length == 2) && (args[0].equals("-accumulate"))) {
527: File root_path = new File(args[1]);
528: File from_dir = new File(root_path,
529: "DATA/RANKING/GLOBAL/014_othercr");
530: File ready_dir = new File(root_path,
531: "DATA/RANKING/GLOBAL/015_ready");
532: File tmp_dir = new File(root_path,
533: "DATA/RANKING/GLOBAL/016_tmp");
534: File err_dir = new File(root_path,
535: "DATA/RANKING/GLOBAL/017_err");
536: File acc_dir = new File(root_path,
537: "DATA/RANKING/GLOBAL/018_acc");
538: String filename = "CRG-a-"
539: + new serverDate().toShortString(true)
540: + ".cr.gz";
541: File to_file = new File(root_path,
542: "DATA/RANKING/GLOBAL/020_con0/" + filename);
543: if (!(ready_dir.exists()))
544: ready_dir.mkdirs();
545: if (!(tmp_dir.exists()))
546: tmp_dir.mkdirs();
547: if (!(err_dir.exists()))
548: err_dir.mkdirs();
549: if (!(acc_dir.exists()))
550: acc_dir.mkdirs();
551: if (!(to_file.getParentFile().exists()))
552: to_file.getParentFile().mkdirs();
553: serverFileUtils.moveAll(from_dir, ready_dir);
554: long start = System.currentTimeMillis();
555: int files = ready_dir.list().length;
556: accumulate(ready_dir, tmp_dir, err_dir, acc_dir,
557: to_file, 1000, true);
558: long seconds = java.lang.Math.max(1, (System
559: .currentTimeMillis() - start) / 1000);
560: System.out.println("Finished accumulate for " + files
561: + " files in " + seconds + " seconds ("
562: + (files / seconds) + " files/second)");
563: }
564: if ((args.length == 3) && (args[0].equals("-recycle"))) {
565: File root_path = new File(args[1]);
566: int max_age_hours = Integer.parseInt(args[2]);
567: File own_dir = new File(root_path,
568: "DATA/RANKING/GLOBAL/010_owncr");
569: File acc_dir = new File(root_path,
570: "DATA/RANKING/GLOBAL/018_acc");
571: File bkp_dir = new File(root_path,
572: "DATA/RANKING/GLOBAL/019_bkp");
573: if (!(own_dir.exists()))
574: return;
575: if (!(acc_dir.exists()))
576: return;
577: if (!(bkp_dir.exists()))
578: bkp_dir.mkdirs();
579: String[] list = acc_dir.list();
580: long start = System.currentTimeMillis();
581: int files = list.length;
582: long d;
583: File f;
584: for (int i = 0; i < list.length; i++) {
585: f = new File(acc_dir, list[i]);
586: try {
587: d = (System.currentTimeMillis() - (new kelondroAttrSeq(
588: f, false)).created()) / 3600000;
589: if (d > max_age_hours) {
590: // file is considered to be too old, it is not recycled
591: System.out
592: .println("file "
593: + f.getName()
594: + " is old ("
595: + d
596: + " hours) and not recycled, only moved to backup");
597: f.renameTo(new File(bkp_dir, list[i]));
598: } else {
599: // file is fresh, it is duplicated and moved to be transferred to other peers again
600: System.out
601: .println("file "
602: + f.getName()
603: + " is fresh ("
604: + d
605: + " hours old), recycled and moved to backup");
606: serverFileUtils.copy(f, new File(own_dir,
607: list[i]));
608: f.renameTo(new File(bkp_dir, list[i]));
609: }
610: } catch (IOException e) {
611: // there is something wrong with this file; delete it
612: System.out.println("file " + f.getName()
613: + " is corrupted and deleted");
614: f.delete();
615: }
616: }
617: long seconds = java.lang.Math.max(1, (System
618: .currentTimeMillis() - start) / 1000);
619: System.out.println("Finished recycling of " + files
620: + " files in " + seconds + " seconds ("
621: + (files / seconds) + " files/second)");
622: }
623: if ((args.length == 2) && (args[0].equals("-genrci"))) {
624: File root_path = new File(args[1]);
625: File cr_filedir = new File(root_path,
626: "DATA/RANKING/GLOBAL/020_con0");
627: File rci_filedir = new File(root_path,
628: "DATA/RANKING/GLOBAL/030_rci0");
629: rci_filedir.mkdirs();
630: long start = System.currentTimeMillis();
631: int count = genrcix(cr_filedir, rci_filedir);
632: long seconds = java.lang.Math.max(1, (System
633: .currentTimeMillis() - start) / 1000);
634: System.out.println("Completed RCI generation: " + count
635: + " citation references in " + seconds
636: + " seconds (" + (count / seconds)
637: + " CR-records/second)");
638: }
639: /*
640: if ((args.length == 2) && (args[0].equals("-genrci"))) {
641: File root_path = new File(args[1]);
642: File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0");
643: File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
644: rci_file.getParentFile().mkdirs();
645: String[] cr_filenames = cr_filedir.list();
646: for (int i = 0; i < cr_filenames.length; i++) {
647: long start = System.currentTimeMillis();
648: int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file);
649: long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
650: System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
651: }
652: }
653: */
654: } catch (IOException e) {
655: e.printStackTrace();
656: }
657: }
658:
659: /*
660: Class-A File format:
661:
662: UDate : latest update timestamp of the URL (as virtual date, hours since epoch)
663: VDate : last visit timestamp of the URL (as virtual date, hours since epoch)
664: LCount : count of links to local resources
665: GCount : count of links to global resources
666: ICount : count of links to images (in document)
667: DCount : count of links to other documents
668: TLength: length of the plain text content (bytes)
669: WACount: total number of all words in content
670: WUCount: number of unique words in content (removed doubles)
671: Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote)
672:
673: Class-a File format is an extension of Class-A plus the following attributes
674: FUDate : first update timestamp of the URL
675: FDDate : first update timestamp of the domain
676: LUDate : latest update timestamp of the URL
677: UCount : Update Counter (of 'latest update timestamp')
678: PCount : Popularity Counter (proxy clicks)
679: ACount : Attention Counter (search result clicks)
680: VCount : Votes
681: Vita : Vitality (normed number of updates per time)
682: */
683: }
|