001: // kelondroAttrSeq.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: // Created 15.11.2005
008: //
009: // $LastChangedDate: 2005-10-22 15:28:04 +0200 (Sat, 22 Oct 2005) $
010: // $LastChangedRevision: 968 $
011: // $LastChangedBy: theli $
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026: //
027: // Using this software in any meaning (reading, learning, copying, compiling,
028: // running) means that you agree that the Author(s) is (are) not responsible
029: // for cost, loss of data or any harm that may be caused directly or indirectly
030: // by usage of this softare or this documentation. The usage of this software
031: // is on your own risk. The installation and usage (starting/running) of this
032: // software may allow other people or application to access your computer and
033: // any attached devices and is highly dependent on the configuration of the
034: // software which must be done by the user of the software; the author(s) is
035: // (are) also not responsible for proper configuration and usage of the
036: // software, even if provoked by documentation provided together with
037: // the software.
038: //
039: // Any changes to this file according to the GPL as documented in the file
040: // gpl.txt aside this file in the shipment you received can be done to the
041: // lines that follows this copyright notice here, but changes must not be
042: // done inside the copyright notive above. A re-distribution must contain
043: // the intact and unchanged copyright notice.
044: // Contributions and changes to the program code must be marked as such.
045:
046: package de.anomic.kelondro;
047:
048: import java.io.BufferedReader;
049: import java.io.File;
050: import java.io.FileInputStream;
051: import java.io.IOException;
052: import java.io.InputStreamReader;
053: import java.util.ArrayList;
054: import java.util.HashMap;
055: import java.util.HashSet;
056: import java.util.Iterator;
057: import java.util.Map;
058: import java.util.Set;
059: import java.util.StringTokenizer;
060: import java.util.TreeMap;
061: import java.util.TreeSet;
062: import java.util.logging.Logger;
063: import java.util.zip.GZIPInputStream;
064:
065: import de.anomic.server.serverFileUtils;
066: import de.anomic.server.serverMemory;
067:
068: public class kelondroAttrSeq {
069:
070: // class objects
071: private File file;
072: private Map<String, Object> entries; // value may be of type String or of type Entry
073: protected Structure structure;
074: private String name;
075: private long created;
076:
077: // optional logger
078: protected Logger theLogger = null;
079:
080: public kelondroAttrSeq(File file, boolean tree) throws IOException {
081: this .file = file;
082: this .structure = null;
083: this .created = -1;
084: this .name = "";
085: this .entries = (tree) ? (Map<String, Object>) new TreeMap<String, Object>()
086: : (Map<String, Object>) new HashMap<String, Object>();
087: readAttrFile(file);
088: }
089:
090: public kelondroAttrSeq(String name, String struct, boolean tree) {
091: this .file = null;
092: this .structure = new Structure(struct);
093: this .created = System.currentTimeMillis();
094: this .name = name;
095: this .entries = (tree) ? (Map<String, Object>) new TreeMap<String, Object>()
096: : (Map<String, Object>) new HashMap<String, Object>();
097: }
098:
099: public void setLogger(Logger newLogger) {
100: this .theLogger = newLogger;
101: }
102:
103: public void logInfo(String message) {
104: if (this .theLogger == null)
105: System.err.println("ATTRSEQ INFO for file " + this .file
106: + ": " + message);
107: else
108: this .theLogger.info("ATTRSEQ INFO for file " + this .file
109: + ": " + message);
110: }
111:
112: public void logWarning(String message) {
113: if (this .theLogger == null)
114: System.err.println("ATTRSEQ WARNING for file " + this .file
115: + ": " + message);
116: else
117: this .theLogger.warning("ATTRSEQ WARNING for file "
118: + this .file + ": " + message);
119: }
120:
121: private void readAttrFile(File loadfile) throws IOException {
122: BufferedReader br = null;
123: int p;
124: if (loadfile.toString().endsWith(".gz")) {
125: br = new BufferedReader(new InputStreamReader(
126: new GZIPInputStream(new FileInputStream(loadfile))));
127: } else {
128: br = new BufferedReader(new InputStreamReader(
129: new FileInputStream(loadfile)));
130: }
131: String line, key, oldvalue, newvalue;
132: while ((line = br.readLine()) != null) {
133: line = line.trim();
134: if (line.length() == 0)
135: continue;
136: if (line.startsWith("#")) {
137: if (line.startsWith("# Structure=")) {
138: structure = new Structure(line.substring(12));
139: }
140: if (line.startsWith("# Name=")) {
141: name = line.substring(7);
142: }
143: if (line.startsWith("# Created=")) {
144: created = Long.parseLong(line.substring(10));
145: }
146: continue;
147: }
148: if ((p = line.indexOf('=')) > 0) {
149: key = line.substring(0, p).trim();
150: newvalue = line.substring(p + 1).trim();
151: oldvalue = (String) entries.get(key);
152: if (oldvalue != null) {
153: if (newvalue.equals(oldvalue)) {
154: //logWarning("key " + key + ": double occurrence. values are equal. second appearance is ignored");
155: } else {
156: if (newvalue.length() < oldvalue.length()) {
157: if (oldvalue
158: .substring(0, newvalue.length())
159: .equals(newvalue)) {
160: logWarning("key "
161: + key
162: + ": double occurrence. new value is subset of old value. second appearance is ignored");
163: } else {
164: logWarning("key "
165: + key
166: + ": double occurrence. new value is shorter than old value, but not a subsequence. old = "
167: + oldvalue + ", new = "
168: + newvalue);
169: }
170: } else if (newvalue.length() > oldvalue
171: .length()) {
172: if (newvalue
173: .substring(0, oldvalue.length())
174: .equals(oldvalue)) {
175: logWarning("key "
176: + key
177: + ": double occurrence. old value is subset of new value. first appearance is ignored");
178: } else {
179: logWarning("key "
180: + key
181: + ": double occurrence. old value is shorter than new value, but not a subsequence. old = "
182: + oldvalue + ", new = "
183: + newvalue);
184: }
185: entries.put(key, newvalue);
186: } else {
187: logWarning("key "
188: + key
189: + ": double occurrence. old and new value have equal length but are not equal. old = "
190: + oldvalue + ", new = " + newvalue);
191: //entries.put(key, newvalue);
192: }
193: }
194: } else {
195: entries.put(key, newvalue);
196: }
197: }
198: }
199: br.close();
200: if (structure == null)
201: throw new IOException("file contains no structure tag");
202: if (name == null)
203: throw new IOException("file contains no name tag");
204: if (created == -1)
205: throw new IOException("file contains no created tag");
206: }
207:
208: public int size() {
209: return entries.size();
210: }
211:
212: public long created() {
213: return this .created;
214: }
215:
216: public void toFile(File out) throws IOException {
217: // generate header
218: StringBuffer sb = new StringBuffer(2000);
219: sb.append("# Name=" + this .name);
220: sb.append((char) 13);
221: sb.append((char) 10);
222: sb.append("# Created=" + this .created);
223: sb.append((char) 13);
224: sb.append((char) 10);
225: sb.append("# Structure=" + this .structure.toString());
226: sb.append((char) 13);
227: sb.append((char) 10);
228: sb.append("# ---");
229: sb.append((char) 13);
230: sb.append((char) 10);
231: Iterator<Map.Entry<String, Object>> i = entries.entrySet()
232: .iterator();
233: Map.Entry<String, Object> entry;
234: String k;
235: Object v;
236: while (i.hasNext()) {
237: entry = i.next();
238: k = (String) entry.getKey();
239: v = entry.getValue();
240: sb.append(k);
241: sb.append('=');
242: if (v instanceof String)
243: sb.append((String) v);
244: if (v instanceof Entry)
245: sb.append(((Entry) v).toString());
246: sb.append((char) 13);
247: sb.append((char) 10);
248: }
249: if (out.toString().endsWith(".gz")) {
250: serverFileUtils.writeAndGZip((new String(sb)).getBytes(),
251: out);
252: } else {
253: serverFileUtils.write((new String(sb)).getBytes(), out);
254: }
255: }
256:
257: public Iterator<String> keys() {
258: return entries.keySet().iterator();
259: }
260:
261: public Entry newEntry(String pivot, boolean tree) {
262: return new Entry(pivot, new HashMap<String, Long>(),
263: (tree) ? (Set<String>) new TreeSet<String>()
264: : (Set<String>) new HashSet<String>());
265: }
266:
267: public Entry newEntry(String pivot, HashMap<String, Long> props,
268: Set<String> seq) {
269: return new Entry(pivot, props, seq);
270: }
271:
272: /*
273: public void putEntry(String pivot, String attrseq) {
274: entries.put(pivot, attrseq);
275: }
276: */
277:
278: public void putEntry(Entry entry) {
279: if (shortmem())
280: entries.put(entry.pivot, entry.toString());
281: else
282: entries.put(entry.pivot, entry);
283: }
284:
285: public void putEntrySmall(Entry entry) {
286: entries.put(entry.pivot, entry.toString());
287: }
288:
289: public Entry getEntry(String pivot) {
290: Object e = entries.get(pivot);
291: if (e == null)
292: return null;
293: if (e instanceof String)
294: return new Entry(pivot, (String) e, false);
295: if (e instanceof Entry)
296: return (Entry) e;
297: return null;
298: }
299:
300: public Entry removeEntry(String pivot) {
301: Object e = entries.remove(pivot);
302: if (e == null)
303: return null;
304: if (e instanceof String)
305: return new Entry(pivot, (String) e, false);
306: if (e instanceof Entry)
307: return (Entry) e;
308: return null;
309: }
310:
311: public class Structure {
312:
313: protected String pivot_name = null;
314: protected int pivot_len = -1;
315: protected String[] prop_names = null;
316: protected int[] prop_len = null, prop_pos = null;
317: protected String[] seq_names = null;
318: protected int[] seq_len = null, seq_pos = null;
319: protected kelondroRow seqrow;
320:
321: // example:
322: //# Structure=<pivot-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-12>
323:
324: public Structure(String structure) {
325: // parse a structure string
326:
327: // parse pivot definition:
328: int p = structure.indexOf(",'='");
329: if (p < 0)
330: return;
331: String pivot = structure.substring(0, p);
332: structure = structure.substring(p + 5);
333: kelondroColumn a = new kelondroColumn(pivot);
334: pivot_name = a.nickname;
335: pivot_len = a.cellwidth;
336:
337: // parse property part definition:
338: p = structure.indexOf(",'|'");
339: if (p < 0)
340: return;
341: ArrayList<kelondroColumn> l = new ArrayList<kelondroColumn>();
342: String attr = structure.substring(0, p);
343: String seqs = structure.substring(p + 5);
344: StringTokenizer st = new StringTokenizer(attr, ",");
345: while (st.hasMoreTokens()) {
346: a = new kelondroColumn(st.nextToken());
347: if (a == null)
348: break;
349: l.add(a);
350: }
351: prop_names = new String[l.size()];
352: prop_len = new int[l.size()];
353: prop_pos = new int[l.size()];
354: p = 0;
355: for (int i = 0; i < l.size(); i++) {
356: a = (kelondroColumn) l.get(i);
357: prop_names[i] = a.nickname;
358: prop_len[i] = a.cellwidth;
359: prop_pos[i] = p;
360: p += prop_len[i];
361: }
362:
363: // parse sequence definition:
364: if (seqs.startsWith("*"))
365: seqs = seqs.substring(1);
366: l = new ArrayList<kelondroColumn>();
367: st = new StringTokenizer(seqs, ",");
368: while (st.hasMoreTokens()) {
369: a = new kelondroColumn(st.nextToken());
370: if (a == null)
371: break;
372: l.add(a);
373: }
374: seq_names = new String[l.size()];
375: seq_len = new int[l.size()];
376: seq_pos = new int[l.size()];
377: p = 0;
378: for (int i = 0; i < l.size(); i++) {
379: a = (kelondroColumn) l.get(i);
380: seq_names[i] = a.nickname;
381: seq_len[i] = a.cellwidth;
382: seq_pos[i] = p;
383: p += seq_len[i];
384: }
385:
386: // generate rowdef for seq row definition
387: StringBuffer rowdef = new StringBuffer();
388: rowdef.append("byte[] ");
389: rowdef.append(seq_names[0]);
390: rowdef.append('-');
391: rowdef.append(seq_len[0]);
392:
393: for (int i = 1; i < seq_names.length; i++) {
394: rowdef.append(", byte[] ");
395: rowdef.append(seq_names[i]);
396: rowdef.append('-');
397: rowdef.append(seq_len[i]);
398: }
399: seqrow = new kelondroRow(new String(rowdef), null, 0);
400: }
401:
402: public String toString() {
403: StringBuffer sb = new StringBuffer(100);
404: sb.append('<');
405: sb.append(pivot_name);
406: sb.append('-');
407: sb.append(Integer.toString(pivot_len));
408: sb.append(">,'=',");
409: if (prop_names.length > 0) {
410: for (int i = 0; i < prop_names.length; i++) {
411: sb.append('<');
412: sb.append(prop_names[i]);
413: sb.append('-');
414: sb.append(Integer.toString(prop_len[i]));
415: sb.append(">,");
416: }
417: }
418: sb.append("'|'");
419: if (seq_names.length > 0) {
420: for (int i = 0; i < seq_names.length; i++) {
421: sb.append(",<");
422: sb.append(seq_names[i]);
423: sb.append('-');
424: sb.append(Integer.toString(seq_len[i]));
425: sb.append('>');
426: }
427: }
428: return new String(sb);
429: }
430: }
431:
432: public class Entry {
433: String pivot;
434: HashMap<String, Long> attrs;
435: Set<String> seq;
436:
437: public Entry(String pivot, HashMap<String, Long> attrs,
438: Set<String> seq) {
439: this .pivot = pivot;
440: this .attrs = attrs;
441: this .seq = seq;
442: }
443:
444: public Entry(String pivot, String attrseq, boolean tree) {
445: this .pivot = pivot;
446: attrs = new HashMap<String, Long>();
447: seq = (tree) ? (Set<String>) new TreeSet<String>()
448: : (Set<String>) new HashSet<String>();
449: for (int i = 0; i < structure.prop_names.length; i++) {
450: attrs
451: .put(
452: structure.prop_names[i],
453: new Long(
454: kelondroBase64Order.enhancedCoder
455: .decodeLong(attrseq
456: .substring(
457: structure.prop_pos[i],
458: structure.prop_pos[i]
459: + structure.prop_len[i]))));
460: }
461:
462: int p = attrseq.indexOf('|') + 1;
463: //long[] seqattrs = new long[structure.seq_names.length - 1];
464: String seqname;
465: while (p + structure.seq_len[0] <= attrseq.length()) {
466: seqname = attrseq
467: .substring(p, p + structure.seq_len[0]);
468: p += structure.seq_len[0];
469: for (int i = 1; i < structure.seq_names.length; i++) {
470: //seqattrs[i - 1] = kelondroBase64Order.enhancedCoder.decodeLong(attrseq.substring(p, p + structure.seq_len[i]));
471: p += structure.seq_len[i];
472: }
473: seq.add(seqname/*, seqattrs*/);
474: }
475: }
476:
477: public HashMap<String, Long> getAttrs() {
478: return attrs;
479: }
480:
481: public long getAttr(String key, long dflt) {
482: Long i = (Long) attrs.get(key);
483: if (i == null)
484: return dflt;
485: return i.longValue();
486: }
487:
488: public void setAttr(String key, long attr) {
489: attrs.put(key, new Long(attr));
490: }
491:
492: public Set<String> getSeqSet() {
493: return seq;
494: }
495:
496: public kelondroRowCollection getSeqCollection() {
497: kelondroRowCollection collection = new kelondroRowCollection(
498: structure.seqrow, seq.size());
499: Iterator<String> i = seq.iterator();
500: while (i.hasNext()) {
501: collection.addUnique(structure.seqrow.newEntry(i.next()
502: .getBytes()));
503: }
504: return collection;
505: }
506:
507: public void setSeq(Set<String> seq) {
508: this .seq = seq;
509: }
510:
511: public void addSeq(String s/*, long[] seqattrs*/) {
512: this .seq.add(s/*, seqattrs*/);
513: }
514:
515: public String toString() {
516: // creates only the attribute field and the sequence, not the pivot
517: StringBuffer sb = new StringBuffer(100
518: + structure.seq_len[0] * seq.size());
519: Long val;
520: for (int i = 0; i < structure.prop_names.length; i++) {
521: val = (Long) attrs.get(structure.prop_names[i]);
522: sb.append(kelondroBase64Order.enhancedCoder
523: .encodeLongSmart((val == null) ? 0 : val
524: .longValue(), structure.prop_len[i]));
525: }
526: sb.append('|');
527: Iterator<String> q = seq.iterator();
528: //long[] seqattrs;
529: while (q.hasNext()) {
530: sb.append((String) q.next());
531: //seqattrs = (long[]) entry.getValue();
532: /*
533: for (int i = 1; i < structure.seq_names.length; i++) {
534: sb.append(kelondroBase64Order.enhancedCoder.encodeLong(seqattrs[i - 1], structure.seq_len[i]));
535: }
536: */
537: }
538: return new String(sb);
539: }
540: }
541:
542: private static final long cc = 0;
543: private static boolean shortmemstate = false;
544:
545: private static boolean shortmem() {
546: if ((cc % 300) == 0) {
547: shortmemstate = (serverMemory.available() < 20000000L);
548: }
549: return shortmemstate;
550: }
551:
552: public static void transcode(File from_file, File to_file)
553: throws IOException {
554: kelondroAttrSeq crp = new kelondroAttrSeq(from_file, true);
555: //crp.toFile(new File(args[1]));
556: kelondroAttrSeq cro = new kelondroAttrSeq(crp.name
557: + "/Transcoded from " + crp.file.getName(),
558: crp.structure.toString(), true);
559: Iterator<String> i = crp.entries.keySet().iterator();
560: while (i.hasNext()) {
561: cro.putEntry(crp.getEntry(i.next()));
562: }
563: cro.toFile(to_file);
564: }
565:
566: public static void main(String[] args) {
567: // java -classpath source de.anomic.kelondro.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
568: try {
569: if ((args.length == 3) && (args[0].equals("-transcode"))) {
570: transcode(new File(args[1]), new File(args[2]));
571: }
572: } catch (IOException e) {
573: e.printStackTrace();
574: }
575: }
576:
577: }
|