0001: // kelondroRowCollection.java
0002: // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
0003: // first published 12.01.2006 on http://www.anomic.de
0004: //
0005: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
0006: // $LastChangedRevision: 1986 $
0007: // $LastChangedBy: orbiter $
0008: //
0009: // LICENSE
0010: //
0011: // This program is free software; you can redistribute it and/or modify
0012: // it under the terms of the GNU General Public License as published by
0013: // the Free Software Foundation; either version 2 of the License, or
0014: // (at your option) any later version.
0015: //
0016: // This program is distributed in the hope that it will be useful,
0017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: // GNU General Public License for more details.
0020: //
0021: // You should have received a copy of the GNU General Public License
0022: // along with this program; if not, write to the Free Software
0023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024:
0025: package de.anomic.kelondro;
0026:
0027: import java.io.File;
0028: import java.io.IOException;
0029: import java.util.ArrayList;
0030: import java.util.Iterator;
0031: import java.util.List;
0032: import java.util.Random;
0033: import java.util.Set;
0034:
0035: import de.anomic.server.serverFileUtils;
0036: import de.anomic.server.serverMemory;
0037: import de.anomic.server.logging.serverLog;
0038: import de.anomic.yacy.yacySeedDB;
0039:
0040: public class kelondroRowCollection {
0041:
0042: public static final double growfactor = 1.4;
0043: private static final int isortlimit = 20;
0044:
0045: protected byte[] chunkcache;
0046: protected int chunkcount;
0047: protected long lastTimeRead, lastTimeWrote;
0048: protected kelondroRow rowdef;
0049: protected int sortBound;
0050:
0051: private static final int exp_chunkcount = 0;
0052: private static final int exp_last_read = 1;
0053: private static final int exp_last_wrote = 2;
0054: private static final int exp_order_type = 3;
0055: private static final int exp_order_col = 4;
0056: private static final int exp_order_bound = 5;
0057: private static final int exp_collection = 6;
0058:
0059: private static int processors = Runtime.getRuntime()
0060: .availableProcessors();
0061:
0062: public kelondroRowCollection(kelondroRowCollection rc) {
0063: this .rowdef = rc.rowdef;
0064: this .chunkcache = rc.chunkcache;
0065: this .chunkcount = rc.chunkcount;
0066: this .sortBound = rc.sortBound;
0067: this .lastTimeRead = rc.lastTimeRead;
0068: this .lastTimeWrote = rc.lastTimeWrote;
0069: }
0070:
0071: public kelondroRowCollection(kelondroRow rowdef, int objectCount) {
0072: this .rowdef = rowdef;
0073: this .chunkcache = new byte[objectCount * rowdef.objectsize];
0074: this .chunkcount = 0;
0075: this .sortBound = 0;
0076: this .lastTimeRead = System.currentTimeMillis();
0077: this .lastTimeWrote = System.currentTimeMillis();
0078: }
0079:
0080: public kelondroRowCollection(kelondroRow rowdef, int objectCount,
0081: byte[] cache, int sortBound) {
0082: this .rowdef = rowdef;
0083: this .chunkcache = cache;
0084: this .chunkcount = objectCount;
0085: this .sortBound = sortBound;
0086: this .lastTimeRead = System.currentTimeMillis();
0087: this .lastTimeWrote = System.currentTimeMillis();
0088: }
0089:
0090: public kelondroRowCollection(kelondroRow rowdef,
0091: kelondroRow.Entry exportedCollectionRowEnvironment,
0092: int columnInEnvironment) {
0093: this .rowdef = rowdef;
0094: int chunkcachelength = exportedCollectionRowEnvironment
0095: .cellwidth(columnInEnvironment)
0096: - exportOverheadSize;
0097: kelondroRow.Entry exportedCollection = exportRow(
0098: chunkcachelength).newEntry(
0099: exportedCollectionRowEnvironment, columnInEnvironment);
0100: this .chunkcount = (int) exportedCollection
0101: .getColLong(exp_chunkcount);
0102: //assert (this.chunkcount <= chunkcachelength / rowdef.objectsize) : "chunkcount = " + this.chunkcount + ", chunkcachelength = " + chunkcachelength + ", rowdef.objectsize = " + rowdef.objectsize;
0103: if ((this .chunkcount > chunkcachelength / rowdef.objectsize)) {
0104: serverLog.logWarning("RowCollection",
0105: "corrected wrong chunkcount; chunkcount = "
0106: + this .chunkcount + ", chunkcachelength = "
0107: + chunkcachelength
0108: + ", rowdef.objectsize = "
0109: + rowdef.objectsize);
0110: this .chunkcount = chunkcachelength / rowdef.objectsize; // patch problem
0111: }
0112: this .lastTimeRead = (exportedCollection
0113: .getColLong(exp_last_read) + 10957)
0114: * day;
0115: this .lastTimeWrote = (exportedCollection
0116: .getColLong(exp_last_wrote) + 10957)
0117: * day;
0118: String sortOrderKey = exportedCollection.getColString(
0119: exp_order_type, null);
0120: kelondroByteOrder oldOrder = null;
0121: if ((sortOrderKey == null) || (sortOrderKey.equals("__"))) {
0122: oldOrder = null;
0123: } else {
0124: oldOrder = kelondroNaturalOrder.bySignature(sortOrderKey);
0125: if (oldOrder == null)
0126: oldOrder = kelondroBase64Order
0127: .bySignature(sortOrderKey);
0128: }
0129: if ((rowdef.objectOrder != null)
0130: && (oldOrder != null)
0131: && (!(rowdef.objectOrder.signature().equals(oldOrder
0132: .signature()))))
0133: throw new kelondroException(
0134: "old collection order does not match with new order; objectOrder.signature = "
0135: + rowdef.objectOrder.signature()
0136: + ", oldOrder.signature = "
0137: + oldOrder.signature());
0138: if (rowdef.primaryKeyIndex != (int) exportedCollection
0139: .getColLong(exp_order_col))
0140: throw new kelondroException(
0141: "old collection primary key does not match with new primary key");
0142: this .sortBound = (int) exportedCollection
0143: .getColLong(exp_order_bound);
0144: //assert (sortBound <= chunkcount) : "sortBound = " + sortBound + ", chunkcount = " + chunkcount;
0145: if (sortBound > chunkcount) {
0146: serverLog.logWarning("RowCollection",
0147: "corrected wrong sortBound; sortBound = "
0148: + sortBound + ", chunkcount = "
0149: + chunkcount);
0150: this .sortBound = chunkcount;
0151: }
0152: this .chunkcache = exportedCollection
0153: .getColBytes(exp_collection);
0154: }
0155:
0156: public void reset() {
0157: this .chunkcache = new byte[0];
0158: this .chunkcount = 0;
0159: this .sortBound = 0;
0160: }
0161:
0162: private static final kelondroRow exportMeasureRow = exportRow(0 /* no relevance */);
0163:
0164: protected static final int sizeOfExportedCollectionRows(
0165: kelondroRow.Entry exportedCollectionRowEnvironment,
0166: int columnInEnvironment) {
0167: kelondroRow.Entry exportedCollectionEntry = exportMeasureRow
0168: .newEntry(exportedCollectionRowEnvironment,
0169: columnInEnvironment);
0170: int chunkcount = (int) exportedCollectionEntry
0171: .getColLong(exp_chunkcount);
0172: return chunkcount;
0173: }
0174:
0175: private static final long day = 1000 * 60 * 60 * 24;
0176:
0177: public static int daysSince2000(long time) {
0178: return (int) (time / day) - 10957;
0179: }
0180:
0181: private static kelondroRow exportRow(int chunkcachelength) {
0182: // find out the size of this collection
0183: return new kelondroRow("int size-4 {b256},"
0184: + "short lastread-2 {b256},"
0185: + // as daysSince2000
0186: "short lastwrote-2 {b256},"
0187: + // as daysSince2000
0188: "byte[] orderkey-2," + "short ordercol-2 {b256},"
0189: + "short orderbound-2 {b256}," + "byte[] collection-"
0190: + chunkcachelength, kelondroNaturalOrder.naturalOrder,
0191: 0);
0192: }
0193:
0194: public static final int exportOverheadSize = 14;
0195:
0196: public synchronized byte[] exportCollection() {
0197: // returns null if the collection is empty
0198: trim(false);
0199: kelondroRow row = exportRow(chunkcache.length);
0200: kelondroRow.Entry entry = row.newEntry();
0201: assert (sortBound <= chunkcount) : "sortBound = " + sortBound
0202: + ", chunkcount = " + chunkcount;
0203: assert (this .chunkcount <= chunkcache.length
0204: / rowdef.objectsize) : "chunkcount = "
0205: + this .chunkcount + ", chunkcache.length = "
0206: + chunkcache.length + ", rowdef.objectsize = "
0207: + rowdef.objectsize;
0208: entry.setCol(exp_chunkcount, this .chunkcount);
0209: entry.setCol(exp_last_read, daysSince2000(this .lastTimeRead));
0210: entry.setCol(exp_last_wrote, daysSince2000(this .lastTimeWrote));
0211: entry.setCol(exp_order_type,
0212: (this .rowdef.objectOrder == null) ? "__".getBytes()
0213: : this .rowdef.objectOrder.signature()
0214: .getBytes());
0215: entry.setCol(exp_order_col, this .rowdef.primaryKeyIndex);
0216: entry.setCol(exp_order_bound, this .sortBound);
0217: entry.setCol(exp_collection, this .chunkcache);
0218: return entry.bytes();
0219: }
0220:
0221: public void saveCollection(File file) throws IOException {
0222: serverFileUtils.write(exportCollection(), file);
0223: }
0224:
0225: public kelondroRow row() {
0226: return this .rowdef;
0227: }
0228:
0229: private final void ensureSize(int elements) {
0230: int needed = elements * rowdef.objectsize;
0231: if (chunkcache.length >= needed)
0232: return;
0233: byte[] newChunkcache = new byte[(int) (needed * growfactor)]; // increase space
0234: System.arraycopy(chunkcache, 0, newChunkcache, 0,
0235: chunkcache.length);
0236: chunkcache = newChunkcache;
0237: newChunkcache = null;
0238: }
0239:
0240: public final long memoryNeededForGrow() {
0241: return (long) ((((long) (chunkcount + 1)) * ((long) rowdef.objectsize)) * growfactor);
0242: }
0243:
0244: public synchronized void trim(boolean plusGrowFactor) {
0245: if (chunkcache.length == 0)
0246: return;
0247: int needed = chunkcount * rowdef.objectsize;
0248: if (plusGrowFactor)
0249: needed = (int) (needed * growfactor);
0250: if (needed >= chunkcache.length)
0251: return; // in case that the growfactor causes that the cache would
0252: // grow instead of shrink, simply ignore the growfactor
0253: if (serverMemory.available() + 1000 < needed)
0254: return; // if the swap buffer is not available, we must give up.
0255: // This is not critical. Othervise we provoke a serious
0256: // problem with OOM
0257: byte[] newChunkcache = new byte[needed];
0258: System.arraycopy(chunkcache, 0, newChunkcache, 0, Math.min(
0259: chunkcache.length, newChunkcache.length));
0260: chunkcache = newChunkcache;
0261: newChunkcache = null;
0262: }
0263:
0264: public final long lastRead() {
0265: return lastTimeRead;
0266: }
0267:
0268: public final long lastWrote() {
0269: return lastTimeWrote;
0270: }
0271:
0272: public synchronized final byte[] getKey(int index) {
0273: assert (index >= 0) : "get: access with index " + index
0274: + " is below zero";
0275: assert (index < chunkcount) : "get: access with index " + index
0276: + " is above chunkcount " + chunkcount
0277: + "; sortBound = " + sortBound;
0278: assert (index * rowdef.objectsize < chunkcache.length);
0279: if ((chunkcache == null) || (rowdef == null))
0280: return null; // case may appear during shutdown
0281: if (index >= chunkcount)
0282: return null;
0283: if (index * rowdef.objectsize >= chunkcache.length)
0284: return null;
0285: this .lastTimeRead = System.currentTimeMillis();
0286: byte[] b = new byte[this .rowdef.width(0)];
0287: System.arraycopy(chunkcache, index * rowdef.objectsize, b, 0,
0288: b.length);
0289: return b;
0290: }
0291:
0292: public synchronized final kelondroRow.Entry get(int index) {
0293: assert (index >= 0) : "get: access with index " + index
0294: + " is below zero";
0295: assert (index < chunkcount) : "get: access with index " + index
0296: + " is above chunkcount " + chunkcount
0297: + "; sortBound = " + sortBound;
0298: assert (index * rowdef.objectsize < chunkcache.length);
0299: if ((chunkcache == null) || (rowdef == null))
0300: return null; // case may appear during shutdown
0301: if (index >= chunkcount)
0302: return null;
0303: if (index * rowdef.objectsize >= chunkcache.length)
0304: return null;
0305: this .lastTimeRead = System.currentTimeMillis();
0306: return rowdef.newEntry(chunkcache, index * rowdef.objectsize,
0307: true);
0308: }
0309:
0310: public synchronized final void set(int index, kelondroRow.Entry a) {
0311: assert (index >= 0) : "set: access with index " + index
0312: + " is below zero";
0313: ensureSize(index + 1);
0314: a.writeToArray(chunkcache, index * rowdef.objectsize);
0315: if (index >= chunkcount)
0316: chunkcount = index + 1;
0317: this .lastTimeWrote = System.currentTimeMillis();
0318: }
0319:
0320: public final void insertUnique(int index, kelondroRow.Entry a) {
0321: assert (a != null);
0322:
0323: if (index < chunkcount) {
0324: // make room
0325: ensureSize(chunkcount + 1);
0326: System.arraycopy(chunkcache, rowdef.objectsize * index,
0327: chunkcache, rowdef.objectsize * (index + 1),
0328: (chunkcount - index) * rowdef.objectsize);
0329: chunkcount++;
0330: }
0331: // insert entry into gap
0332: set(index, a);
0333: }
0334:
0335: public synchronized void addUnique(kelondroRow.Entry row) {
0336: byte[] r = row.bytes();
0337: addUnique(r, 0, r.length);
0338: }
0339:
0340: public synchronized void addUniqueMultiple(
0341: List<kelondroRow.Entry> rows) {
0342: assert this .sortBound == 0 : "sortBound = " + this .sortBound
0343: + ", chunkcount = " + this .chunkcount;
0344: Iterator<kelondroRow.Entry> i = rows.iterator();
0345: while (i.hasNext())
0346: addUnique(i.next());
0347: }
0348:
0349: public synchronized void add(byte[] a) {
0350: addUnique(a, 0, a.length);
0351: }
0352:
0353: private final void addUnique(byte[] a, int astart, int alength) {
0354: assert (a != null);
0355: assert (astart >= 0) && (astart < a.length) : " astart = " + a;
0356: assert (!(serverLog.allZero(a, astart, alength))) : "a = "
0357: + serverLog.arrayList(a, astart, alength);
0358: assert (alength > 0);
0359: assert (astart + alength <= a.length);
0360: if (bugappearance(a, astart, alength)) {
0361: System.out.println("*** DEBUG: patched wrong a = "
0362: + serverLog.arrayList(a, astart, alength));
0363: return; // TODO: this is temporary; remote peers may still submit bad entries
0364: }
0365: assert (!(bugappearance(a, astart, alength))) : "a = "
0366: + serverLog.arrayList(a, astart, alength);
0367: int l = Math.min(rowdef.objectsize, Math.min(alength, a.length
0368: - astart));
0369: ensureSize(chunkcount + 1);
0370: System.arraycopy(a, astart, chunkcache, rowdef.objectsize
0371: * chunkcount, l);
0372: chunkcount++;
0373: this .lastTimeWrote = System.currentTimeMillis();
0374: }
0375:
0376: private static boolean bugappearance(byte[] a, int astart,
0377: int alength) {
0378: // check strange appearances of '@[B', which is not a b64-value or any other hash fragment
0379: if (astart + 3 > alength)
0380: return false;
0381: loop: for (int i = astart; i <= alength - 3; i++) {
0382: if (a[i] != 64)
0383: continue loop;
0384: if (a[i + 1] != 91)
0385: continue loop;
0386: if (a[i + 2] != 66)
0387: continue loop;
0388: return true;
0389: }
0390: return false;
0391: }
0392:
0393: public synchronized final void addAllUnique(kelondroRowCollection c) {
0394: if (c == null)
0395: return;
0396: assert (rowdef.objectsize == c.rowdef.objectsize);
0397: ensureSize(chunkcount + c.size());
0398: System.arraycopy(c.chunkcache, 0, chunkcache, rowdef.objectsize
0399: * chunkcount, rowdef.objectsize * c.size());
0400: chunkcount += c.size();
0401: }
0402:
0403: /**
0404: * This method removes the entry at position p ensuring the order of the remaining
0405: * entries if specified by keepOrder.
0406: * Note: Keeping the order is expensive. If you want to remove more than one element in
0407: * a batch with this method, it'd be better to do the removes without order keeping and doing
0408: * the sort after all the removes are done.
0409: *
0410: * @param p element at this position will be removed
0411: * @param keepOrder keep the order of remaining entries
0412: */
0413: protected synchronized final void removeRow(int p, boolean keepOrder) {
0414: assert p >= 0 : "p = " + p;
0415: assert p < chunkcount : "p = " + p + ", chunkcount = "
0416: + chunkcount;
0417: assert chunkcount > 0 : "chunkcount = " + chunkcount;
0418: assert sortBound <= chunkcount : "sortBound = " + sortBound
0419: + ", chunkcount = " + chunkcount;
0420: if (keepOrder && (p < sortBound)) {
0421: // remove by shift (quite expensive for big collections)
0422: System.arraycopy(chunkcache, (p + 1)
0423: * this .rowdef.objectsize, chunkcache, p
0424: * this .rowdef.objectsize, (chunkcount - p - 1)
0425: * this .rowdef.objectsize);
0426: sortBound--;
0427: } else {
0428: // remove by copying the top-element to the remove position
0429: if (p != chunkcount - 1) {
0430: System.arraycopy(chunkcache, (chunkcount - 1)
0431: * this .rowdef.objectsize, chunkcache, p
0432: * this .rowdef.objectsize,
0433: this .rowdef.objectsize);
0434: }
0435: // we moved the last element to the remove position: (p+1)st element
0436: // only the first p elements keep their order (element p is already outside the order)
0437: if (sortBound >= p)
0438: sortBound = p;
0439: }
0440: chunkcount--;
0441: this .lastTimeWrote = System.currentTimeMillis();
0442: }
0443:
0444: public synchronized kelondroRow.Entry removeOne() {
0445: // removes the last entry from the collection
0446: if (chunkcount == 0)
0447: return null;
0448: kelondroRow.Entry r = get(chunkcount - 1);
0449: if (chunkcount == sortBound)
0450: sortBound--;
0451: chunkcount--;
0452: this .lastTimeWrote = System.currentTimeMillis();
0453: return r;
0454: }
0455:
0456: public synchronized void clear() {
0457: if (this .chunkcache.length == 0)
0458: return;
0459: this .chunkcache = new byte[0];
0460: this .chunkcount = 0;
0461: this .sortBound = 0;
0462: this .lastTimeWrote = System.currentTimeMillis();
0463: }
0464:
0465: public int size() {
0466: return chunkcount;
0467: }
0468:
0469: public synchronized Iterator<byte[]> keys() {
0470: // iterates byte[] - type entries
0471: return new keyIterator();
0472: }
0473:
0474: /**
0475: * Iterator for kelondroRowCollection.
0476: * It supports remove() though it doesn't contain the order of the underlying
0477: * collection during removes.
0478: *
0479: */
0480: public class keyIterator implements Iterator<byte[]> {
0481:
0482: private int p;
0483:
0484: public keyIterator() {
0485: p = 0;
0486: }
0487:
0488: public boolean hasNext() {
0489: return p < chunkcount;
0490: }
0491:
0492: public byte[] next() {
0493: return getKey(p++);
0494: }
0495:
0496: public void remove() {
0497: p--;
0498: removeRow(p, false);
0499: }
0500: }
0501:
0502: public synchronized Iterator<kelondroRow.Entry> rows() {
0503: // iterates kelondroRow.Entry - type entries
0504: return new rowIterator();
0505: }
0506:
0507: /**
0508: * Iterator for kelondroRowCollection.
0509: * It supports remove() though it doesn't contain the order of the underlying
0510: * collection during removes.
0511: *
0512: */
0513: public class rowIterator implements Iterator<kelondroRow.Entry> {
0514:
0515: private int p;
0516:
0517: public rowIterator() {
0518: p = 0;
0519: }
0520:
0521: public boolean hasNext() {
0522: return p < chunkcount;
0523: }
0524:
0525: public kelondroRow.Entry next() {
0526: return get(p++);
0527: }
0528:
0529: public void remove() {
0530: p--;
0531: removeRow(p, false);
0532: }
0533: }
0534:
0535: public synchronized void select(Set<String> keys) {
0536: // removes all entries but the ones given by urlselection
0537: if ((keys == null) || (keys.size() == 0))
0538: return;
0539: Iterator<kelondroRow.Entry> i = rows();
0540: kelondroRow.Entry row;
0541: while (i.hasNext()) {
0542: row = (kelondroRow.Entry) i.next();
0543: if (!(keys.contains(row.getColString(0, null))))
0544: i.remove();
0545: }
0546: }
0547:
0548: public synchronized final void sort() {
0549: assert (this .rowdef.objectOrder != null);
0550: if (this .sortBound == this .chunkcount)
0551: return; // this is already sorted
0552: if (this .chunkcount < isortlimit) {
0553: isort(0, this .chunkcount, new byte[this .rowdef.objectsize]);
0554: this .sortBound = this .chunkcount;
0555: assert this .isSorted();
0556: return;
0557: }
0558: byte[] swapspace = new byte[this .rowdef.objectsize];
0559: int p = partition(0, this .chunkcount, this .sortBound, swapspace);
0560: if ((processors > 1) && (this .chunkcount >= 10000)) {
0561: // sort this using multi-threading; use one second thread
0562: qsortthread qs = new qsortthread(0, p, 0);
0563: qs.start();
0564: qsort(p, this .chunkcount, 0, swapspace);
0565: try {
0566: qs.join();
0567: } catch (InterruptedException e) {
0568: e.printStackTrace();
0569: }
0570: } else {
0571: qsort(0, p, 0, swapspace);
0572: qsort(p, this .chunkcount, 0, swapspace);
0573: }
0574: this .sortBound = this .chunkcount;
0575: //assert this.isSorted();
0576: }
0577:
0578: private class qsortthread extends Thread {
0579: private int sl, sr, sb;
0580:
0581: public qsortthread(int L, int R, int S) {
0582: this .sl = L;
0583: this .sr = R;
0584: this .sb = S;
0585: }
0586:
0587: public void run() {
0588: qsort(sl, sr, sb, new byte[rowdef.objectsize]);
0589: }
0590: }
0591:
0592: private final void qsort(int L, int R, int S, byte[] swapspace) {
0593: if (R - L < isortlimit) {
0594: isort(L, R, swapspace);
0595: return;
0596: }
0597: assert R > L;
0598: int p = partition(L, R, S, swapspace);
0599: assert p > L;
0600: assert p < R;
0601: qsort(L, p, 0, swapspace);
0602: qsort(p, R, 0, swapspace);
0603: }
0604:
0605: private final int partition(int L, int R, int S, byte[] swapspace) {
0606: // returns {partition-point, new-S}
0607: assert (L < R - 1);
0608: assert (R - L >= isortlimit);
0609:
0610: int p = L;
0611: int q = R - 1;
0612: int pivot = (L + R - 1) / 2;
0613: int oldpivot = -1;
0614: byte[] compiledPivot = null;
0615: if (this .rowdef.objectOrder instanceof kelondroBase64Order) {
0616: while (p <= q) {
0617: // wenn pivot < S: pivot befindet sich in sortierter Sequenz von L bis S - 1
0618: // d.h. alle Werte von L bis pivot sind kleiner als das pivot
0619: // zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
0620: if (oldpivot != pivot) {
0621: compiledPivot = compilePivot(pivot);
0622: oldpivot = pivot;
0623: }
0624: if ((pivot < S) && (p < pivot)) {
0625: //System.out.println("+++ saved " + (pivot - p) + " comparisments");
0626: p = pivot;
0627: S = 0;
0628: } else {
0629: while (comparePivot(compiledPivot, p) == 1)
0630: p++; // chunkAt[p] < pivot
0631: }
0632: // nun gilt chunkAt[p] >= pivot
0633: while (comparePivot(compiledPivot, q) == -1)
0634: q--; // chunkAt[q] > pivot
0635: if (p <= q) {
0636: oldpivot = pivot;
0637: pivot = swap(p, q, pivot, swapspace);
0638: p++;
0639: q--;
0640: }
0641: }
0642: } else {
0643: while (p <= q) {
0644: if ((pivot < S) && (p < pivot)) {
0645: p = pivot;
0646: S = 0;
0647: } else {
0648: while (compare(pivot, p) == 1)
0649: p++; // chunkAt[p] < pivot
0650: }
0651: while (compare(pivot, q) == -1)
0652: q--; // chunkAt[q] > pivot
0653: if (p <= q) {
0654: pivot = swap(p, q, pivot, swapspace);
0655: p++;
0656: q--;
0657: }
0658: }
0659: }
0660: return p;
0661: }
0662:
0663: private final void isort(int L, int R, byte[] swapspace) {
0664: for (int i = L + 1; i < R; i++)
0665: for (int j = i; j > L && compare(j - 1, j) > 0; j--)
0666: swap(j, j - 1, 0, swapspace);
0667: }
0668:
0669: private final int swap(int i, int j, int p, byte[] swapspace) {
0670: if (i == j)
0671: return p;
0672: System.arraycopy(chunkcache, this .rowdef.objectsize * i,
0673: swapspace, 0, this .rowdef.objectsize);
0674: System.arraycopy(chunkcache, this .rowdef.objectsize * j,
0675: chunkcache, this .rowdef.objectsize * i,
0676: this .rowdef.objectsize);
0677: System.arraycopy(swapspace, 0, chunkcache,
0678: this .rowdef.objectsize * j, this .rowdef.objectsize);
0679: if (i == p)
0680: return j;
0681: else if (j == p)
0682: return i;
0683: else
0684: return p;
0685: }
0686:
0687: public synchronized void uniq() {
0688: assert (this .rowdef.objectOrder != null);
0689: // removes double-occurrences of chunks
0690: // this works only if the collection was ordered with sort before
0691: // if the collection is large and the number of deletions is also large,
0692: // then this method may run a long time with 100% CPU load which is caused
0693: // by the large number of memory movements.
0694: if (chunkcount < 2)
0695: return;
0696: int i = chunkcount - 2;
0697: long t = System.currentTimeMillis(); // for time-out
0698: int d = 0;
0699: boolean u = true;
0700: try {
0701: while (i >= 0) {
0702: if (compare(i, i + 1) == 0) {
0703: removeRow(i + 1, false);
0704: d++;
0705: if (i + 1 < chunkcount - 1)
0706: u = false;
0707: }
0708: i--;
0709: if (System.currentTimeMillis() - t > 10000) {
0710: throw new RuntimeException("uniq() time-out at "
0711: + i + " (backwards) from " + chunkcount
0712: + " elements after "
0713: + (System.currentTimeMillis() - t)
0714: + " milliseconds; " + d
0715: + " deletions so far");
0716: }
0717: }
0718: } catch (RuntimeException e) {
0719: serverLog.logWarning("kelondroRowCollection", e
0720: .getMessage(), e);
0721: } finally {
0722: if (!u)
0723: this .sort();
0724: }
0725: }
0726:
0727: public synchronized ArrayList<kelondroRowSet> removeDoubles() {
0728: assert (this .rowdef.objectOrder != null);
0729: // removes double-occurrences of chunks
0730: // in contrast to uniq() this removes also the remaining, non-double entry that had a double-occurrance to the others
0731: // all removed chunks are returned in an array
0732: this .sort();
0733: ArrayList<kelondroRowSet> report = new ArrayList<kelondroRowSet>();
0734: if (chunkcount < 2)
0735: return report;
0736: int i = chunkcount - 2;
0737: int d = 0;
0738: boolean u = true;
0739: kelondroRowSet collection = new kelondroRowSet(this .rowdef, 2);
0740: try {
0741: while (i >= 0) {
0742: if (compare(i, i + 1) == 0) {
0743: collection.addUnique(get(i + 1));
0744: removeRow(i + 1, false);
0745: d++;
0746: if (i + 1 < chunkcount - 1)
0747: u = false;
0748: } else if (collection.size() > 0) {
0749: // finish collection of double occurrences
0750: collection.addUnique(get(i + 1));
0751: removeRow(i + 1, false);
0752: d++;
0753: if (i + 1 < chunkcount - 1)
0754: u = false;
0755: collection.trim(false);
0756: report.add(collection);
0757: collection = new kelondroRowSet(this .rowdef, 2);
0758: }
0759: i--;
0760: }
0761: } catch (RuntimeException e) {
0762: serverLog.logWarning("kelondroRowCollection", e
0763: .getMessage(), e);
0764: } finally {
0765: if (!u)
0766: this .sort();
0767: }
0768: return report;
0769: }
0770:
0771: public synchronized boolean isSorted() {
0772: assert (this .rowdef.objectOrder != null);
0773: if (chunkcount <= 1)
0774: return true;
0775: if (chunkcount != this .sortBound)
0776: return false;
0777: for (int i = 0; i < chunkcount - 1; i++) {
0778: //System.out.println("*" + new String(get(i).getColBytes(0)));
0779: if (compare(i, i + 1) > 0) {
0780: System.out.println("?"
0781: + new String(get(i + 1).getColBytes(0)));
0782: return false;
0783: }
0784: }
0785: return true;
0786: }
0787:
0788: public synchronized String toString() {
0789: StringBuffer s = new StringBuffer();
0790: Iterator<kelondroRow.Entry> i = rows();
0791: if (i.hasNext())
0792: s.append(i.next().toString());
0793: while (i.hasNext())
0794: s.append(", " + ((kelondroRow.Entry) i.next()).toString());
0795: return new String(s);
0796: }
0797:
0798: private final int compare(int i, int j) {
0799: assert (chunkcount * this .rowdef.objectsize <= chunkcache.length) : "chunkcount = "
0800: + chunkcount
0801: + ", objsize = "
0802: + this .rowdef.objectsize
0803: + ", chunkcache.length = " + chunkcache.length;
0804: assert (i >= 0) && (i < chunkcount) : "i = " + i
0805: + ", chunkcount = " + chunkcount;
0806: assert (j >= 0) && (j < chunkcount) : "j = " + j
0807: + ", chunkcount = " + chunkcount;
0808: assert (this .rowdef.objectOrder != null);
0809: if (i == j)
0810: return 0;
0811: assert (this .rowdef.primaryKeyIndex == 0) : "this.sortColumn = "
0812: + this .rowdef.primaryKeyIndex;
0813: int colstart = (this .rowdef.primaryKeyIndex < 0) ? 0
0814: : this .rowdef.colstart[this .rowdef.primaryKeyIndex];
0815: assert (!bugappearance(chunkcache, i * this .rowdef.objectsize
0816: + colstart, this .rowdef.primaryKeyLength));
0817: assert (!bugappearance(chunkcache, j * this .rowdef.objectsize
0818: + colstart, this .rowdef.primaryKeyLength));
0819: int c = this .rowdef.objectOrder.compare(chunkcache, i
0820: * this .rowdef.objectsize + colstart,
0821: this .rowdef.primaryKeyLength, chunkcache, j
0822: * this .rowdef.objectsize + colstart,
0823: this .rowdef.primaryKeyLength);
0824: return c;
0825: }
0826:
0827: protected final byte[] compilePivot(int i) {
0828: assert (i >= 0) && (i < chunkcount) : "i = " + i
0829: + ", chunkcount = " + chunkcount;
0830: assert (this .rowdef.objectOrder != null);
0831: assert (this .rowdef.objectOrder instanceof kelondroBase64Order);
0832: assert (this .rowdef.primaryKeyIndex == 0) : "this.sortColumn = "
0833: + this .rowdef.primaryKeyIndex;
0834: int colstart = (this .rowdef.primaryKeyIndex < 0) ? 0
0835: : this .rowdef.colstart[this .rowdef.primaryKeyIndex];
0836: assert (!bugappearance(chunkcache, i * this .rowdef.objectsize
0837: + colstart, this .rowdef.primaryKeyLength));
0838: return ((kelondroBase64Order) this .rowdef.objectOrder)
0839: .compilePivot(chunkcache, i * this .rowdef.objectsize
0840: + colstart, this .rowdef.primaryKeyLength);
0841: }
0842:
0843: protected final byte[] compilePivot(byte[] a, int astart,
0844: int alength) {
0845: assert (this .rowdef.objectOrder != null);
0846: assert (this .rowdef.objectOrder instanceof kelondroBase64Order);
0847: assert (this .rowdef.primaryKeyIndex == 0) : "this.sortColumn = "
0848: + this .rowdef.primaryKeyIndex;
0849: return ((kelondroBase64Order) this .rowdef.objectOrder)
0850: .compilePivot(a, astart, alength);
0851: }
0852:
0853: protected final int comparePivot(byte[] compiledPivot, int j) {
0854: assert (chunkcount * this .rowdef.objectsize <= chunkcache.length) : "chunkcount = "
0855: + chunkcount
0856: + ", objsize = "
0857: + this .rowdef.objectsize
0858: + ", chunkcache.length = " + chunkcache.length;
0859: assert (j >= 0) && (j < chunkcount) : "j = " + j
0860: + ", chunkcount = " + chunkcount;
0861: assert (this .rowdef.objectOrder != null);
0862: assert (this .rowdef.objectOrder instanceof kelondroBase64Order);
0863: assert (this .rowdef.primaryKeyIndex == 0) : "this.sortColumn = "
0864: + this .rowdef.primaryKeyIndex;
0865: int colstart = (this .rowdef.primaryKeyIndex < 0) ? 0
0866: : this .rowdef.colstart[this .rowdef.primaryKeyIndex];
0867: assert (!bugappearance(chunkcache, j * this .rowdef.objectsize
0868: + colstart, this .rowdef.primaryKeyLength));
0869: int c = ((kelondroBase64Order) this .rowdef.objectOrder)
0870: .comparePivot(compiledPivot, chunkcache, j
0871: * this .rowdef.objectsize + colstart,
0872: this .rowdef.primaryKeyLength);
0873: return c;
0874: }
0875:
0876: protected synchronized int compare(byte[] a, int astart,
0877: int alength, int chunknumber) {
0878: assert (chunknumber < chunkcount);
0879: int l = Math.min(this .rowdef.primaryKeyLength, Math.min(
0880: a.length - astart, alength));
0881: return rowdef.objectOrder
0882: .compare(
0883: a,
0884: astart,
0885: l,
0886: chunkcache,
0887: chunknumber
0888: * this .rowdef.objectsize
0889: + ((rowdef.primaryKeyIndex < 0) ? 0
0890: : this .rowdef.colstart[rowdef.primaryKeyIndex]),
0891: this .rowdef.primaryKeyLength);
0892: }
0893:
0894: protected synchronized boolean match(byte[] a, int astart,
0895: int alength, int chunknumber) {
0896: if (chunknumber >= chunkcount)
0897: return false;
0898: int i = 0;
0899: int p = chunknumber
0900: * this .rowdef.objectsize
0901: + ((rowdef.primaryKeyIndex < 0) ? 0
0902: : this .rowdef.colstart[rowdef.primaryKeyIndex]);
0903: final int len = Math.min(this .rowdef.primaryKeyLength, Math
0904: .min(alength, a.length - astart));
0905: while (i < len)
0906: if (a[astart + i++] != chunkcache[p++])
0907: return false;
0908: return ((len == this .rowdef.primaryKeyLength) || (chunkcache[len] == 0));
0909: }
0910:
0911: public synchronized void close() {
0912: chunkcache = null;
0913: }
0914:
0915: private static long d(long a, long b) {
0916: if (b == 0)
0917: return a;
0918: else
0919: return a / b;
0920: }
0921:
0922: private static Random random;
0923:
0924: private static String randomHash() {
0925: return kelondroBase64Order.enhancedCoder.encodeLong(random
0926: .nextLong(), 4)
0927: + kelondroBase64Order.enhancedCoder.encodeLong(random
0928: .nextLong(), 4)
0929: + kelondroBase64Order.enhancedCoder.encodeLong(random
0930: .nextLong(), 4);
0931: }
0932:
0933: public static void test(int testsize) {
0934: kelondroRow r = new kelondroRow(
0935: new kelondroColumn[] { new kelondroColumn("hash",
0936: kelondroColumn.celltype_string,
0937: kelondroColumn.encoder_bytes,
0938: yacySeedDB.commonHashLength, "hash") },
0939: kelondroBase64Order.enhancedCoder, 0);
0940:
0941: kelondroRowCollection a = new kelondroRowCollection(r, testsize);
0942: a.add("AAAAAAAAAAAA".getBytes());
0943: a.add("BBBBBBBBBBBB".getBytes());
0944: a.add("BBBBBBBBBBBB".getBytes());
0945: a.add("BBBBBBBBBBBB".getBytes());
0946: a.add("CCCCCCCCCCCC".getBytes());
0947: ArrayList<kelondroRowSet> del = a.removeDoubles();
0948: System.out.println(del + "rows double");
0949: Iterator<kelondroRow.Entry> j = a.rows();
0950: while (j.hasNext())
0951: System.out.println(new String(j.next().bytes()));
0952:
0953: System.out.println("kelondroRowCollection test with size = "
0954: + testsize);
0955: a = new kelondroRowCollection(r, testsize);
0956: long t0 = System.currentTimeMillis();
0957: random = new Random(0);
0958: for (int i = 0; i < testsize; i++)
0959: a.add(randomHash().getBytes());
0960: random = new Random(0);
0961: for (int i = 0; i < testsize; i++)
0962: a.add(randomHash().getBytes());
0963: a.sort();
0964: a.uniq();
0965: long t1 = System.currentTimeMillis();
0966: System.out.println("create a : " + (t1 - t0)
0967: + " milliseconds, " + d(testsize, (t1 - t0))
0968: + " entries/millisecond; a.size() = " + a.size());
0969:
0970: kelondroRowCollection c = new kelondroRowCollection(r, testsize);
0971: random = new Random(0);
0972: t0 = System.currentTimeMillis();
0973: for (int i = 0; i < testsize; i++) {
0974: c.add(randomHash().getBytes());
0975: }
0976: t1 = System.currentTimeMillis();
0977: System.out.println("create c : " + (t1 - t0)
0978: + " milliseconds, " + d(testsize, (t1 - t0))
0979: + " entries/millisecond");
0980: kelondroRowCollection d = new kelondroRowCollection(r, testsize);
0981: for (int i = 0; i < testsize; i++) {
0982: d.add(c.get(i).getColBytes(0));
0983: }
0984: long t2 = System.currentTimeMillis();
0985: System.out.println("copy c -> d: " + (t2 - t1)
0986: + " milliseconds, " + d(testsize, (t2 - t1))
0987: + " entries/millisecond");
0988: processors = 1;
0989: c.sort();
0990: long t3 = System.currentTimeMillis();
0991: System.out.println("sort c (1) : " + (t3 - t2)
0992: + " milliseconds, " + d(testsize, (t3 - t2))
0993: + " entries/millisecond");
0994: processors = 2;
0995: d.sort();
0996: long t4 = System.currentTimeMillis();
0997: System.out.println("sort d (2) : " + (t4 - t3)
0998: + " milliseconds, " + d(testsize, (t4 - t3))
0999: + " entries/millisecond");
1000: c.uniq();
1001: long t5 = System.currentTimeMillis();
1002: System.out.println("uniq c : " + (t5 - t4)
1003: + " milliseconds, " + d(testsize, (t5 - t4))
1004: + " entries/millisecond");
1005: d.uniq();
1006: long t6 = System.currentTimeMillis();
1007: System.out.println("uniq d : " + (t6 - t5)
1008: + " milliseconds, " + d(testsize, (t6 - t5))
1009: + " entries/millisecond");
1010: random = new Random(0);
1011: kelondroRowSet e = new kelondroRowSet(r, testsize);
1012: for (int i = 0; i < testsize; i++) {
1013: e.put(r.newEntry(randomHash().getBytes()));
1014: }
1015: long t7 = System.currentTimeMillis();
1016: System.out.println("create e : " + (t7 - t6)
1017: + " milliseconds, " + d(testsize, (t7 - t6))
1018: + " entries/millisecond");
1019: e.sort();
1020: long t8 = System.currentTimeMillis();
1021: System.out.println("sort e (2) : " + (t8 - t7)
1022: + " milliseconds, " + d(testsize, (t8 - t7))
1023: + " entries/millisecond");
1024: e.uniq();
1025: long t9 = System.currentTimeMillis();
1026: System.out.println("uniq e : " + (t9 - t8)
1027: + " milliseconds, " + d(testsize, (t9 - t8))
1028: + " entries/millisecond");
1029: boolean cis = c.isSorted();
1030: long t10 = System.currentTimeMillis();
1031: System.out.println("c isSorted = " + ((cis) ? "true" : "false")
1032: + ": " + (t10 - t9) + " milliseconds");
1033: boolean dis = d.isSorted();
1034: long t11 = System.currentTimeMillis();
1035: System.out.println("d isSorted = " + ((dis) ? "true" : "false")
1036: + ": " + (t11 - t10) + " milliseconds");
1037: boolean eis = e.isSorted();
1038: long t12 = System.currentTimeMillis();
1039: System.out.println("e isSorted = " + ((eis) ? "true" : "false")
1040: + ": " + (t12 - t11) + " milliseconds");
1041: random = new Random(0);
1042: boolean allfound = true;
1043: for (int i = 0; i < testsize; i++) {
1044: if (e.get(randomHash().getBytes()) == null) {
1045: allfound = false;
1046: break;
1047: }
1048: }
1049: long t13 = System.currentTimeMillis();
1050: System.out.println("e allfound = "
1051: + ((allfound) ? "true" : "false") + ": " + (t13 - t12)
1052: + " milliseconds");
1053: boolean noghosts = true;
1054: for (int i = 0; i < testsize; i++) {
1055: if (e.get(randomHash().getBytes()) != null) {
1056: noghosts = false;
1057: break;
1058: }
1059: }
1060: long t14 = System.currentTimeMillis();
1061: System.out.println("e noghosts = "
1062: + ((noghosts) ? "true" : "false") + ": " + (t14 - t13)
1063: + " milliseconds");
1064: System.out.println("Result size: c = " + c.size() + ", d = "
1065: + d.size() + ", e = " + e.size());
1066: System.out.println();
1067: }
1068:
1069: public static void main(String[] args) {
1070: //test(1000);
1071: test(10000);
1072: test(100000);
1073: //test(1000000);
1074:
1075: /*
1076: System.out.println(new java.util.Date(10957 * day));
1077: System.out.println(new java.util.Date(0));
1078: System.out.println(daysSince2000(System.currentTimeMillis()));
1079: */
1080: }
1081: }
|