001: package it.unimi.dsi.mg4j.index.cluster;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.mg4j.index.BitStreamIndex;
025: import it.unimi.dsi.mg4j.index.IndexIterator;
026: import it.unimi.dsi.mg4j.index.IndexReader;
027: import it.unimi.dsi.util.StringMap;
028: import it.unimi.dsi.Util;
029:
030: import java.io.IOException;
031:
032: import org.apache.log4j.Logger;
033:
034: /** Static utility methods for lexical strategies.
035: *
036: * @author Alessandro Arrabito
037: * @author Sebastiano Vigna
038: */
039:
040: public class LexicalStrategies {
041: private final static Logger LOGGER = Util
042: .getLogger(LexicalStrategies.class);
043:
044: protected LexicalStrategies() {
045: }
046:
047: /** Creates an {@linkplain ContiguousLexicalStrategy contiguous lexical strategy} in which
048: * all local indices have approximately the same number of documents.
049: * @param numberOfLocalIndices the number of local indices.
050: * @param index the global index to be partitioned.
051: * @return a {@link ContiguousLexicalStrategy} that will partition in <code>index</code> in
052: * <code>numberOfLocalIndices</code> local indices of approximately equal size.
053: */
054: public static ContiguousLexicalStrategy uniform(
055: final int numberOfLocalIndices, final BitStreamIndex index)
056: throws IOException {
057: final int[] cutPoint = new int[numberOfLocalIndices + 1];
058: final CharSequence[] cutPointTerm = new CharSequence[numberOfLocalIndices + 1];
059: final int terms = index.numberOfTerms;
060:
061: LOGGER
062: .info("Computing a contiguous lexical strategy to partition "
063: + index
064: + " into "
065: + numberOfLocalIndices
066: + " parts...");
067:
068: /* If we have positions, we partition following the number of occurrences; otherwise,
069: * we partition following the number of document pointers. */
070: long total = index.hasPositions ? index.numberOfOccurrences
071: : index.numberOfPostings;
072:
073: final StringMap<? extends CharSequence> termMap = index.termMap;
074: if (termMap == null)
075: throw new IllegalStateException("Index " + index
076: + " has no term map");
077:
078: cutPointTerm[0] = termMap.list().get(0);
079: cutPointTerm[numberOfLocalIndices] = "\uFFFF";
080:
081: int k, blockSizeDivisor = numberOfLocalIndices;
082:
083: do {
084: int frequency;
085: int left = 0; // The left extreme of the current block
086: long count = 0; // Number of documents/occurrences in the current block
087:
088: final IndexReader indexReader = index.getReader();
089: long blockSize = total / blockSizeDivisor++; // The approximate size of a block
090: IndexIterator indexIterator;
091:
092: for (int i = k = 0; i < terms; i++) {
093: indexIterator = indexReader.nextIterator();
094: frequency = indexIterator.frequency();
095: if (!index.hasPositions)
096: count += frequency;
097: for (int j = frequency; j-- != 0;) {
098: indexIterator.nextDocument();
099: if (index.hasPositions)
100: count += indexIterator.count();
101: }
102:
103: if (i == terms - 1)
104: i++; // To fool the next
105: if (count >= blockSize && k < numberOfLocalIndices - 1
106: || i == terms) {
107: LOGGER.info("New term interval [" + left + ".." + i
108: + "] (" + termMap.list().get(left) + " -> "
109: + (i == terms ? "" : termMap.list().get(i))
110: + ")");
111: cutPoint[++k] = i;
112: if (i != terms)
113: cutPointTerm[k] = termMap.list().get(i);
114: left = i;
115: count = 0;
116: }
117: }
118: indexReader.close();
119: // In case we did not generate enough blocks, we try again with a smaller block size.
120: } while (k < numberOfLocalIndices);
121:
122: return new ContiguousLexicalStrategy(cutPoint, cutPointTerm);
123: }
124: }
|