001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntIterator;
025: import it.unimi.dsi.mg4j.index.Index;
026: import it.unimi.dsi.mg4j.index.IndexIterator;
027: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
028: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
029: import it.unimi.dsi.io.OutputBitStream;
030:
031: import java.io.Closeable;
032: import java.io.IOException;
033: import java.lang.reflect.InvocationTargetException;
034: import java.net.URISyntaxException;
035: import java.util.Map;
036:
037: import org.apache.commons.configuration.ConfigurationException;
038:
039: import com.martiansoftware.jsap.JSAPException;
040:
041: /** Concatenates several indices.
042: *
043: * <p>This implementation of {@link it.unimi.dsi.mg4j.tool.Combine} concatenates
044: * the involved indices: document 0 of the first index is document 0 of the
045: * final collection, but document 0 of the second index is numbered after
046: * the number of documents in the first index, and so on. The resulting
047: * index is exactly what you would obtain by concatenating the document
048: * sequences at the origin of each index.
049: *
050: * <p>Note that this class can be used also with a single index, making it possible to recompress easily
051: * an index using different compression flags.
052: *
053: * @author Sebastiano Vigna
054: * @since 1.0
055: *
056: */
057:
058: final public class Concatenate extends Combine {
059:
060: public Concatenate(final String outputBasename,
061: final String[] inputBasename, final boolean metadataOnly,
062: final int bufferSize,
063: final Map<Component, Coding> writerFlags,
064: final boolean interleaved, final boolean skips,
065: final int quantum, final int height,
066: final int skipBufferSize, final long logInterval)
067: throws IOException, ConfigurationException,
068: URISyntaxException, ClassNotFoundException,
069: SecurityException, InstantiationException,
070: IllegalAccessException, InvocationTargetException,
071: NoSuchMethodException {
072: super (outputBasename, inputBasename, metadataOnly, bufferSize,
073: writerFlags, interleaved, skips, quantum, height,
074: skipBufferSize, logInterval);
075: }
076:
077: protected int combineNumberOfDocuments() {
078: int n = 0;
079: for (int i = 0; i < numIndices; i++)
080: n += index[i].numberOfDocuments;
081: return n;
082: }
083:
084: protected int combineSizes() throws IOException {
085: int currDoc = 0, maxDocSize = 0;
086: for (int i = 0; i < numIndices; i++) {
087: final IntIterator sizes = sizes(i);
088: int s = 0;
089: int j = index[i].numberOfDocuments;
090: while (j-- != 0) {
091: s = (size[currDoc++] += sizes.nextInt());
092: if (s > maxDocSize)
093: maxDocSize = s;
094: }
095: if (sizes instanceof Closeable)
096: ((Closeable) sizes).close();
097: }
098: return maxDocSize;
099: }
100:
101: protected int combine(final int numUsedIndices) throws IOException {
102: int currIndex, numPrevDocs = 0, currDoc, count, totalFrequency;
103: OutputBitStream obs;
104: Index i;
105: IndexIterator ii;
106:
107: // We gather the frequencies from the subindices and just add up.
108: totalFrequency = 0;
109: for (int k = numUsedIndices; k-- != 0;)
110: totalFrequency += (frequency[usedIndex[k]] = indexIterator[usedIndex[k]]
111: .frequency());
112:
113: indexWriter.newInvertedList();
114: indexWriter.writeFrequency(totalFrequency);
115:
116: for (int k = currIndex = 0; k < numUsedIndices; k++) { // We can just concatenated posting lists.
117:
118: // We must update the number of previously seen documents, possibly adding those in skipped indices.
119: while (currIndex < usedIndex[k])
120: numPrevDocs += index[currIndex++].numberOfDocuments;
121:
122: i = index[currIndex];
123: ii = indexIterator[currIndex];
124:
125: for (int j = frequency[currIndex]; j-- != 0;) {
126: obs = indexWriter.newDocumentRecord();
127: currDoc = ii.nextDocument() + numPrevDocs;
128: indexWriter.writeDocumentPointer(obs, currDoc);
129:
130: if (i.hasPayloads)
131: indexWriter.writePayload(obs, ii.payload());
132:
133: if (i.hasCounts) {
134: count = ii.count();
135: if (hasCounts)
136: indexWriter.writePositionCount(obs, count);
137: if (i.hasPositions && hasPositions)
138: indexWriter.writeDocumentPositions(obs, ii
139: .positionArray(), 0, count,
140: size != null ? size[currDoc] : -1);
141: }
142: }
143: }
144:
145: return totalFrequency;
146: }
147:
148: public static void main(String arg[])
149: throws ConfigurationException, SecurityException,
150: JSAPException, IOException, URISyntaxException,
151: ClassNotFoundException, InstantiationException,
152: IllegalAccessException, InvocationTargetException,
153: NoSuchMethodException {
154: Combine.main(arg, Concatenate.class);
155: }
156:
157: }
|