Source Code Cross Referenced for PartitionLexically.java in  » Search-Engine » mg4j » it » unimi » dsi » mg4j » tool » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.tool 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package it.unimi.dsi.mg4j.tool;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2006-2007 Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.fastutil.io.BinIO;
025:        import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
026:        import it.unimi.dsi.mg4j.index.BitStreamHPIndex;
027:        import it.unimi.dsi.mg4j.index.BitStreamIndex;
028:        import it.unimi.dsi.mg4j.index.DiskBasedIndex;
029:        import it.unimi.dsi.mg4j.index.Index;
030:        import it.unimi.dsi.mg4j.index.cluster.ContiguousLexicalStrategy;
031:        import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
032:        import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
033:        import it.unimi.dsi.mg4j.index.cluster.LexicalCluster;
034:        import it.unimi.dsi.mg4j.index.cluster.LexicalPartitioningStrategy;
035:        import it.unimi.dsi.mg4j.index.cluster.LexicalStrategies;
036:        import it.unimi.dsi.io.FastBufferedReader;
037:        import it.unimi.dsi.io.InputBitStream;
038:        import it.unimi.dsi.io.OutputBitStream;
039:        import it.unimi.dsi.mg4j.search.score.BM25Scorer;
040:        import it.unimi.dsi.Util;
041:        import it.unimi.dsi.lang.MutableString;
042:        import it.unimi.dsi.logging.ProgressLogger;
043:        import it.unimi.dsi.sux4j.util.ShiftAddXorSignedStringMap;
044:        import it.unimi.dsi.util.BloomFilter;
045:        import it.unimi.dsi.util.ImmutableExternalPrefixMap;
046:        import it.unimi.dsi.util.PrefixMap;
047:        import it.unimi.dsi.util.Properties;
048:        import it.unimi.dsi.util.StringMap;
049:
050:        import java.io.File;
051:        import java.io.FileInputStream;
052:        import java.io.FileOutputStream;
053:        import java.io.IOException;
054:        import java.io.InputStream;
055:        import java.io.InputStreamReader;
056:        import java.io.OutputStream;
057:        import java.io.OutputStreamWriter;
058:        import java.io.PrintWriter;
059:
060:        import org.apache.commons.configuration.ConfigurationException;
061:        import org.apache.commons.configuration.ConfigurationMap;
062:        import org.apache.commons.io.IOUtils;
063:        import org.apache.log4j.Logger;
064:
065:        import com.martiansoftware.jsap.FlaggedOption;
066:        import com.martiansoftware.jsap.JSAP;
067:        import com.martiansoftware.jsap.JSAPException;
068:        import com.martiansoftware.jsap.JSAPResult;
069:        import com.martiansoftware.jsap.Parameter;
070:        import com.martiansoftware.jsap.SimpleJSAP;
071:        import com.martiansoftware.jsap.Switch;
072:        import com.martiansoftware.jsap.UnflaggedOption;
073:
074:        /** Partitions an index lexically.
075:         * 
076:         * <p>A global index is partitioned lexically by providing a {@link LexicalPartitioningStrategy}
077:         * that specifies a destination local index for each term, and a local term number. The global index
078:         * is read directly at the bit level, and the posting lists are divided among the 
079:         * local indices using the provided strategy. For instance,
080:         * an {@link ContiguousLexicalStrategy} divides an index into 
081:         * contiguous blocks (of terms) specified by the given strategy.
082:         * 
083:         * <p>By choice, document pointers are not remapped. Thus, it may happen that one of the local indices 
084:         * contains <em>no</em> posting with a certain document. However, computing the subset of documents contained
085:         * in each local index to remap them in a contiguous interval is not a good idea, as usually the subset
086:         * of documents appearing in the postings of each local index is large.
087:         *
088:         * <p>To speed up the search of the right local index of a not-so-frequent term (in
089:         * particular with a {@linkplain it.unimi.dsi.mg4j.index.cluster.ChainedLexicalClusteringStrategy chained strategy}), 
090:         * after partitioning an index you can create {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
091:         * inquiring indices that do not contain a term. The filters will be automatically loaded
092:         * by {@link it.unimi.dsi.mg4j.index.cluster.IndexCluster#getInstance(CharSequence, boolean, boolean)}.
093:         * 
094:         * <p>Note that the size file is the same for each local index and <em>is not copied</em>. Please use
095:         * standard operating system features such as symbolic links to provide size files to 
096:         * local indices. 
097:         * 
098:         * <p>If you plan to {@linkplain LexicalCluster cluster} the partitioned indices and you need document sizes 
099:         * (e.g., for {@linkplain BM25Scorer BM25 scoring}), you can use the index property 
100:         * {@link it.unimi.dsi.mg4j.index.Index.UriKeys#SIZES} to load the original size file.  
101:         * 
102:         * If you plan on partitioning an index requiring
103:         * document sizes, you should consider a custom index loading scheme 
104:         * that shares the {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndex#sizes size list}
105:         * among all local indices.
106:         *
107:         * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
108:         * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
109:         * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
110:         *
111:         * <h2>Write-once output and distributed index partitioning</h2>
112:         * 
113:         * <p>The partitioning process writes each index file sequentially exactly once, so index partitioning
114:         * can output its results to <em>pipes</em>, which in
115:         * turn can spill their content, for instance, through the network. In other words, albeit this
116:         * class theoretically creates a number of local indices on disk, those indices can be
117:         * substituted with suitable pipes creating remote local indices without affecting the partitioning process.
118:         * For instance, the following <samp>bash</samp> code creates three sets of pipes:
119:         * <pre style="margin: 1em 0">
120:         * for i in 0 1 2; do
121:         *   for e in frequencies globcounts index offsets properties sizes terms; do 
122:         *     mkfifo pipe-$i.$e
123:         *   done
124:         * done
125:         * </pre> 
126:         * 
127:         * <p>Each pipe must be emptied elsewhere, for instance (assuming
128:         * you want local indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>):
129:         * <pre style="margin: 1em 0">
130:         * for i in 0 1 2; do 
131:         *   for e in frequencies globcounts index offsets properties sizes terms; do 
132:         *     (cat pipe-$i.$e | ssh -x example.com "cat >index-$i.$e" &)
133:         *   done
134:         * done
135:         * </pre> 
136:         * <p>If we now start a partitioning process generating three local indices named <samp>pipe-0</samp>,
137:         * <samp>pipe-1</samp> and <samp>pipe-2</samp>
138:         * all pipes will be written to by the process, and the data will create remotely
139:         * indices <samp>index-0</samp>, <samp>index-1</samp> and <samp>index-2</samp>.
140:         *
141:         * @author Sebastiano Vigna
142:         * 
143:         * @since 1.0.1
144:         */
145:
146:        public class PartitionLexically {
147:            private static final Logger LOGGER = Util
148:                    .getLogger(PartitionLexically.class);
149:
150:            /**  The default buffer size for all involved indices. */
151:            public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
152:
153:            /** The number of local indices. */
154:            private final int numIndices;
155:            /** The output basenames. */
156:            private final String outputBasename;
157:            /** The array of local output basenames. */
158:            private final String[] localBasename;
159:            /** The input basename. */
160:            private final String inputBasename;
161:            /** The size of I/O buffers. */
162:            private final int bufferSize;
163:            /** The filename of the strategy used to partition the index. */
164:            private final String strategyFilename;
165:            /** The strategy used to partition the index. */
166:            private final LexicalPartitioningStrategy strategy;
167:            /** The additional local properties of each local index. */
168:            private final Properties[] strategyProperties;
169:            /** The logging interval. */
170:            private final long logInterval;
171:
172:            public PartitionLexically(final String inputBasename,
173:                    final String outputBasename,
174:                    final LexicalPartitioningStrategy strategy,
175:                    final String strategyFilename, final int bufferSize,
176:                    final long logInterval) {
177:
178:                this .inputBasename = inputBasename;
179:                this .outputBasename = outputBasename;
180:                this .strategy = strategy;
181:                this .strategyFilename = strategyFilename;
182:                this .bufferSize = bufferSize;
183:                this .logInterval = logInterval;
184:                numIndices = strategy.numberOfLocalIndices();
185:                strategyProperties = strategy.properties();
186:                localBasename = new String[numIndices];
187:                for (int i = 0; i < numIndices; i++)
188:                    localBasename[i] = outputBasename + "-" + i;
189:            }
190:
191:            public void runTermsOnly() throws IOException {
192:                final ProgressLogger pl = new ProgressLogger(LOGGER,
193:                        logInterval);
194:
195:                final PrintWriter[] localTerms = new PrintWriter[numIndices];
196:                final int numTerms[] = new int[numIndices];
197:                final FastBufferedReader terms = new FastBufferedReader(
198:                        new InputStreamReader(new FileInputStream(inputBasename
199:                                + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
200:
201:                for (int i = 0; i < numIndices; i++)
202:                    localTerms[i] = new PrintWriter(new OutputStreamWriter(
203:                            new FastBufferedOutputStream(new FileOutputStream(
204:                                    localBasename[i]
205:                                            + DiskBasedIndex.TERMS_EXTENSION)),
206:                            "UTF-8"));
207:
208:                // The current term
209:                final MutableString currTerm = new MutableString();
210:
211:                pl.itemsName = "terms";
212:                pl.logInterval = logInterval;
213:                pl.start("Partitioning index terms...");
214:
215:                int termNumber = 0, k;
216:
217:                while (terms.readLine(currTerm) != null) {
218:                    k = strategy.localIndex(termNumber); // The local index for this term
219:                    if (numTerms[k] != strategy.localNumber(termNumber))
220:                        throw new IllegalStateException();
221:                    numTerms[k]++;
222:                    currTerm.println(localTerms[k]);
223:                    pl.update();
224:                    termNumber++;
225:                }
226:
227:                terms.close();
228:                for (int i = 0; i < numIndices; i++)
229:                    localTerms[i].close();
230:
231:                pl.done();
232:            }
233:
234:            public void run() throws ConfigurationException, IOException,
235:                    ClassNotFoundException {
236:                final ProgressLogger pl = new ProgressLogger(LOGGER,
237:                        logInterval);
238:                final byte[] buffer = new byte[bufferSize];
239:
240:                final OutputBitStream[] localIndexStream = new OutputBitStream[numIndices];
241:                final OutputBitStream[] localPositionsStream = new OutputBitStream[numIndices];
242:                final OutputBitStream[] localOffsets = new OutputBitStream[numIndices];
243:                final OutputBitStream[] localFrequencies = new OutputBitStream[numIndices];
244:                final OutputBitStream[] localGlobCounts = new OutputBitStream[numIndices];
245:                final PrintWriter[] localTerms = new PrintWriter[numIndices];
246:                final int numTerms[] = new int[numIndices];
247:                final long numberOfOccurrences[] = new long[numIndices];
248:                final long numberOfPostings[] = new long[numIndices];
249:
250:                final boolean isHighPerformance = BitStreamHPIndex.class
251:                        .isAssignableFrom(Class.forName(new Properties(
252:                                inputBasename
253:                                        + DiskBasedIndex.PROPERTIES_EXTENSION)
254:                                .getString(Index.PropertyKeys.INDEXCLASS)));
255:
256:                final InputBitStream globalIndex = new InputBitStream(
257:                        inputBasename + DiskBasedIndex.INDEX_EXTENSION,
258:                        bufferSize);
259:                final InputBitStream globalPositions = isHighPerformance ? new InputBitStream(
260:                        inputBasename + DiskBasedIndex.POSITIONS_EXTENSION,
261:                        bufferSize)
262:                        : null;
263:                final FastBufferedReader terms = new FastBufferedReader(
264:                        new InputStreamReader(new FileInputStream(inputBasename
265:                                + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
266:                final InputBitStream offsets = new InputBitStream(inputBasename
267:                        + DiskBasedIndex.OFFSETS_EXTENSION);
268:                final InputBitStream frequencies = new InputBitStream(
269:                        inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION);
270:                final InputBitStream globCounts = new InputBitStream(
271:                        inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
272:                offsets.readGamma();
273:
274:                for (int i = 0; i < numIndices; i++) {
275:                    localIndexStream[i] = new OutputBitStream(localBasename[i]
276:                            + DiskBasedIndex.INDEX_EXTENSION, bufferSize);
277:                    if (isHighPerformance)
278:                        localPositionsStream[i] = new OutputBitStream(
279:                                localBasename[i]
280:                                        + DiskBasedIndex.POSITIONS_EXTENSION,
281:                                bufferSize);
282:                    localFrequencies[i] = new OutputBitStream(localBasename[i]
283:                            + DiskBasedIndex.FREQUENCIES_EXTENSION);
284:                    localGlobCounts[i] = new OutputBitStream(localBasename[i]
285:                            + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
286:                    localTerms[i] = new PrintWriter(new OutputStreamWriter(
287:                            new FastBufferedOutputStream(new FileOutputStream(
288:                                    localBasename[i]
289:                                            + DiskBasedIndex.TERMS_EXTENSION)),
290:                            "UTF-8"));
291:                    localOffsets[i] = new OutputBitStream(localBasename[i]
292:                            + DiskBasedIndex.OFFSETS_EXTENSION);
293:                    localOffsets[i].writeGamma(0);
294:                }
295:
296:                // The current term
297:                final MutableString currTerm = new MutableString();
298:
299:                pl.expectedUpdates = (new File(inputBasename
300:                        + DiskBasedIndex.INDEX_EXTENSION).length() + (isHighPerformance ? new File(
301:                        inputBasename + DiskBasedIndex.POSITIONS_EXTENSION)
302:                        .length()
303:                        : 0)) * 8;
304:                pl.itemsName = "bits";
305:                pl.logInterval = logInterval;
306:                pl.start("Partitioning index...");
307:
308:                int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;
309:                long length, count, positionsOffset = 0;
310:                int res, frequency;
311:
312:                while (terms.readLine(currTerm) != null) {
313:                    k = strategy.localIndex(termNumber); // The local index for this term
314:                    if (numTerms[k] != strategy.localNumber(termNumber))
315:                        throw new IllegalStateException();
316:                    numTerms[k]++;
317:
318:                    if (isHighPerformance) {
319:                        final long temp = globalIndex.readBits();
320:                        positionsOffset = globalIndex.readLongDelta();
321:                        previousHeaderLength = (int) (globalIndex.readBits() - temp);
322:                        if (prevK != -1) {
323:                            length = positionsOffset
324:                                    - globalPositions.readBits();
325:                            pl.count += length;
326:                            while (length > 0) {
327:                                res = (int) Math.min(bufferSize * 8, length);
328:                                globalPositions.read(buffer, res);
329:                                localPositionsStream[prevK].write(buffer, res);
330:                                length -= res;
331:                            }
332:                        }
333:                        newHeaderLength = localIndexStream[k]
334:                                .writeLongDelta(localPositionsStream[k]
335:                                        .writtenBits());
336:                    }
337:
338:                    frequency = frequencies.readGamma();
339:                    localFrequencies[k].writeGamma(frequency);
340:                    numberOfPostings[k] += frequency;
341:
342:                    count = globCounts.readLongGamma();
343:                    numberOfOccurrences[k] += count;
344:                    localGlobCounts[k].writeLongGamma(count);
345:
346:                    currTerm.println(localTerms[k]);
347:
348:                    length = offsets.readLongGamma() - previousHeaderLength;
349:                    localOffsets[k].writeLongGamma(length + newHeaderLength);
350:                    pl.count += length + previousHeaderLength - 1;
351:
352:                    while (length > 0) {
353:                        res = (int) Math.min(bufferSize * 8, length);
354:                        globalIndex.read(buffer, res);
355:                        localIndexStream[k].write(buffer, res);
356:                        length -= res;
357:                    }
358:
359:                    pl.update();
360:                    prevK = k;
361:                    termNumber++;
362:                }
363:
364:                // We pour the last piece of positions
365:                if (isHighPerformance) {
366:                    if (prevK != -1) {
367:                        length = positionsOffset - globalPositions.readBits();
368:                        while (length > 0) {
369:                            res = (int) Math.min(bufferSize * 8, length);
370:                            globalIndex.read(buffer, res);
371:                            localPositionsStream[prevK].write(buffer, res);
372:                            length -= res;
373:                        }
374:                    }
375:                }
376:
377:                pl.done();
378:
379:                terms.close();
380:                offsets.close();
381:                frequencies.close();
382:                globCounts.close();
383:                globalIndex.close();
384:                if (isHighPerformance)
385:                    globalPositions.close();
386:
387:                // We copy the relevant properties from the original 
388:                Properties properties = new Properties(inputBasename
389:                        + DiskBasedIndex.PROPERTIES_EXTENSION);
390:                Properties globalProperties = new Properties();
391:                if (strategyFilename != null)
392:                    globalProperties.setProperty(
393:                            IndexCluster.PropertyKeys.STRATEGY,
394:                            strategyFilename);
395:                globalProperties.setProperty(
396:                        DocumentalCluster.PropertyKeys.BLOOM, false);
397:                globalProperties.setProperty(Index.PropertyKeys.INDEXCLASS,
398:                        LexicalCluster.class.getName());
399:                for (int i = 0; i < numIndices; i++)
400:                    globalProperties.addProperty(
401:                            IndexCluster.PropertyKeys.LOCALINDEX,
402:                            localBasename[i]);
403:                globalProperties.setProperty(Index.PropertyKeys.FIELD,
404:                        properties.getProperty(Index.PropertyKeys.FIELD));
405:                globalProperties.setProperty(Index.PropertyKeys.POSTINGS,
406:                        properties.getProperty(Index.PropertyKeys.POSTINGS));
407:                globalProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
408:                        properties.getProperty(Index.PropertyKeys.OCCURRENCES));
409:                globalProperties.setProperty(Index.PropertyKeys.DOCUMENTS,
410:                        properties.getProperty(Index.PropertyKeys.DOCUMENTS));
411:                globalProperties.setProperty(Index.PropertyKeys.TERMS,
412:                        properties.getProperty(Index.PropertyKeys.TERMS));
413:                globalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
414:                        properties
415:                                .getProperty(Index.PropertyKeys.TERMPROCESSOR));
416:                globalProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
417:                        properties.getProperty(Index.PropertyKeys.MAXCOUNT));
418:                globalProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
419:                        properties.getProperty(Index.PropertyKeys.MAXDOCSIZE));
420:                globalProperties.save(outputBasename
421:                        + DiskBasedIndex.PROPERTIES_EXTENSION);
422:                LOGGER.debug("Properties for clustered index " + outputBasename
423:                        + ": " + new ConfigurationMap(globalProperties));
424:
425:                for (int i = 0; i < numIndices; i++) {
426:                    localIndexStream[i].close();
427:                    if (isHighPerformance)
428:                        localPositionsStream[i].close();
429:                    localOffsets[i].close();
430:                    localFrequencies[i].close();
431:                    localGlobCounts[i].close();
432:                    localTerms[i].close();
433:                    final InputStream input = new FileInputStream(inputBasename
434:                            + DiskBasedIndex.SIZES_EXTENSION);
435:                    final OutputStream output = new FileOutputStream(
436:                            localBasename[i] + DiskBasedIndex.SIZES_EXTENSION);
437:                    IOUtils.copy(input, output);
438:                    input.close();
439:                    output.close();
440:                    Properties localProperties = new Properties();
441:                    localProperties.addAll(globalProperties);
442:                    localProperties.setProperty(Index.PropertyKeys.TERMS,
443:                            numTerms[i]);
444:                    localProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
445:                            numberOfOccurrences[i]);
446:                    localProperties.setProperty(Index.PropertyKeys.POSTINGS,
447:                            numberOfPostings[i]);
448:                    localProperties.setProperty(Index.PropertyKeys.POSTINGS,
449:                            numberOfPostings[i]);
450:                    localProperties
451:                            .setProperty(
452:                                    Index.PropertyKeys.INDEXCLASS,
453:                                    properties
454:                                            .getProperty(Index.PropertyKeys.INDEXCLASS));
455:                    localProperties.addProperties(Index.PropertyKeys.CODING,
456:                            properties
457:                                    .getStringArray(Index.PropertyKeys.CODING));
458:                    localProperties
459:                            .setProperty(
460:                                    BitStreamIndex.PropertyKeys.SKIPQUANTUM,
461:                                    properties
462:                                            .getProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM));
463:                    localProperties
464:                            .setProperty(
465:                                    BitStreamIndex.PropertyKeys.SKIPHEIGHT,
466:                                    properties
467:                                            .getProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT));
468:                    if (strategyProperties[i] != null)
469:                        localProperties.addAll(strategyProperties[i]);
470:                    localProperties.save(localBasename[i]
471:                            + DiskBasedIndex.PROPERTIES_EXTENSION);
472:                    LOGGER.debug("Post-partitioning properties for index "
473:                            + localBasename[i] + ": "
474:                            + new ConfigurationMap(localProperties));
475:                }
476:            }
477:
478:            public static void main(final String[] arg) throws JSAPException,
479:                    ConfigurationException, IOException,
480:                    ClassNotFoundException, SecurityException,
481:                    InstantiationException, IllegalAccessException {
482:
483:                SimpleJSAP jsap = new SimpleJSAP(
484:                        PartitionLexically.class.getName(),
485:                        "Partitions an index lexically.",
486:                        new Parameter[] {
487:                                new FlaggedOption(
488:                                        "bufferSize",
489:                                        JSAP.INTSIZE_PARSER,
490:                                        Util
491:                                                .formatBinarySize(DEFAULT_BUFFER_SIZE),
492:                                        JSAP.NOT_REQUIRED, 'b', "buffer-size",
493:                                        "The size of an I/O buffer."),
494:                                new FlaggedOption(
495:                                        "logInterval",
496:                                        JSAP.LONG_PARSER,
497:                                        Long
498:                                                .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
499:                                        JSAP.NOT_REQUIRED, 'l', "log-interval",
500:                                        "The minimum time interval between activity logs in milliseconds."),
501:                                new FlaggedOption("strategy",
502:                                        JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
503:                                        JSAP.NOT_REQUIRED, 's', "strategy",
504:                                        "A serialised lexical partitioning strategy."),
505:                                new FlaggedOption("uniformStrategy",
506:                                        JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
507:                                        JSAP.NOT_REQUIRED, 'u', "uniform",
508:                                        "Requires a uniform partitioning in the given number of parts."),
509:                                new Switch("termsOnly", 't', "terms-only",
510:                                        "Just partition the term list."),
511:                                new UnflaggedOption("inputBasename",
512:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
513:                                        "The basename of the global index."),
514:                                new UnflaggedOption("outputBasename",
515:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
516:                                        "The basename of the local indices.") });
517:
518:                JSAPResult jsapResult = jsap.parse(arg);
519:                if (jsap.messagePrinted())
520:                    return;
521:                String inputBasename = jsapResult.getString("inputBasename");
522:                String outputBasename = jsapResult.getString("outputBasename");
523:                String strategyFilename = jsapResult.getString("strategy");
524:                LexicalPartitioningStrategy strategy = null;
525:
526:                if (jsapResult.userSpecified("uniformStrategy")) {
527:                    strategy = LexicalStrategies.uniform(jsapResult
528:                            .getInt("uniformStrategy"), DiskBasedIndex
529:                            .getInstance(inputBasename, false, false, true));
530:                    BinIO.storeObject(strategy,
531:                            strategyFilename = outputBasename
532:                                    + IndexCluster.STRATEGY_DEFAULT_EXTENSION);
533:                } else if (strategyFilename != null)
534:                    strategy = (LexicalPartitioningStrategy) BinIO
535:                            .loadObject(strategyFilename);
536:                else
537:                    throw new IllegalArgumentException(
538:                            "You must specify a splitting strategy");
539:
540:                final PartitionLexically partitionLexically = new PartitionLexically(
541:                        inputBasename, outputBasename, strategy,
542:                        strategyFilename, jsapResult.getInt("bufferSize"),
543:                        jsapResult.getLong("logInterval"));
544:
545:                if (jsapResult.getBoolean("termsOnly"))
546:                    partitionLexically.runTermsOnly();
547:                else
548:                    partitionLexically.run();
549:            }
550:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.