001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Paolo Boldi
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.bits.Utf16TransformationStrategy;
025: import it.unimi.dsi.fastutil.io.BinIO;
026: import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
027: import it.unimi.dsi.io.FastBufferedReader;
028: import it.unimi.dsi.io.FileLinesCollection;
029: import it.unimi.dsi.io.LineIterator;
030: import it.unimi.dsi.lang.MutableString;
031: import it.unimi.dsi.logging.ProgressLogger;
032: import it.unimi.dsi.mg4j.document.Document;
033: import it.unimi.dsi.sux4j.mph.LcpMinimalPerfectMonotoneHash;
034: import it.unimi.dsi.sux4j.util.ShiftAddXorSignedStringMap;
035: import it.unimi.dsi.util.BloomFilter;
036: import it.unimi.dsi.util.StringMap;
037:
038: import java.io.File;
039: import java.io.FileInputStream;
040: import java.io.FileOutputStream;
041: import java.io.IOException;
042: import java.io.InputStreamReader;
043: import java.net.URI;
044: import java.net.URISyntaxException;
045: import java.util.ArrayList;
046: import java.util.Collection;
047:
048: import org.apache.commons.lang.RandomStringUtils;
049: import org.apache.log4j.Logger;
050:
051: import com.martiansoftware.jsap.FlaggedOption;
052: import com.martiansoftware.jsap.JSAP;
053: import com.martiansoftware.jsap.JSAPException;
054: import com.martiansoftware.jsap.JSAPResult;
055: import com.martiansoftware.jsap.Parameter;
056: import com.martiansoftware.jsap.SimpleJSAP;
057: import com.martiansoftware.jsap.UnflaggedOption;
058:
059: /** A virtual-document resolver based on document URIs.
060: *
061: * <p>Instances of this class store in a {@link StringMap} instances
062: * all URIs from a collection, and consider a virtual-document specification a (possibly relative) URI. The
063: * virtual-document specification is resolved against the document URI, and then the perfect hash is used
064: * to retrieve the corresponding document.
065: *
066: * <p>This class provides a main method that helps in building serialised resolvers from URI lists.
067: * In case of pathological document collections with duplicate URIs (most notably, the GOV2 collection
068: * used for TREC evaluations), an option makes it possible to add random noise to duplicates, so that
069: * minimal perfect hash construction does not go into an infinite loop. It is a rather crude solution, but it
070: * is nonsensical to have duplicate URIs in the first place.
071: */
072:
073: public class URLMPHVirtualDocumentResolver implements
074: VirtualDocumentResolver {
075: private static final long serialVersionUID = 1L;
076:
077: private static final Logger LOGGER = Logger
078: .getLogger(URLMPHVirtualDocumentResolver.class);
079:
080: /** The term map used by this resolver to associated URI strings to numbers. */
081: private final StringMap<? extends CharSequence> url2DocumentPointer;
082: /** The cached URI of the last argument to {@link #context(Document)}. */
083: private transient URI documentURI;
084:
085: public URLMPHVirtualDocumentResolver(
086: final StringMap<? extends CharSequence> url2DocumentPointer) {
087: this .url2DocumentPointer = url2DocumentPointer;
088: }
089:
090: public void context(final Document document) {
091: try {
092: documentURI = new URI(document.uri().toString())
093: .normalize();
094: } catch (URISyntaxException e) {
095: documentURI = null;
096: }
097: }
098:
099: public int resolve(final CharSequence virtualDocumentSpec) {
100: try {
101: URI virtualURI = URI.create(virtualDocumentSpec.toString())
102: .normalize();
103: if (!virtualURI.isAbsolute()) {
104: if (documentURI == null)
105: return -1;
106: virtualURI = documentURI.resolve(virtualURI);
107: }
108:
109: // TODO discard opaque?
110: return (int) url2DocumentPointer.getLong(virtualURI
111: .toString());
112: } catch (Exception e) {
113: return -1;
114: }
115: }
116:
117: public int numberOfDocuments() {
118: return url2DocumentPointer.size();
119: }
120:
121: private static void makeUnique(final BloomFilter filter,
122: final MutableString uri) {
123: while (!filter.add(uri)) {
124: LOGGER.debug("Duplicate URI " + uri);
125: uri.append('/').append(
126: RandomStringUtils.randomAlphanumeric(32));
127: }
128: }
129:
130: @SuppressWarnings("unchecked")
131: public static void main(final String[] arg) throws JSAPException,
132: IOException {
133: final SimpleJSAP jsap = new SimpleJSAP(
134: URLMPHVirtualDocumentResolver.class.getName(),
135: "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata.",
136: new Parameter[] {
137: new FlaggedOption("bufferSize",
138: JSAP.INTSIZE_PARSER, "64Ki",
139: JSAP.NOT_REQUIRED, 'b', "buffer-size",
140: "The size of the I/O buffer used to read terms."),
141: //new FlaggedOption( "class", MG4JClassParser.getParser(), ShiftAddXorSignedMinimalPerfectHash.class.getName(), JSAP.NOT_REQUIRED, 'c', "class", "A subclass of MinimalPerfectHash to be used when creating the table." ),
142: new FlaggedOption(
143: "termFile",
144: JSAP.STRING_PARSER,
145: JSAP.NO_DEFAULT,
146: JSAP.NOT_REQUIRED,
147: 'o',
148: "offline",
149: "Read terms from this file (without loading them into core memory) instead of standard input."),
150: new FlaggedOption(
151: "uniqueUris",
152: JSAP.INTSIZE_PARSER,
153: JSAP.NO_DEFAULT,
154: JSAP.NOT_REQUIRED,
155: 'U',
156: "unique-uris",
157: "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter."),
158: new UnflaggedOption("resolver",
159: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
160: JSAP.REQUIRED, JSAP.NOT_GREEDY,
161: "The filename for the resolver.") });
162:
163: JSAPResult jsapResult = jsap.parse(arg);
164: if (jsap.messagePrinted())
165: return;
166:
167: final int bufferSize = jsapResult.getInt("bufferSize");
168: final String resolverName = jsapResult.getString("resolver");
169: //final Class<?> tableClass = jsapResult.getClass( "class" );
170: String termFile = jsapResult.getString("termFile");
171:
172: BloomFilter filter = null;
173: final boolean uniqueURIs = jsapResult
174: .userSpecified("uniqueUris");
175: if (uniqueURIs)
176: filter = new BloomFilter(jsapResult.getInt("uniqueUris"));
177:
178: final Collection<? extends CharSequence> collection;
179: if (termFile == null) {
180: ArrayList<MutableString> termList = new ArrayList<MutableString>();
181: final ProgressLogger pl = new ProgressLogger();
182: pl.itemsName = "URIs";
183: final LineIterator termIterator = new LineIterator(
184: new FastBufferedReader(new InputStreamReader(
185: System.in, "UTF-8"), bufferSize), pl);
186:
187: pl.start("Reading URIs...");
188: MutableString uri;
189: while (termIterator.hasNext()) {
190: uri = termIterator.next();
191: if (uniqueURIs)
192: makeUnique(filter, uri);
193: termList.add(uri.copy());
194: }
195: pl.done();
196:
197: collection = termList;
198: } else {
199: if (uniqueURIs) {
200: // Create temporary file with unique URIs
201: final ProgressLogger pl = new ProgressLogger();
202: pl.itemsName = "URIs";
203: pl.start("Copying URIs...");
204: final LineIterator termIterator = new LineIterator(
205: new FastBufferedReader(new InputStreamReader(
206: new FileInputStream(termFile)),
207: bufferSize), pl);
208: File temp = File.createTempFile(
209: URLMPHVirtualDocumentResolver.class.getName(),
210: ".uniqueuris");
211: temp.deleteOnExit();
212: termFile = temp.toString();
213: final FastBufferedOutputStream outputStream = new FastBufferedOutputStream(
214: new FileOutputStream(termFile), bufferSize);
215: MutableString uri;
216: while (termIterator.hasNext()) {
217: uri = termIterator.next();
218: makeUnique(filter, uri);
219: uri.writeUTF8(outputStream);
220: outputStream.write('\n');
221: }
222: pl.done();
223: outputStream.close();
224: }
225: collection = new FileLinesCollection(termFile, "UTF-8");
226: }
227: LOGGER.debug("Building minimal perfect hash table...");
228: BinIO
229: .storeObject(
230: new URLMPHVirtualDocumentResolver(
231: new ShiftAddXorSignedStringMap(
232: collection.iterator(),
233: new LcpMinimalPerfectMonotoneHash<CharSequence>(
234: collection,
235: new Utf16TransformationStrategy()))),
236: resolverName);
237: LOGGER.debug(" done.");
238: }
239:
240: }
|