001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
025: import it.unimi.dsi.mg4j.document.Document;
026: import it.unimi.dsi.mg4j.document.DocumentCollection;
027: import it.unimi.dsi.mg4j.document.DocumentIterator;
028: import it.unimi.dsi.mg4j.document.DocumentSequence;
029: import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
030: import it.unimi.dsi.Util;
031: import it.unimi.dsi.mg4j.util.MG4JClassParser;
032: import it.unimi.dsi.lang.MutableString;
033: import it.unimi.dsi.logging.ProgressLogger;
034:
035: import java.io.FileOutputStream;
036: import java.io.IOException;
037: import java.lang.reflect.InvocationTargetException;
038:
039: import org.apache.log4j.Logger;
040:
041: import com.martiansoftware.jsap.FlaggedOption;
042: import com.martiansoftware.jsap.JSAP;
043: import com.martiansoftware.jsap.JSAPException;
044: import com.martiansoftware.jsap.JSAPResult;
045: import com.martiansoftware.jsap.Parameter;
046: import com.martiansoftware.jsap.SimpleJSAP;
047:
048: /** Scans a document sequence and prints on standard output the corresponding URIs.
049: *
050: * <p>This tool is a necessary intermediate step for the construction of an index with
051: * virtual fields.
052: *
053: * @author Sebastiano Vigna
054: * @since 1.1
055: */
056:
057: public class ScanMetadata {
058: private final static Logger LOGGER = Util
059: .getLogger(ScanMetadata.class);
060:
061: private static final char[] LINE_TERMINATORS = new char[] { '\n',
062: '\r' };
063: private static final char[] SPACES = new char[] { ' ', ' ' };
064:
065: public static void main(final String[] arg) throws JSAPException,
066: InvocationTargetException, NoSuchMethodException,
067: ClassNotFoundException, IOException,
068: IllegalAccessException, InstantiationException {
069:
070: SimpleJSAP jsap = new SimpleJSAP(
071: ScanMetadata.class.getName(),
072: "Scans and prints to standard output metadata of a collection. All line terminators in the metadata will be substituted with spaces.",
073: new Parameter[] {
074: new FlaggedOption("sequence",
075: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
076: JSAP.NOT_REQUIRED, 'S', "sequence",
077: "A serialised document sequence that will be used instead of stdin."),
078: new FlaggedOption(
079: "delimiter",
080: JSAP.INTEGER_PARSER,
081: Integer
082: .toString(Scan.DEFAULT_DELIMITER),
083: JSAP.NOT_REQUIRED, 'd', "delimiter",
084: "The document delimiter."),
085: new FlaggedOption(
086: "factory",
087: MG4JClassParser.getParser(),
088: IdentityDocumentFactory.class.getName(),
089: JSAP.NOT_REQUIRED, 'f', "factory",
090: "A document factory with a standard constructor."),
091: new FlaggedOption("property",
092: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
093: JSAP.NOT_REQUIRED, 'p', "property",
094: "A 'key=value' specification, or the name of a property file")
095: .setAllowMultipleDeclarations(true),
096: new FlaggedOption("renumber",
097: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
098: JSAP.NOT_REQUIRED, 'r', "renumber",
099: "The filename of a document renumbering."),
100: new FlaggedOption(
101: "logInterval",
102: JSAP.LONG_PARSER,
103: Long
104: .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
105: JSAP.NOT_REQUIRED, 'l', "log-interval",
106: "The minimum time interval between activity logs in milliseconds."),
107: new FlaggedOption("titles", JSAP.STRING_PARSER,
108: JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
109: 't', "titles",
110: "The resulting document titles."),
111: new FlaggedOption("uris", JSAP.STRING_PARSER,
112: JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
113: 'u', "uris",
114: "The resulting document URIs."), });
115:
116: JSAPResult jsapResult = jsap.parse(arg);
117: if (jsap.messagePrinted())
118: return;
119:
120: DocumentSequence documentSequence = Scan.getSequence(jsapResult
121: .getString("sequence"), jsapResult.getClass("factory"),
122: jsapResult.getStringArray("property"), jsapResult
123: .getInt("delimiter"), LOGGER);
124:
125: if (!jsapResult.userSpecified("uris")
126: && !jsapResult.userSpecified("titles"))
127: throw new IllegalArgumentException(
128: "You specify either a title or a URI output file");
129:
130: Util.ensureLog4JIsConfigured();
131:
132: final DocumentIterator documentIterator = documentSequence
133: .iterator();
134:
135: Document document;
136: FastBufferedOutputStream uriStream = null, titleStream = null;
137:
138: if (jsapResult.userSpecified("uris"))
139: uriStream = new FastBufferedOutputStream(
140: new FileOutputStream(jsapResult.getString("uris")));
141: if (jsapResult.userSpecified("titles"))
142: titleStream = new FastBufferedOutputStream(
143: new FileOutputStream(jsapResult.getString("titles")));
144:
145: MutableString s = new MutableString();
146:
147: ProgressLogger progressLogger = new ProgressLogger(LOGGER,
148: jsapResult.getLong("logInterval"), "documents");
149: if (documentSequence instanceof DocumentCollection)
150: progressLogger.expectedUpdates = ((DocumentCollection) documentSequence)
151: .size();
152: progressLogger.start("Scanning...");
153:
154: while ((document = documentIterator.nextDocument()) != null) {
155: if (uriStream != null) {
156: s.replace(document.uri());
157: s.replace(LINE_TERMINATORS, SPACES);
158: s.writeUTF8(uriStream);
159: uriStream.write('\n');
160: }
161: if (titleStream != null) {
162: s.replace(document.title());
163: s.replace(LINE_TERMINATORS, SPACES);
164: s.writeUTF8(titleStream);
165: titleStream.write('\n');
166: }
167: progressLogger.lightUpdate();
168: }
169:
170: progressLogger.done();
171: if (uriStream != null)
172: uriStream.close();
173: if (titleStream != null)
174: titleStream.close();
175: }
176: }
|