001: package org.apache.lucene.benchmark.utils;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.xml.sax.Attributes;
021: import org.xml.sax.InputSource;
022: import org.xml.sax.XMLReader;
023: import org.xml.sax.helpers.DefaultHandler;
024: import org.xml.sax.helpers.XMLReaderFactory;
025:
026: import javax.xml.parsers.SAXParser;
027: import javax.xml.parsers.SAXParserFactory;
028: import java.io.File;
029: import java.io.FileInputStream;
030: import java.io.FileWriter;
031: import java.io.IOException;
032:
033: /**
034: * Extract the downloaded Wikipedia dump into separate files for indexing.
035: */
036: public class ExtractWikipedia {
037:
038: private File wikipedia;
039: private File outputDir;
040:
041: public ExtractWikipedia(File wikipedia, File outputDir) {
042: this .wikipedia = wikipedia;
043: this .outputDir = outputDir;
044: System.out.println("Deleting all files in " + outputDir);
045: File[] files = outputDir.listFiles();
046: for (int i = 0; i < files.length; i++) {
047: files[i].delete();
048: }
049: }
050:
051: static public int count = 0;
052: static String[] months = { "JAN", "FEB", "MAR", "APR", "MAY",
053: "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
054:
055: public class Parser extends DefaultHandler {
056:
057: public Parser() {
058: }
059:
060: StringBuffer contents = new StringBuffer();
061:
062: public void characters(char[] ch, int start, int length) {
063: contents.append(ch, start, length);
064: }
065:
066: String title;
067: String id;
068: String body;
069: String time;
070:
071: static final int BASE = 10;
072:
073: public void startElement(String namespace, String simple,
074: String qualified, Attributes attributes) {
075: if (qualified.equals("page")) {
076: title = null;
077: id = null;
078: body = null;
079: time = null;
080: } else if (qualified.equals("text")) {
081: contents.setLength(0);
082: } else if (qualified.equals("timestamp")) {
083: contents.setLength(0);
084: } else if (qualified.equals("title")) {
085: contents.setLength(0);
086: } else if (qualified.equals("id")) {
087: contents.setLength(0);
088: }
089: }
090:
091: public File directory(int count, File directory) {
092: if (directory == null) {
093: directory = outputDir;
094: }
095: int base = BASE;
096: while (base <= count) {
097: base *= BASE;
098: }
099: if (count < BASE) {
100: return directory;
101: }
102: directory = new File(directory, (Integer.toString(base
103: / BASE)));
104: directory = new File(directory, (Integer.toString(count
105: / (base / BASE))));
106: return directory(count % (base / BASE), directory);
107: }
108:
109: public void create(String id, String title, String time,
110: String body) {
111:
112: File d = directory(count++, null);
113: d.mkdirs();
114: File f = new File(d, id + ".txt");
115:
116: StringBuffer contents = new StringBuffer();
117:
118: contents.append(time);
119: contents.append("\n\n");
120: contents.append(title);
121: contents.append("\n\n");
122: contents.append(body);
123: contents.append("\n");
124:
125: try {
126: FileWriter writer = new FileWriter(f);
127: writer.write(contents.toString());
128: writer.close();
129: } catch (IOException ioe) {
130: throw new RuntimeException(ioe);
131: }
132:
133: }
134:
135: String time(String original) {
136: StringBuffer buffer = new StringBuffer();
137:
138: buffer.append(original.substring(8, 10));
139: buffer.append('-');
140: buffer.append(months[Integer.valueOf(
141: original.substring(5, 7)).intValue() - 1]);
142: buffer.append('-');
143: buffer.append(original.substring(0, 4));
144: buffer.append(' ');
145: buffer.append(original.substring(11, 19));
146: buffer.append(".000");
147:
148: return buffer.toString();
149: }
150:
151: public void endElement(String namespace, String simple,
152: String qualified) {
153: if (qualified.equals("title")) {
154: title = contents.toString();
155: } else if (qualified.equals("text")) {
156: body = contents.toString();
157: if (body.startsWith("#REDIRECT")
158: || body.startsWith("#redirect")) {
159: body = null;
160: }
161: } else if (qualified.equals("timestamp")) {
162: time = time(contents.toString());
163: } else if (qualified.equals("id") && id == null) {
164: id = contents.toString();
165: } else if (qualified.equals("page")) {
166: if (body != null) {
167: create(id, title, time, body);
168: }
169: }
170: }
171: }
172:
173: public void extract() {
174:
175: try {
176: Parser parser = new Parser();
177: if (false) {
178: SAXParser sp = SAXParserFactory.newInstance()
179: .newSAXParser();
180: sp.parse(new FileInputStream(wikipedia), parser);
181: } else {
182: XMLReader reader = XMLReaderFactory
183: .createXMLReader("org.apache.xerces.parsers.SAXParser");
184: reader.setContentHandler(parser);
185: reader.setErrorHandler(parser);
186: reader.parse(new InputSource(new FileInputStream(
187: wikipedia)));
188: }
189: } catch (Exception e) {
190: throw new RuntimeException(e);
191: }
192: }
193:
194: public static void main(String[] args) {
195: if (args.length != 2) {
196: printUsage();
197: }
198:
199: File wikipedia = new File(args[0]);
200:
201: if (wikipedia.exists()) {
202: File outputDir = new File(args[1]);
203: outputDir.mkdirs();
204: ExtractWikipedia extractor = new ExtractWikipedia(
205: wikipedia, outputDir);
206: extractor.extract();
207: } else {
208: printUsage();
209: }
210: }
211:
212: private static void printUsage() {
213: System.err
214: .println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
215: }
216:
217: }
|