001: package org.apache.lucene.benchmark.byTask.feeds;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.xml.sax.XMLReader;
021: import org.xml.sax.Attributes;
022: import org.xml.sax.InputSource;
023: import org.xml.sax.SAXException;
024: import org.xml.sax.helpers.DefaultHandler;
025: import org.xml.sax.helpers.XMLReaderFactory;
026:
027: import java.io.IOException;
028: import java.io.FileInputStream;
029:
030: import org.apache.lucene.document.Document;
031:
032: /**
033: * A LineDocMaker which reads the uncompressed english wikipedia dump.
034: */
035: public class EnwikiDocMaker extends LineDocMaker {
036:
037: static final int TITLE = 0;
038: static final int DATE = TITLE + 1;
039: static final int BODY = DATE + 1;
040: static final int ID = BODY + 1;
041: static final int LENGTH = ID + 1;
042:
043: static final String[] months = { "JAN", "FEB", "MAR", "APR", "MAY",
044: "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
045:
046: class Parser extends DefaultHandler implements Runnable {
047:
048: Thread t;
049: boolean threadDone;
050:
051: public void run() {
052:
053: try {
054: XMLReader reader = XMLReaderFactory
055: .createXMLReader("org.apache.xerces.parsers.SAXParser");
056: reader.setContentHandler(this );
057: reader.setErrorHandler(this );
058: while (true) {
059: final FileInputStream localFileIS = fileIS;
060: try {
061: InputSource is = new InputSource(localFileIS);
062: reader.parse(is);
063: } catch (IOException ioe) {
064: synchronized (EnwikiDocMaker.this ) {
065: if (localFileIS != fileIS) {
066: // fileIS was closed on us, so, just fall
067: // through
068: } else
069: // Exception is real
070: throw ioe;
071: }
072: }
073: synchronized (this ) {
074: if (!forever) {
075: nmde = new NoMoreDataException();
076: notify();
077: return;
078: } else if (localFileIS == fileIS) {
079: // If file is not already re-opened then
080: // re-open it now
081: openFile();
082: }
083: }
084: }
085: } catch (SAXException sae) {
086: throw new RuntimeException(sae);
087: } catch (IOException ioe) {
088: throw new RuntimeException(ioe);
089: } finally {
090: synchronized (this ) {
091: threadDone = true;
092: notify();
093: }
094: }
095: }
096:
097: String[] tuple;
098: NoMoreDataException nmde;
099:
100: String[] next() throws NoMoreDataException {
101: if (t == null) {
102: threadDone = false;
103: t = new Thread(this );
104: t.setDaemon(true);
105: t.start();
106: }
107: String[] result;
108: synchronized (this ) {
109: while (tuple == null && nmde == null && !threadDone) {
110: try {
111: wait();
112: } catch (InterruptedException ie) {
113: }
114: }
115: if (nmde != null) {
116: // Set to null so we will re-start thread in case
117: // we are re-used:
118: t = null;
119: throw nmde;
120: }
121: if (t != null && threadDone)
122: // The thread has exited yet did not hit end of
123: // data, so this means it hit an exception. We
124: // throw NoMorDataException here to force
125: // benchmark to stop the current alg:
126: throw new NoMoreDataException();
127: result = tuple;
128: tuple = null;
129: notify();
130: }
131: return result;
132: }
133:
134: StringBuffer contents = new StringBuffer();
135:
136: public void characters(char[] ch, int start, int length) {
137: contents.append(ch, start, length);
138: }
139:
140: String title;
141: String body;
142: String time;
143: String id;
144:
145: public void startElement(String namespace, String simple,
146: String qualified, Attributes attributes) {
147: if (qualified.equals("page")) {
148: title = null;
149: body = null;
150: time = null;
151: id = null;
152: } else if (qualified.equals("text")) {
153: contents.setLength(0);
154: } else if (qualified.equals("timestamp")) {
155: contents.setLength(0);
156: } else if (qualified.equals("title")) {
157: contents.setLength(0);
158: } else if (qualified.equals("id")) {
159: contents.setLength(0);
160: }
161: }
162:
163: String time(String original) {
164: StringBuffer buffer = new StringBuffer();
165:
166: buffer.append(original.substring(8, 10));
167: buffer.append('-');
168: buffer.append(months[Integer.valueOf(
169: original.substring(5, 7)).intValue() - 1]);
170: buffer.append('-');
171: buffer.append(original.substring(0, 4));
172: buffer.append(' ');
173: buffer.append(original.substring(11, 19));
174: buffer.append(".000");
175:
176: return buffer.toString();
177: }
178:
179: public void create(String title, String time, String body,
180: String id) {
181: String[] t = new String[LENGTH];
182: t[TITLE] = title.replace('\t', ' ');
183: t[DATE] = time.replace('\t', ' ');
184: t[BODY] = body.replaceAll("[\t\n]", " ");
185: t[ID] = id;
186: synchronized (this ) {
187: while (tuple != null) {
188: try {
189: wait();
190: } catch (InterruptedException ie) {
191: }
192: }
193: tuple = t;
194: notify();
195: }
196: }
197:
198: public void endElement(String namespace, String simple,
199: String qualified) throws SAXException {
200: if (qualified.equals("title")) {
201: title = contents.toString();
202: } else if (qualified.equals("text")) {
203: body = contents.toString();
204: if (body.startsWith("#REDIRECT")
205: || body.startsWith("#redirect")) {
206: body = null;
207: }
208: } else if (qualified.equals("timestamp")) {
209: time = time(contents.toString());
210: } else if (qualified.equals("id") && id == null) {//just get the first id
211: id = contents.toString();
212: } else if (qualified.equals("page")) {
213: if (body != null) {
214: create(title, time, body, id);
215: }
216: }
217: }
218: }
219:
220: Parser parser = new Parser();
221:
222: class DocState extends LineDocMaker.DocState {
223: public Document setFields(String[] tuple) {
224: titleField.setValue(tuple[TITLE]);
225: dateField.setValue(tuple[DATE]);
226: bodyField.setValue(tuple[BODY]);
227: idField.setValue(tuple[ID]);
228: return doc;
229: }
230: }
231:
232: private DocState getDocState() {
233: DocState ds = (DocState) docState.get();
234: if (ds == null) {
235: ds = new DocState();
236: docState.set(ds);
237: }
238: return ds;
239: }
240:
241: public Document makeDocument() throws Exception {
242: String[] tuple = parser.next();
243: return getDocState().setFields(tuple);
244: }
245:
246: }
|