01: package org.apache.lucene.benchmark.byTask.feeds;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import java.io.IOException;
21: import java.io.Reader;
22: import java.io.StringReader;
23: import java.text.DateFormat;
24: import java.text.ParseException;
25: import java.util.Date;
26: import java.util.Properties;
27:
28: /**
29: * HTML Parser that is based on Lucene's demo HTML parser.
30: */
31: public class DemoHTMLParser implements
32: org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
33:
34: public DemoHTMLParser() {
35: }
36:
37: /*
38: * (non-Javadoc)
39: * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
40: */
41: public DocData parse(String name, Date date, Reader reader,
42: DateFormat dateFormat) throws IOException,
43: InterruptedException {
44: org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(
45: reader);
46:
47: // title
48: String title = p.getTitle();
49: // properties
50: Properties props = p.getMetaTags();
51: // body
52: Reader r = p.getReader();
53: char c[] = new char[1024];
54: StringBuffer bodyBuf = new StringBuffer();
55: int n;
56: while ((n = r.read(c)) >= 0) {
57: if (n > 0) {
58: bodyBuf.append(c, 0, n);
59: }
60: }
61: r.close();
62: if (date == null && props.getProperty("date") != null) {
63: try {
64: date = dateFormat.parse(props.getProperty("date")
65: .trim());
66: } catch (ParseException e) {
67: // do not fail test just because a date could not be parsed
68: System.out
69: .println("ignoring date parse exception (assigning 'now') for: "
70: + props.getProperty("date"));
71: date = new Date(); // now
72: }
73: }
74:
75: return new DocData(name, bodyBuf.toString(), title, props, date);
76: }
77:
78: /*
79: * (non-Javadoc)
80: * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.lang.StringBuffer, java.text.DateFormat)
81: */
82: public DocData parse(String name, Date date,
83: StringBuffer inputText, DateFormat dateFormat)
84: throws IOException, InterruptedException {
85: return parse(name, date,
86: new StringReader(inputText.toString()), dateFormat);
87: }
88:
89: }
|