001: package org.apache.lucene.ant;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.document.Field;
021: import org.w3c.dom.Element;
022: import org.w3c.dom.Node;
023: import org.w3c.dom.NodeList;
024: import org.w3c.dom.Text;
025: import org.w3c.tidy.Tidy;
026:
027: import java.io.BufferedReader;
028: import java.io.File;
029: import java.io.FileInputStream;
030: import java.io.FileReader;
031: import java.io.IOException;
032: import java.io.InputStream;
033: import java.io.StringWriter;
034:
035: /**
036: * The <code>HtmlDocument</code> class creates a Lucene {@link
037: * org.apache.lucene.document.Document} from an HTML document. <P>
038: *
039: * It does this by using JTidy package. It can take input input
040: * from {@link java.io.File} or {@link java.io.InputStream}.
041: *
042: *@author Erik Hatcher
043: */
044: public class HtmlDocument {
045: private Element rawDoc;
046:
047: //-------------------------------------------------------------
048: // Constructors
049: //-------------------------------------------------------------
050:
051: /**
052: * Constructs an <code>HtmlDocument</code> from a {@link
053: * java.io.File}.
054: *
055: *@param file the <code>File</code> containing the
056: * HTML to parse
057: *@exception IOException if an I/O exception occurs
058: */
059: public HtmlDocument(File file) throws IOException {
060: Tidy tidy = new Tidy();
061: tidy.setQuiet(true);
062: tidy.setShowWarnings(false);
063: org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream(
064: file), null);
065: rawDoc = root.getDocumentElement();
066: }
067:
068: /**
069: * Constructs an <code>HtmlDocument</code> from an {@link
070: * java.io.InputStream}.
071: *
072: *@param is the <code>InputStream</code>
073: * containing the HTML
074: */
075: public HtmlDocument(InputStream is) {
076: Tidy tidy = new Tidy();
077: tidy.setQuiet(true);
078: tidy.setShowWarnings(false);
079: org.w3c.dom.Document root = tidy.parseDOM(is, null);
080: rawDoc = root.getDocumentElement();
081: }
082:
083: /**
084: * Creates a Lucene <code>Document</code> from an {@link
085: * java.io.InputStream}.
086: *
087: *@param is
088: */
089: public static org.apache.lucene.document.Document getDocument(
090: InputStream is) {
091: HtmlDocument htmlDoc = new HtmlDocument(is);
092: org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
093:
094: luceneDoc.add(new Field("title", htmlDoc.getTitle(),
095: Field.Store.YES, Field.Index.TOKENIZED));
096: luceneDoc.add(new Field("contents", htmlDoc.getBody(),
097: Field.Store.YES, Field.Index.TOKENIZED));
098:
099: return luceneDoc;
100: }
101:
102: //-------------------------------------------------------------
103: // Public methods
104: //-------------------------------------------------------------
105:
106: /**
107: * Creates a Lucene <code>Document</code> from a {@link
108: * java.io.File}.
109: *
110: *@param file
111: *@exception IOException
112: */
113: public static org.apache.lucene.document.Document Document(File file)
114: throws IOException {
115: HtmlDocument htmlDoc = new HtmlDocument(file);
116: org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
117:
118: luceneDoc.add(new Field("title", htmlDoc.getTitle(),
119: Field.Store.YES, Field.Index.TOKENIZED));
120: luceneDoc.add(new Field("contents", htmlDoc.getBody(),
121: Field.Store.YES, Field.Index.TOKENIZED));
122:
123: String contents = null;
124: BufferedReader br = new BufferedReader(new FileReader(file));
125: StringWriter sw = new StringWriter();
126: String line = br.readLine();
127: while (line != null) {
128: sw.write(line);
129: line = br.readLine();
130: }
131: br.close();
132: contents = sw.toString();
133: sw.close();
134:
135: luceneDoc.add(new Field("rawcontents", contents,
136: Field.Store.YES, Field.Index.NO));
137:
138: return luceneDoc;
139: }
140:
141: //-------------------------------------------------------------
142: // Private methods
143: //-------------------------------------------------------------
144:
145: /**
146: * Runs <code>HtmlDocument</code> on the files specified on
147: * the command line.
148: *
149: *@param args Command line arguments
150: *@exception Exception Description of Exception
151: */
152: public static void main(String args[]) throws Exception {
153: // HtmlDocument doc = new HtmlDocument(new File(args[0]));
154: // System.out.println("Title = " + doc.getTitle());
155: // System.out.println("Body = " + doc.getBody());
156:
157: HtmlDocument doc = new HtmlDocument(new FileInputStream(
158: new File(args[0])));
159: System.out.println("Title = " + doc.getTitle());
160: System.out.println("Body = " + doc.getBody());
161: }
162:
163: /**
164: * Gets the title attribute of the <code>HtmlDocument</code>
165: * object.
166: *
167: *@return the title value
168: */
169: public String getTitle() {
170: if (rawDoc == null) {
171: return null;
172: }
173:
174: String title = "";
175:
176: NodeList nl = rawDoc.getElementsByTagName("title");
177: if (nl.getLength() > 0) {
178: Element titleElement = ((Element) nl.item(0));
179: Text text = (Text) titleElement.getFirstChild();
180: if (text != null) {
181: title = text.getData();
182: }
183: }
184: return title;
185: }
186:
187: /**
188: * Gets the bodyText attribute of the
189: * <code>HtmlDocument</code> object.
190: *
191: *@return the bodyText value
192: */
193: public String getBody() {
194: if (rawDoc == null) {
195: return null;
196: }
197:
198: String body = "";
199: NodeList nl = rawDoc.getElementsByTagName("body");
200: if (nl.getLength() > 0) {
201: body = getBodyText(nl.item(0));
202: }
203: return body;
204: }
205:
206: /**
207: * Gets the bodyText attribute of the
208: * <code>HtmlDocument</code> object.
209: *
210: *@param node a DOM Node
211: *@return The bodyText value
212: */
213: private String getBodyText(Node node) {
214: NodeList nl = node.getChildNodes();
215: StringBuffer buffer = new StringBuffer();
216: for (int i = 0; i < nl.getLength(); i++) {
217: Node child = nl.item(i);
218: switch (child.getNodeType()) {
219: case Node.ELEMENT_NODE:
220: buffer.append(getBodyText(child));
221: buffer.append(" ");
222: break;
223: case Node.TEXT_NODE:
224: buffer.append(((Text) child).getData());
225: break;
226: }
227: }
228: return buffer.toString();
229: }
230: }
|