01: package org.apache.lucene.demo;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import java.io.*;
21: import org.apache.lucene.document.*;
22: import org.apache.lucene.demo.html.HTMLParser;
23:
24: /** A utility for making Lucene Documents for HTML documents. */
25:
26: public class HTMLDocument {
27: static char dirSep = System.getProperty("file.separator").charAt(0);
28:
29: public static String uid(File f) {
30: // Append path and date into a string in such a way that lexicographic
31: // sorting gives the same results as a walk of the file hierarchy. Thus
32: // null (\u0000) is used both to separate directory components and to
33: // separate the path from the date.
34: return f.getPath().replace(dirSep, '\u0000')
35: + "\u0000"
36: + DateTools.timeToString(f.lastModified(),
37: DateTools.Resolution.SECOND);
38: }
39:
40: public static String uid2url(String uid) {
41: String url = uid.replace('\u0000', '/'); // replace nulls with slashes
42: return url.substring(0, url.lastIndexOf('/')); // remove date from end
43: }
44:
45: public static Document Document(File f) throws IOException,
46: InterruptedException {
47: // make a new, empty document
48: Document doc = new Document();
49:
50: // Add the url as a field named "path". Use a field that is
51: // indexed (i.e. searchable), but don't tokenize the field into words.
52: doc.add(new Field("path", f.getPath().replace(dirSep, '/'),
53: Field.Store.YES, Field.Index.UN_TOKENIZED));
54:
55: // Add the last modified date of the file a field named "modified".
56: // Use a field that is indexed (i.e. searchable), but don't tokenize
57: // the field into words.
58: doc.add(new Field("modified", DateTools.timeToString(f
59: .lastModified(), DateTools.Resolution.MINUTE),
60: Field.Store.YES, Field.Index.UN_TOKENIZED));
61:
62: // Add the uid as a field, so that index can be incrementally maintained.
63: // This field is not stored with document, it is indexed, but it is not
64: // tokenized prior to indexing.
65: doc.add(new Field("uid", uid(f), Field.Store.NO,
66: Field.Index.UN_TOKENIZED));
67:
68: FileInputStream fis = new FileInputStream(f);
69: HTMLParser parser = new HTMLParser(fis);
70:
71: // Add the tag-stripped contents as a Reader-valued Text field so it will
72: // get tokenized and indexed.
73: doc.add(new Field("contents", parser.getReader()));
74:
75: // Add the summary as a field that is stored and returned with
76: // hit documents for display.
77: doc.add(new Field("summary", parser.getSummary(),
78: Field.Store.YES, Field.Index.NO));
79:
80: // Add the title as a field that it can be searched and that is stored.
81: doc.add(new Field("title", parser.getTitle(), Field.Store.YES,
82: Field.Index.TOKENIZED));
83:
84: // return the document
85: return doc;
86: }
87:
88: private HTMLDocument() {
89: }
90: }
|