001: package vqwiki.utils.lucene;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2001 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import org.apache.lucene.document.DateField;
058: import org.apache.lucene.document.Document;
059: import org.apache.lucene.document.Field;
060:
061: import java.io.File;
062: import java.io.IOException;
063:
064: /**
065: * A utility for making Lucene Documents for HTML documents.
066: *
067: * @version $Id: HTMLDocument.java 365 2003-10-05 05:07:32Z garethc $
068: */
069:
070: public class HTMLDocument {
071: /**
072: * TODO: Document this field.
073: */
074: static char dirSep = System.getProperty("file.separator").charAt(0);
075:
076: /**
077: *Creates a new HTMLDocument.
078: */
079: private HTMLDocument() {
080: }
081:
082: /**
083: * TODO: Document this method.
084: *
085: * @param f TODO: Document this parameter.
086: * @return TODO: Document the result.
087: * @exception IOException TODO: Document this exception.
088: * @exception InterruptedException TODO: Document this exception.
089: */
090: public static Document Document(File f) throws IOException,
091: InterruptedException {
092: // make a new, empty document
093: Document doc = new Document();
094:
095: // Add the url as a field named "url". Use an UnIndexed field, so
096: // that the url is just stored with the document, but is not searchable.
097: doc.add(Field
098: .UnIndexed("url", f.getPath().replace(dirSep, '/')));
099:
100: // Add the last modified date of the file a field named "modified". Use a
101: // Keyword field, so that it's searchable, but so that no attempt is made
102: // to tokenize the field into words.
103: doc.add(Field.Keyword("modified", DateField.timeToString(f
104: .lastModified())));
105:
106: // Add the uid as a field, so that index can be incrementally maintained.
107: // This field is not stored with document, it is indexed, but it is not
108: // tokenized prior to indexing.
109: doc.add(new Field("uid", uid(f), false, true, false));
110:
111: HTMLParser parser = new HTMLParser(f);
112:
113: // Add the tag-stripped contents as a Reader-valued Text field so it will
114: // get tokenized and indexed.
115: doc.add(Field.Text("contents", parser.getReader()));
116:
117: // Add the summary as an UnIndexed field, so that it is stored and returned
118: // with hit documents for display.
119: doc.add(Field.UnIndexed("summary", parser.getSummary()));
120:
121: // Add the title as a separate Text field, so that it can be searched
122: // separately.
123: doc.add(Field.Text("title", parser.getTitle()));
124:
125: // return the document
126: return doc;
127: }
128:
129: /**
130: * TODO: Document this method.
131: *
132: * @param f TODO: Document this parameter.
133: * @return TODO: Document the result.
134: */
135: public static String uid(File f) {
136: // Append path and date into a string in such a way that lexicographic
137: // sorting gives the same results as a walk of the file hierarchy. Thus
138: // null (\u0000) is used both to separate directory components and to
139: // separate the path from the date.
140: return f.getPath().replace(dirSep, '\u0000') + "\u0000"
141: + DateField.timeToString(f.lastModified());
142: }
143:
144: /**
145: * TODO: Document this method.
146: *
147: * @param uid TODO: Document this parameter.
148: * @return TODO: Document the result.
149: */
150: public static String uid2url(String uid) {
151: String url = uid.replace('\u0000', '/');
152: // replace nulls with slashes
153: return url.substring(0, url.lastIndexOf('/'));
154: // remove date from end
155: }
156: }
|