001: package org.apache.lucene.demo;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.standard.StandardAnalyzer;
021: import org.apache.lucene.document.Document;
022: import org.apache.lucene.index.IndexReader;
023: import org.apache.lucene.index.IndexWriter;
024: import org.apache.lucene.index.Term;
025: import org.apache.lucene.index.TermEnum;
026: import java.io.File;
027: import java.util.Date;
028: import java.util.Arrays;
029:
030: /** Indexer for HTML files. */
031: public class IndexHTML {
032: private IndexHTML() {
033: }
034:
035: private static boolean deleting = false; // true during deletion pass
036: private static IndexReader reader; // existing index
037: private static IndexWriter writer; // new index being built
038: private static TermEnum uidIter; // document id iterator
039:
040: /** Indexer for HTML files.*/
041: public static void main(String[] argv) {
042: try {
043: String index = "index";
044: boolean create = false;
045: File root = null;
046:
047: String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
048:
049: if (argv.length == 0) {
050: System.err.println("Usage: " + usage);
051: return;
052: }
053:
054: for (int i = 0; i < argv.length; i++) {
055: if (argv[i].equals("-index")) { // parse -index option
056: index = argv[++i];
057: } else if (argv[i].equals("-create")) { // parse -create option
058: create = true;
059: } else if (i != argv.length - 1) {
060: System.err.println("Usage: " + usage);
061: return;
062: } else
063: root = new File(argv[i]);
064: }
065:
066: Date start = new Date();
067:
068: if (!create) { // delete stale docs
069: deleting = true;
070: indexDocs(root, index, create);
071: }
072: writer = new IndexWriter(index, new StandardAnalyzer(),
073: create);
074: writer.setMaxFieldLength(1000000);
075: indexDocs(root, index, create); // add new docs
076:
077: System.out.println("Optimizing index...");
078: writer.optimize();
079: writer.close();
080:
081: Date end = new Date();
082:
083: System.out.print(end.getTime() - start.getTime());
084: System.out.println(" total milliseconds");
085:
086: } catch (Exception e) {
087: System.out.println(" caught a " + e.getClass()
088: + "\n with message: " + e.getMessage());
089: }
090: }
091:
092: /* Walk directory hierarchy in uid order, while keeping uid iterator from
093: /* existing index in sync. Mismatches indicate one of: (a) old documents to
094: /* be deleted; (b) unchanged documents, to be left alone; or (c) new
095: /* documents, to be indexed.
096: */
097:
098: private static void indexDocs(File file, String index,
099: boolean create) throws Exception {
100: if (!create) { // incrementally update
101:
102: reader = IndexReader.open(index); // open existing index
103: uidIter = reader.terms(new Term("uid", "")); // init uid iterator
104:
105: indexDocs(file);
106:
107: if (deleting) { // delete rest of stale docs
108: while (uidIter.term() != null
109: && uidIter.term().field() == "uid") {
110: System.out.println("deleting "
111: + HTMLDocument.uid2url(uidIter.term()
112: .text()));
113: reader.deleteDocuments(uidIter.term());
114: uidIter.next();
115: }
116: deleting = false;
117: }
118:
119: uidIter.close(); // close uid iterator
120: reader.close(); // close existing index
121:
122: } else
123: // don't have exisiting
124: indexDocs(file);
125: }
126:
127: private static void indexDocs(File file) throws Exception {
128: if (file.isDirectory()) { // if a directory
129: String[] files = file.list(); // list its files
130: Arrays.sort(files); // sort the files
131: for (int i = 0; i < files.length; i++)
132: // recursively index them
133: indexDocs(new File(file, files[i]));
134:
135: } else if (file.getPath().endsWith(".html") || // index .html files
136: file.getPath().endsWith(".htm") || // index .htm files
137: file.getPath().endsWith(".txt")) { // index .txt files
138:
139: if (uidIter != null) {
140: String uid = HTMLDocument.uid(file); // construct uid for doc
141:
142: while (uidIter.term() != null
143: && uidIter.term().field() == "uid"
144: && uidIter.term().text().compareTo(uid) < 0) {
145: if (deleting) { // delete stale docs
146: System.out.println("deleting "
147: + HTMLDocument.uid2url(uidIter.term()
148: .text()));
149: reader.deleteDocuments(uidIter.term());
150: }
151: uidIter.next();
152: }
153: if (uidIter.term() != null
154: && uidIter.term().field() == "uid"
155: && uidIter.term().text().compareTo(uid) == 0) {
156: uidIter.next(); // keep matching docs
157: } else if (!deleting) { // add new docs
158: Document doc = HTMLDocument.Document(file);
159: System.out.println("adding " + doc.get("path"));
160: writer.addDocument(doc);
161: }
162: } else { // creating a new index
163: Document doc = HTMLDocument.Document(file);
164: System.out.println("adding " + doc.get("path"));
165: writer.addDocument(doc); // add docs unconditionally
166: }
167: }
168: }
169: }
|