001: /*
002: * Copyright 2004-2008 H2 Group. Licensed under the H2 License, Version 1.0
003: * (license2)
004: * Initial Developer: H2 Group
005: */
006: package org.h2.tools.indexer;
007:
008: import java.io.File;
009: import java.io.FileInputStream;
010: import java.io.FileWriter;
011: import java.io.PrintWriter;
012: import java.util.ArrayList;
013: import java.util.Collections;
014: import java.util.Comparator;
015: import java.util.HashMap;
016: import java.util.HashSet;
017: import java.util.StringTokenizer;
018:
019: import org.h2.util.IOUtils;
020: import org.h2.util.StringUtils;
021:
022: /**
023: * The indexer creates the fulltext index of the HTML documentation.
024: * It is used for the built-in HTML javascript search.
025: */
026: public class Indexer {
027:
028: ArrayList pages = new ArrayList();
029: HashMap words = new HashMap();
030: HashSet noIndex = new HashSet();
031: ArrayList wordList;
032: int totalAllWeights;
033: PrintWriter output;
034:
035: Page page;
036: boolean title;
037: boolean heading;
038:
039: private static final int MIN_WORD_SIZE = 3;
040: private static final int MAX_RELATIONS = 20;
041:
042: public static void main(String[] args) throws Exception {
043: new Indexer().run(args);
044: }
045:
046: void run(String[] args) throws Exception {
047: System.out.println(getClass().getName());
048: String dir = "docs";
049: String destDir = "docs/html";
050: for (int i = 0; i < args.length; i++) {
051: if (args[i].equals("-dir")) {
052: dir = args[++i];
053: } else if (args[i].equals("-destDir")) {
054: destDir = args[++i];
055: }
056: }
057: File file = new File(dir);
058: System.out.println("indexing " + file.getCanonicalPath());
059: setNoIndex(new String[] { "index.html", "html/header.html",
060: "html/search.html", "html/frame.html",
061: "javadoc/index.html", "javadoc/classes.html",
062: "javadoc/allclasses-frame.html",
063: "javadoc/allclasses-noframe.html",
064: "javadoc/constant-values.html",
065: "javadoc/overview-frame.html",
066: "javadoc/overview-summary.html",
067: "javadoc/serialized-form.html" });
068: output = new PrintWriter(new FileWriter(destDir + "/index.js"));
069: readPages("", file, 0);
070: output.println("var pages=new Array();");
071: output.println("var ref=new Array();");
072: output
073: .println("function Page(title, file) { this.title=title; this.file=file; }");
074: output.println("function load() {");
075: sortWords();
076: removeOverflowRelations();
077: sortPages();
078: listPages();
079: listWords();
080: output.println("}");
081: output.close();
082: }
083:
084: private void setNoIndex(String[] strings) {
085: for (int i = 0; i < strings.length; i++) {
086: noIndex.add(strings[i]);
087: }
088: }
089:
090: void sortWords() {
091: wordList = new ArrayList(words.values());
092: // TODO support ignored keywords (to shrink the index)
093: // String ignored = "";
094: // for(int i=0; i<wordList.size(); i++) {
095: // Word word = (Word) wordList.get(i);
096: // if(word.pages.size() >= pages.size()/4) {
097: // wordList.remove(i);
098: // if(ignored.length()==0) {
099: // ignored += ",";
100: // }
101: // ignored += word.name;
102: // i--;
103: // }
104: // }
105: // output.println("var ignored = '" + convertUTF(ignored) + "'");
106: // TODO support A, B, C,... class links in the index file and use them
107: // for combined AND searches
108: Collections.sort(wordList, new Comparator() {
109: public int compare(Object o0, Object o1) {
110: Word w0 = (Word) o0;
111: Word w1 = (Word) o1;
112: return w0.name.compareToIgnoreCase(w1.name);
113: }
114: });
115: }
116:
117: void removeOverflowRelations() {
118: for (int i = 0; i < wordList.size(); i++) {
119: Word word = (Word) wordList.get(i);
120: ArrayList weights = word.getSortedWeights();
121: int max = MAX_RELATIONS;
122: if (weights.size() > max) {
123: while (max < weights.size()) {
124: Weight weight = (Weight) weights.get(max);
125: if (weight.value < Weight.HEADER) {
126: break;
127: }
128: max++;
129: }
130: }
131: while (max < weights.size()) {
132: Weight weight = (Weight) weights.get(max);
133: weights.remove(max);
134: weight.page.relations--;
135: }
136: }
137: }
138:
139: void sortPages() {
140: Collections.sort(pages, new Comparator() {
141: public int compare(Object o0, Object o1) {
142: Page p0 = (Page) o0;
143: Page p1 = (Page) o1;
144: return p0.relations == p1.relations ? 0
145: : p0.relations < p1.relations ? 1 : -1;
146: }
147: });
148: for (int i = 0; i < pages.size(); i++) {
149: Page page = (Page) pages.get(i);
150: page.id = i;
151: }
152: }
153:
154: void listPages() {
155: for (int i = 0; i < pages.size(); i++) {
156: Page page = (Page) pages.get(i);
157: output.println("pages[" + page.id + "]=new Page('"
158: + convertUTF(page.title) + "', '" + page.fileName
159: + "');");
160: }
161: }
162:
163: void readPages(String dir, File file, int level) throws Exception {
164: String name = file.getName();
165: String fileName = dir.length() > 0 ? dir + "/" + name
166: : level > 0 ? name : "";
167: if (file.isDirectory()) {
168: File[] list = file.listFiles();
169: for (int i = 0; i < list.length; i++) {
170: readPages(fileName, list[i], level + 1);
171: }
172: return;
173: }
174: String lower = StringUtils.toLowerEnglish(name);
175: if (!lower.endsWith(".html") && !lower.endsWith(".htm")) {
176: return;
177: }
178: if (lower.indexOf("_ja.") >= 0) {
179: return;
180: }
181: if (!noIndex.contains(fileName)) {
182: page = new Page(pages.size(), fileName);
183: pages.add(page);
184: readPage(file);
185: }
186: }
187:
188: void listWords() {
189: output.println("// words: " + wordList.size());
190: StringBuffer buff = new StringBuffer();
191: String first = "";
192: int firstLen = 1;
193: int totalRelations = 0;
194: for (int i = 0; i < wordList.size(); i++) {
195: Word word = (Word) wordList.get(i);
196: ArrayList weights = word.getSortedWeights();
197: String lower = StringUtils.toLowerEnglish(word.name);
198: if (!first.equals(lower.substring(0, firstLen))) {
199: if (buff.length() > 0) {
200: output.println("ref['" + convertUTF(first) + "']='"
201: + buff.toString() + "';");
202: buff = new StringBuffer();
203: }
204: first = lower.substring(0, firstLen);
205: }
206: if (buff.length() > 0) {
207: buff.append(';');
208: }
209: buff.append(convertUTF(word.name));
210: buff.append('=');
211: String weightString = "r";
212: totalRelations += weights.size();
213: for (int j = 0; j < weights.size(); j++) {
214: Weight weight = (Weight) weights.get(j);
215: Page page = weight.page;
216: if (j > 0) {
217: buff.append(",");
218: }
219: String ws;
220: if (weight.value >= Weight.TITLE) {
221: ws = "t";
222: } else if (weight.value >= Weight.HEADER) {
223: ws = "h";
224: } else {
225: ws = "r";
226: }
227: if (ws != weightString) {
228: weightString = ws;
229: buff.append(ws);
230: }
231: buff.append(page.id);
232: // TODO compress weight
233: // buff.append(",");
234: // buff.append(weight.value);
235: }
236: }
237: // TODO optimization: could support "a name=" and go to _first_
238: // occurrence, or scan page and mark
239: output.println("ref['" + convertUTF(first) + "']='"
240: + buff.toString() + "';");
241: output.println("// totalRelations: " + totalRelations);
242: }
243:
244: private void readPage(File file) throws Exception {
245: byte[] data = IOUtils.readBytesAndClose(new FileInputStream(
246: file), 0);
247: String text = new String(data, "UTF-8");
248: StringTokenizer t = new StringTokenizer(text, "<> \r\n", true);
249: boolean inTag = false;
250: title = false;
251: heading = false;
252: while (t.hasMoreTokens()) {
253: String token = t.nextToken();
254: if (token.length() == 1) {
255: char c = token.charAt(0);
256: switch (c) {
257: case '<': {
258: if (inTag) {
259: process("???");
260: }
261: inTag = true;
262: if (!t.hasMoreTokens()) {
263: break;
264: }
265: token = t.nextToken();
266: if (token.startsWith("/")) {
267: title = false;
268: heading = false;
269: } else if (token.equalsIgnoreCase("title")) {
270: title = true;
271: } else if (token.length() == 2
272: && Character.toLowerCase(token.charAt(0)) == 'h'
273: && Character.isDigit(token.charAt(1))) {
274: heading = true;
275: }
276: // TODO maybe skip script tags?
277: break;
278: }
279: case '>': {
280: if (!inTag) {
281: process("???");
282: }
283: inTag = false;
284: break;
285: }
286: case '\r':
287: case '\n':
288: case ' ':
289: break;
290: default:
291: if (!inTag) {
292: process(token);
293: }
294: }
295: } else {
296: if (!inTag) {
297: process(token);
298: }
299: }
300: }
301:
302: if (page.title == null || page.title.trim().length() == 0) {
303: System.out.println("Error: not title found in "
304: + file.getName());
305: page.title = file.getName();
306: }
307: page.title = page.title.trim();
308: }
309:
310: void process(String text) {
311: text = HtmlConverter.convertHtmlToString(text);
312: if (title) {
313: if (page.title == null) {
314: page.title = text;
315: } else {
316: page.title = page.title + " " + text;
317: }
318: }
319: int weight;
320: if (title) {
321: weight = Weight.TITLE;
322: } else if (heading) {
323: weight = Weight.HEADER;
324: } else {
325: weight = Weight.PARAGRAPH;
326: }
327: // this list of constants needs to be the same in search.js
328: // (char) 160: nbsp
329: StringTokenizer t = new StringTokenizer(
330: text,
331: " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" + (char) 160,
332: false);
333: while (t.hasMoreTokens()) {
334: String token = t.nextToken();
335: if (token.length() < MIN_WORD_SIZE) {
336: continue;
337: }
338: if (Character.isDigit(token.charAt(0))) {
339: continue;
340: }
341: String lower = StringUtils.toLowerEnglish(token);
342: Word word = (Word) words.get(lower);
343: if (word == null) {
344: word = new Word(token);
345: words.put(lower, word);
346: } else if (!word.name.equals(token)) {
347: word.name = token.compareTo(word.name) > 0 ? token
348: : word.name;
349: }
350: page.totalWeight += weight;
351: totalAllWeights += weight;
352: word.addPage(page, weight);
353: }
354: }
355:
356: String convertUTF(String s) {
357: s = StringUtils.quoteJavaString(s);
358: s = s.substring(1, s.length() - 1);
359: return s;
360: }
361:
362: }
|