001: /*
002: * Copyright 2004-2008 H2 Group. Licensed under the H2 License, Version 1.0
003: * (license2)
004: * Initial Developer: H2 Group
005: */
006: package org.h2.tools.doc;
007:
008: import java.io.File;
009: import java.io.FileReader;
010: import java.io.IOException;
011: import java.io.Reader;
012: import java.io.StringWriter;
013: import java.util.Arrays;
014: import java.util.HashMap;
015: import java.util.HashSet;
016: import java.util.Iterator;
017: import java.util.StringTokenizer;
018:
019: import org.h2.util.IOUtils;
020:
021: /**
022: * The spell checker makes sure that each word used in the source code
023: * is spelled correctly, by comparing the words with a word list.
024: * Camel case and uppercase words are checked as well.
025: * HTTP links are not checked; however they may not end with a dot.
026: */
027: public class SpellChecker {
028:
029: private HashSet dictionary = new HashSet();
030: private HashSet used = new HashSet();
031: private HashMap unknown = new HashMap();
032: private boolean debug;
033: private boolean printDictionary = false;
034: private boolean addToDictionary;
035: private static final String[] SUFFIX = new String[] { "html",
036: "java", "sql", "txt", "xml", "jsp", "css", "bat", "csv",
037: "xml", "js", "Driver", "properties", "task", "" };
038: private static final String[] IGNORE = new String[] { "dev", "nsi",
039: "gif", "png", "odg", "ico", "sxd", "zip", "bz2", "rc",
040: "layout", "res", "dll", "jar" };
041: private static final String PREFIX_IGNORE = "abc";
042: private static final String IGNORE_FILE = "mainWeb.html";
043: private int errorCount;
044:
045: public static void main(String[] args) throws IOException {
046: String dir = "src";
047: new SpellChecker().run("tools/org/h2/tools/doc/dictionary.txt",
048: dir);
049: }
050:
051: private void run(String dictionary, String dir) throws IOException {
052: process(new File(dir + "/" + dictionary));
053: process(new File(dir));
054: if (printDictionary) {
055: System.out.println("USED WORDS");
056: String[] list = new String[used.size()];
057: used.toArray(list);
058: Arrays.sort(list);
059: StringBuffer buff = new StringBuffer();
060: for (int i = 0; i < list.length; i++) {
061: String s = list[i];
062: if (buff.length() > 0) {
063: if (buff.length() + s.length() > 80) {
064: System.out.println(buff.toString());
065: buff.setLength(0);
066: } else {
067: buff.append(' ');
068: }
069: }
070: buff.append(s);
071: }
072: System.out.println(buff.toString());
073: }
074: if (unknown.size() > 0) {
075: System.out.println();
076: System.out.println("UNKNOWN WORDS");
077: for (Iterator it = unknown.keySet().iterator(); it
078: .hasNext();) {
079: String s = (String) it.next();
080: // int count = ((Integer) unknown.get(s)).intValue();
081: System.out.print(s + " ");
082: errorCount++;
083: }
084: System.out.println();
085: System.out.println();
086: }
087: if (errorCount > 0) {
088: throw new IOException(errorCount + " error found");
089: }
090: }
091:
092: private void process(File file) throws IOException {
093: String name = file.getCanonicalPath();
094: if (name.endsWith(".svn")) {
095: return;
096: }
097: if (name.indexOf("_") > 0 && name.indexOf("_en") < 0) {
098: return;
099: }
100: if (file.isDirectory()) {
101: File[] list = file.listFiles();
102: for (int i = 0; i < list.length; i++) {
103: process(list[i]);
104: }
105: } else {
106: String fileName = file.getAbsolutePath();
107: int idx = fileName.lastIndexOf('.');
108: String suffix;
109: if (idx < 0) {
110: suffix = "";
111: } else {
112: suffix = fileName.substring(idx + 1);
113: }
114: boolean ignore = false;
115: for (int i = 0; i < IGNORE.length; i++) {
116: if (IGNORE[i].equals(suffix)) {
117: ignore = true;
118: break;
119: }
120: }
121: if (fileName.endsWith(IGNORE_FILE)) {
122: ignore = true;
123: }
124: if (ignore) {
125: return;
126: }
127: boolean ok = false;
128: for (int i = 0; i < SUFFIX.length; i++) {
129: if (SUFFIX[i].equals(suffix)) {
130: ok = true;
131: break;
132: }
133: }
134: if (!ok) {
135: throw new IOException("Unsupported suffix: " + suffix
136: + " for file: " + fileName);
137: }
138: FileReader reader = null;
139: String text = null;
140: try {
141: reader = new FileReader(file);
142: text = readStringAndClose(reader, -1);
143: } finally {
144: IOUtils.closeSilently(reader);
145: }
146: if (fileName.endsWith("dictionary.txt")) {
147: addToDictionary = true;
148: } else {
149: addToDictionary = false;
150: }
151: scan(fileName, text);
152: }
153: }
154:
155: private void scan(String fileName, String text) {
156: HashSet notFound = new HashSet();
157: text = removeLinks(fileName, text);
158: StringTokenizer tokenizer = new StringTokenizer(text,
159: "\r\n \t+\"*%&/()='[]{},.-;:_<>\\!?$@#|~^`");
160: while (tokenizer.hasMoreTokens()) {
161: String token = tokenizer.nextToken();
162: char first = token.charAt(0);
163: if (Character.isDigit(first)) {
164: continue;
165: }
166: if (!addToDictionary && debug) {
167: System.out.print(token + " ");
168: }
169: scanCombinedToken(notFound, token);
170: if (!addToDictionary && debug) {
171: System.out.println();
172: }
173: }
174: if (notFound.isEmpty()) {
175: return;
176: }
177: if (notFound.size() > 0) {
178: System.out.println("file: " + fileName);
179: for (Iterator it = notFound.iterator(); it.hasNext();) {
180: String s = (String) it.next();
181: System.out.print(s + " ");
182: }
183: System.out.println();
184: }
185: }
186:
187: private String removeLinks(String fileName, String text) {
188: StringBuffer buff = new StringBuffer(text.length());
189: int pos = 0, last = 0;
190: while (true) {
191: pos = text.indexOf("http://", pos);
192: if (pos < 0) {
193: break;
194: }
195: int start = pos;
196: buff.append(text.substring(last, start));
197: pos += "http://".length();
198: while (true) {
199: char c = text.charAt(pos);
200: if (!Character.isJavaIdentifierPart(c)
201: && ".#/?&=%+_-:".indexOf(c) < 0) {
202: break;
203: }
204: pos++;
205: }
206: String link = text.substring(start, pos);
207: if (link.endsWith(".")) {
208: System.out.println("Link ending with dot in "
209: + fileName + ": " + link);
210: errorCount++;
211: }
212: last = pos;
213: }
214: buff.append(text.substring(last));
215: String changed = buff.toString();
216: return changed;
217: }
218:
219: private void scanCombinedToken(HashSet notFound, String token) {
220: for (int i = 1; i < token.length(); i++) {
221: char charLeft = token.charAt(i - 1);
222: char charRight = token.charAt(i);
223: if (Character.isLowerCase(charLeft)
224: && Character.isUpperCase(charRight)) {
225: scanToken(notFound, token.substring(0, i));
226: token = token.substring(i);
227: i = 1;
228: } else if (Character.isUpperCase(charLeft)
229: && Character.isLowerCase(charRight)) {
230: scanToken(notFound, token.substring(0, i - 1));
231: token = token.substring(i - 1);
232: i = 1;
233: }
234: }
235: scanToken(notFound, token);
236: }
237:
238: private void scanToken(HashSet notFound, String token) {
239: if (token.length() < 3) {
240: return;
241: }
242: while (true) {
243: char last = token.charAt(token.length() - 1);
244: if (!Character.isDigit(last)) {
245: break;
246: }
247: token = token.substring(0, token.length() - 1);
248: }
249: if (token.length() < 3) {
250: return;
251: }
252: for (int i = 0; i < token.length(); i++) {
253: if (Character.isDigit(token.charAt(i))) {
254: return;
255: }
256: }
257: token = token.toLowerCase();
258: if (!addToDictionary && debug) {
259: System.out.print(token + " ");
260: }
261: if (token.startsWith(PREFIX_IGNORE)) {
262: return;
263: }
264: if (addToDictionary) {
265: dictionary.add(token);
266: } else {
267: if (!dictionary.contains(token)) {
268: notFound.add(token);
269: increment(unknown, token);
270: } else {
271: used.add(token);
272: }
273: }
274: }
275:
276: private void increment(HashMap map, String key) {
277: Integer value = (Integer) map.get(key);
278: value = new Integer(value == null ? 0 : value.intValue() + 1);
279: map.put(key, value);
280: }
281:
282: public static String readStringAndClose(Reader in, int length)
283: throws IOException {
284: if (length <= 0) {
285: length = Integer.MAX_VALUE;
286: }
287: int block = Math.min(4096, length);
288: StringWriter out = new StringWriter(
289: length == Integer.MAX_VALUE ? block : length);
290: char[] buff = new char[block];
291: while (length > 0) {
292: int len = Math.min(block, length);
293: len = in.read(buff, 0, len);
294: if (len < 0) {
295: break;
296: }
297: out.write(buff, 0, len);
298: length -= len;
299: }
300: in.close();
301: return out.toString();
302: }
303: }
|