001: /*
002: ***********************************************************************
003: * Copyright (C) 2005-2006, International Business Machines *
004: * Corporation and others. All Rights Reserved. *
005: ***********************************************************************
006: *
007: */
008:
009: package com.ibm.icu.dev.tool.charsetdet.sbcs;
010:
011: import java.io.*;
012: import java.util.Arrays;
013: import java.util.ArrayList;
014: import java.util.Collections;
015: import java.util.Iterator;
016: import java.util.List;
017:
018: import com.ibm.icu.impl.Utility;
019:
020: /**
021: * @author emader
022: *
023: * TODO To change the template for this generated type comment go to
024: * Window - Preferences - Java - Code Style - Code Templates
025: */
026: public class StatisticsTool implements NGramParser.NGramParserClient,
027: NGramList.NGramKeyMapper {
028: /* TODO Make this usage string more sane. */
029: private static final String usageString = "\nUsage: StatisticsTool [OPTIONS] [FILES]\n\n"
030: + "This program will read in a Unicode text file of text in a particular language\n"
031: + "and compute the statistics needed to detected that language and character set.\n "
032: + "Options:\n"
033: + "-e specify the target encoding\n"
034: + "-h or -? print this usage text.\n"
035: + "-v also generate statistics for visual order.\n"
036: + "-l only generate statistics for logical order (cancel -v)."
037: + "-c run the checker.\n"
038: + "-t run the encoding test.\n"
039: + "example: com.ibm.icu.dev.tool.charset.StatisticsTool -e 8859-1 Spanish.txt";
040:
041: private static final int BUFFER_SIZE = 1024;
042:
043: private char[] buffer;
044: private int bufIndex;
045: private int bufMax;
046:
047: private InputFile inputFile;
048:
049: private NGramList ngrams;
050:
051: private static byte[] allBytes = { (byte) 0x00, (byte) 0x01,
052: (byte) 0x02, (byte) 0x03, (byte) 0x04, (byte) 0x05,
053: (byte) 0x06, (byte) 0x07, (byte) 0x08, (byte) 0x09,
054: (byte) 0x0A, (byte) 0x0B, (byte) 0x0C, (byte) 0x0D,
055: (byte) 0x0E, (byte) 0x0F, (byte) 0x10, (byte) 0x11,
056: (byte) 0x12, (byte) 0x13, (byte) 0x14, (byte) 0x15,
057: (byte) 0x16, (byte) 0x17, (byte) 0x18, (byte) 0x19,
058: (byte) 0x1A, (byte) 0x1B, (byte) 0x1C, (byte) 0x1D,
059: (byte) 0x1E, (byte) 0x1F, (byte) 0x20, (byte) 0x21,
060: (byte) 0x22, (byte) 0x23, (byte) 0x24, (byte) 0x25,
061: (byte) 0x26, (byte) 0x27, (byte) 0x28, (byte) 0x29,
062: (byte) 0x2A, (byte) 0x2B, (byte) 0x2C, (byte) 0x2D,
063: (byte) 0x2E, (byte) 0x2F, (byte) 0x30, (byte) 0x31,
064: (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35,
065: (byte) 0x36, (byte) 0x37, (byte) 0x38, (byte) 0x39,
066: (byte) 0x3A, (byte) 0x3B, (byte) 0x3C, (byte) 0x3D,
067: (byte) 0x3E, (byte) 0x3F, (byte) 0x40, (byte) 0x41,
068: (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45,
069: (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49,
070: (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D,
071: (byte) 0x4E, (byte) 0x4F, (byte) 0x50, (byte) 0x51,
072: (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55,
073: (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59,
074: (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D,
075: (byte) 0x5E, (byte) 0x5F, (byte) 0x60, (byte) 0x61,
076: (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65,
077: (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69,
078: (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D,
079: (byte) 0x6E, (byte) 0x6F, (byte) 0x70, (byte) 0x71,
080: (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75,
081: (byte) 0x76, (byte) 0x77, (byte) 0x78, (byte) 0x79,
082: (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D,
083: (byte) 0x7E, (byte) 0x7F, (byte) 0x80, (byte) 0x81,
084: (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85,
085: (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89,
086: (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D,
087: (byte) 0x8E, (byte) 0x8F, (byte) 0x90, (byte) 0x91,
088: (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95,
089: (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99,
090: (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D,
091: (byte) 0x9E, (byte) 0x9F, (byte) 0xA0, (byte) 0xA1,
092: (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5,
093: (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9,
094: (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD,
095: (byte) 0xAE, (byte) 0xAF, (byte) 0xB0, (byte) 0xB1,
096: (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5,
097: (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9,
098: (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD,
099: (byte) 0xBE, (byte) 0xBF, (byte) 0xC0, (byte) 0xC1,
100: (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5,
101: (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9,
102: (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD,
103: (byte) 0xCE, (byte) 0xCF, (byte) 0xD0, (byte) 0xD1,
104: (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5,
105: (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9,
106: (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD,
107: (byte) 0xDE, (byte) 0xDF, (byte) 0xE0, (byte) 0xE1,
108: (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5,
109: (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9,
110: (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED,
111: (byte) 0xEE, (byte) 0xEF, (byte) 0xF0, (byte) 0xF1,
112: (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5,
113: (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9,
114: (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD,
115: (byte) 0xFE, (byte) 0xFF };
116:
117: /**
118: *
119: */
120: public StatisticsTool() {
121: buffer = new char[BUFFER_SIZE];
122:
123: buffer[0] = ' ';
124: bufIndex = 0;
125: bufMax = 1;
126: }
127:
128: private static void usage() {
129: System.out.println(usageString);
130: }
131:
132: private static void exceptionError(Exception e) {
133: System.err.println("ioError: " + e.toString());
134: }
135:
136: private int nextBuffer(InputFile inputFile) {
137: bufIndex = 0;
138:
139: return inputFile.read(buffer);
140: }
141:
142: public char nextChar() {
143: if (bufIndex >= bufMax) {
144: bufMax = nextBuffer(inputFile);
145: }
146:
147: if (bufMax < 0) {
148: return 0;
149: }
150:
151: return buffer[bufIndex++];
152: }
153:
154: public void handleNGram(String key) {
155: ngrams.put(key);
156: }
157:
158: public Object mapKey(String key) {
159: return key;
160: }
161:
162: private NGramList dumpNGrams() {
163: String filename = inputFile.getPath();
164: int extension = filename.lastIndexOf(".");
165: String outputFileName = filename.substring(0, extension)
166: + ".raw" + filename.substring(extension);
167: PrintStream output;
168: double cumulative = 0;
169:
170: try {
171: output = new PrintStream(new FileOutputStream(
172: outputFileName), true, "UTF8");
173: } catch (IOException e) {
174: System.out.println("? Could not open " + outputFileName
175: + " for writing.");
176: return null;
177: }
178:
179: System.out.println(inputFile.getFilename() + ": "
180: + ngrams.getUniqueNGrams() + "/"
181: + ngrams.getTotalNGrams());
182:
183: ArrayList array = new ArrayList(ngrams.values());
184:
185: Collections.sort(array);
186:
187: NGramList stats = new NGramList(inputFile);
188: int count = 0;
189: int totalNGrams = ngrams.getTotalNGrams();
190:
191: for (Iterator it = array.iterator(); it.hasNext(); count += 1) {
192: NGramList.NGram ngram = (NGramList.NGram) it.next();
193: String value = ngram.getValue();
194: int refCount = ngram.getRefCount();
195: double ratio = (double) refCount / totalNGrams * 100.0;
196:
197: cumulative += ratio;
198:
199: // TODO check should be count < max && cumulative < maxPercent
200: if (count < 64) {
201: stats.put(value);
202: }
203:
204: output.println(value + "\t" + refCount + "\t" + ratio
205: + "%\t" + cumulative + "%");
206: }
207:
208: output.close();
209:
210: return stats;
211: }
212:
213: private void writeStatistics(ArrayList keyList, boolean visual) {
214: String filename = inputFile.getPath();
215: int extension = filename.lastIndexOf(".");
216: String outputFileName = filename.substring(0, extension) + "-"
217: + inputFile.getEncoding()
218: + (visual ? "-visual.dat" : ".dat");
219: PrintStream output;
220:
221: try {
222: output = new PrintStream(new FileOutputStream(
223: outputFileName), true, "ASCII");
224: } catch (IOException e) {
225: System.out.println("? Could not open " + outputFileName
226: + " for writing.");
227: return;
228: }
229:
230: int i = 0;
231:
232: output.print(" private static int[] ngrams = {");
233:
234: for (Iterator it = keyList.iterator(); it.hasNext(); i += 1) {
235: Integer ngram = (Integer) it.next();
236:
237: if (i % 16 == 0) {
238: output.print("\n ");
239: }
240:
241: output
242: .print("0x" + Utility.hex(ngram.intValue(), 6)
243: + ", ");
244: }
245:
246: output.println("\n };\n");
247:
248: /*
249: * Generate the byte map
250: */
251: char[] unicodes = inputFile.decode(allBytes);
252:
253: for (int b = 0; b < 256; b += 1) {
254: char unicode = unicodes[b];
255: int charClass = NGramParser.getCharClass(unicode);
256:
257: switch (charClass) {
258: case NGramParser.C_LETTER:
259: unicodes[b] = Character.toLowerCase(unicode);
260: break;
261:
262: case NGramParser.C_PUNCT:
263: unicodes[b] = ' ';
264: break;
265:
266: case NGramParser.C_IGNORE:
267: default:
268: unicodes[b] = '\0';
269: }
270: }
271:
272: byte[] byteMap = inputFile.encode(unicodes);
273:
274: output.print(" private static byte[] byteMap = {");
275:
276: for (int b = 0; b < 256; b += 1) {
277: if (b % 8 == 0) {
278: output.print("\n ");
279: }
280:
281: output.print("(byte) 0x"
282: + Utility.hex(byteMap[b] & 0xFF, 2) + ", ");
283: }
284:
285: output.println("\n };");
286: }
287:
288: public NGramList collectStatistics(InputFile file) {
289: if (!file.open()) {
290: return null;
291: }
292:
293: inputFile = file;
294:
295: NGramParser parser = new NGramParser(this );
296:
297: ngrams = new NGramList(this );
298: parser.parse();
299:
300: file.close();
301:
302: NGramList stats = dumpNGrams();
303: ArrayList statKeys = new ArrayList(stats.keys());
304:
305: Collections.sort(statKeys);
306: writeStatistics(statKeys, false);
307:
308: if (inputFile.getVisualOrder()) {
309: ArrayList reversed = new ArrayList(statKeys.size());
310:
311: for (Iterator it = statKeys.iterator(); it.hasNext();) {
312: Integer key = (Integer) it.next();
313: int k = key.intValue();
314: int r = 0;
315:
316: while (k != 0) {
317: r = (r << 8) | (k & 0xFF);
318: k >>= 8;
319: }
320:
321: reversed.add(new Integer(r));
322: }
323:
324: Collections.sort(reversed);
325: writeStatistics(reversed, true);
326: }
327:
328: return stats;
329: }
330:
331: public static void main(String[] args) {
332: List list = Arrays.asList(args);
333: InputFile[] input_files = new InputFile[args.length];
334: int file_count = 0;
335: String encoding = null;
336: boolean run_checker = false;
337: boolean encoding_test = false;
338: boolean visual_order = false;
339:
340: for (Iterator it = list.iterator(); it.hasNext(); /*anything?*/) {
341: String arg = (String) it.next();
342:
343: if (arg.equals("-v")) {
344: visual_order = true;
345: } else if (arg.equals("-l")) {
346: visual_order = false;
347: } else if (arg.equals("-c")) {
348: run_checker = true;
349: } else if (arg.equals("-t")) {
350: encoding_test = true;
351: } else if (arg.equals("-e")) {
352: if (it.hasNext()) {
353: encoding = (String) it.next();
354: } else {
355: System.err.println("Error: missing encoding.");
356: }
357: } else if (arg.startsWith("-")) {
358: if (!(arg.equals("-h") || arg.equals("-?"))) {
359: System.err.println("Error: unknown option " + arg);
360: }
361:
362: usage();
363: } else {
364: input_files[file_count++] = new InputFile(arg,
365: encoding, visual_order);
366: }
367: }
368:
369: if (file_count == 0) {
370: System.err.println("Error: there are no files to process.");
371: usage();
372: }
373:
374: StatisticsTool tool = new StatisticsTool();
375: Checker[] checkers = new Checker[file_count];
376:
377: for (int i = 0; i < file_count; i += 1) {
378: InputFile file = input_files[i];
379:
380: checkers[i] = new Checker(tool.collectStatistics(file),
381: file);
382: }
383:
384: System.out.println();
385:
386: /**
387: * Checkers
388: */
389: if (run_checker) {
390: for (int c = 0; c < file_count; c += 1) {
391: Checker checker = checkers[c];
392:
393: for (int f = 0; f < file_count; f += 1) {
394: checker.check(input_files[f]);
395: }
396: }
397:
398: }
399:
400: /*
401: * Detection test
402: */
403: if (encoding_test) {
404: char[] buffer = new char[128];
405:
406: System.out.println("Detection test");
407:
408: for (int f = 0; f < file_count; f += 1) {
409: InputFile file = input_files[f];
410: int[] histogram = new int[file_count];
411: int charCount, misses = 0;
412:
413: System.out.println(file.getFilename() + "("
414: + file.getEncoding() + "):");
415: file.open();
416:
417: for (int c = 0; c < file_count; c += 1) {
418: checkers[c].setMapper(file);
419: }
420:
421: // for each buffer
422: // for each checker
423: // call checkBuffer, save score
424: // find highest score, update histogram for that checker
425: // show checker histogram
426:
427: while ((charCount = file.read(buffer)) > 0) {
428: int[] scores = new int[file_count];
429: int bestFit = -1, maxScore = 0;
430:
431: for (int c = 0; c < file_count; c += 1) {
432: scores[c] = checkers[c].checkBuffer(buffer,
433: charCount);
434: }
435:
436: for (int c = 0; c < file_count; c += 1) {
437: int score = scores[c];
438:
439: if (score > maxScore) {
440: maxScore = score;
441: bestFit = c;
442: }
443: }
444:
445: if (bestFit >= 0) {
446: histogram[bestFit] += 1;
447: } else {
448: misses += 1;
449: }
450: }
451:
452: for (int c = 0; c < file_count; c += 1) {
453: System.out.println(" "
454: + checkers[c].getLanguage() + ": "
455: + histogram[c]);
456: }
457:
458: if (misses > 0) {
459: System.out.println(" NONE: " + misses);
460: }
461:
462: System.out.println();
463: }
464: }
465: }
466: }
|