001: /*
002: ***********************************************************************
003: *
004: * Copyright (C) 2006, International Business Machines Corporation and
005: * others. All Rights Reserved.
006: *
007: ***********************************************************************
008: *
009: * BIG5Tool
010: *
011: * This tool produces the character usage frequency statistics for the Big5
012: * Chinese charset, for use by the ICU charset detectors.
013: *
014: * usage: java BIG5Tool [-d] [directory path]
015: *
016: * -d: Produce the data in a form to be exported to the ICU implementation
017: * Default is to produce an informative dump.
018: *
019: * -sjis Do Shift_JIS. The structure of sjis is very similar to Big5.
020: *
021: * directory path
022: * Source directory for the text files to be analyzed.
023: * All files in the specified directory must be in the Big5 encoding.
024: *
025: */
026:
027: package com.ibm.icu.dev.tool.charsetdet.mbcs;
028:
029: import java.io.File;
030: import java.io.FileInputStream;
031: import java.util.ArrayList;
032: import java.util.Arrays;
033: import java.util.HashMap;
034: import java.util.List;
035:
036: public class BIG5Tool {
037:
038: // The file buffer and file data length need to be out in class member variables
039: // so that the code lifted from charSet detection for scanning the multi-byte chars
040: // can see them conveniently.
041: byte[] buf = new byte[1000000];
042: int fileSize;
043:
044: boolean option_d = false; // data option. Produce exportable data
045: boolean option_v = true; // verbose informaional output.
046: boolean sjis = false; // True if input text files are Shift_JIS encoded.
047:
048: public static void main(String[] args) {
049: BIG5Tool This = new BIG5Tool();
050: This.Main(args);
051: }
052:
053: void Main(String[] args) {
054: int i;
055:
056: //
057: // Command Line Option Handling
058: //
059: String dirName = null;
060: for (i = 0; i < args.length; i++) {
061: if (args[i].equals("-d")) {
062: option_d = true;
063: option_v = false;
064: continue;
065: }
066: if (args[i].equals("-sjis")) {
067: sjis = true;
068: continue;
069: }
070: if (args[i].startsWith("-")) {
071: System.err.println("Unrecognized option: " + args[i]);
072: System.exit(-1);
073: }
074: if (dirName == null) {
075: dirName = args[i];
076: } else {
077: System.err.println("Unrecognized option: " + dirName);
078: System.exit(-1);
079: }
080: }
081: if (dirName == null) {
082: dirName = ".";
083: }
084:
085: //
086: // Verify that the specified directory exists.
087: //
088: File dir = new File(dirName);
089: if (dir.isDirectory() == false) {
090: System.err
091: .println("\"" + dirName + "\" is not a directory");
092: System.exit(-1);
093: }
094: processDir(dir);
095:
096: }
097:
098: //
099: // Collect statistics from all ordinary files in a specified directory.
100: //
101: void processDir(File dir) {
102: int totalMbcsChars = 0;
103: HashMap m = new HashMap(10000);
104: int i;
105:
106: System.out.println(dir.getName());
107: File[] files = dir.listFiles();
108: for (i = 0; i < files.length; i++) {
109: try {
110: if (files[i].isFile()) {
111: FileInputStream is = new FileInputStream(files[i]);
112: fileSize = is.read(buf);
113: if (option_v) {
114: System.out.println(files[i].getPath());
115: System.out.println(" " + fileSize + " bytes.");
116: }
117: iteratedChar ichar = new iteratedChar();
118: int fileChars = 0;
119: int fileMbcsChars = 0;
120: int errs = 0;
121:
122: while (nextChar(ichar)) {
123: if (ichar.error == true) {
124: errs++;
125: continue;
126: }
127: fileChars++;
128: if (ichar.charValue > 255) {
129: fileMbcsChars++;
130: totalMbcsChars++;
131: }
132: if (ichar.charValue <= 255) {
133: // Don't keep occurence statistics for the single byte range
134: continue;
135: }
136:
137: //
138: // Frequency of occurence statistics are accumulated in a map.
139: //
140: ChEl keyEl = new ChEl(ichar.charValue, 0);
141: ChEl valEl = (ChEl) m.get(keyEl);
142: if (valEl == null) {
143: m.put(keyEl, keyEl);
144: valEl = keyEl;
145: }
146: valEl.occurences++;
147: }
148: if (option_v) {
149: System.out.println(" " + fileChars + " Chars");
150: System.out.println(" " + fileMbcsChars
151: + " mbcs Chars");
152: System.out.println(" " + errs + " errors");
153: System.out.println("\n");
154: }
155: }
156: } catch (Exception e) {
157: System.err.println("Exception:" + e);
158:
159: }
160: }
161:
162: //
163: // We've processed through all of the files.
164: // sort and dump out the frequency statistics.
165: //
166: Object[] encounteredChars = m.values().toArray();
167: Arrays.sort(encounteredChars);
168: int cumulativeChars = 0;
169: int cumulativePercent = 0;
170: if (option_v) {
171: System.out
172: .println("# <char code> <occurences> <Cumulative %>");
173: for (i = 0; i < encounteredChars.length; i++) {
174: ChEl c = (ChEl) encounteredChars[i];
175: cumulativeChars += c.occurences;
176: cumulativePercent = cumulativeChars * 100
177: / totalMbcsChars;
178: System.out.println(i + " "
179: + Integer.toHexString(c.charCode) + " "
180: + c.occurences + " "
181: + cumulativePercent);
182: }
183: }
184: if (option_d) {
185: //
186: // Output the list of characters formatted for pasting into a
187: // Java source code array initializer.
188: // Resort into order based on the character code value, not
189: // on frequency of occurence.
190: //
191: List charList = new ArrayList();
192:
193: for (i = 0; i < 100 && cumulativePercent < 50; i++) {
194: ChEl c = (ChEl) encounteredChars[i];
195: cumulativeChars += c.occurences;
196: cumulativePercent = cumulativeChars * 100
197: / totalMbcsChars;
198: charList.add(new Integer(c.charCode));
199: }
200: Object[] sortedChars = charList.toArray();
201: Arrays.sort(sortedChars);
202:
203: System.out.print(" {");
204: for (i = 0; i < sortedChars.length; i++) {
205: if (i != 0) {
206: System.out.print(", ");
207: if ((i) % 10 == 0) {
208: System.out.print("\n ");
209: }
210: }
211: int cp = ((Integer) sortedChars[i]).intValue();
212: System.out.print("0x" + Integer.toHexString(cp));
213: }
214: System.out.println("};");
215: }
216: }
217:
218: //
219: // This is a little class containing a
220: // multi-byte character value and an occurence count for that char.
221: // Instances of this class are kept in the collection that accumulates statistics
222: //
223: // WARNING: this class's natural ordering (from Comparable) and equals()
224: // are inconsistent.
225:
226: static class ChEl implements Comparable {
227: int charCode;
228: int occurences;
229:
230: ChEl(int c, int o) {
231: charCode = c;
232: occurences = o;
233: }
234:
235: // Equals needs to work with a map, with the charCode as the key.
236: // For insertion/lookup, we care about the char code only, not the occurence count.
237: public boolean equals(Object other) {
238: ChEl o = (ChEl) other;
239: return o.charCode == this .charCode;
240: }
241:
242: // Hashcode needs to be compatible with equals
243: // We're using this in a hashMap!
244: public int hashCode() {
245: return charCode;
246: }
247:
248: // We want to be able to sort the results by frequency of occurence
249: // Compare backwards. We want most frequent chars first.
250: public int compareTo(Object other) {
251: ChEl o = (ChEl) other;
252: return (this .occurences > o.occurences ? -1
253: : (this .occurences == o.occurences ? 0 : 1));
254: }
255:
256: }
257:
258: //
259: // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
260: // Pulls out one logical char according to the rules of EUC encoding.
261: //
262: class iteratedChar {
263: int charValue = 0; // The char value is a value from the encoding.
264: // It's meaning is not well defined, other than
265: // different encodings
266: int index = 0;
267: int nextIndex = 0;
268: boolean error = false;
269: boolean done = false;
270:
271: void reset() {
272: charValue = 0;
273: index = -1;
274: nextIndex = 0;
275: error = false;
276: done = false;
277: }
278:
279: int nextByte() {
280: if (nextIndex >= fileSize) {
281: done = true;
282: return -1;
283: }
284: int byteValue = (int) buf[nextIndex++] & 0x00ff;
285: return byteValue;
286: }
287: }
288:
289: boolean nextChar(iteratedChar it) {
290: it.index = it.nextIndex;
291: it.error = false;
292: int firstByte = 0;
293: int secondByte = 0;
294:
295: buildChar: {
296: firstByte = it.charValue = it.nextByte();
297: if (firstByte < 0) {
298: // Ran off the end of the input data
299: it.done = true;
300: break buildChar;
301: }
302: if (firstByte <= 0x0080
303: || (sjis && firstByte >= 0x00a0 && firstByte < 0x00e0)
304: || (sjis && firstByte >= 0x00fd && firstByte <= 0x00ff)) {
305: // single byte char
306: break buildChar;
307: }
308:
309: secondByte = it.nextByte();
310: it.charValue = (it.charValue << 8) | secondByte;
311:
312: if (secondByte < 0x40 || secondByte == 0x007f
313: || secondByte == 0x00ff || sjis
314: && secondByte >= 0x00fd) {
315: it.error = true;
316: }
317:
318: if (it.error) {
319: System.out.println("Error "
320: + Integer.toHexString(firstByte) + " "
321: + Integer.toHexString(secondByte));
322: }
323: }
324:
325: return (it.done == false);
326: }
327:
328: }
|