001: /*
002: ***********************************************************************
003: *
004: * Copyright (C) 2005-2006, International Business Machines Corporation and
005: * others. All Rights Reserved.
006: *
007: ***********************************************************************
008: *
009: * euc_tool
010: *
011: * This tool produces the character usage frequency statistics for the EUC family
012: * of charsets, for use by the ICU charset detectors.
013: *
014: * usage: java euc_tool [-d] [directory path]
015: *
016: * -d: Produce the data in a form to be exported to the ICU implementation
017: * Default is to produce an informative dump.
018: *
019: * directory path
020: * Source directory for the files to be analyzed.
021: * Default is the current directory.
022: * There should be three subdirectories under the specified directory, one
023: * each for EUC_JP, EUC_CN and EUC_KR. Within each of these subdirectories
024: * should be text files in the specified encoding.
025: *
026: */
027:
028: package com.ibm.icu.dev.tool.charsetdet.mbcs;
029:
030: import java.util.*;
031: import java.io.*;
032:
033: public class EUCTool {
034:
035: // The file buffer and file data length need to be out in class member variables
036: // so that the code lifted from charSet detection for scanning the multi-byte chars
037: // can see them conveniently.
038: byte[] buf = new byte[1000000];
039: int fileSize;
040:
041: boolean option_d = false; // data option. Produce exportable data
042: boolean option_v = true; // verbose informaional output.
043:
044: public static void main(String[] args) {
045: EUCTool This = new EUCTool();
046: This.Main(args);
047: }
048:
049: void Main(String[] args) {
050: int i;
051:
052: //
053: // Command Line Option Handling
054: //
055: String dirName = ".";
056: for (i = 0; i < args.length; i++) {
057: if (args[i].equals("-d")) {
058: option_d = true;
059: option_v = false;
060: continue;
061: }
062: if (args[i].startsWith("-")) {
063: System.err.println("Unrecongized option: " + args[i]);
064: System.exit(-1);
065: }
066: dirName = args[i];
067: }
068:
069: //
070: // Verify that the specified directory exists.
071: //
072: File dir = new File(dirName);
073: if (dir.isDirectory() == false) {
074: System.err
075: .println("\"" + dirName + "\" is not a directory");
076: System.exit(-1);
077: }
078:
079: //
080: // Do each subdirectory of the specified directory. There should be
081: // one per each encoding - euc-kr, euc-cn, euc-jp
082: //
083: File[] dirs = dir.listFiles();
084: for (i = 0; i < dirs.length; i++) {
085: if (dirs[i].isDirectory()) {
086: String nam = dirs[i].getName();
087: if (nam.equalsIgnoreCase("CVS")) {
088: continue;
089: }
090: processDir(dirs[i]);
091: }
092: }
093: }
094:
095: //
096: // Collect statistics from all ordinary files in a specified directory.
097: //
098: void processDir(File dir) {
099: int totalMbcsChars = 0;
100: HashMap m = new HashMap(10000);
101: int i;
102:
103: System.out.println(dir.getName());
104: File[] files = dir.listFiles();
105: for (i = 0; i < files.length; i++) {
106: try {
107: if (files[i].isFile()) {
108: FileInputStream is = new FileInputStream(files[i]);
109: fileSize = is.read(buf);
110: if (option_v) {
111: System.out.println(files[i].getPath());
112: System.out.println(" " + fileSize + " bytes.");
113: }
114: iteratedChar ichar = new iteratedChar();
115: int fileChars = 0;
116: int fileMbcsChars = 0;
117: int errs = 0;
118:
119: while (nextChar(ichar)) {
120: if (ichar.error == true) {
121: errs++;
122: continue;
123: }
124: fileChars++;
125: if (ichar.charValue > 255) {
126: fileMbcsChars++;
127: totalMbcsChars++;
128: }
129: if (ichar.charValue <= 255) {
130: // Don't keep occurence statistics for the single byte range
131: continue;
132: }
133:
134: //
135: // Frequency of occurence statistics are accumulated in a map.
136: //
137: ChEl keyEl = new ChEl(ichar.charValue, 0);
138: ChEl valEl = (ChEl) m.get(keyEl);
139: if (valEl == null) {
140: m.put(keyEl, keyEl);
141: valEl = keyEl;
142: }
143: valEl.occurences++;
144: }
145: if (option_v) {
146: System.out.println(" " + fileChars + " Chars");
147: System.out.println(" " + fileMbcsChars
148: + " mbcs Chars");
149: System.out.println(" " + errs + " errors");
150: System.out.println("\n");
151: }
152: }
153: } catch (Exception e) {
154: System.err.println("Exception:" + e);
155:
156: }
157: }
158:
159: //
160: // We've processed through all of the files.
161: // sort and dump out the frequency statistics.
162: //
163: Object[] encounteredChars = m.values().toArray();
164: Arrays.sort(encounteredChars);
165: int cumulativeChars = 0;
166: int cumulativePercent = 0;
167: if (option_v) {
168: System.out
169: .println("# <char code> <occurences> <Cumulative %>");
170: for (i = 0; i < encounteredChars.length; i++) {
171: ChEl c = (ChEl) encounteredChars[i];
172: cumulativeChars += c.occurences;
173: cumulativePercent = cumulativeChars * 100
174: / totalMbcsChars;
175: System.out.println(i + " "
176: + Integer.toHexString(c.charCode) + " "
177: + c.occurences + " "
178: + cumulativePercent);
179: }
180: }
181: if (option_d) {
182: //
183: // Output the list of characters formatted for pasting into a
184: // Java source code array initializer.
185: // Resort into order based on the character code value, not
186: // on frequency of occurence.
187: //
188: List charList = new ArrayList();
189:
190: for (i = 0; i < 100 && cumulativePercent < 50; i++) {
191: ChEl c = (ChEl) encounteredChars[i];
192: cumulativeChars += c.occurences;
193: cumulativePercent = cumulativeChars * 100
194: / totalMbcsChars;
195: charList.add(new Integer(c.charCode));
196: }
197: Object[] sortedChars = charList.toArray();
198: Arrays.sort(sortedChars);
199:
200: System.out.print(" {");
201: for (i = 0; i < sortedChars.length; i++) {
202: if (i != 0) {
203: System.out.print(", ");
204: if ((i) % 10 == 0) {
205: System.out.print("\n ");
206: }
207: }
208: int cp = ((Integer) sortedChars[i]).intValue();
209: System.out.print("0x" + Integer.toHexString(cp));
210: }
211: System.out.println("};");
212: }
213: }
214:
215: //
216: // This is a little class containing a
217: // multi-byte character value and an occurence count for that char.
218: // Instances of this class are kept in the collection that accumulates statistics
219: //
220: // WARNING: this class's natural ordering (from Comparable) and equals()
221: // are inconsistent.
222:
223: static class ChEl implements Comparable {
224: int charCode;
225: int occurences;
226:
227: ChEl(int c, int o) {
228: charCode = c;
229: occurences = o;
230: }
231:
232: // Equals needs to work with a map, with the charCode as the key.
233: // For insertion/lookup, we care about the char code only, not the occurence count.
234: public boolean equals(Object other) {
235: ChEl o = (ChEl) other;
236: return o.charCode == this .charCode;
237: }
238:
239: // Hashcode needs to be compatible with equals
240: // We're using this in a hashMap!
241: public int hashCode() {
242: return charCode;
243: }
244:
245: // We want to be able to sort the results by frequency of occurence
246: // Compare backwards. We want most frequent chars first.
247: public int compareTo(Object other) {
248: ChEl o = (ChEl) other;
249: return (this .occurences > o.occurences ? -1
250: : (this .occurences == o.occurences ? 0 : 1));
251: }
252:
253: }
254:
255: //
256: // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
257: // Pulls out one logical char according to the rules of EUC encoding.
258: //
259: class iteratedChar {
260: int charValue = 0; // The char value is a value from the encoding.
261: // It's meaning is not well defined, other than
262: // different encodings
263: int index = 0;
264: int nextIndex = 0;
265: boolean error = false;
266: boolean done = false;
267:
268: void reset() {
269: charValue = 0;
270: index = -1;
271: nextIndex = 0;
272: error = false;
273: done = false;
274: }
275:
276: int nextByte() {
277: if (nextIndex >= fileSize) {
278: done = true;
279: return -1;
280: }
281: int byteValue = (int) buf[nextIndex++] & 0x00ff;
282: return byteValue;
283: }
284: }
285:
286: boolean nextChar(iteratedChar it) {
287: it.index = it.nextIndex;
288: it.error = false;
289: int firstByte = 0;
290: int secondByte = 0;
291: int thirdByte = 0;
292: int fourthByte = 0;
293:
294: buildChar: {
295: firstByte = it.charValue = it.nextByte();
296: if (firstByte < 0) {
297: // Ran off the end of the input data
298: it.done = true;
299: break buildChar;
300: }
301: if (firstByte <= 0x8d) {
302: // single byte char
303: break buildChar;
304: }
305:
306: secondByte = it.nextByte();
307: it.charValue = (it.charValue << 8) | secondByte;
308:
309: if (firstByte >= 0xA1 && firstByte <= 0xfe) {
310: // Two byte Char
311: if (secondByte < 0xa1) {
312: it.error = true;
313: }
314: break buildChar;
315: }
316: if (firstByte == 0x8e) {
317: // Code Set 2.
318: // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
319: // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
320: // We don't know which we've got.
321: // Treat it like EUC-JP. If the data really was EUC-TW, the following two
322: // bytes will look like a well formed 2 byte char.
323: if (secondByte < 0xa1) {
324: it.error = true;
325: }
326: break buildChar;
327: }
328:
329: if (firstByte == 0x8f) {
330: // Code set 3.
331: // Three byte total char size, two bytes of actual char value.
332: thirdByte = it.nextByte();
333: it.charValue = (it.charValue << 8) | thirdByte;
334: if (thirdByte < 0xa1) {
335: it.error = true;
336: }
337: }
338:
339: }
340: if (it.error) {
341: System.out.println("Error "
342: + Integer.toHexString(firstByte) + " "
343: + Integer.toHexString(secondByte) + " "
344: + Integer.toHexString(thirdByte) + " "
345: + Integer.toHexString(fourthByte));
346: }
347: return (it.done == false);
348: }
349: }
|