001: /*
002: ***********************************************************************
003: * Copyright (C) 2005-2006, International Business Machines *
004: * Corporation and others. All Rights Reserved. *
005: ***********************************************************************
006: *
007: */
008:
009: package com.ibm.icu.dev.tool.charsetdet.sbcs;
010:
011: /**
012: * @author emader
013: *
014: * TODO To change the template for this generated type comment go to
015: * Window - Preferences - Java - Code Style - Code Templates
016: */
017: public class Checker implements NGramParser.NGramParserClient {
018: private NGramList ngrams;
019: private int totalNGrams;
020: private int totalHits;
021:
022: private String language;
023: private String encoding;
024:
025: private int[] histogram;
026:
027: private static final int BUFFER_SIZE = 1024;
028:
029: private char[] buffer;
030: private int bufIndex;
031: private int bufMax;
032:
033: private NGramParser parser;
034:
035: /**
036: * TODO This should take cumulative percent and the name...
037: */
038: public Checker(NGramList list, InputFile dataFile) {
039: ngrams = list;
040: ngrams.setMapper(dataFile);
041:
042: language = languageName(dataFile.getFilename());
043: encoding = dataFile.getEncoding();
044:
045: buffer = new char[BUFFER_SIZE];
046: parser = new NGramParser(this );
047: resetCounts();
048:
049: histogram = new int[100];
050: resetHistogram();
051: }
052:
053: public void handleNGram(String key) {
054: NGramList.NGram ngram = ngrams.get(key);
055:
056: totalNGrams += 1;
057:
058: if (ngram != null) {
059: totalHits += 1;
060: //ngram.incrementRefCount();
061: }
062: }
063:
064: private void resetCounts() {
065: bufIndex = 0;
066: totalNGrams = totalHits = 0;
067: }
068:
069: private void resetHistogram() {
070: for (int i = 0; i < 100; i += 1) {
071: histogram[i] = 0;
072: }
073:
074: }
075:
076: private static void exceptionError(Exception e) {
077: System.err.println("ioError: " + e.toString());
078: }
079:
080: private static String languageName(String filename) {
081: return filename.substring(0, filename.indexOf('.'));
082: }
083:
084: private boolean nextBuffer(InputFile inputFile) {
085: try {
086: bufMax = inputFile.read(buffer);
087: } catch (Exception e) {
088: bufMax = -1;
089: exceptionError(e);
090:
091: return false;
092: }
093:
094: bufIndex = 0;
095:
096: return bufMax >= 0;
097: }
098:
099: private void parseBuffer() {
100: resetCounts();
101: parser.reset();
102: parser.parse();
103: }
104:
105: public char nextChar() {
106: if (bufIndex >= bufMax) {
107: return 0;
108: }
109:
110: return buffer[bufIndex++];
111: }
112:
113: public String getLanguage() {
114: return language;
115: }
116:
117: public void setMapper(InputFile file) {
118: ngrams.setMapper(file);
119: }
120:
121: public int checkBuffer(char[] theBuffer, int charCount) {
122: buffer = theBuffer;
123: bufMax = charCount;
124:
125: parseBuffer();
126:
127: return totalHits;
128: }
129:
130: public void check(InputFile dataFile) {
131: int minHist = 101, maxHist = -1;
132:
133: dataFile.open();
134:
135: String dataFilename = dataFile.getFilename();
136: String fileEncoding = dataFile.getEncoding();
137:
138: System.out.println(language + "(" + encoding + ") stats, "
139: + languageName(dataFilename) + "(" + fileEncoding
140: + ") data:");
141:
142: setMapper(dataFile);
143: resetHistogram();
144:
145: while (nextBuffer(dataFile)) {
146: parseBuffer();
147:
148: double percentHits = (double) totalHits / totalNGrams
149: * 100.0;
150: int ph = (int) percentHits;
151:
152: if (ph < minHist) {
153: minHist = ph;
154: }
155:
156: if (ph > maxHist) {
157: maxHist = ph;
158: }
159:
160: histogram[ph] += 1;
161: }
162:
163: for (int ph = minHist; ph <= maxHist; ph += 1) {
164: System.out.println(ph + "\t" + histogram[ph]);
165: }
166:
167: System.out.println();
168:
169: dataFile.close();
170:
171: return;
172: }
173: }
|