001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.dev.tool.rbbi;
008:
009: import com.ibm.icu.util.CompactByteArray;
010: import com.ibm.icu.text.UnicodeSet;
011: import com.ibm.icu.impl.Utility;
012: import java.io.*;
013: import java.util.Vector;
014:
015: public class BuildDictionaryFile {
016: public static void main(String args[])
017: throws FileNotFoundException, UnsupportedEncodingException,
018: IOException {
019: String filename = args[0];
020: String encoding = "";
021: String outputFile = "";
022: String listingFile = "";
023:
024: if (args.length >= 2)
025: encoding = args[1];
026:
027: if (args.length >= 3)
028: outputFile = args[2];
029:
030: if (args.length >= 4)
031: listingFile = args[3];
032:
033: BuildDictionaryFile dictionary = new BuildDictionaryFile();
034: dictionary.build(filename, encoding);
035:
036: DataOutputStream out = null;
037: if (outputFile.length() != 0) {
038: out = new DataOutputStream(new FileOutputStream(outputFile));
039: dictionary.writeDictionaryFile(out);
040: }
041:
042: PrintWriter listing = null;
043: if (listingFile.length() != 0) {
044: listing = new PrintWriter(new OutputStreamWriter(
045: new FileOutputStream(listingFile), "UnicodeLittle"));
046: dictionary.printWordList("", 0, listing);
047: listing.close();
048: }
049: }
050:
051: public BuildDictionaryFile() {
052: }
053:
054: public void build(String filename, String encoding)
055: throws FileNotFoundException, UnsupportedEncodingException,
056: IOException {
057: FileInputStream file = new FileInputStream(filename);
058: InputStreamReader in;
059: if (encoding.length() == 0)
060: in = new InputStreamReader(file);
061: else
062: in = new InputStreamReader(file, encoding);
063:
064: buildColumnMap(in);
065:
066: file = new FileInputStream(filename);
067: if (encoding.length() == 0)
068: in = new InputStreamReader(file);
069: else
070: in = new InputStreamReader(file, encoding);
071:
072: buildStateTable(in);
073: //printTable();
074: }
075:
076: public void buildColumnMap(InputStreamReader in) throws IOException {
077: System.out.println("Building column map...");
078: UnicodeSet charsInFile = new UnicodeSet();
079: int c = in.read();
080: int totalChars = 0;
081: while (c >= 0) {
082: ++totalChars;
083: if (totalChars > 0 && totalChars % 5000 == 0)
084: System.out.println("Read " + totalChars
085: + " characters...");
086: if (c > ' ')
087: charsInFile.add((char) c);
088: c = in.read();
089: }
090: // Test.debugPrintln(charsInFile.toString());
091:
092: StringBuffer tempReverseMap = new StringBuffer();
093: tempReverseMap.append(' ');
094:
095: columnMap = new CompactByteArray();
096: int n = charsInFile.getRangeCount();
097: byte p = 1;
098: for (int i = 0; i < n; ++i) {
099: char start = (char) charsInFile.getRangeStart(i);
100: char end = (char) charsInFile.getRangeEnd(i);
101: for (char ch = start; ch <= end; ch++) {
102: if (columnMap.elementAt(Character.toLowerCase(ch)) == 0) {
103: columnMap.setElementAt(Character.toUpperCase(ch),
104: Character.toUpperCase(ch), p);
105: columnMap.setElementAt(Character.toLowerCase(ch),
106: Character.toLowerCase(ch), p);
107: ++p;
108: tempReverseMap.append(ch);
109: }
110: }
111: }
112: //System.out.println("Compacting...");
113: columnMap.compact();
114:
115: //System.out.println(tempReverseMap.toString());
116: reverseColumnMap = new char[p];
117: Utility.getChars(tempReverseMap, 0, p, reverseColumnMap, 0);
118:
119: System.out.println("total columns = " + p);
120: numCols = p;
121: numColGroups = (numCols >> 5) + 1;
122:
123: /*
124: short[] index = columnMap.getIndexArray();
125: System.out.println("Index:");
126: for (int i = 0; i < index.length; i++) {
127: if (i % 16 == 0) {
128: System.out.println();
129: System.out.print(" " + Integer.toHexString(i * 128) + ":");
130: }
131: System.out.print("\t" + Integer.toHexString(index[i]));
132: }
133: System.out.println();
134: byte[] data = columnMap.getStringArray();
135: System.out.print("Values:");
136: for (int i = 0; i < data.length; i++) {
137: if (i % 16 == 0) {
138: System.out.println();
139: System.out.print(" " + Integer.toHexString(i) + ":");
140: }
141: if (data[i] == 0)
142: System.out.print("\t.");
143: else
144: System.out.print("\t" + Integer.toString(data[i]));
145: }
146: System.out.println();
147: */
148: }
149:
150: public void buildStateTable(InputStreamReader in)
151: throws IOException {
152: Vector tempTable = new Vector();
153: tempTable.addElement(new int[numCols + 1]);
154: int state = 0;
155:
156: int c = in.read();
157: int[] row = null;
158: int charsInWord = 0;
159: while (c >= 0) {
160: charsInWord++;
161: short column = columnMap.elementAt((char) c);
162:
163: row = (int[]) (tempTable.elementAt(state));
164: if (column != 0) {
165: if (row[column] == 0) {
166: row[column] = tempTable.size();
167: ++row[numCols];
168: state = (tempTable.size());
169: tempTable.addElement(new int[numCols + 1]);
170: } else
171: state = row[column];
172: } else if (state != 0) {
173: if (row[0] != -1) {
174: row[0] = -1;
175: ++row[numCols];
176: uniqueWords++;
177: totalUniqueWordChars += charsInWord;
178: }
179: totalWords++;
180: if (totalWords % 5000 == 0)
181: System.out.println("Read " + totalWords
182: + " words, " + tempTable.size()
183: + " rows...");
184: charsInWord = 0;
185: state = 0;
186: }
187: c = in.read();
188: }
189: if (state != 0) {
190: row = (int[]) (tempTable.elementAt(state));
191: if (row[0] != -1) {
192: row[0] = -1;
193: uniqueWords++;
194: totalUniqueWordChars += charsInWord;
195: }
196: totalWords++;
197: }
198:
199: compress(tempTable);
200:
201: table = new short[numCols * tempTable.size()];
202: for (int i = 0; i < tempTable.size(); i++) {
203: row = (int[]) tempTable.elementAt(i);
204: for (int j = 0; j < numCols; j++)
205: table[i * numCols + j] = (short) row[j];
206: }
207: }
208:
209: private void compress(Vector tempTable) {
210: System.out.println("Before compression:");
211: System.out.println(" Number of rows = " + tempTable.size());
212: System.out.println(" Number of columns = " + numCols);
213: System.out.println(" Number of cells = " + tempTable.size()
214: * numCols);
215: deleteDuplicateRows(tempTable);
216: System.out.println("After removing duplicate rows:");
217: System.out.println(" Number of rows = " + tempTable.size());
218: System.out.println(" Number of columns = " + numCols);
219: System.out.println(" Number of cells = " + tempTable.size()
220: * numCols);
221: stackRows(tempTable);
222: if (tempTable.size() > 32767)
223: throw new IllegalArgumentException(
224: "Too many rows in table!");
225: System.out.println("After doubling up on rows:");
226: System.out.println(" Number of rows = " + tempTable.size());
227: System.out.println(" Number of columns = " + numCols);
228: System.out.println(" Number of cells = " + tempTable.size()
229: * numCols);
230: }
231:
232: /*
233: experimental...
234: private void deleteDuplicateRows(Vector tempTable) {
235: int[] rowNumMap = new int[tempTable.size()];
236: for (int i = 0; i < rowNumMap.length; i++)
237: rowNumMap[i] = i;
238:
239: int nextClass = numCols;
240: int currentClass;
241: int lastClass;
242: boolean split;
243: int[] row1, row2, tempRow;
244: int tempCat;
245:
246: do {
247: System.out.println("Making a pass (" + nextClass + " classes)...");
248: currentClass = 0;
249: lastClass = nextClass;
250: while (currentClass < nextClass) {
251: System.out.println(" currentClass = " + currentClass);
252: split = false;
253: row1 = row2 = null;
254: for (int i = 0; i < tempTable.size(); i++) {
255: tempRow = (int[])tempTable.elementAt(i);
256: if (tempRow[numCols] == currentClass) {
257: if (row1 == null) {
258: row1 = (int[])tempTable.elementAt(i);
259: }
260: else {
261: row2 = (int[])tempTable.elementAt(i);
262: for (int j = 0; j < numCols; j++) {
263: if ((row1[j] == 0) != (row2[j] == 0) ||
264: (row1[j] == -1) != (row2[j] == -1)) {
265: row2[numCols] = nextClass;
266: split = true;
267: break;
268: }
269: else if (row1[j] != 0 && row2[j] != 0 && row1[j] != -1
270: && row2[j] != -1) {
271: tempRow = (int[])tempTable.elementAt(row1[j]);
272: tempCat = tempRow[numCols];
273: tempRow = (int[])tempTable.elementAt(row2[j]);
274: if (tempCat != tempRow[numCols]) {
275: row2[numCols] = nextClass;
276: split = true;
277: break;
278: }
279: }
280: }
281: }
282: }
283: }
284: if (split)
285: ++nextClass;
286: ++currentClass;
287: //System.out.println();
288: }
289: } while (lastClass != nextClass);
290:
291: int[] representatives = new int[nextClass];
292: for (int i = 1; i < tempTable.size(); i++) {
293: tempRow = (int[])tempTable.elementAt(i);
294: if (representatives[tempRow[numCols]] == 0)
295: representatives[tempRow[numCols]] = i;
296: else
297: rowNumMap[i] = representatives[tempRow[numCols]];
298: }
299: System.out.println("Renumbering...");
300:
301: // renumber all remaining rows
302: for (int i = 0; i < rowNumMap.length; i++)
303: if (rowNumMap[i] != i)
304: tempTable.setElementAt(null, i);
305: int newRowNum = 0;
306: for (int i = 0; i < rowNumMap.length; i++)
307: if (tempTable.elementAt(i) != null)
308: rowNumMap[i] = newRowNum++;
309: for (int i = 0; i < rowNumMap.length; i++)
310: if (tempTable.elementAt(i) == null)
311: rowNumMap[i] = rowNumMap[rowNumMap[i]];
312:
313: for (int i = tempTable.size() - 1; i >= 0; i--) {
314: tempRow = (int[])tempTable.elementAt(i);
315: if (tempRow == null)
316: tempTable.removeElementAt(i);
317: else {
318: for (int j = 0; j < numCols; j++)
319: if (tempRow[j] != -1)
320: tempRow[j] = rowNumMap[j];
321: }
322: }
323: //for (int i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int newRowNum = rowNumMap.length;
324: }
325: */
326:
327: private void deleteDuplicateRows(Vector tempTable) {
328: Vector work = (Vector) (tempTable.clone());
329: boolean didDeleteRow = true;
330:
331: Vector tempMapping = new Vector(work.size());
332: int[] mapping = new int[work.size()];
333: for (int i = 0; i < mapping.length; i++) {
334: mapping[i] = i;
335: tempMapping.addElement(new Integer(i));
336: }
337: boolean[] tbd = new boolean[work.size()];
338:
339: while (didDeleteRow) {
340: System.out.println(" " + work.size() + " rows...");
341: int deletedRows = 0;
342: didDeleteRow = false;
343:
344: sortTable(work, tempMapping, mapping, 1, work.size());
345: for (int i = 0; i < work.size() - 1;) {
346: System.out.print("Deleting, inspecting row " + i
347: + ", deleted " + deletedRows + " rows...\r");
348: int rowToDelete = ((Integer) (tempMapping
349: .elementAt(i + 1))).intValue();
350: int rowToMapTo = ((Integer) (tempMapping.elementAt(i)))
351: .intValue();
352: if (compareRows((int[]) work.elementAt(i), (int[]) work
353: .elementAt(i + 1), mapping) == 0) {
354: tbd[rowToDelete] = true;
355: tempTable.setElementAt(null, rowToDelete);
356: while (tbd[mapping[rowToMapTo]])
357: mapping[rowToMapTo] = mapping[mapping[rowToMapTo]];
358: mapping[rowToDelete] = mapping[rowToMapTo];
359: didDeleteRow = true;
360: deletedRows++;
361: work.removeElementAt(i + 1);
362: tempMapping.removeElementAt(i + 1);
363: } else
364: i++;
365: }
366: for (int i = 0; i < mapping.length; i++) {
367: if (tbd[i] && tbd[mapping[i]])
368: mapping[i] = mapping[mapping[i]];
369: }
370: }
371:
372: int decrementBy = 0;
373: for (int i = 0; i < mapping.length; i++) {
374: if (tbd[i])
375: decrementBy++;
376: else
377: mapping[i] -= decrementBy;
378: }
379: for (int i = 0; i < mapping.length; i++) {
380: if (tbd[i])
381: mapping[i] = mapping[mapping[i]];
382: }
383: for (int i = tempTable.size() - 1; i >= 0; i--) {
384: if (tbd[i])
385: tempTable.removeElementAt(i);
386: else {
387: int[] row = (int[]) tempTable.elementAt(i);
388: for (int j = 0; j < numCols; j++)
389: row[j] = (row[j] == -1) ? -1 : mapping[row[j]];
390: }
391: }
392: }
393:
394: private void sortTable(Vector table, Vector tempMapping,
395: int[] mapping, int start, int end) {
396: System.out.print("Sorting (" + start + ", " + end + ")...\r");
397: if (start + 1 >= end)
398: return;
399: else if (start + 10 >= end) {
400: for (int i = start + 1; i < end; i++) {
401: int[] row = (int[]) table.elementAt(i);
402: Integer tempMap = (Integer) tempMapping.elementAt(i);
403: int j;
404: for (j = i - 1; j >= start; j--) {
405: if (compareRows((int[]) table.elementAt(j), row,
406: mapping) > 0) {
407: table.setElementAt((int[]) table.elementAt(j),
408: j + 1);
409: tempMapping.setElementAt((Integer) tempMapping
410: .elementAt(j), j + 1);
411: } else {
412: table.setElementAt(row, j + 1);
413: tempMapping.setElementAt(tempMap, j + 1);
414: break;
415: }
416: }
417: if (j < start) {
418: table.setElementAt(row, start);
419: tempMapping.setElementAt(tempMap, start);
420: }
421: }
422: } else {
423: int boundaryPos = (start + end) / 2;
424: int i;
425: boolean allTheSame = true;
426: int firstDifferent = 0;
427:
428: do {
429: int[] boundary = (int[]) table.elementAt(boundaryPos);
430: i = start;
431: int j = end - 1;
432: int[] row = null;
433: byte compResult;
434: while (i < j) {
435: row = (int[]) table.elementAt(i);
436: while (i <= j
437: && compareRows(row, boundary, mapping) < 0) {
438: i++;
439: row = (int[]) table.elementAt(i);
440: }
441:
442: row = (int[]) table.elementAt(j);
443: compResult = compareRows(row, boundary, mapping);
444: while (i <= j && (compResult >= 0)) {
445: if (compResult != 0) {
446: allTheSame = false;
447: firstDifferent = j;
448: }
449: j--;
450: row = (int[]) table.elementAt(j);
451: compResult = compareRows(row, boundary, mapping);
452: }
453: if (i <= j) {
454: row = (int[]) table.elementAt(j);
455: table.setElementAt(table.elementAt(i), j);
456: table.setElementAt(row, i);
457: Object temp = tempMapping.elementAt(j);
458: tempMapping.setElementAt(tempMapping
459: .elementAt(i), j);
460: tempMapping.setElementAt(temp, i);
461: }
462: }
463: if (i <= start) {
464: if (allTheSame)
465: return;
466: else
467: boundaryPos = firstDifferent;
468: }
469: } while (i <= start);
470: sortTable(table, tempMapping, mapping, start, i);
471: sortTable(table, tempMapping, mapping, i, end);
472: }
473: }
474:
475: private byte compareRows(int[] row1, int[] row2, int[] mapping) {
476: for (int i = 0; i < numCols; i++) {
477: int c1 = (row1[i] == -1) ? -1 : mapping[row1[i]];
478: int c2 = (row2[i] == -1) ? -1 : mapping[row2[i]];
479: if (c1 < c2)
480: return -1;
481: else if (c1 > c2)
482: return 1;
483: }
484: return 0;
485: }
486:
487: private int[] buildRowIndex(Vector tempTable) {
488: int[] tempRowIndex = new int[tempTable.size()];
489: rowIndexFlagsIndex = new short[tempTable.size()];
490: Vector tempRowIndexFlags = new Vector();
491: rowIndexShifts = new byte[tempTable.size()];
492:
493: // build the row index. Each entry in the row index starts out referring
494: // to the original row (so it doesn't actually do any mapping), and we set
495: // up the index flags to show which cells in the row are populated
496: for (int i = 0; i < tempTable.size(); i++) {
497: tempRowIndex[i] = i;
498:
499: int[] row = (int[]) tempTable.elementAt(i);
500: if (row[numCols] == 1 && row[0] == 0) {
501: int j = 0;
502: while (row[j] == 0)
503: ++j;
504: rowIndexFlagsIndex[i] = (short) (-j);
505: } else {
506: int[] flags = new int[numColGroups];
507: int nextFlag = 1;
508: int colGroup = 0;
509: for (int j = 0; j < numCols; j++) {
510: if (row[j] != 0)
511: flags[colGroup] |= nextFlag;
512: nextFlag <<= 1;
513: if (nextFlag == 0) {
514: ++colGroup;
515: nextFlag = 1;
516: }
517: }
518: colGroup = 0;
519: int j = 0;
520: while (j < tempRowIndexFlags.size()) {
521: if (((Integer) tempRowIndexFlags.elementAt(j))
522: .intValue() == flags[colGroup]) {
523: ++colGroup;
524: ++j;
525: if (colGroup >= numColGroups)
526: break;
527: } else if (colGroup != 0)
528: colGroup = 0;
529: else
530: ++j;
531: }
532: rowIndexFlagsIndex[i] = (short) (j - colGroup);
533: while (colGroup < numColGroups) {
534: tempRowIndexFlags.addElement(new Integer(
535: flags[colGroup]));
536: ++colGroup;
537: }
538: }
539: }
540: rowIndexFlags = new int[tempRowIndexFlags.size()];
541: for (int i = 0; i < rowIndexFlags.length; i++)
542: rowIndexFlags[i] = ((Integer) tempRowIndexFlags
543: .elementAt(i)).intValue();
544: System.out.println("Number of column groups = " + numColGroups);
545: System.out.println("Size of rowIndexFlags = "
546: + rowIndexFlags.length);
547:
548: return tempRowIndex;
549: }
550:
551: private void stackRows(Vector tempTable) {
552: /*
553: System.out.print("Row:\t");
554: for (int i = 0; i < numCols; i++)
555: System.out.print(reverseColumnMap[i] + "\t");
556: System.out.println();
557: for (int i = 0; i < tempTable.size(); i++) {
558: System.out.print(Integer.toString(i) + ":\t");
559: int[] row = (int[])tempTable.elementAt(i);
560: for (int j = 0; j < row.length; j++)
561: if (row[j] != 0) System.out.print(Integer.toString(row[j]) + "\t");
562: else System.out.print(".\t");
563: System.out.println();
564: }
565: */
566:
567: int[] tempRowIndex = buildRowIndex(tempTable);
568: boolean[] tbd = new boolean[tempTable.size()];
569:
570: // now we actually go through and stack rows together
571: for (int i = 0; i < tempTable.size(); i++) {
572: if (tbd[i])
573: continue;
574: System.out.print("Stacking, inspecting row " + i + "...\r");
575: //System.out.println("Stacking, inspecting row " + i + "...");
576:
577: int[] destRow = (int[]) tempTable.elementAt(i);
578:
579: boolean[] tempFlags = new boolean[numCols];
580: boolean[] filledCells = new boolean[numCols];
581: for (int j = 0; j < numCols; j++)
582: filledCells[j] = destRow[j] != 0;
583:
584: for (int j = i + 1; destRow[numCols] < numCols
585: && j < tempTable.size(); j++) {
586: if (tbd[j])
587: continue;
588:
589: int[] srcRow = (int[]) tempTable.elementAt(j);
590: if (srcRow[numCols] + destRow[numCols] > numCols)
591: continue;
592:
593: int maxLeftShift = -999;
594: int maxRightShift = 0;
595: for (int k = 0; k < numCols; k++) {
596: tempFlags[k] = srcRow[k] != 0;
597: if (tempFlags[k]) {
598: if (maxLeftShift == -999)
599: maxLeftShift = -k;
600: maxRightShift = (numCols - 1) - k;
601: }
602: }
603:
604: int shift;
605: for (shift = maxLeftShift; shift <= maxRightShift; shift++) {
606: int k;
607: for (k = 0; k < numCols; k++) {
608: if (tempFlags[k] && filledCells[k + shift])
609: break;
610: }
611: if (k >= numCols)
612: break;
613: }
614: if (shift <= maxRightShift) {
615: //System.out.println("Packing row " + j + " into row " + i + " with shift = " + shift);
616: for (int k = 0; k < numCols; k++) {
617: if (tempFlags[k]) {
618: filledCells[k + shift] = true;
619: destRow[k + shift] = srcRow[k];
620: ++destRow[numCols];
621: }
622: }
623: tbd[j] = true;
624: tempRowIndex[j] = i;
625: rowIndexShifts[j] = (byte) shift;
626: }
627: }
628: }
629:
630: // finally, we squeeze out all the deleted rows
631: int decrementBy = 0;
632: for (int i = 0; i < tempRowIndex.length; i++) {
633: if (!tbd[i])
634: tempRowIndex[i] -= decrementBy;
635: else
636: ++decrementBy;
637: }
638: rowIndex = new short[tempRowIndex.length];
639: for (int i = tempRowIndex.length - 1; i >= 0; i--) {
640: if (tbd[i]) {
641: rowIndex[i] = (short) (tempRowIndex[tempRowIndex[i]]);
642: tempTable.removeElementAt(i);
643: } else
644: rowIndex[i] = (short) tempRowIndex[i];
645: }
646: }
647:
648: private void printTable() {
649: short cell;
650: int populatedCells = 0;
651: /*
652: System.out.println("Conceptual table:");
653: System.out.println(" Row: a b c d e f g h i j k l m n"
654: + " o p q r s t u v w x y z ' #");
655:
656: boolean[] rowPrintFlags = new boolean[rowIndex.length];
657: printConceptualTable("", 0, rowPrintFlags);
658: */
659:
660: System.out.println();
661: System.out.println("Conceptual table:");
662: System.out.print(" Row:");
663: for (int i = 0; i < reverseColumnMap.length; i++) {
664: System.out.print(" " + reverseColumnMap[i]);
665: }
666: for (int i = 0; i < rowIndex.length; i++) {
667: System.out.println();
668: printNumber(i, 4);
669: System.out.print(":");
670: for (int j = 0; j < numCols; j++)
671: printNumber(at(i, j), 4);
672: }
673: System.out.println('\n');
674:
675: System.out.println();
676: System.out.println("Internally stored table:");
677: System.out.print(" Row:");
678: for (int i = 0; i < reverseColumnMap.length; i++) {
679: System.out.print(" " + reverseColumnMap[i]);
680: }
681: for (int i = 0; i < table.length; i++) {
682: if (i % numCols == 0) {
683: System.out.println();
684: printNumber(i / numCols, 4);
685: System.out.print(":");
686: }
687: cell = table[i];
688: if (cell != 0)
689: populatedCells++;
690: printNumber(cell, 4);
691: }
692: System.out.println('\n');
693:
694: System.out.println("Row index:");
695: for (int i = 0; i < rowIndex.length; i++) {
696: System.out.print(" " + i + " -> " + rowIndex[i]);
697: if (rowIndexFlagsIndex[i] < 0)
698: System.out
699: .print(", flags = "
700: + Integer
701: .toBinaryString((1 << (-rowIndexFlagsIndex[i])))
702: + " (" + rowIndexFlagsIndex[i]);
703: else
704: System.out
705: .print(", flags = "
706: + Integer
707: .toBinaryString(rowIndexFlags[rowIndexFlagsIndex[i]])
708: + " (" + rowIndexFlagsIndex[i]);
709: System.out.println("), shift = " + rowIndexShifts[i]);
710: }
711: /*
712: int theoreticalMinRows = populatedCells / numCols;
713: if (populatedCells % numCols != 0)
714: theoreticalMinRows++;
715: int oneCellRows = 0;
716: for (int i = 0; i < rowIndexFlags.length; i++) {
717: double temp = Math.log(rowIndexFlags[i]) / Math.log(2);
718: if (temp == (int)temp)
719: oneCellRows++;
720: }
721:
722: System.out.println('\n');
723: System.out.println("Total words in input = " + totalWords);
724: System.out.println("Total unique words = " + uniqueWords + ", comprising " +
725: totalUniqueWordChars + " characters\n");
726: System.out.println("Number of populated cells = " + populatedCells);
727: System.out.println("Total number of cells = " + (table.length));
728: System.out.println("Residency = " + ((float)populatedCells / table.length * 100) + '%');
729: System.out.println("Ratio of populated cells to unique-word characters = " +
730: ((float)populatedCells / totalUniqueWordChars * 100) + '%');
731: System.out.println("Ratio of total cells to unique-word characters = " +
732: ((float)table.length / totalUniqueWordChars * 100) + '%');
733: System.out.println("Number of rows = " + (table.length / numCols));
734: System.out.println("Theoretical minimum number of rows = " + theoreticalMinRows);
735: System.out.println("Ratio of number of rows to theoretical minimum = " +
736: ((float)(table.length / numCols) / theoreticalMinRows * 100) + '%');
737: System.out.println("Number of conceptual rows = " + rowIndex.length);
738: System.out.println("Conceptual rows with only one populated cell = " + oneCellRows);
739: System.out.println("Ratio of one-cell rows to total conceptual rows = " + (((float)oneCellRows)
740: / rowIndex.length * 100) + '%');
741: System.out.println("Average number of populated cells in multi-cell rows = " +
742: ((float)(populatedCells - oneCellRows) / (rowIndex.length - oneCellRows)));
743:
744: int storageUsed = table.length * 2 + rowIndex.length * 2
745: + rowIndexFlags.length * 4 + rowIndexShifts.length;
746: System.out.println("Total number of bytes in table (including indexes) = " +
747: storageUsed);
748: System.out.println("Bytes of overhead per unique-word character = " + ((double)(storageUsed
749: - (totalUniqueWordChars * 2)) / totalUniqueWordChars));
750: */
751: }
752:
753: private void printConceptualTable(String initialString, int state,
754: boolean[] flags) {
755: if (initialString.length() == 0)
756: System.out.println("root:");
757: else
758: System.out.println(initialString + ':');
759:
760: if (!flags[state]) {
761: flags[state] = true;
762: printNumber(state, 4);
763: System.out.print(":");
764: for (int i = 0; i < numCols; i++)
765: printNumber(at(state, i), 4);
766: System.out.println();
767: }
768:
769: int nextState;
770: for (int i = 0; i < numCols; i++) {
771: nextState = at(state, i);
772: if (nextState > 0 && !flags[nextState]) {
773: printNumber(nextState, 4);
774: System.out.print(":");
775: for (int j = 0; j < numCols; j++)
776: printNumber(at(nextState, j), 4);
777: System.out.println();
778: }
779: }
780: for (int i = 0; i < numCols; i++) {
781: nextState = at(state, i);
782: if (nextState > 0 && !flags[nextState]) {
783: char nextChar;
784: if (nextState == 27)
785: nextChar = ' ';
786: else if (nextState == 26)
787: nextChar = '\'';
788: else
789: nextChar = (char) (i + 'a');
790: flags[nextState] = true;
791: printConceptualTable(initialString + nextChar,
792: nextState, flags);
793: }
794: }
795: }
796:
797: private void printWordList(String partialWord, int state,
798: PrintWriter out) throws IOException {
799: if (state == -1) {
800: System.out.println(partialWord);
801: if (out != null)
802: out.println(partialWord);
803: } else {
804: for (int i = 0; i < numCols; i++) {
805: if (at(state, i) != 0)
806: printWordList(partialWord + reverseColumnMap[i],
807: at(state, i), out);
808: }
809: }
810: }
811:
812: private void writeDictionaryFile(DataOutputStream out)
813: throws IOException {
814: out.writeInt(0); // version number
815:
816: char[] columnMapIndexes = columnMap.getIndexArray();
817: out.writeInt(columnMapIndexes.length);
818: for (int i = 0; i < columnMapIndexes.length; i++)
819: out.writeShort((short) columnMapIndexes[i]);
820: byte[] columnMapValues = columnMap.getValueArray();
821: out.writeInt(columnMapValues.length);
822: for (int i = 0; i < columnMapValues.length; i++)
823: out.writeByte((byte) columnMapValues[i]);
824:
825: out.writeInt(numCols);
826: out.writeInt(numColGroups);
827:
828: out.writeInt(rowIndex.length);
829: for (int i = 0; i < rowIndex.length; i++)
830: out.writeShort(rowIndex[i]);
831:
832: out.writeInt(rowIndexFlagsIndex.length);
833: for (int i = 0; i < rowIndexFlagsIndex.length; i++)
834: out.writeShort(rowIndexFlagsIndex[i]);
835: out.writeInt(rowIndexFlags.length);
836: for (int i = 0; i < rowIndexFlags.length; i++)
837: out.writeInt(rowIndexFlags[i]);
838:
839: out.writeInt(rowIndexShifts.length);
840: for (int i = 0; i < rowIndexShifts.length; i++)
841: out.writeByte(rowIndexShifts[i]);
842:
843: out.writeInt(table.length);
844: for (int i = 0; i < table.length; i++)
845: out.writeShort(table[i]);
846:
847: out.close();
848: }
849:
850: private void printNumber(int x, int width) {
851: String s = String.valueOf(x);
852: if (width > s.length())
853: System.out.print(spaces.substring(0, width - s.length()));
854: if (x != 0)
855: System.out.print(s);
856: else
857: System.out.print('.');
858: }
859:
860: public final short at(int row, char ch) {
861: int col = columnMap.elementAt(ch);
862: return at(row, col);
863: }
864:
865: public final short at(int row, int col) {
866: if (cellIsPopulated(row, col))
867: return internalAt(rowIndex[row], col + rowIndexShifts[row]);
868: else
869: return 0;
870: }
871:
872: private final boolean cellIsPopulated(int row, int col) {
873: if (rowIndexFlagsIndex[row] < 0)
874: return col == -rowIndexFlagsIndex[row];
875: else {
876: int flags = rowIndexFlags[rowIndexFlagsIndex[row]
877: + (col >> 5)];
878: return (flags & (1 << (col & 0x1f))) != 0;
879: }
880: }
881:
882: private final short internalAt(int row, int col) {
883: return table[row * numCols + col];
884: }
885:
886: private CompactByteArray columnMap = null;
887: private char[] reverseColumnMap = null;
888: private int numCols;
889: private int numColGroups;
890: private short[] table = null;
891: private short[] rowIndex = null;
892: private int[] rowIndexFlags = null;
893: private short[] rowIndexFlagsIndex = null;
894: private byte[] rowIndexShifts = null;
895:
896: private int totalWords = 0;
897: private int uniqueWords = 0;
898: private int totalUniqueWordChars = 0;
899:
900: private static final String spaces = " ";
901: }
|