001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import java.io.InputStream;
010: import java.io.DataInputStream;
011: import java.io.FileNotFoundException;
012: import java.io.UnsupportedEncodingException;
013: import java.io.IOException;
014: import java.io.FileInputStream;
015: import java.io.OutputStreamWriter;
016: import java.io.PrintWriter;
017: import java.io.FileOutputStream;
018:
019: import com.ibm.icu.util.CompactByteArray;
020:
021: /**
022: * This is the class that represents the list of known words used by
023: * DictionaryBasedBreakIterator. The conceptual data structure used
024: * here is a trie: there is a node hanging off the root node for every
025: * letter that can start a word. Each of these nodes has a node hanging
026: * off of it for every letter that can be the second letter of a word
027: * if this node is the first letter, and so on. The trie is represented
028: * as a two-dimensional array that can be treated as a table of state
029: * transitions. Indexes are used to compress this array, taking
030: * advantage of the fact that this array will always be very sparse.
031: * @internal
032: * @deprecated This API is ICU internal only.
033: */
034: public class BreakDictionary {
035: //=================================================================================
036: // testing and debugging
037: //=================================================================================
038: /**
039: * @internal
040: * @deprecated This API is ICU internal only.
041: */
042: public static void main(String args[])
043: throws FileNotFoundException, UnsupportedEncodingException,
044: IOException {
045: String filename = args[0];
046:
047: BreakDictionary dictionary = new BreakDictionary(
048: new FileInputStream(filename));
049:
050: PrintWriter out = null;
051:
052: if (args.length >= 2) {
053: out = new PrintWriter(new OutputStreamWriter(
054: new FileOutputStream(args[1]), "UnicodeLittle"));
055: }
056:
057: dictionary.printWordList("", 0, out);
058:
059: if (out != null) {
060: out.close();
061: }
062: }
063:
064: /**
065: * @internal
066: * @deprecated This API is ICU internal only.
067: */
068: public void printWordList(String partialWord, int state,
069: PrintWriter out) throws IOException {
070: if (state == 0xFFFF) {
071: System.out.println(partialWord);
072: if (out != null) {
073: out.println(partialWord);
074: }
075: } else {
076: for (int i = 0; i < numCols; i++) {
077: int newState = (at(state, i)) & 0xFFFF;
078:
079: if (newState != 0) {
080: char newChar = reverseColumnMap[i];
081: String newPartialWord = partialWord;
082:
083: if (newChar != 0) {
084: newPartialWord += newChar;
085: }
086:
087: printWordList(newPartialWord, newState, out);
088: }
089: }
090: }
091: }
092:
093: /**
094: * A map used to go from column numbers to characters. Used only
095: * for debugging right now.
096: */
097: private char[] reverseColumnMap = null;
098:
099: //=================================================================================
100: // data members
101: //=================================================================================
102:
103: /**
104: * Maps from characters to column numbers. The main use of this is to
105: * avoid making room in the array for empty columns.
106: */
107: private CompactByteArray columnMap = null;
108:
109: /**
110: * The number of actual columns in the table
111: */
112: private int numCols;
113:
114: /**
115: * Columns are organized into groups of 32. This says how many
116: * column groups. (We could calculate this, but we store the
117: * value to avoid having to repeatedly calculate it.)
118: */
119: private int numColGroups;
120:
121: /**
122: * The actual compressed state table. Each conceptual row represents
123: * a state, and the cells in it contain the row numbers of the states
124: * to transition to for each possible letter. 0 is used to indicate
125: * an illegal combination of letters (i.e., the error state). The
126: * table is compressed by eliminating all the unpopulated (i.e., zero)
127: * cells. Multiple conceptual rows can then be doubled up in a single
128: * physical row by sliding them up and possibly shifting them to one
129: * side or the other so the populated cells don't collide. Indexes
130: * are used to identify unpopulated cells and to locate populated cells.
131: */
132: private short[] table = null;
133:
134: /**
135: * This index maps logical row numbers to physical row numbers
136: */
137: private short[] rowIndex = null;
138:
139: /**
140: * A bitmap is used to tell which cells in the comceptual table are
141: * populated. This array contains all the unique bit combinations
142: * in that bitmap. If the table is more than 32 columns wide,
143: * successive entries in this array are used for a single row.
144: */
145: private int[] rowIndexFlags = null;
146:
147: /**
148: * This index maps from a logical row number into the bitmap table above.
149: * (This keeps us from storing duplicate bitmap combinations.) Since there
150: * are a lot of rows with only one populated cell, instead of wasting space
151: * in the bitmap table, we just store a negative number in this index for
152: * rows with one populated cell. The absolute value of that number is
153: * the column number of the populated cell.
154: */
155: private short[] rowIndexFlagsIndex = null;
156:
157: /**
158: * For each logical row, this index contains a constant that is added to
159: * the logical column number to get the physical column number
160: */
161: private byte[] rowIndexShifts = null;
162:
163: //=================================================================================
164: // deserialization
165: //=================================================================================
166:
167: /**
168: * @internal
169: * @deprecated This API is ICU internal only.
170: */
171: public BreakDictionary(InputStream dictionaryStream)
172: throws IOException {
173: readDictionaryFile(new DataInputStream(dictionaryStream));
174: }
175:
176: /**
177: * @internal
178: * @deprecated This API is ICU internal only.
179: */
180: public void readDictionaryFile(DataInputStream in)
181: throws IOException {
182: int l;
183:
184: // read in the version number (right now we just ignore it)
185: in.readInt();
186:
187: // read in the column map (this is serialized in its internal form:
188: // an index array followed by a data array)
189: l = in.readInt();
190: char[] temp = new char[l];
191: for (int i = 0; i < temp.length; i++)
192: temp[i] = (char) in.readShort();
193: l = in.readInt();
194: byte[] temp2 = new byte[l];
195: for (int i = 0; i < temp2.length; i++)
196: temp2[i] = in.readByte();
197: columnMap = new CompactByteArray(temp, temp2);
198:
199: // read in numCols and numColGroups
200: numCols = in.readInt();
201: numColGroups = in.readInt();
202:
203: // read in the row-number index
204: l = in.readInt();
205: rowIndex = new short[l];
206: for (int i = 0; i < rowIndex.length; i++)
207: rowIndex[i] = in.readShort();
208:
209: // load in the populated-cells bitmap: index first, then bitmap list
210: l = in.readInt();
211: rowIndexFlagsIndex = new short[l];
212: for (int i = 0; i < rowIndexFlagsIndex.length; i++)
213: rowIndexFlagsIndex[i] = in.readShort();
214: l = in.readInt();
215: rowIndexFlags = new int[l];
216: for (int i = 0; i < rowIndexFlags.length; i++)
217: rowIndexFlags[i] = in.readInt();
218:
219: // load in the row-shift index
220: l = in.readInt();
221: rowIndexShifts = new byte[l];
222: for (int i = 0; i < rowIndexShifts.length; i++)
223: rowIndexShifts[i] = in.readByte();
224:
225: // finally, load in the actual state table
226: l = in.readInt();
227: table = new short[l];
228: for (int i = 0; i < table.length; i++)
229: table[i] = in.readShort();
230:
231: // this data structure is only necessary for testing and debugging purposes
232: reverseColumnMap = new char[numCols];
233: for (char c = 0; c < 0xffff; c++) {
234: int col = columnMap.elementAt(c);
235: if (col != 0) {
236: reverseColumnMap[col] = c;
237: }
238: }
239:
240: // close the stream
241: in.close();
242: }
243:
244: //=================================================================================
245: // access to the words
246: //=================================================================================
247:
248: /**
249: * Uses the column map to map the character to a column number, then
250: * passes the row and column number to the other version of at()
251: * @param row The current state
252: * @param ch The character whose column we're interested in
253: * @return The new state to transition to
254: * @internal
255: * @deprecated This API is ICU internal only.
256: */
257: public final short at(int row, char ch) {
258: int col = columnMap.elementAt(ch);
259: return at(row, col);
260: }
261:
262: /**
263: * Returns the value in the cell with the specified (logical) row and
264: * column numbers. In DictionaryBasedBreakIterator, the row number is
265: * a state number, the column number is an input, and the return value
266: * is the row number of the new state to transition to. (0 is the
267: * "error" state, and -1 is the "end of word" state in a dictionary)
268: * @param row The row number of the current state
269: * @param col The column number of the input character (0 means "not a
270: * dictionary character")
271: * @return The row number of the new state to transition to
272: * @internal
273: * @deprecated This API is ICU internal only.
274: */
275: public final short at(int row, int col) {
276: if (cellIsPopulated(row, col)) {
277: // we map from logical to physical row number by looking up the
278: // mapping in rowIndex; we map from logical column number to
279: // physical column number by looking up a shift value for this
280: // logical row and offsetting the logical column number by
281: // the shift amount. Then we can use internalAt() to actually
282: // get the value out of the table.
283: return internalAt(rowIndex[row], col + rowIndexShifts[row]);
284: } else {
285: return 0;
286: }
287: }
288:
289: /**
290: * Given (logical) row and column numbers, returns true if the
291: * cell in that position is populated
292: */
293: private final boolean cellIsPopulated(int row, int col) {
294: // look up the entry in the bitmap index for the specified row.
295: // If it's a negative number, it's the column number of the only
296: // populated cell in the row
297: if (rowIndexFlagsIndex[row] < 0) {
298: return col == -rowIndexFlagsIndex[row];
299: }
300:
301: // if it's a positive number, it's the offset of an entry in the bitmap
302: // list. If the table is more than 32 columns wide, the bitmap is stored
303: // successive entries in the bitmap list, so we have to divide the column
304: // number by 32 and offset the number we got out of the index by the result.
305: // Once we have the appropriate piece of the bitmap, test the appropriate
306: // bit and return the result.
307: else {
308: int flags = rowIndexFlags[rowIndexFlagsIndex[row]
309: + (col >> 5)];
310: return (flags & (1 << (col & 0x1f))) != 0;
311: }
312: }
313:
314: /**
315: * Implementation of at() when we know the specified cell is populated.
316: * @param row The PHYSICAL row number of the cell
317: * @param col The PHYSICAL column number of the cell
318: * @return The value stored in the cell
319: */
320: private final short internalAt(int row, int col) {
321: // the table is a one-dimensional array, so this just does the math necessary
322: // to treat it as a two-dimensional array (we don't just use a two-dimensional
323: // array because two-dimensional arrays are inefficient in Java)
324: return table[row * numCols + col];
325: }
326: }
|