001: /*
002: *
003: * @(#)BreakDictionary.java 1.15 06/10/10
004: *
005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
006: * Reserved. Use is subject to license terms.
007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
008: *
009: * This program is free software; you can redistribute it and/or
010: * modify it under the terms of the GNU General Public License version
011: * 2 only, as published by the Free Software Foundation.
012: *
013: * This program is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * General Public License version 2 for more details (a copy is
017: * included at /legal/license.txt).
018: *
019: * You should have received a copy of the GNU General Public License
020: * version 2 along with this work; if not, write to the Free Software
021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022: * 02110-1301 USA
023: *
024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
025: * Clara, CA 95054 or visit www.sun.com if you need additional
026: * information or have any questions.
027: */
028:
029: /*
030: * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
031: * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
032: *
033: * The original version of this source code and documentation
034: * is copyrighted and owned by Taligent, Inc., a wholly-owned
035: * subsidiary of IBM. These materials are provided under terms
036: * of a License Agreement between Taligent and Sun. This technology
037: * is protected by multiple US and International patents.
038: *
039: * This notice and attribution to Taligent may not be removed.
040: * Taligent is a registered trademark of Taligent, Inc.
041: */
042: package java.text;
043:
044: import java.io.*;
045: import java.util.MissingResourceException;
046: import sun.text.CompactByteArray;
047:
048: /**
049: * This is the class that represents the list of known words used by
050: * DictionaryBasedBreakIterator. The conceptual data structure used
051: * here is a trie: there is a node hanging off the root node for every
052: * letter that can start a word. Each of these nodes has a node hanging
053: * off of it for every letter that can be the second letter of a word
054: * if this node is the first letter, and so on. The trie is represented
055: * as a two-dimensional array that can be treated as a table of state
056: * transitions. Indexes are used to compress this array, taking
057: * advantage of the fact that this array will always be very sparse.
058: */
059: class BreakDictionary {
060: //=================================================================================
061: // testing and debugging
062: //=================================================================================
063: public static void main(String args[])
064: throws FileNotFoundException, UnsupportedEncodingException,
065: IOException {
066: String filename = args[0];
067:
068: BreakDictionary dictionary = new BreakDictionary(
069: new FileInputStream(filename));
070:
071: String outputFile = "";
072: if (args.length >= 2) {
073: outputFile = args[1];
074: }
075: PrintWriter out = null;
076: if (outputFile.length() != 0) {
077: out = new PrintWriter(new OutputStreamWriter(
078: new FileOutputStream(outputFile), "UnicodeLittle"));
079: }
080: dictionary.printWordList("", 0, out);
081: }
082:
083: public void printWordList(String partialWord, int state,
084: PrintWriter out) throws IOException {
085: if (state == -1) {
086: System.out.println(partialWord);
087: if (out != null) {
088: out.println(partialWord);
089: }
090: } else {
091: for (int i = 0; i < numCols; i++) {
092: if (at(state, i) != 0) {
093: printWordList(partialWord + reverseColumnMap[i],
094: at(state, i), out);
095: }
096: }
097: }
098: }
099:
100: /**
101: * A map used to go from column numbers to characters. Used only
102: * for debugging right now.
103: */
104: private char[] reverseColumnMap = null;
105:
106: //=================================================================================
107: // data members
108: //=================================================================================
109:
110: /**
111: * The version of the dictionary that was read in.
112: */
113: private static int supportedVersion = 0;
114: private int version;
115:
116: /**
117: * Maps from characters to column numbers. The main use of this is to
118: * avoid making room in the array for empty columns.
119: */
120: private CompactByteArray columnMap = null;
121:
122: /**
123: * The number of actual columns in the table
124: */
125: private int numCols;
126:
127: /**
128: * Columns are organized into groups of 32. This says how many
129: * column groups. (We could calculate this, but we store the
130: * value to avoid having to repeatedly calculate it.)
131: */
132: private int numColGroups;
133:
134: /**
135: * The actual compressed state table. Each conceptual row represents
136: * a state, and the cells in it contain the row numbers of the states
137: * to transition to for each possible letter. 0 is used to indicate
138: * an illegal combination of letters (i.e., the error state). The
139: * table is compressed by eliminating all the unpopulated (i.e., zero)
140: * cells. Multiple conceptual rows can then be doubled up in a single
141: * physical row by sliding them up and possibly shifting them to one
142: * side or the other so the populated cells don't collide. Indexes
143: * are used to identify unpopulated cells and to locate populated cells.
144: */
145: private short[] table = null;
146:
147: /**
148: * This index maps logical row numbers to physical row numbers
149: */
150: private short[] rowIndex = null;
151:
152: /**
153: * A bitmap is used to tell which cells in the comceptual table are
154: * populated. This array contains all the unique bit combinations
155: * in that bitmap. If the table is more than 32 columns wide,
156: * successive entries in this array are used for a single row.
157: */
158: private int[] rowIndexFlags = null;
159:
160: /**
161: * This index maps from a logical row number into the bitmap table above.
162: * (This keeps us from storing duplicate bitmap combinations.) Since there
163: * are a lot of rows with only one populated cell, instead of wasting space
164: * in the bitmap table, we just store a negative number in this index for
165: * rows with one populated cell. The absolute value of that number is
166: * the column number of the populated cell.
167: */
168: private short[] rowIndexFlagsIndex = null;
169:
170: /**
171: * For each logical row, this index contains a constant that is added to
172: * the logical column number to get the physical column number
173: */
174: private byte[] rowIndexShifts = null;
175:
176: //=================================================================================
177: // deserialization
178: //=================================================================================
179:
180: public BreakDictionary(InputStream dictionaryStream)
181: throws IOException {
182: readDictionaryFile(new DataInputStream(dictionaryStream));
183: }
184:
185: public void readDictionaryFile(DataInputStream in)
186: throws IOException {
187: version = in.readInt();
188: if (version != supportedVersion) {
189: throw new MissingResourceException("Dictionary version("
190: + version + ") is unsupported", in.toString(), "");
191: }
192:
193: int l;
194:
195: // read in the column map (this is serialized in its internal form:
196: // an index array followed by a data array)
197: l = in.readInt();
198: short[] temp = new short[l];
199: for (int i = 0; i < temp.length; i++)
200: temp[i] = in.readShort();
201: l = in.readInt();
202: byte[] temp2 = new byte[l];
203: for (int i = 0; i < temp2.length; i++)
204: temp2[i] = in.readByte();
205: columnMap = new CompactByteArray(temp, temp2);
206:
207: // read in numCols and numColGroups
208: numCols = in.readInt();
209: numColGroups = in.readInt();
210:
211: // read in the row-number index
212: l = in.readInt();
213: rowIndex = new short[l];
214: for (int i = 0; i < rowIndex.length; i++)
215: rowIndex[i] = in.readShort();
216:
217: // load in the populated-cells bitmap: index first, then bitmap list
218: l = in.readInt();
219: rowIndexFlagsIndex = new short[l];
220: for (int i = 0; i < rowIndexFlagsIndex.length; i++)
221: rowIndexFlagsIndex[i] = in.readShort();
222: l = in.readInt();
223: rowIndexFlags = new int[l];
224: for (int i = 0; i < rowIndexFlags.length; i++)
225: rowIndexFlags[i] = in.readInt();
226:
227: // load in the row-shift index
228: l = in.readInt();
229: rowIndexShifts = new byte[l];
230: for (int i = 0; i < rowIndexShifts.length; i++)
231: rowIndexShifts[i] = in.readByte();
232:
233: // finally, load in the actual state table
234: l = in.readInt();
235: table = new short[l];
236: for (int i = 0; i < table.length; i++)
237: table[i] = in.readShort();
238:
239: // this data structure is only necessary for testing and debugging purposes
240: reverseColumnMap = new char[numCols];
241: for (char c = 0; c < 0xffff; c++) {
242: int col = columnMap.elementAt(c);
243: if (col != 0) {
244: reverseColumnMap[col] = c;
245: }
246: }
247:
248: // close the stream
249: in.close();
250: }
251:
252: //=================================================================================
253: // access to the words
254: //=================================================================================
255:
256: /**
257: * Uses the column map to map the character to a column number, then
258: * passes the row and column number to the other version of at()
259: * @param row The current state
260: * @param ch The character whose column we're interested in
261: * @return The new state to transition to
262: */
263: public final short at(int row, char ch) {
264: int col = columnMap.elementAt(ch);
265: return at(row, col);
266: }
267:
268: /**
269: * Returns the value in the cell with the specified (logical) row and
270: * column numbers. In DictionaryBasedBreakIterator, the row number is
271: * a state number, the column number is an input, and the return value
272: * is the row number of the new state to transition to. (0 is the
273: * "error" state, and -1 is the "end of word" state in a dictionary)
274: * @param row The row number of the current state
275: * @param col The column number of the input character (0 means "not a
276: * dictionary character")
277: * @return The row number of the new state to transition to
278: */
279: public final short at(int row, int col) {
280: if (cellIsPopulated(row, col)) {
281: // we map from logical to physical row number by looking up the
282: // mapping in rowIndex; we map from logical column number to
283: // physical column number by looking up a shift value for this
284: // logical row and offsetting the logical column number by
285: // the shift amount. Then we can use internalAt() to actually
286: // get the value out of the table.
287: return internalAt(rowIndex[row], col + rowIndexShifts[row]);
288: } else {
289: return 0;
290: }
291: }
292:
293: /**
294: * Given (logical) row and column numbers, returns true if the
295: * cell in that position is populated
296: */
297: private final boolean cellIsPopulated(int row, int col) {
298: // look up the entry in the bitmap index for the specified row.
299: // If it's a negative number, it's the column number of the only
300: // populated cell in the row
301: if (rowIndexFlagsIndex[row] < 0) {
302: return col == -rowIndexFlagsIndex[row];
303: }
304:
305: // if it's a positive number, it's the offset of an entry in the bitmap
306: // list. If the table is more than 32 columns wide, the bitmap is stored
307: // successive entries in the bitmap list, so we have to divide the column
308: // number by 32 and offset the number we got out of the index by the result.
309: // Once we have the appropriate piece of the bitmap, test the appropriate
310: // bit and return the result.
311: else {
312: int flags = rowIndexFlags[rowIndexFlagsIndex[row]
313: + (col >> 5)];
314: return (flags & (1 << (col & 0x1f))) != 0;
315: }
316: }
317:
318: /**
319: * Implementation of at() when we know the specified cell is populated.
320: * @param row The PHYSICAL row number of the cell
321: * @param col The PHYSICAL column number of the cell
322: * @return The value stored in the cell
323: */
324: private final short internalAt(int row, int col) {
325: // the table is a one-dimensional array, so this just does the math necessary
326: // to treat it as a two-dimensional array (we don't just use a two-dimensional
327: // array because two-dimensional arrays are inefficient in Java)
328: return table[row * numCols + col];
329: }
330: }
|