Source Code Cross Referenced for BreakDictionary.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /*
002:         *******************************************************************************
003:         * Copyright (C) 1996-2006, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         */
007:        package com.ibm.icu.text;
008:
009:        import java.io.InputStream;
010:        import java.io.DataInputStream;
011:        import java.io.FileNotFoundException;
012:        import java.io.UnsupportedEncodingException;
013:        import java.io.IOException;
014:        import java.io.FileInputStream;
015:        import java.io.OutputStreamWriter;
016:        import java.io.PrintWriter;
017:        import java.io.FileOutputStream;
018:
019:        import com.ibm.icu.util.CompactByteArray;
020:
021:        /**
022:         * This is the class that represents the list of known words used by
023:         * DictionaryBasedBreakIterator.  The conceptual data structure used
024:         * here is a trie: there is a node hanging off the root node for every
025:         * letter that can start a word.  Each of these nodes has a node hanging
026:         * off of it for every letter that can be the second letter of a word
027:         * if this node is the first letter, and so on.  The trie is represented
028:         * as a two-dimensional array that can be treated as a table of state
029:         * transitions.  Indexes are used to compress this array, taking
030:         * advantage of the fact that this array will always be very sparse.
031:         * @internal
032:         * @deprecated This API is ICU internal only.
033:         */
034:        public class BreakDictionary {
035:            //=================================================================================
036:            // testing and debugging
037:            //=================================================================================
038:            /** 
039:             * @internal 
040:             * @deprecated This API is ICU internal only.
041:             */
042:            public static void main(String args[])
043:                    throws FileNotFoundException, UnsupportedEncodingException,
044:                    IOException {
045:                String filename = args[0];
046:
047:                BreakDictionary dictionary = new BreakDictionary(
048:                        new FileInputStream(filename));
049:
050:                PrintWriter out = null;
051:
052:                if (args.length >= 2) {
053:                    out = new PrintWriter(new OutputStreamWriter(
054:                            new FileOutputStream(args[1]), "UnicodeLittle"));
055:                }
056:
057:                dictionary.printWordList("", 0, out);
058:
059:                if (out != null) {
060:                    out.close();
061:                }
062:            }
063:
064:            /** 
065:             * @internal 
066:             * @deprecated This API is ICU internal only.
067:             */
068:            public void printWordList(String partialWord, int state,
069:                    PrintWriter out) throws IOException {
070:                if (state == 0xFFFF) {
071:                    System.out.println(partialWord);
072:                    if (out != null) {
073:                        out.println(partialWord);
074:                    }
075:                } else {
076:                    for (int i = 0; i < numCols; i++) {
077:                        int newState = (at(state, i)) & 0xFFFF;
078:
079:                        if (newState != 0) {
080:                            char newChar = reverseColumnMap[i];
081:                            String newPartialWord = partialWord;
082:
083:                            if (newChar != 0) {
084:                                newPartialWord += newChar;
085:                            }
086:
087:                            printWordList(newPartialWord, newState, out);
088:                        }
089:                    }
090:                }
091:            }
092:
093:            /**
094:             * A map used to go from column numbers to characters.  Used only
095:             * for debugging right now.
096:             */
097:            private char[] reverseColumnMap = null;
098:
099:            //=================================================================================
100:            // data members
101:            //=================================================================================
102:
103:            /**
104:             * Maps from characters to column numbers.  The main use of this is to
105:             * avoid making room in the array for empty columns.
106:             */
107:            private CompactByteArray columnMap = null;
108:
109:            /**
110:             * The number of actual columns in the table
111:             */
112:            private int numCols;
113:
114:            /**
115:             * Columns are organized into groups of 32.  This says how many
116:             * column groups.  (We could calculate this, but we store the
117:             * value to avoid having to repeatedly calculate it.)
118:             */
119:            private int numColGroups;
120:
121:            /**
122:             * The actual compressed state table.  Each conceptual row represents
123:             * a state, and the cells in it contain the row numbers of the states
124:             * to transition to for each possible letter.  0 is used to indicate
125:             * an illegal combination of letters (i.e., the error state).  The
126:             * table is compressed by eliminating all the unpopulated (i.e., zero)
127:             * cells.  Multiple conceptual rows can then be doubled up in a single
128:             * physical row by sliding them up and possibly shifting them to one
129:             * side or the other so the populated cells don't collide.  Indexes
130:             * are used to identify unpopulated cells and to locate populated cells.
131:             */
132:            private short[] table = null;
133:
134:            /**
135:             * This index maps logical row numbers to physical row numbers
136:             */
137:            private short[] rowIndex = null;
138:
139:            /**
140:             * A bitmap is used to tell which cells in the comceptual table are
141:             * populated.  This array contains all the unique bit combinations
142:             * in that bitmap.  If the table is more than 32 columns wide,
143:             * successive entries in this array are used for a single row.
144:             */
145:            private int[] rowIndexFlags = null;
146:
147:            /**
148:             * This index maps from a logical row number into the bitmap table above.
149:             * (This keeps us from storing duplicate bitmap combinations.)  Since there
150:             * are a lot of rows with only one populated cell, instead of wasting space
151:             * in the bitmap table, we just store a negative number in this index for
152:             * rows with one populated cell.  The absolute value of that number is
153:             * the column number of the populated cell.
154:             */
155:            private short[] rowIndexFlagsIndex = null;
156:
157:            /**
158:             * For each logical row, this index contains a constant that is added to
159:             * the logical column number to get the physical column number
160:             */
161:            private byte[] rowIndexShifts = null;
162:
163:            //=================================================================================
164:            // deserialization
165:            //=================================================================================
166:
167:            /** 
168:             * @internal 
169:             * @deprecated This API is ICU internal only.
170:             */
171:            public BreakDictionary(InputStream dictionaryStream)
172:                    throws IOException {
173:                readDictionaryFile(new DataInputStream(dictionaryStream));
174:            }
175:
176:            /** 
177:             * @internal
178:             * @deprecated This API is ICU internal only.
179:             */
180:            public void readDictionaryFile(DataInputStream in)
181:                    throws IOException {
182:                int l;
183:
184:                // read in the version number (right now we just ignore it)
185:                in.readInt();
186:
187:                // read in the column map (this is serialized in its internal form:
188:                // an index array followed by a data array)
189:                l = in.readInt();
190:                char[] temp = new char[l];
191:                for (int i = 0; i < temp.length; i++)
192:                    temp[i] = (char) in.readShort();
193:                l = in.readInt();
194:                byte[] temp2 = new byte[l];
195:                for (int i = 0; i < temp2.length; i++)
196:                    temp2[i] = in.readByte();
197:                columnMap = new CompactByteArray(temp, temp2);
198:
199:                // read in numCols and numColGroups
200:                numCols = in.readInt();
201:                numColGroups = in.readInt();
202:
203:                // read in the row-number index
204:                l = in.readInt();
205:                rowIndex = new short[l];
206:                for (int i = 0; i < rowIndex.length; i++)
207:                    rowIndex[i] = in.readShort();
208:
209:                // load in the populated-cells bitmap: index first, then bitmap list
210:                l = in.readInt();
211:                rowIndexFlagsIndex = new short[l];
212:                for (int i = 0; i < rowIndexFlagsIndex.length; i++)
213:                    rowIndexFlagsIndex[i] = in.readShort();
214:                l = in.readInt();
215:                rowIndexFlags = new int[l];
216:                for (int i = 0; i < rowIndexFlags.length; i++)
217:                    rowIndexFlags[i] = in.readInt();
218:
219:                // load in the row-shift index
220:                l = in.readInt();
221:                rowIndexShifts = new byte[l];
222:                for (int i = 0; i < rowIndexShifts.length; i++)
223:                    rowIndexShifts[i] = in.readByte();
224:
225:                // finally, load in the actual state table
226:                l = in.readInt();
227:                table = new short[l];
228:                for (int i = 0; i < table.length; i++)
229:                    table[i] = in.readShort();
230:
231:                // this data structure is only necessary for testing and debugging purposes
232:                reverseColumnMap = new char[numCols];
233:                for (char c = 0; c < 0xffff; c++) {
234:                    int col = columnMap.elementAt(c);
235:                    if (col != 0) {
236:                        reverseColumnMap[col] = c;
237:                    }
238:                }
239:
240:                // close the stream
241:                in.close();
242:            }
243:
244:            //=================================================================================
245:            // access to the words
246:            //=================================================================================
247:
248:            /**
249:             * Uses the column map to map the character to a column number, then
250:             * passes the row and column number to the other version of at()
251:             * @param row The current state
252:             * @param ch The character whose column we're interested in
253:             * @return The new state to transition to
254:             * @internal
255:             * @deprecated This API is ICU internal only.
256:             */
257:            public final short at(int row, char ch) {
258:                int col = columnMap.elementAt(ch);
259:                return at(row, col);
260:            }
261:
262:            /**
263:             * Returns the value in the cell with the specified (logical) row and
264:             * column numbers.  In DictionaryBasedBreakIterator, the row number is
265:             * a state number, the column number is an input, and the return value
266:             * is the row number of the new state to transition to.  (0 is the
267:             * "error" state, and -1 is the "end of word" state in a dictionary)
268:             * @param row The row number of the current state
269:             * @param col The column number of the input character (0 means "not a
270:             * dictionary character")
271:             * @return The row number of the new state to transition to
272:             * @internal
273:             * @deprecated This API is ICU internal only.
274:             */
275:            public final short at(int row, int col) {
276:                if (cellIsPopulated(row, col)) {
277:                    // we map from logical to physical row number by looking up the
278:                    // mapping in rowIndex; we map from logical column number to
279:                    // physical column number by looking up a shift value for this
280:                    // logical row and offsetting the logical column number by
281:                    // the shift amount.  Then we can use internalAt() to actually
282:                    // get the value out of the table.
283:                    return internalAt(rowIndex[row], col + rowIndexShifts[row]);
284:                } else {
285:                    return 0;
286:                }
287:            }
288:
289:            /**
290:             * Given (logical) row and column numbers, returns true if the
291:             * cell in that position is populated
292:             */
293:            private final boolean cellIsPopulated(int row, int col) {
294:                // look up the entry in the bitmap index for the specified row.
295:                // If it's a negative number, it's the column number of the only
296:                // populated cell in the row
297:                if (rowIndexFlagsIndex[row] < 0) {
298:                    return col == -rowIndexFlagsIndex[row];
299:                }
300:
301:                // if it's a positive number, it's the offset of an entry in the bitmap
302:                // list.  If the table is more than 32 columns wide, the bitmap is stored
303:                // successive entries in the bitmap list, so we have to divide the column
304:                // number by 32 and offset the number we got out of the index by the result.
305:                // Once we have the appropriate piece of the bitmap, test the appropriate
306:                // bit and return the result.
307:                else {
308:                    int flags = rowIndexFlags[rowIndexFlagsIndex[row]
309:                            + (col >> 5)];
310:                    return (flags & (1 << (col & 0x1f))) != 0;
311:                }
312:            }
313:
314:            /**
315:             * Implementation of at() when we know the specified cell is populated.
316:             * @param row The PHYSICAL row number of the cell
317:             * @param col The PHYSICAL column number of the cell
318:             * @return The value stored in the cell
319:             */
320:            private final short internalAt(int row, int col) {
321:                // the table is a one-dimensional array, so this just does the math necessary
322:                // to treat it as a two-dimensional array (we don't just use a two-dimensional
323:                // array because two-dimensional arrays are inefficient in Java)
324:                return table[row * numCols + col];
325:            }
326:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.