001 /*
002 * Copyright 1999-2003 Sun Microsystems, Inc. All Rights Reserved.
003 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004 *
005 * This code is free software; you can redistribute it and/or modify it
006 * under the terms of the GNU General Public License version 2 only, as
007 * published by the Free Software Foundation. Sun designates this
008 * particular file as subject to the "Classpath" exception as provided
009 * by Sun in the LICENSE file that accompanied this code.
010 *
011 * This code is distributed in the hope that it will be useful, but WITHOUT
012 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014 * version 2 for more details (a copy is included in the LICENSE file that
015 * accompanied this code).
016 *
017 * You should have received a copy of the GNU General Public License version
018 * 2 along with this work; if not, write to the Free Software Foundation,
019 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020 *
021 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022 * CA 95054 USA or visit www.sun.com if you need additional information or
023 * have any questions.
024 */
025
026 /*
027 * @(#)DictionaryBasedBreakIterator.java 1.3 99/05/03
028 *
029 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
030 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
031 *
032 * The original version of this source code and documentation
033 * is copyrighted and owned by Taligent, Inc., a wholly-owned
034 * subsidiary of IBM. These materials are provided under terms
035 * of a License Agreement between Taligent and Sun. This technology
036 * is protected by multiple US and International patents.
037 *
038 * This notice and attribution to Taligent may not be removed.
039 * Taligent is a registered trademark of Taligent, Inc.
040 */
041
042 package java.text;
043
044 import java.util.Vector;
045 import java.util.Stack;
046 import java.util.Hashtable;
047 import java.text.CharacterIterator;
048 import java.io.InputStream;
049 import java.io.IOException;
050
051 /**
052 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
053 * to further subdivide ranges of text beyond what is possible using just the
054 * state-table-based algorithm. This is necessary, for example, to handle
055 * word and line breaking in Thai, which doesn't use spaces between words. The
056 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
057 * up text as far as possible, and then contiguous ranges of letters are
058 * repeatedly compared against a list of known words (i.e., the dictionary)
059 * to divide them up into words.
060 *
061 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
062 * but adds one more special substitution name: <dictionary>. This substitution
063 * name is used to identify characters in words in the dictionary. The idea is that
064 * if the iterator passes over a chunk of text that includes two or more characters
065 * in a row that are included in <dictionary>, it goes back through that range and
066 * derives additional break positions (if possible) using the dictionary.
067 *
068 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
069 * file. It follows a prescribed search path to locate the dictionary (right now,
070 * it looks for it in /com/ibm/text/resources in each directory in the classpath,
071 * and won't find it in JAR files, but this location is likely to change). The
072 * dictionary file is in a serialized binary format. We have a very primitive (and
073 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
074 * currently making it public. Contact us for help.
075 */
076 class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
077
078 /**
079 * a list of known words that is used to divide up contiguous ranges of letters,
080 * stored in a compressed, indexed, format that offers fast access
081 */
082 private BreakDictionary dictionary;
083
084 /**
085 * a list of flags indicating which character categories are contained in
086 * the dictionary file (this is used to determine which ranges of characters
087 * to apply the dictionary to)
088 */
089 private boolean[] categoryFlags;
090
091 /**
092 * a temporary hiding place for the number of dictionary characters in the
093 * last range passed over by next()
094 */
095 private int dictionaryCharCount;
096
097 /**
098 * when a range of characters is divided up using the dictionary, the break
099 * positions that are discovered are stored here, preventing us from having
100 * to use either the dictionary or the state table again until the iterator
101 * leaves this range of text
102 */
103 private int[] cachedBreakPositions;
104
105 /**
106 * if cachedBreakPositions is not null, this indicates which item in the
107 * cache the current iteration position refers to
108 */
109 private int positionInCache;
110
111 /**
112 * Constructs a DictionaryBasedBreakIterator.
113 * @param description Same as the description parameter on RuleBasedBreakIterator,
114 * except for the special meaning of "<dictionary>". This parameter is just
115 * passed through to RuleBasedBreakIterator's constructor.
116 * @param dictionaryFilename The filename of the dictionary file to use
117 */
118 public DictionaryBasedBreakIterator(String dataFile,
119 String dictionaryFile) throws IOException {
120 super (dataFile);
121 byte[] tmp = super .getAdditionalData();
122 if (tmp != null) {
123 prepareCategoryFlags(tmp);
124 super .setAdditionalData(null);
125 }
126 dictionary = new BreakDictionary(dictionaryFile);
127 }
128
129 private void prepareCategoryFlags(byte[] data) {
130 categoryFlags = new boolean[data.length];
131 for (int i = 0; i < data.length; i++) {
132 categoryFlags[i] = (data[i] == (byte) 1) ? true : false;
133 }
134 }
135
136 public void setText(CharacterIterator newText) {
137 super .setText(newText);
138 cachedBreakPositions = null;
139 dictionaryCharCount = 0;
140 positionInCache = 0;
141 }
142
143 /**
144 * Sets the current iteration position to the beginning of the text.
145 * (i.e., the CharacterIterator's starting offset).
146 * @return The offset of the beginning of the text.
147 */
148 public int first() {
149 cachedBreakPositions = null;
150 dictionaryCharCount = 0;
151 positionInCache = 0;
152 return super .first();
153 }
154
155 /**
156 * Sets the current iteration position to the end of the text.
157 * (i.e., the CharacterIterator's ending offset).
158 * @return The text's past-the-end offset.
159 */
160 public int last() {
161 cachedBreakPositions = null;
162 dictionaryCharCount = 0;
163 positionInCache = 0;
164 return super .last();
165 }
166
167 /**
168 * Advances the iterator one step backwards.
169 * @return The position of the last boundary position before the
170 * current iteration position
171 */
172 public int previous() {
173 CharacterIterator text = getText();
174
175 // if we have cached break positions and we're still in the range
176 // covered by them, just move one step backward in the cache
177 if (cachedBreakPositions != null && positionInCache > 0) {
178 --positionInCache;
179 text.setIndex(cachedBreakPositions[positionInCache]);
180 return cachedBreakPositions[positionInCache];
181 }
182
183 // otherwise, dump the cache and use the inherited previous() method to move
184 // backward. This may fill up the cache with new break positions, in which
185 // case we have to mark our position in the cache
186 else {
187 cachedBreakPositions = null;
188 int result = super .previous();
189 if (cachedBreakPositions != null) {
190 positionInCache = cachedBreakPositions.length - 2;
191 }
192 return result;
193 }
194 }
195
196 /**
197 * Sets the current iteration position to the last boundary position
198 * before the specified position.
199 * @param offset The position to begin searching from
200 * @return The position of the last boundary before "offset"
201 */
202 public int preceding(int offset) {
203 CharacterIterator text = getText();
204 checkOffset(offset, text);
205
206 // if we have no cached break positions, or "offset" is outside the
207 // range covered by the cache, we can just call the inherited routine
208 // (which will eventually call other routines in this class that may
209 // refresh the cache)
210 if (cachedBreakPositions == null
211 || offset <= cachedBreakPositions[0]
212 || offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
213 cachedBreakPositions = null;
214 return super .preceding(offset);
215 }
216
217 // on the other hand, if "offset" is within the range covered by the cache,
218 // then all we have to do is search the cache for the last break position
219 // before "offset"
220 else {
221 positionInCache = 0;
222 while (positionInCache < cachedBreakPositions.length
223 && offset > cachedBreakPositions[positionInCache]) {
224 ++positionInCache;
225 }
226 --positionInCache;
227 text.setIndex(cachedBreakPositions[positionInCache]);
228 return text.getIndex();
229 }
230 }
231
232 /**
233 * Sets the current iteration position to the first boundary position after
234 * the specified position.
235 * @param offset The position to begin searching forward from
236 * @return The position of the first boundary after "offset"
237 */
238 public int following(int offset) {
239 CharacterIterator text = getText();
240 checkOffset(offset, text);
241
242 // if we have no cached break positions, or if "offset" is outside the
243 // range covered by the cache, then dump the cache and call our
244 // inherited following() method. This will call other methods in this
245 // class that may refresh the cache.
246 if (cachedBreakPositions == null
247 || offset < cachedBreakPositions[0]
248 || offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
249 cachedBreakPositions = null;
250 return super .following(offset);
251 }
252
253 // on the other hand, if "offset" is within the range covered by the
254 // cache, then just search the cache for the first break position
255 // after "offset"
256 else {
257 positionInCache = 0;
258 while (positionInCache < cachedBreakPositions.length
259 && offset >= cachedBreakPositions[positionInCache]) {
260 ++positionInCache;
261 }
262 text.setIndex(cachedBreakPositions[positionInCache]);
263 return text.getIndex();
264 }
265 }
266
267 /**
268 * This is the implementation function for next().
269 */
270 protected int handleNext() {
271 CharacterIterator text = getText();
272
273 // if there are no cached break positions, or if we've just moved
274 // off the end of the range covered by the cache, we have to dump
275 // and possibly regenerate the cache
276 if (cachedBreakPositions == null
277 || positionInCache == cachedBreakPositions.length - 1) {
278
279 // start by using the inherited handleNext() to find a tentative return
280 // value. dictionaryCharCount tells us how many dictionary characters
281 // we passed over on our way to the tentative return value
282 int startPos = text.getIndex();
283 dictionaryCharCount = 0;
284 int result = super .handleNext();
285
286 // if we passed over more than one dictionary character, then we use
287 // divideUpDictionaryRange() to regenerate the cached break positions
288 // for the new range
289 if (dictionaryCharCount > 1 && result - startPos > 1) {
290 divideUpDictionaryRange(startPos, result);
291 }
292
293 // otherwise, the value we got back from the inherited fuction
294 // is our return value, and we can dump the cache
295 else {
296 cachedBreakPositions = null;
297 return result;
298 }
299 }
300
301 // if the cache of break positions has been regenerated (or existed all
302 // along), then just advance to the next break position in the cache
303 // and return it
304 if (cachedBreakPositions != null) {
305 ++positionInCache;
306 text.setIndex(cachedBreakPositions[positionInCache]);
307 return cachedBreakPositions[positionInCache];
308 }
309 return -9999; // SHOULD NEVER GET HERE!
310 }
311
312 /**
313 * Looks up a character category for a character.
314 */
315 protected int lookupCategory(int c) {
316 // this override of lookupCategory() exists only to keep track of whether we've
317 // passed over any dictionary characters. It calls the inherited lookupCategory()
318 // to do the real work, and then checks whether its return value is one of the
319 // categories represented in the dictionary. If it is, bump the dictionary-
320 // character count.
321 int result = super .lookupCategory(c);
322 if (result != RuleBasedBreakIterator.IGNORE
323 && categoryFlags[result]) {
324 ++dictionaryCharCount;
325 }
326 return result;
327 }
328
329 /**
330 * This is the function that actually implements the dictionary-based
331 * algorithm. Given the endpoints of a range of text, it uses the
332 * dictionary to determine the positions of any boundaries in this
333 * range. It stores all the boundary positions it discovers in
334 * cachedBreakPositions so that we only have to do this work once
335 * for each time we enter the range.
336 */
337 private void divideUpDictionaryRange(int startPos, int endPos) {
338 CharacterIterator text = getText();
339
340 // the range we're dividing may begin or end with non-dictionary characters
341 // (i.e., for line breaking, we may have leading or trailing punctuation
342 // that needs to be kept with the word). Seek from the beginning of the
343 // range to the first dictionary character
344 text.setIndex(startPos);
345 int c = getCurrent();
346 int category = lookupCategory(c);
347 while (category == IGNORE || !categoryFlags[category]) {
348 c = getNext();
349 category = lookupCategory(c);
350 }
351
352 // initialize. We maintain two stacks: currentBreakPositions contains
353 // the list of break positions that will be returned if we successfully
354 // finish traversing the whole range now. possibleBreakPositions lists
355 // all other possible word ends we've passed along the way. (Whenever
356 // we reach an error [a sequence of characters that can't begin any word
357 // in the dictionary], we back up, possibly delete some breaks from
358 // currentBreakPositions, move a break from possibleBreakPositions
359 // to currentBreakPositions, and start over from there. This process
360 // continues in this way until we either successfully make it all the way
361 // across the range, or exhaust all of our combinations of break
362 // positions.)
363 Stack currentBreakPositions = new Stack();
364 Stack possibleBreakPositions = new Stack();
365 Vector wrongBreakPositions = new Vector();
366
367 // the dictionary is implemented as a trie, which is treated as a state
368 // machine. -1 represents the end of a legal word. Every word in the
369 // dictionary is represented by a path from the root node to -1. A path
370 // that ends in state 0 is an illegal combination of characters.
371 int state = 0;
372
373 // these two variables are used for error handling. We keep track of the
374 // farthest we've gotten through the range being divided, and the combination
375 // of breaks that got us that far. If we use up all possible break
376 // combinations, the text contains an error or a word that's not in the
377 // dictionary. In this case, we "bless" the break positions that got us the
378 // farthest as real break positions, and then start over from scratch with
379 // the character where the error occurred.
380 int farthestEndPoint = text.getIndex();
381 Stack bestBreakPositions = null;
382
383 // initialize (we always exit the loop with a break statement)
384 c = getCurrent();
385 while (true) {
386
387 // if we can transition to state "-1" from our current state, we're
388 // on the last character of a legal word. Push that position onto
389 // the possible-break-positions stack
390 if (dictionary.getNextState(state, 0) == -1) {
391 possibleBreakPositions
392 .push(new Integer(text.getIndex()));
393 }
394
395 // look up the new state to transition to in the dictionary
396 state = dictionary.getNextStateFromCharacter(state, c);
397
398 // if the character we're sitting on causes us to transition to
399 // the "end of word" state, then it was a non-dictionary character
400 // and we've successfully traversed the whole range. Drop out
401 // of the loop.
402 if (state == -1) {
403 currentBreakPositions
404 .push(new Integer(text.getIndex()));
405 break;
406 }
407
408 // if the character we're sitting on causes us to transition to
409 // the error state, or if we've gone off the end of the range
410 // without transitioning to the "end of word" state, we've hit
411 // an error...
412 else if (state == 0 || text.getIndex() >= endPos) {
413
414 // if this is the farthest we've gotten, take note of it in
415 // case there's an error in the text
416 if (text.getIndex() > farthestEndPoint) {
417 farthestEndPoint = text.getIndex();
418 bestBreakPositions = (Stack) (currentBreakPositions
419 .clone());
420 }
421
422 // wrongBreakPositions is a list of all break positions
423 // we've tried starting that didn't allow us to traverse
424 // all the way through the text. Every time we pop a
425 //break position off of currentBreakPositions, we put it
426 // into wrongBreakPositions to avoid trying it again later.
427 // If we make it to this spot, we're either going to back
428 // up to a break in possibleBreakPositions and try starting
429 // over from there, or we've exhausted all possible break
430 // positions and are going to do the fallback procedure.
431 // This loop prevents us from messing with anything in
432 // possibleBreakPositions that didn't work as a starting
433 // point the last time we tried it (this is to prevent a bunch of
434 // repetitive checks from slowing down some extreme cases)
435 Integer newStartingSpot = null;
436 while (!possibleBreakPositions.isEmpty()
437 && wrongBreakPositions
438 .contains(possibleBreakPositions.peek())) {
439 possibleBreakPositions.pop();
440 }
441
442 // if we've used up all possible break-position combinations, there's
443 // an error or an unknown word in the text. In this case, we start
444 // over, treating the farthest character we've reached as the beginning
445 // of the range, and "blessing" the break positions that got us that
446 // far as real break positions
447 if (possibleBreakPositions.isEmpty()) {
448 if (bestBreakPositions != null) {
449 currentBreakPositions = bestBreakPositions;
450 if (farthestEndPoint < endPos) {
451 text.setIndex(farthestEndPoint + 1);
452 } else {
453 break;
454 }
455 } else {
456 if ((currentBreakPositions.size() == 0 || ((Integer) (currentBreakPositions
457 .peek())).intValue() != text.getIndex())
458 && text.getIndex() != startPos) {
459 currentBreakPositions.push(new Integer(text
460 .getIndex()));
461 }
462 getNext();
463 currentBreakPositions.push(new Integer(text
464 .getIndex()));
465 }
466 }
467
468 // if we still have more break positions we can try, then promote the
469 // last break in possibleBreakPositions into currentBreakPositions,
470 // and get rid of all entries in currentBreakPositions that come after
471 // it. Then back up to that position and start over from there (i.e.,
472 // treat that position as the beginning of a new word)
473 else {
474 Integer temp = (Integer) possibleBreakPositions
475 .pop();
476 Object temp2 = null;
477 while (!currentBreakPositions.isEmpty()
478 && temp.intValue() < ((Integer) currentBreakPositions
479 .peek()).intValue()) {
480 temp2 = currentBreakPositions.pop();
481 wrongBreakPositions.addElement(temp2);
482 }
483 currentBreakPositions.push(temp);
484 text.setIndex(((Integer) currentBreakPositions
485 .peek()).intValue());
486 }
487
488 // re-sync "c" for the next go-round, and drop out of the loop if
489 // we've made it off the end of the range
490 c = getCurrent();
491 if (text.getIndex() >= endPos) {
492 break;
493 }
494 }
495
496 // if we didn't hit any exceptional conditions on this last iteration,
497 // just advance to the next character and loop
498 else {
499 c = getNext();
500 }
501 }
502
503 // dump the last break position in the list, and replace it with the actual
504 // end of the range (which may be the same character, or may be further on
505 // because the range actually ended with non-dictionary characters we want to
506 // keep with the word)
507 if (!currentBreakPositions.isEmpty()) {
508 currentBreakPositions.pop();
509 }
510 currentBreakPositions.push(new Integer(endPos));
511
512 // create a regular array to hold the break positions and copy
513 // the break positions from the stack to the array (in addition,
514 // our starting position goes into this array as a break position).
515 // This array becomes the cache of break positions used by next()
516 // and previous(), so this is where we actually refresh the cache.
517 cachedBreakPositions = new int[currentBreakPositions.size() + 1];
518 cachedBreakPositions[0] = startPos;
519
520 for (int i = 0; i < currentBreakPositions.size(); i++) {
521 cachedBreakPositions[i + 1] = ((Integer) currentBreakPositions
522 .elementAt(i)).intValue();
523 }
524 positionInCache = 0;
525 }
526 }
|