001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.text;
009:
010: import java.util.Vector;
011: import java.util.Stack;
012: import com.ibm.icu.impl.Assert;
013: import java.text.CharacterIterator;
014: import java.io.InputStream;
015: import java.io.IOException;
016:
017: /**
018: * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
019: * to further subdivide ranges of text beyond what is possible using just the
020: * state-table-based algorithm. This is necessary, for example, to handle
021: * word and line breaking in Thai, which doesn't use spaces between words. The
022: * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
023: * up text as far as possible, and then contiguous ranges of letters are
024: * repeatedly compared against a list of known words (i.e., the dictionary)
025: * to divide them up into words.
026: *
027: * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
028: * but adds one more special substitution name: _dictionary_. This substitution
029: * name is used to identify characters in words in the dictionary. The idea is that
030: * if the iterator passes over a chunk of text that includes two or more characters
031: * in a row that are included in _dictionary_, it goes back through that range and
032: * derives additional break positions (if possible) using the dictionary.
033: *
034: * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
035: * file. It uses Class.getResource() to locate the dictionary file. The
036: * dictionary file is in a serialized binary format. We have a very primitive (and
037: * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
038: * currently making it public. Contact us for help.
039: *
040: * @stable ICU 2.0
041: */
042: public class DictionaryBasedBreakIterator extends
043: RuleBasedBreakIterator {
044:
045: /**
046: * a list of known words that is used to divide up contiguous ranges of letters,
047: * stored in a compressed, indexed, format that offers fast access
048: */
049: private BreakDictionary dictionary;
050:
051: /**
052: * a list of flags indicating which character categories are contained in
053: * the dictionary file (this is used to determine which ranges of characters
054: * to apply the dictionary to)
055: */
056: private boolean[] categoryFlags;
057:
058: /**
059: * when a range of characters is divided up using the dictionary, the break
060: * positions that are discovered are stored here, preventing us from having
061: * to use either the dictionary or the state table again until the iterator
062: * leaves this range of text
063: */
064: private int[] cachedBreakPositions;
065:
066: /**
067: * if cachedBreakPositions is not null, this indicates which item in the
068: * cache the current iteration position refers to
069: */
070: private int positionInCache;
071:
072: /**
073: * Special variable name for characters in words in dictionary
074: */
075:
076: /**
077: * Constructs a DictionaryBasedBreakIterator.
078: * @param rules Same as the rules parameter on RuleBasedBreakIterator,
079: * except for the special meaning of "_dictionary_". This parameter is just
080: * passed through to RuleBasedBreakIterator constructor.
081: * @param dictionaryStream the stream containing the dictionary data
082: * @stable ICU 2.0
083: */
084: public DictionaryBasedBreakIterator(String rules,
085: InputStream dictionaryStream) throws IOException {
086: super (rules);
087: dictionary = new BreakDictionary(dictionaryStream);
088: }
089:
090: /**
091: * Construct a DictionarBasedBreakIterator from precompiled rules.
092: * @param compiledRules an input stream containing the binary (flattened) compiled rules.
093: * @param dictionaryStream an input stream containing the dictionary data
094: * @internal
095: * @deprecated This API is ICU internal only.
096: */
097: public DictionaryBasedBreakIterator(InputStream compiledRules,
098: InputStream dictionaryStream) throws IOException {
099: fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
100: dictionary = new BreakDictionary(dictionaryStream);
101: }
102:
103: /** @stable ICU 2.0 */
104: public void setText(CharacterIterator newText) {
105: super .setText(newText);
106: cachedBreakPositions = null;
107: fDictionaryCharCount = 0;
108: positionInCache = 0;
109: }
110:
111: /**
112: * Sets the current iteration position to the beginning of the text.
113: * (i.e., the CharacterIterator's starting offset).
114: * @return The offset of the beginning of the text.
115: * @stable ICU 2.0
116: */
117: public int first() {
118: cachedBreakPositions = null;
119: fDictionaryCharCount = 0;
120: positionInCache = 0;
121: return super .first();
122: }
123:
124: /**
125: * Sets the current iteration position to the end of the text.
126: * (i.e., the CharacterIterator's ending offset).
127: * @return The text's past-the-end offset.
128: * @stable ICU 2.0
129: */
130: public int last() {
131: cachedBreakPositions = null;
132: fDictionaryCharCount = 0;
133: positionInCache = 0;
134: return super .last();
135: }
136:
137: /**
138: * Advances the iterator one step backwards.
139: * @return The position of the last boundary position before the
140: * current iteration position
141: * @stable ICU 2.0
142: */
143: public int previous() {
144: CharacterIterator text = getText();
145:
146: // if we have cached break positions and we're still in the range
147: // covered by them, just move one step backward in the cache
148: if (cachedBreakPositions != null && positionInCache > 0) {
149: --positionInCache;
150: text.setIndex(cachedBreakPositions[positionInCache]);
151: return cachedBreakPositions[positionInCache];
152: }
153:
154: // otherwise, dump the cache and use the inherited previous() method to move
155: // backward. This may fill up the cache with new break positions, in which
156: // case we have to mark our position in the cache. If it doesn't, use next()
157: // to move forward until we hit or pass the current position. This *will* fill
158: // the cache.
159: else {
160: cachedBreakPositions = null;
161: int offset = current();
162: int result = super .previous();
163:
164: if (cachedBreakPositions != null) {
165: positionInCache = cachedBreakPositions.length - 2;
166: return result;
167: }
168:
169: while (result < offset) {
170: int nextResult = next();
171:
172: if (nextResult >= offset) {
173: break;
174: }
175:
176: result = nextResult;
177: }
178:
179: if (cachedBreakPositions != null) {
180: positionInCache = cachedBreakPositions.length - 2;
181: }
182:
183: if (result != BreakIterator.DONE) {
184: text.setIndex(result);
185: }
186:
187: return result;
188: }
189: }
190:
191: /**
192: * Sets the current iteration position to the last boundary position
193: * before the specified position.
194: * @param offset The position to begin searching from
195: * @return The position of the last boundary before "offset"
196: * @stable ICU 2.0
197: */
198: public int preceding(int offset) {
199: CharacterIterator text = getText();
200: checkOffset(offset, text);
201:
202: // if we have no cached break positions, or "offset" is outside the
203: // range covered by the cache, we can just call the inherited routine
204: // (which will eventually call other routines in this class that may
205: // refresh the cache)
206: if (cachedBreakPositions == null
207: || offset <= cachedBreakPositions[0]
208: || offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
209: cachedBreakPositions = null;
210: return super .preceding(offset);
211: }
212:
213: // on the other hand, if "offset" is within the range covered by the cache,
214: // then all we have to do is search the cache for the last break position
215: // before "offset"
216: else {
217: positionInCache = 0;
218: while (positionInCache < cachedBreakPositions.length
219: && offset > cachedBreakPositions[positionInCache])
220: ++positionInCache;
221: --positionInCache;
222: text.setIndex(cachedBreakPositions[positionInCache]);
223: return text.getIndex();
224: }
225: }
226:
227: /**
228: * Sets the current iteration position to the first boundary position after
229: * the specified position.
230: * @param offset The position to begin searching forward from
231: * @return The position of the first boundary after "offset"
232: * @stable ICU 2.0
233: */
234: public int following(int offset) {
235: CharacterIterator text = getText();
236: checkOffset(offset, text);
237:
238: // if we have no cached break positions, or if "offset" is outside the
239: // range covered by the cache, then dump the cache and call our
240: // inherited following() method. This will call other methods in this
241: // class that may refresh the cache.
242: if (cachedBreakPositions == null
243: || offset < cachedBreakPositions[0]
244: || offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
245: cachedBreakPositions = null;
246: return super .following(offset);
247: }
248:
249: // on the other hand, if "offset" is within the range covered by the
250: // cache, then just search the cache for the first break position
251: // after "offset"
252: else {
253: positionInCache = 0;
254: while (positionInCache < cachedBreakPositions.length
255: && offset >= cachedBreakPositions[positionInCache])
256: ++positionInCache;
257: text.setIndex(cachedBreakPositions[positionInCache]);
258: return text.getIndex();
259: }
260: }
261:
262: /**
263: * Return the status tag from the break rule that determined the most recently
264: * returned break position.
265: *
266: * TODO: not supported with dictionary based break iterators.
267: *
268: * @return the status from the break rule that determined the most recently
269: * returned break position.
270: * @draft ICU 3.0
271: * @provisional This API might change or be removed in a future release.
272: */
273: public int getRuleStatus() {
274: return 0;
275: }
276:
277: /**
278: * Get the status (tag) values from the break rule(s) that determined the most
279: * recently returned break position. The values appear in the rule source
280: * within brackets, {123}, for example. The default status value for rules
281: * that do not explicitly provide one is zero.
282: * <p>
283: * TODO: not supported for dictionary based break iterator.
284: *
285: * @param fillInArray an array to be filled in with the status values.
286: * @return The number of rule status values from rules that determined
287: * the most recent boundary returned by the break iterator.
288: * In the event that the array is too small, the return value
289: * is the total number of status values that were available,
290: * not the reduced number that were actually returned.
291: * @draft ICU 3.0
292: * @provisional This API might change or be removed in a future release.
293: */
294: public int getRuleStatusVec(int[] fillInArray) {
295: if (fillInArray != null && fillInArray.length >= 1) {
296: fillInArray[0] = 0;
297: }
298: return 1;
299: }
300:
301: /**
302: * This is the implementation function for next().
303: * @internal
304: * @deprecated This API is ICU internal only.
305: */
306: protected int handleNext() {
307: CharacterIterator text = getText();
308:
309: // if there are no cached break positions, or if we've just moved
310: // off the end of the range covered by the cache, we have to dump
311: // and possibly regenerate the cache
312: if (cachedBreakPositions == null
313: || positionInCache == cachedBreakPositions.length - 1) {
314:
315: // start by using the inherited handleNext() to find a tentative return
316: // value. dictionaryCharCount tells us how many dictionary characters
317: // we passed over on our way to the tentative return value
318: int startPos = text.getIndex();
319: fDictionaryCharCount = 0;
320: int result = super .handleNext();
321:
322: // if we passed over more than one dictionary character, then we use
323: // divideUpDictionaryRange() to regenerate the cached break positions
324: // for the new range
325: if (fDictionaryCharCount > 1 && result - startPos > 1) {
326: divideUpDictionaryRange(startPos, result);
327: }
328:
329: // otherwise, the value we got back from the inherited fuction
330: // is our return value, and we can dump the cache
331: else {
332: cachedBreakPositions = null;
333: return result;
334: }
335: }
336:
337: // if the cache of break positions has been regenerated (or existed all
338: // along), then just advance to the next break position in the cache
339: // and return it
340: if (cachedBreakPositions != null) {
341: ++positionInCache;
342: text.setIndex(cachedBreakPositions[positionInCache]);
343: return cachedBreakPositions[positionInCache];
344: }
345: Assert.assrt(false);
346: return -9999; // SHOULD NEVER GET HERE!
347: }
348:
349: /**
350: * This is the function that actually implements the dictionary-based
351: * algorithm. Given the endpoints of a range of text, it uses the
352: * dictionary to determine the positions of any boundaries in this
353: * range. It stores all the boundary positions it discovers in
354: * cachedBreakPositions so that we only have to do this work once
355: * for each time we enter the range.
356: */
357: private void divideUpDictionaryRange(int startPos, int endPos) {
358: CharacterIterator text = getText();
359:
360: // the range we're dividing may begin or end with non-dictionary characters
361: // (i.e., for line breaking, we may have leading or trailing punctuation
362: // that needs to be kept with the word). Seek from the beginning of the
363: // range to the first dictionary character
364: text.setIndex(startPos);
365: int c = CICurrent32(text);
366: while (isDictionaryChar(c) == false) {
367: c = CINext32(text);
368: }
369:
370: //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
371:
372: // initialize. We maintain two stacks: currentBreakPositions contains
373: // the list of break positions that will be returned if we successfully
374: // finish traversing the whole range now. possibleBreakPositions lists
375: // all other possible word ends we've passed along the way. (Whenever
376: // we reach an error [a sequence of characters that can't begin any word
377: // in the dictionary], we back up, possibly delete some breaks from
378: // currentBreakPositions, move a break from possibleBreakPositions
379: // to currentBreakPositions, and start over from there. This process
380: // continues in this way until we either successfully make it all the way
381: // across the range, or exhaust all of our combinations of break
382: // positions.)
383: Stack currentBreakPositions = new Stack();
384: Stack possibleBreakPositions = new Stack();
385: Vector wrongBreakPositions = new Vector();
386:
387: // the dictionary is implemented as a trie, which is treated as a state
388: // machine. -1 represents the end of a legal word. Every word in the
389: // dictionary is represented by a path from the root node to -1. A path
390: // that ends in state 0 is an illegal combination of characters.
391: int state = 0;
392:
393: // these two variables are used for error handling. We keep track of the
394: // farthest we've gotten through the range being divided, and the combination
395: // of breaks that got us that far. If we use up all possible break
396: // combinations, the text contains an error or a word that's not in the
397: // dictionary. In this case, we "bless" the break positions that got us the
398: // farthest as real break positions, and then start over from scratch with
399: // the character where the error occurred.
400: int farthestEndPoint = text.getIndex();
401: Stack bestBreakPositions = null;
402:
403: // initialize (we always exit the loop with a break statement)
404: c = CICurrent32(text);
405: while (true) {
406: //System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
407:
408: // if we can transition to state "-1" from our current state, we're
409: // on the last character of a legal word. Push that position onto
410: // the possible-break-positions stack
411: if (dictionary.at(state, 0) == -1) {
412: possibleBreakPositions
413: .push(new Integer(text.getIndex()));
414: }
415:
416: // look up the new state to transition to in the dictionary
417: // There will be no supplementaries here because the Thai dictionary
418: // does not include any. This code is going away soon, not worth
419: // fixing.
420: state = (dictionary.at(state, (char) c)) & 0xFFFF; // TODO: fix supplementaries
421: //System.out.print(", state = " + state);
422:
423: // if the character we're sitting on causes us to transition to
424: // the "end of word" state, then it was a non-dictionary character
425: // and we've successfully traversed the whole range. Drop out
426: // of the loop.
427: if (state == /*-1*/0xFFFF) {
428: currentBreakPositions
429: .push(new Integer(text.getIndex()));
430: break;
431: }
432:
433: // if the character we're sitting on causes us to transition to
434: // the error state, or if we've gone off the end of the range
435: // without transitioning to the "end of word" state, we've hit
436: // an error...
437: else if (state == 0 || text.getIndex() >= endPos) {
438:
439: // if this is the farthest we've gotten, take note of it in
440: // case there's an error in the text
441: if (text.getIndex() > farthestEndPoint) {
442: farthestEndPoint = text.getIndex();
443: bestBreakPositions = (Stack) (currentBreakPositions
444: .clone());
445: }
446:
447: // wrongBreakPositions is a list of all break positions we've tried starting
448: // that didn't allow us to traverse all the way through the text. Every time
449: // we pop a break position off of currentBreakPositions, we put it into
450: // wrongBreakPositions to avoid trying it again later. If we make it to this
451: // spot, we're either going to back up to a break in possibleBreakPositions
452: // and try starting over from there, or we've exhausted all possible break
453: // positions and are going to do the fallback procedure. This loop prevents
454: // us from messing with anything in possibleBreakPositions that didn't work as
455: // a starting point the last time we tried it (this is to prevent a bunch of
456: // repetitive checks from slowing down some extreme cases)
457: // variable not used Integer newStartingSpot = null;
458: while (!possibleBreakPositions.isEmpty()
459: && wrongBreakPositions
460: .contains(possibleBreakPositions.peek())) {
461: possibleBreakPositions.pop();
462: }
463:
464: // if we've used up all possible break-position combinations, there's
465: // an error or an unknown word in the text. In this case, we start
466: // over, treating the farthest character we've reached as the beginning
467: // of the range, and "blessing" the break positions that got us that
468: // far as real break positions
469: if (possibleBreakPositions.isEmpty()) {
470: if (bestBreakPositions != null) {
471: currentBreakPositions = bestBreakPositions;
472: if (farthestEndPoint < endPos) {
473: text.setIndex(farthestEndPoint + 1);
474: } else {
475: break;
476: }
477: } else {
478: if ((currentBreakPositions.size() == 0 || ((Integer) (currentBreakPositions
479: .peek())).intValue() != text.getIndex())
480: && text.getIndex() != startPos) {
481: currentBreakPositions.push(new Integer(text
482: .getIndex()));
483: }
484: CINext32(text);
485: currentBreakPositions.push(new Integer(text
486: .getIndex()));
487: }
488: }
489:
490: // if we still have more break positions we can try, then promote the
491: // last break in possibleBreakPositions into currentBreakPositions,
492: // and get rid of all entries in currentBreakPositions that come after
493: // it. Then back up to that position and start over from there (i.e.,
494: // treat that position as the beginning of a new word)
495: else {
496: Integer temp = (Integer) possibleBreakPositions
497: .pop();
498: Object temp2 = null;
499: while (!currentBreakPositions.isEmpty()
500: && temp.intValue() < ((Integer) currentBreakPositions
501: .peek()).intValue()) {
502: temp2 = currentBreakPositions.pop();
503: wrongBreakPositions.addElement(temp2);
504: }
505: currentBreakPositions.push(temp);
506: text.setIndex(((Integer) currentBreakPositions
507: .peek()).intValue());
508: }
509:
510: // re-sync "c" for the next go-round, and drop out of the loop if
511: // we've made it off the end of the range
512: c = CICurrent32(text);
513: state = 0;
514: if (text.getIndex() >= endPos) {
515: break;
516: }
517: }
518:
519: // if we didn't hit any exceptional conditions on this last iteration,
520: // just advance to the next character and loop
521: else {
522: c = CINext32(text);
523: }
524: //System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
525: //System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
526: }
527:
528: // dump the last break position in the list, and replace it with the actual
529: // end of the range (which may be the same character, or may be further on
530: // because the range actually ended with non-dictionary characters we want to
531: // keep with the word)
532: if (!currentBreakPositions.isEmpty()) {
533: currentBreakPositions.pop();
534: }
535: currentBreakPositions.push(new Integer(endPos));
536:
537: // create a regular array to hold the break positions and copy
538: // the break positions from the stack to the array (in addition,
539: // our starting position goes into this array as a break position).
540: // This array becomes the cache of break positions used by next()
541: // and previous(), so this is where we actually refresh the cache.
542: cachedBreakPositions = new int[currentBreakPositions.size() + 1];
543: cachedBreakPositions[0] = startPos;
544:
545: for (int i = 0; i < currentBreakPositions.size(); i++) {
546: cachedBreakPositions[i + 1] = ((Integer) currentBreakPositions
547: .elementAt(i)).intValue();
548: }
549: positionInCache = 0;
550: }
551: }
|