001: /*
002: *
003: * @(#)DictionaryBasedBreakIterator.java 1.13 06/10/03
004: *
005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
006: * Reserved. Use is subject to license terms.
007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
008: *
009: * This program is free software; you can redistribute it and/or
010: * modify it under the terms of the GNU General Public License version
011: * 2 only, as published by the Free Software Foundation.
012: *
013: * This program is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * General Public License version 2 for more details (a copy is
017: * included at /legal/license.txt).
018: *
019: * You should have received a copy of the GNU General Public License
020: * version 2 along with this work; if not, write to the Free Software
021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022: * 02110-1301 USA
023: *
024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
025: * Clara, CA 95054 or visit www.sun.com if you need additional
026: * information or have any questions.
027: */
028:
029: /*
030: * @(#)DictionaryBasedBreakIterator.java 1.3 99/05/03
031: *
032: * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
033: * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
034: *
035: * The original version of this source code and documentation
036: * is copyrighted and owned by Taligent, Inc., a wholly-owned
037: * subsidiary of IBM. These materials are provided under terms
038: * of a License Agreement between Taligent and Sun. This technology
039: * is protected by multiple US and International patents.
040: *
041: * This notice and attribution to Taligent may not be removed.
042: * Taligent is a registered trademark of Taligent, Inc.
043: */
044:
045: package java.text;
046:
047: import java.util.Vector;
048: import java.util.Stack;
049: import java.util.Hashtable;
050: import java.text.CharacterIterator;
051: import java.io.InputStream;
052: import java.io.IOException;
053:
054: /**
055: * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
056: * to further subdivide ranges of text beyond what is possible using just the
057: * state-table-based algorithm. This is necessary, for example, to handle
058: * word and line breaking in Thai, which doesn't use spaces between words. The
059: * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
060: * up text as far as possible, and then contiguous ranges of letters are
061: * repeatedly compared against a list of known words (i.e., the dictionary)
062: * to divide them up into words.
063: *
064: * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
065: * but adds one more special substitution name: <dictionary>. This substitution
066: * name is used to identify characters in words in the dictionary. The idea is that
067: * if the iterator passes over a chunk of text that includes two or more characters
068: * in a row that are included in <dictionary>, it goes back through that range and
069: * derives additional break positions (if possible) using the dictionary.
070: *
071: * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
072: * file. It follows a prescribed search path to locate the dictionary (right now,
073: * it looks for it in /com/ibm/text/resources in each directory in the classpath,
074: * and won't find it in JAR files, but this location is likely to change). The
075: * dictionary file is in a serialized binary format. We have a very primitive (and
076: * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
077: * currently making it public. Contact us for help.
078: */
079: class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
080:
081: /**
082: * a list of known words that is used to divide up contiguous ranges of letters,
083: * stored in a compressed, indexed, format that offers fast access
084: */
085: private BreakDictionary dictionary;
086:
087: /**
088: * a list of flags indicating which character categories are contained in
089: * the dictionary file (this is used to determine which ranges of characters
090: * to apply the dictionary to)
091: */
092: private boolean[] categoryFlags;
093:
094: /**
095: * a temporary hiding place for the number of dictionary characters in the
096: * last range passed over by next()
097: */
098: private int dictionaryCharCount;
099:
100: /**
101: * when a range of characters is divided up using the dictionary, the break
102: * positions that are discovered are stored here, preventing us from having
103: * to use either the dictionary or the state table again until the iterator
104: * leaves this range of text
105: */
106: private int[] cachedBreakPositions;
107:
108: /**
109: * if cachedBreakPositions is not null, this indicates which item in the
110: * cache the current iteration position refers to
111: */
112: private int positionInCache;
113:
114: /**
115: * Constructs a DictionaryBasedBreakIterator.
116: * @param description Same as the description parameter on RuleBasedBreakIterator,
117: * except for the special meaning of "<dictionary>". This parameter is just
118: * passed through to RuleBasedBreakIterator's constructor.
119: * @param dictionaryFilename The filename of the dictionary file to use
120: */
121: public DictionaryBasedBreakIterator(String description,
122: InputStream dictionaryStream) throws IOException {
123: super (description);
124: dictionary = new BreakDictionary(dictionaryStream);
125: }
126:
127: /**
128: * Returns a Builder that is customized to build a DictionaryBasedBreakIterator.
129: * This is the same as RuleBasedBreakIterator.Builder, except for the extra code
130: * to handle the <dictionary> tag.
131: */
132: protected RuleBasedBreakIterator.Builder makeBuilder() {
133: return new Builder();
134: }
135:
136: public void setText(CharacterIterator newText) {
137: super .setText(newText);
138: cachedBreakPositions = null;
139: dictionaryCharCount = 0;
140: positionInCache = 0;
141: }
142:
143: /**
144: * Sets the current iteration position to the beginning of the text.
145: * (i.e., the CharacterIterator's starting offset).
146: * @return The offset of the beginning of the text.
147: */
148: public int first() {
149: cachedBreakPositions = null;
150: dictionaryCharCount = 0;
151: positionInCache = 0;
152: return super .first();
153: }
154:
155: /**
156: * Sets the current iteration position to the end of the text.
157: * (i.e., the CharacterIterator's ending offset).
158: * @return The text's past-the-end offset.
159: */
160: public int last() {
161: cachedBreakPositions = null;
162: dictionaryCharCount = 0;
163: positionInCache = 0;
164: return super .last();
165: }
166:
167: /**
168: * Advances the iterator one step backwards.
169: * @return The position of the last boundary position before the
170: * current iteration position
171: */
172: public int previous() {
173: CharacterIterator text = getText();
174:
175: // if we have cached break positions and we're still in the range
176: // covered by them, just move one step backward in the cache
177: if (cachedBreakPositions != null && positionInCache > 0) {
178: --positionInCache;
179: text.setIndex(cachedBreakPositions[positionInCache]);
180: return cachedBreakPositions[positionInCache];
181: }
182:
183: // otherwise, dump the cache and use the inherited previous() method to move
184: // backward. This may fill up the cache with new break positions, in which
185: // case we have to mark our position in the cache
186: else {
187: cachedBreakPositions = null;
188: int result = super .previous();
189: if (cachedBreakPositions != null)
190: positionInCache = cachedBreakPositions.length - 2;
191: return result;
192: }
193: }
194:
195: /**
196: * Sets the current iteration position to the last boundary position
197: * before the specified position.
198: * @param offset The position to begin searching from
199: * @return The position of the last boundary before "offset"
200: */
201: public int preceding(int offset) {
202: CharacterIterator text = getText();
203: checkOffset(offset, text);
204:
205: // if we have no cached break positions, or "offset" is outside the
206: // range covered by the cache, we can just call the inherited routine
207: // (which will eventually call other routines in this class that may
208: // refresh the cache)
209: if (cachedBreakPositions == null
210: || offset <= cachedBreakPositions[0]
211: || offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
212: cachedBreakPositions = null;
213: return super .preceding(offset);
214: }
215:
216: // on the other hand, if "offset" is within the range covered by the cache,
217: // then all we have to do is search the cache for the last break position
218: // before "offset"
219: else {
220: positionInCache = 0;
221: while (positionInCache < cachedBreakPositions.length
222: && offset > cachedBreakPositions[positionInCache])
223: ++positionInCache;
224: --positionInCache;
225: text.setIndex(cachedBreakPositions[positionInCache]);
226: return text.getIndex();
227: }
228: }
229:
230: /**
231: * Sets the current iteration position to the first boundary position after
232: * the specified position.
233: * @param offset The position to begin searching forward from
234: * @return The position of the first boundary after "offset"
235: */
236: public int following(int offset) {
237: CharacterIterator text = getText();
238: checkOffset(offset, text);
239:
240: // if we have no cached break positions, or if "offset" is outside the
241: // range covered by the cache, then dump the cache and call our
242: // inherited following() method. This will call other methods in this
243: // class that may refresh the cache.
244: if (cachedBreakPositions == null
245: || offset < cachedBreakPositions[0]
246: || offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
247: cachedBreakPositions = null;
248: return super .following(offset);
249: }
250:
251: // on the other hand, if "offset" is within the range covered by the
252: // cache, then just search the cache for the first break position
253: // after "offset"
254: else {
255: positionInCache = 0;
256: while (positionInCache < cachedBreakPositions.length
257: && offset >= cachedBreakPositions[positionInCache])
258: ++positionInCache;
259: text.setIndex(cachedBreakPositions[positionInCache]);
260: return text.getIndex();
261: }
262: }
263:
264: /**
265: * This is the implementation function for next().
266: */
267: protected int handleNext() {
268: CharacterIterator text = getText();
269:
270: // if there are no cached break positions, or if we've just moved
271: // off the end of the range covered by the cache, we have to dump
272: // and possibly regenerate the cache
273: if (cachedBreakPositions == null
274: || positionInCache == cachedBreakPositions.length - 1) {
275:
276: // start by using the inherited handleNext() to find a tentative return
277: // value. dictionaryCharCount tells us how many dictionary characters
278: // we passed over on our way to the tentative return value
279: int startPos = text.getIndex();
280: dictionaryCharCount = 0;
281: int result = super .handleNext();
282:
283: // if we passed over more than one dictionary character, then we use
284: // divideUpDictionaryRange() to regenerate the cached break positions
285: // for the new range
286: if (dictionaryCharCount > 1 && result - startPos > 1) {
287: divideUpDictionaryRange(startPos, result);
288: }
289:
290: // otherwise, the value we got back from the inherited fuction
291: // is our return value, and we can dump the cache
292: else {
293: cachedBreakPositions = null;
294: return result;
295: }
296: }
297:
298: // if the cache of break positions has been regenerated (or existed all
299: // along), then just advance to the next break position in the cache
300: // and return it
301: if (cachedBreakPositions != null) {
302: ++positionInCache;
303: text.setIndex(cachedBreakPositions[positionInCache]);
304: return cachedBreakPositions[positionInCache];
305: }
306: return -9999; // SHOULD NEVER GET HERE!
307: }
308:
309: /**
310: * Looks up a character category for a character.
311: */
312: protected int lookupCategory(char c) {
313: // this override of lookupCategory() exists only to keep track of whether we've
314: // passed over any dictionary characters. It calls the inherited lookupCategory()
315: // to do the real work, and then checks whether its return value is one of the
316: // categories represented in the dictionary. If it is, bump the dictionary-
317: // character count.
318: int result = super .lookupCategory(c);
319: if (result != RuleBasedBreakIterator.IGNORE
320: && categoryFlags[result]) {
321: ++dictionaryCharCount;
322: }
323: return result;
324: }
325:
326: /**
327: * This is the function that actually implements the dictionary-based
328: * algorithm. Given the endpoints of a range of text, it uses the
329: * dictionary to determine the positions of any boundaries in this
330: * range. It stores all the boundary positions it discovers in
331: * cachedBreakPositions so that we only have to do this work once
332: * for each time we enter the range.
333: */
334: private void divideUpDictionaryRange(int startPos, int endPos) {
335: CharacterIterator text = getText();
336:
337: // the range we're dividing may begin or end with non-dictionary characters
338: // (i.e., for line breaking, we may have leading or trailing punctuation
339: // that needs to be kept with the word). Seek from the beginning of the
340: // range to the first dictionary character
341: text.setIndex(startPos);
342: char c = text.current();
343: int category = lookupCategory(c);
344: while (category == IGNORE || !categoryFlags[category]) {
345: c = text.next();
346: category = lookupCategory(c);
347: }
348:
349: // initialize. We maintain two stacks: currentBreakPositions contains
350: // the list of break positions that will be returned if we successfully
351: // finish traversing the whole range now. possibleBreakPositions lists
352: // all other possible word ends we've passed along the way. (Whenever
353: // we reach an error [a sequence of characters that can't begin any word
354: // in the dictionary], we back up, possibly delete some breaks from
355: // currentBreakPositions, move a break from possibleBreakPositions
356: // to currentBreakPositions, and start over from there. This process
357: // continues in this way until we either successfully make it all the way
358: // across the range, or exhaust all of our combinations of break
359: // positions.)
360: Stack currentBreakPositions = new Stack();
361: Stack possibleBreakPositions = new Stack();
362: Vector wrongBreakPositions = new Vector();
363:
364: // the dictionary is implemented as a trie, which is treated as a state
365: // machine. -1 represents the end of a legal word. Every word in the
366: // dictionary is represented by a path from the root node to -1. A path
367: // that ends in state 0 is an illegal combination of characters.
368: int state = 0;
369:
370: // these two variables are used for error handling. We keep track of the
371: // farthest we've gotten through the range being divided, and the combination
372: // of breaks that got us that far. If we use up all possible break
373: // combinations, the text contains an error or a word that's not in the
374: // dictionary. In this case, we "bless" the break positions that got us the
375: // farthest as real break positions, and then start over from scratch with
376: // the character where the error occurred.
377: int farthestEndPoint = text.getIndex();
378: Stack bestBreakPositions = null;
379:
380: // initialize (we always exit the loop with a break statement)
381: c = text.current();
382: while (true) {
383:
384: // if we can transition to state "-1" from our current state, we're
385: // on the last character of a legal word. Push that position onto
386: // the possible-break-positions stack
387: if (dictionary.at(state, 0) == -1) {
388: possibleBreakPositions
389: .push(new Integer(text.getIndex()));
390: }
391:
392: // look up the new state to transition to in the dictionary
393: state = dictionary.at(state, c);
394:
395: // if the character we're sitting on causes us to transition to
396: // the "end of word" state, then it was a non-dictionary character
397: // and we've successfully traversed the whole range. Drop out
398: // of the loop.
399: if (state == -1) {
400: currentBreakPositions
401: .push(new Integer(text.getIndex()));
402: break;
403: }
404:
405: // if the character we're sitting on causes us to transition to
406: // the error state, or if we've gone off the end of the range
407: // without transitioning to the "end of word" state, we've hit
408: // an error...
409: else if (state == 0 || text.getIndex() >= endPos) {
410:
411: // if this is the farthest we've gotten, take note of it in
412: // case there's an error in the text
413: if (text.getIndex() > farthestEndPoint) {
414: farthestEndPoint = text.getIndex();
415: bestBreakPositions = (Stack) (currentBreakPositions
416: .clone());
417: }
418:
419: // wrongBreakPositions is a list of all break positions
420: // we've tried starting that didn't allow us to traverse
421: // all the way through the text. Every time we pop a
422: //break position off of currentBreakPositions, we put it
423: // into wrongBreakPositions to avoid trying it again later.
424: // If we make it to this spot, we're either going to back
425: // up to a break in possibleBreakPositions and try starting
426: // over from there, or we've exhausted all possible break
427: // positions and are going to do the fallback procedure.
428: // This loop prevents us from messing with anything in
429: // possibleBreakPositions that didn't work as a starting
430: // point the last time we tried it (this is to prevent a bunch of
431: // repetitive checks from slowing down some extreme cases)
432: Integer newStartingSpot = null;
433: while (!possibleBreakPositions.isEmpty()
434: && wrongBreakPositions
435: .contains(possibleBreakPositions.peek())) {
436: possibleBreakPositions.pop();
437: }
438:
439: // if we've used up all possible break-position combinations, there's
440: // an error or an unknown word in the text. In this case, we start
441: // over, treating the farthest character we've reached as the beginning
442: // of the range, and "blessing" the break positions that got us that
443: // far as real break positions
444: if (possibleBreakPositions.isEmpty()) {
445: if (bestBreakPositions != null) {
446: currentBreakPositions = bestBreakPositions;
447: if (farthestEndPoint < endPos) {
448: text.setIndex(farthestEndPoint + 1);
449: } else {
450: break;
451: }
452: } else {
453: if ((currentBreakPositions.size() == 0 || ((Integer) (currentBreakPositions
454: .peek())).intValue() != text.getIndex())
455: && text.getIndex() != startPos) {
456: currentBreakPositions.push(new Integer(text
457: .getIndex()));
458: }
459: text.next();
460: currentBreakPositions.push(new Integer(text
461: .getIndex()));
462: }
463: }
464:
465: // if we still have more break positions we can try, then promote the
466: // last break in possibleBreakPositions into currentBreakPositions,
467: // and get rid of all entries in currentBreakPositions that come after
468: // it. Then back up to that position and start over from there (i.e.,
469: // treat that position as the beginning of a new word)
470: else {
471: Integer temp = (Integer) possibleBreakPositions
472: .pop();
473: Object temp2 = null;
474: while (!currentBreakPositions.isEmpty()
475: && temp.intValue() < ((Integer) currentBreakPositions
476: .peek()).intValue()) {
477: temp2 = currentBreakPositions.pop();
478: wrongBreakPositions.addElement(temp2);
479: }
480: currentBreakPositions.push(temp);
481: text.setIndex(((Integer) currentBreakPositions
482: .peek()).intValue());
483: }
484:
485: // re-sync "c" for the next go-round, and drop out of the loop if
486: // we've made it off the end of the range
487: c = text.current();
488: if (text.getIndex() >= endPos) {
489: break;
490: }
491: }
492:
493: // if we didn't hit any exceptional conditions on this last iteration,
494: // just advance to the next character and loop
495: else {
496: c = text.next();
497: }
498: }
499:
500: // dump the last break position in the list, and replace it with the actual
501: // end of the range (which may be the same character, or may be further on
502: // because the range actually ended with non-dictionary characters we want to
503: // keep with the word)
504: if (!currentBreakPositions.isEmpty()) {
505: currentBreakPositions.pop();
506: }
507: currentBreakPositions.push(new Integer(endPos));
508:
509: // create a regular array to hold the break positions and copy
510: // the break positions from the stack to the array (in addition,
511: // our starting position goes into this array as a break position).
512: // This array becomes the cache of break positions used by next()
513: // and previous(), so this is where we actually refresh the cache.
514: cachedBreakPositions = new int[currentBreakPositions.size() + 1];
515: cachedBreakPositions[0] = startPos;
516:
517: for (int i = 0; i < currentBreakPositions.size(); i++) {
518: cachedBreakPositions[i + 1] = ((Integer) currentBreakPositions
519: .elementAt(i)).intValue();
520: }
521: positionInCache = 0;
522: }
523:
524: /**
525: * The Builder class for DictionaryBasedBreakIterator inherits almost all of
526: * its functionality from the Builder class for RuleBasedBreakIterator, but
527: * extends it with extra logic to handle the "<dictionary>" token
528: */
529: protected class Builder extends RuleBasedBreakIterator.Builder {
530:
531: /**
532: * A CharSet that contains all the characters represented in the dictionary
533: */
534: private CharSet dictionaryChars = new CharSet();
535: private String dictionaryExpression = "";
536:
537: /**
538: * No special initialization
539: */
540: public Builder() {
541: DictionaryBasedBreakIterator.this .super ();
542: }
543:
544: /**
545: * We override handleSpecialSubstitution() to add logic to handle
546: * the <dictionary> tag. If we see a substitution named "<dictionary>",
547: * parse the substitution expression and store the result in
548: * dictionaryChars.
549: */
550: protected void handleSpecialSubstitution(String replace,
551: String replaceWith, int startPos, String description) {
552: super .handleSpecialSubstitution(replace, replaceWith,
553: startPos, description);
554:
555: if (replace.equals("<dictionary>")) {
556: if (replaceWith.charAt(0) == '(') {
557: error("Dictionary group can't be enclosed in (",
558: startPos, description);
559: }
560: dictionaryExpression = replaceWith;
561: dictionaryChars = CharSet.parseString(replaceWith);
562: }
563: }
564:
565: /**
566: * The other half of the logic to handle the dictionary characters happens here.
567: * After the inherited builder has derived the real character categories, we
568: * set up the categoryFlags array in the iterator. This array contains "true"
569: * for every character category that includes a dictionary character.
570: */
571: protected void buildCharCategories(Vector tempRuleList) {
572: super .buildCharCategories(tempRuleList);
573:
574: categoryFlags = new boolean[categories.size()];
575: for (int i = 0; i < categories.size(); i++) {
576: CharSet cs = (CharSet) categories.elementAt(i);
577: if (!(cs.intersection(dictionaryChars).empty())) {
578: categoryFlags[i] = true;
579: }
580: }
581: }
582:
583: // This function is actually called by
584: // RuleBasedBreakIterator.buildCharCategories(), which is called
585: // by the function above. This gives us a way to create a separate
586: // character category for the dictionary characters even when
587: // RuleBasedBreakIterator isn't making a distinction.
588: protected void mungeExpressionList(Hashtable expressions) {
589: expressions.put(dictionaryExpression, dictionaryChars);
590: }
591: }
592: }
|