Source Code Cross Referenced for DictionaryBasedBreakIterator.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         *******************************************************************************
003:         * Copyright (C) 1996-2006, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         */
007:
008:        package com.ibm.icu.text;
009:
010:        import java.util.Vector;
011:        import java.util.Stack;
012:        import com.ibm.icu.impl.Assert;
013:        import java.text.CharacterIterator;
014:        import java.io.InputStream;
015:        import java.io.IOException;
016:
017:        /**
018:         * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
019:         * to further subdivide ranges of text beyond what is possible using just the
020:         * state-table-based algorithm.  This is necessary, for example, to handle
021:         * word and line breaking in Thai, which doesn't use spaces between words.  The
022:         * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
023:         * up text as far as possible, and then contiguous ranges of letters are
024:         * repeatedly compared against a list of known words (i.e., the dictionary)
025:         * to divide them up into words.
026:         *
027:         * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
028:         * but adds one more special substitution name: _dictionary_.  This substitution
029:         * name is used to identify characters in words in the dictionary.  The idea is that
030:         * if the iterator passes over a chunk of text that includes two or more characters
031:         * in a row that are included in _dictionary_, it goes back through that range and
032:         * derives additional break positions (if possible) using the dictionary.
033:         *
034:         * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
035:         * file.  It uses Class.getResource() to locate the dictionary file.  The
036:         * dictionary file is in a serialized binary format.  We have a very primitive (and
037:         * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
038:         * currently making it public.  Contact us for help.
039:         *
040:         * @stable ICU 2.0
041:         */
042:        public class DictionaryBasedBreakIterator extends
043:                RuleBasedBreakIterator {
044:
045:            /**
046:             * a list of known words that is used to divide up contiguous ranges of letters,
047:             * stored in a compressed, indexed, format that offers fast access
048:             */
049:            private BreakDictionary dictionary;
050:
051:            /**
052:             * a list of flags indicating which character categories are contained in
053:             * the dictionary file (this is used to determine which ranges of characters
054:             * to apply the dictionary to)
055:             */
056:            private boolean[] categoryFlags;
057:
058:            /**
059:             * when a range of characters is divided up using the dictionary, the break
060:             * positions that are discovered are stored here, preventing us from having
061:             * to use either the dictionary or the state table again until the iterator
062:             * leaves this range of text
063:             */
064:            private int[] cachedBreakPositions;
065:
066:            /**
067:             * if cachedBreakPositions is not null, this indicates which item in the
068:             * cache the current iteration position refers to
069:             */
070:            private int positionInCache;
071:
072:            /**
073:             * Special variable name for characters in words in dictionary
074:             */
075:
076:            /**
077:             * Constructs a DictionaryBasedBreakIterator.
078:             * @param rules Same as the rules parameter on RuleBasedBreakIterator,
079:             * except for the special meaning of "_dictionary_".  This parameter is just
080:             * passed through to RuleBasedBreakIterator constructor.
081:             * @param dictionaryStream the stream containing the dictionary data
082:             * @stable ICU 2.0
083:             */
084:            public DictionaryBasedBreakIterator(String rules,
085:                    InputStream dictionaryStream) throws IOException {
086:                super (rules);
087:                dictionary = new BreakDictionary(dictionaryStream);
088:            }
089:
090:            /**
091:             * Construct a DictionarBasedBreakIterator from precompiled rules.
092:             * @param compiledRules an input stream containing the binary (flattened) compiled rules.
093:             * @param dictionaryStream an input stream containing the dictionary data
094:             * @internal
095:             * @deprecated This API is ICU internal only.
096:             */
097:            public DictionaryBasedBreakIterator(InputStream compiledRules,
098:                    InputStream dictionaryStream) throws IOException {
099:                fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
100:                dictionary = new BreakDictionary(dictionaryStream);
101:            }
102:
103:            /** @stable ICU 2.0 */
104:            public void setText(CharacterIterator newText) {
105:                super .setText(newText);
106:                cachedBreakPositions = null;
107:                fDictionaryCharCount = 0;
108:                positionInCache = 0;
109:            }
110:
111:            /**
112:             * Sets the current iteration position to the beginning of the text.
113:             * (i.e., the CharacterIterator's starting offset).
114:             * @return The offset of the beginning of the text.
115:             * @stable ICU 2.0
116:             */
117:            public int first() {
118:                cachedBreakPositions = null;
119:                fDictionaryCharCount = 0;
120:                positionInCache = 0;
121:                return super .first();
122:            }
123:
124:            /**
125:             * Sets the current iteration position to the end of the text.
126:             * (i.e., the CharacterIterator's ending offset).
127:             * @return The text's past-the-end offset.
128:             * @stable ICU 2.0
129:             */
130:            public int last() {
131:                cachedBreakPositions = null;
132:                fDictionaryCharCount = 0;
133:                positionInCache = 0;
134:                return super .last();
135:            }
136:
137:            /**
138:             * Advances the iterator one step backwards.
139:             * @return The position of the last boundary position before the
140:             * current iteration position
141:             * @stable ICU 2.0
142:             */
143:            public int previous() {
144:                CharacterIterator text = getText();
145:
146:                // if we have cached break positions and we're still in the range
147:                // covered by them, just move one step backward in the cache
148:                if (cachedBreakPositions != null && positionInCache > 0) {
149:                    --positionInCache;
150:                    text.setIndex(cachedBreakPositions[positionInCache]);
151:                    return cachedBreakPositions[positionInCache];
152:                }
153:
154:                // otherwise, dump the cache and use the inherited previous() method to move
155:                // backward.  This may fill up the cache with new break positions, in which
156:                // case we have to mark our position in the cache. If it doesn't, use next()
157:                // to move forward until we hit or pass the current position. This *will* fill
158:                // the cache.
159:                else {
160:                    cachedBreakPositions = null;
161:                    int offset = current();
162:                    int result = super .previous();
163:
164:                    if (cachedBreakPositions != null) {
165:                        positionInCache = cachedBreakPositions.length - 2;
166:                        return result;
167:                    }
168:
169:                    while (result < offset) {
170:                        int nextResult = next();
171:
172:                        if (nextResult >= offset) {
173:                            break;
174:                        }
175:
176:                        result = nextResult;
177:                    }
178:
179:                    if (cachedBreakPositions != null) {
180:                        positionInCache = cachedBreakPositions.length - 2;
181:                    }
182:
183:                    if (result != BreakIterator.DONE) {
184:                        text.setIndex(result);
185:                    }
186:
187:                    return result;
188:                }
189:            }
190:
191:            /**
192:             * Sets the current iteration position to the last boundary position
193:             * before the specified position.
194:             * @param offset The position to begin searching from
195:             * @return The position of the last boundary before "offset"
196:             * @stable ICU 2.0
197:             */
198:            public int preceding(int offset) {
199:                CharacterIterator text = getText();
200:                checkOffset(offset, text);
201:
202:                // if we have no cached break positions, or "offset" is outside the
203:                // range covered by the cache, we can just call the inherited routine
204:                // (which will eventually call other routines in this class that may
205:                // refresh the cache)
206:                if (cachedBreakPositions == null
207:                        || offset <= cachedBreakPositions[0]
208:                        || offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
209:                    cachedBreakPositions = null;
210:                    return super .preceding(offset);
211:                }
212:
213:                // on the other hand, if "offset" is within the range covered by the cache,
214:                // then all we have to do is search the cache for the last break position
215:                // before "offset"
216:                else {
217:                    positionInCache = 0;
218:                    while (positionInCache < cachedBreakPositions.length
219:                            && offset > cachedBreakPositions[positionInCache])
220:                        ++positionInCache;
221:                    --positionInCache;
222:                    text.setIndex(cachedBreakPositions[positionInCache]);
223:                    return text.getIndex();
224:                }
225:            }
226:
227:            /**
228:             * Sets the current iteration position to the first boundary position after
229:             * the specified position.
230:             * @param offset The position to begin searching forward from
231:             * @return The position of the first boundary after "offset"
232:             * @stable ICU 2.0
233:             */
234:            public int following(int offset) {
235:                CharacterIterator text = getText();
236:                checkOffset(offset, text);
237:
238:                // if we have no cached break positions, or if "offset" is outside the
239:                // range covered by the cache, then dump the cache and call our
240:                // inherited following() method.  This will call other methods in this
241:                // class that may refresh the cache.
242:                if (cachedBreakPositions == null
243:                        || offset < cachedBreakPositions[0]
244:                        || offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
245:                    cachedBreakPositions = null;
246:                    return super .following(offset);
247:                }
248:
249:                // on the other hand, if "offset" is within the range covered by the
250:                // cache, then just search the cache for the first break position
251:                // after "offset"
252:                else {
253:                    positionInCache = 0;
254:                    while (positionInCache < cachedBreakPositions.length
255:                            && offset >= cachedBreakPositions[positionInCache])
256:                        ++positionInCache;
257:                    text.setIndex(cachedBreakPositions[positionInCache]);
258:                    return text.getIndex();
259:                }
260:            }
261:
262:            /**
263:             * Return the status tag from the break rule that determined the most recently
264:             * returned break position. 
265:             * 
266:             * TODO:  not supported with dictionary based break iterators.
267:             *
268:             * @return the status from the break rule that determined the most recently
269:             * returned break position.
270:             * @draft ICU 3.0
271:             * @provisional This API might change or be removed in a future release.
272:             */
273:            public int getRuleStatus() {
274:                return 0;
275:            }
276:
277:            /**
278:             * Get the status (tag) values from the break rule(s) that determined the most 
279:             * recently returned break position.  The values appear in the rule source
280:             * within brackets, {123}, for example.  The default status value for rules
281:             * that do not explicitly provide one is zero.
282:             * <p>
283:             * TODO: not supported for dictionary based break iterator. 
284:             *
285:             * @param fillInArray an array to be filled in with the status values.  
286:             * @return          The number of rule status values from rules that determined 
287:             *                  the most recent boundary returned by the break iterator.
288:             *                  In the event that the array is too small, the return value
289:             *                  is the total number of status values that were available,
290:             *                  not the reduced number that were actually returned.
291:             * @draft ICU 3.0
292:             * @provisional This API might change or be removed in a future release.
293:             */
294:            public int getRuleStatusVec(int[] fillInArray) {
295:                if (fillInArray != null && fillInArray.length >= 1) {
296:                    fillInArray[0] = 0;
297:                }
298:                return 1;
299:            }
300:
301:            /**
302:             * This is the implementation function for next().
303:             * @internal
304:             * @deprecated This API is ICU internal only.
305:             */
306:            protected int handleNext() {
307:                CharacterIterator text = getText();
308:
309:                // if there are no cached break positions, or if we've just moved
310:                // off the end of the range covered by the cache, we have to dump
311:                // and possibly regenerate the cache
312:                if (cachedBreakPositions == null
313:                        || positionInCache == cachedBreakPositions.length - 1) {
314:
315:                    // start by using the inherited handleNext() to find a tentative return
316:                    // value.   dictionaryCharCount tells us how many dictionary characters
317:                    // we passed over on our way to the tentative return value
318:                    int startPos = text.getIndex();
319:                    fDictionaryCharCount = 0;
320:                    int result = super .handleNext();
321:
322:                    // if we passed over more than one dictionary character, then we use
323:                    // divideUpDictionaryRange() to regenerate the cached break positions
324:                    // for the new range
325:                    if (fDictionaryCharCount > 1 && result - startPos > 1) {
326:                        divideUpDictionaryRange(startPos, result);
327:                    }
328:
329:                    // otherwise, the value we got back from the inherited fuction
330:                    // is our return value, and we can dump the cache
331:                    else {
332:                        cachedBreakPositions = null;
333:                        return result;
334:                    }
335:                }
336:
337:                // if the cache of break positions has been regenerated (or existed all
338:                // along), then just advance to the next break position in the cache
339:                // and return it
340:                if (cachedBreakPositions != null) {
341:                    ++positionInCache;
342:                    text.setIndex(cachedBreakPositions[positionInCache]);
343:                    return cachedBreakPositions[positionInCache];
344:                }
345:                Assert.assrt(false);
346:                return -9999; // SHOULD NEVER GET HERE!
347:            }
348:
349:            /**
350:             * This is the function that actually implements the dictionary-based
351:             * algorithm.  Given the endpoints of a range of text, it uses the
352:             * dictionary to determine the positions of any boundaries in this
353:             * range.  It stores all the boundary positions it discovers in
354:             * cachedBreakPositions so that we only have to do this work once
355:             * for each time we enter the range.
356:             */
357:            private void divideUpDictionaryRange(int startPos, int endPos) {
358:                CharacterIterator text = getText();
359:
360:                // the range we're dividing may begin or end with non-dictionary characters
361:                // (i.e., for line breaking, we may have leading or trailing punctuation
362:                // that needs to be kept with the word).  Seek from the beginning of the
363:                // range to the first dictionary character
364:                text.setIndex(startPos);
365:                int c = CICurrent32(text);
366:                while (isDictionaryChar(c) == false) {
367:                    c = CINext32(text);
368:                }
369:
370:                //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
371:
372:                // initialize.  We maintain two stacks: currentBreakPositions contains
373:                // the list of break positions that will be returned if we successfully
374:                // finish traversing the whole range now.  possibleBreakPositions lists
375:                // all other possible word ends we've passed along the way.  (Whenever
376:                // we reach an error [a sequence of characters that can't begin any word
377:                // in the dictionary], we back up, possibly delete some breaks from
378:                // currentBreakPositions, move a break from possibleBreakPositions
379:                // to currentBreakPositions, and start over from there.  This process
380:                // continues in this way until we either successfully make it all the way
381:                // across the range, or exhaust all of our combinations of break
382:                // positions.)
383:                Stack currentBreakPositions = new Stack();
384:                Stack possibleBreakPositions = new Stack();
385:                Vector wrongBreakPositions = new Vector();
386:
387:                // the dictionary is implemented as a trie, which is treated as a state
388:                // machine.  -1 represents the end of a legal word.  Every word in the
389:                // dictionary is represented by a path from the root node to -1.  A path
390:                // that ends in state 0 is an illegal combination of characters.
391:                int state = 0;
392:
393:                // these two variables are used for error handling.  We keep track of the
394:                // farthest we've gotten through the range being divided, and the combination
395:                // of breaks that got us that far.  If we use up all possible break
396:                // combinations, the text contains an error or a word that's not in the
397:                // dictionary.  In this case, we "bless" the break positions that got us the
398:                // farthest as real break positions, and then start over from scratch with
399:                // the character where the error occurred.
400:                int farthestEndPoint = text.getIndex();
401:                Stack bestBreakPositions = null;
402:
403:                // initialize (we always exit the loop with a break statement)
404:                c = CICurrent32(text);
405:                while (true) {
406:                    //System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
407:
408:                    // if we can transition to state "-1" from our current state, we're
409:                    // on the last character of a legal word.  Push that position onto
410:                    // the possible-break-positions stack
411:                    if (dictionary.at(state, 0) == -1) {
412:                        possibleBreakPositions
413:                                .push(new Integer(text.getIndex()));
414:                    }
415:
416:                    // look up the new state to transition to in the dictionary
417:                    //    There will be no supplementaries here because the Thai dictionary
418:                    //     does not include any.  This code is going away soon, not worth
419:                    //     fixing.
420:                    state = (dictionary.at(state, (char) c)) & 0xFFFF; // TODO: fix supplementaries
421:                    //System.out.print(", state = " + state);
422:
423:                    // if the character we're sitting on causes us to transition to
424:                    // the "end of word" state, then it was a non-dictionary character
425:                    // and we've successfully traversed the whole range.  Drop out
426:                    // of the loop.
427:                    if (state == /*-1*/0xFFFF) {
428:                        currentBreakPositions
429:                                .push(new Integer(text.getIndex()));
430:                        break;
431:                    }
432:
433:                    // if the character we're sitting on causes us to transition to
434:                    // the error state, or if we've gone off the end of the range
435:                    // without transitioning to the "end of word" state, we've hit
436:                    // an error...
437:                    else if (state == 0 || text.getIndex() >= endPos) {
438:
439:                        // if this is the farthest we've gotten, take note of it in
440:                        // case there's an error in the text
441:                        if (text.getIndex() > farthestEndPoint) {
442:                            farthestEndPoint = text.getIndex();
443:                            bestBreakPositions = (Stack) (currentBreakPositions
444:                                    .clone());
445:                        }
446:
447:                        // wrongBreakPositions is a list of all break positions we've tried starting
448:                        // that didn't allow us to traverse all the way through the text.  Every time
449:                        // we pop a break position off of currentBreakPositions, we put it into
450:                        // wrongBreakPositions to avoid trying it again later.  If we make it to this
451:                        // spot, we're either going to back up to a break in possibleBreakPositions
452:                        // and try starting over from there, or we've exhausted all possible break
453:                        // positions and are going to do the fallback procedure.  This loop prevents
454:                        // us from messing with anything in possibleBreakPositions that didn't work as
455:                        // a starting point the last time we tried it (this is to prevent a bunch of
456:                        // repetitive checks from slowing down some extreme cases)
457:                        // variable not used Integer newStartingSpot = null;
458:                        while (!possibleBreakPositions.isEmpty()
459:                                && wrongBreakPositions
460:                                        .contains(possibleBreakPositions.peek())) {
461:                            possibleBreakPositions.pop();
462:                        }
463:
464:                        // if we've used up all possible break-position combinations, there's
465:                        // an error or an unknown word in the text.  In this case, we start
466:                        // over, treating the farthest character we've reached as the beginning
467:                        // of the range, and "blessing" the break positions that got us that
468:                        // far as real break positions
469:                        if (possibleBreakPositions.isEmpty()) {
470:                            if (bestBreakPositions != null) {
471:                                currentBreakPositions = bestBreakPositions;
472:                                if (farthestEndPoint < endPos) {
473:                                    text.setIndex(farthestEndPoint + 1);
474:                                } else {
475:                                    break;
476:                                }
477:                            } else {
478:                                if ((currentBreakPositions.size() == 0 || ((Integer) (currentBreakPositions
479:                                        .peek())).intValue() != text.getIndex())
480:                                        && text.getIndex() != startPos) {
481:                                    currentBreakPositions.push(new Integer(text
482:                                            .getIndex()));
483:                                }
484:                                CINext32(text);
485:                                currentBreakPositions.push(new Integer(text
486:                                        .getIndex()));
487:                            }
488:                        }
489:
490:                        // if we still have more break positions we can try, then promote the
491:                        // last break in possibleBreakPositions into currentBreakPositions,
492:                        // and get rid of all entries in currentBreakPositions that come after
493:                        // it.  Then back up to that position and start over from there (i.e.,
494:                        // treat that position as the beginning of a new word)
495:                        else {
496:                            Integer temp = (Integer) possibleBreakPositions
497:                                    .pop();
498:                            Object temp2 = null;
499:                            while (!currentBreakPositions.isEmpty()
500:                                    && temp.intValue() < ((Integer) currentBreakPositions
501:                                            .peek()).intValue()) {
502:                                temp2 = currentBreakPositions.pop();
503:                                wrongBreakPositions.addElement(temp2);
504:                            }
505:                            currentBreakPositions.push(temp);
506:                            text.setIndex(((Integer) currentBreakPositions
507:                                    .peek()).intValue());
508:                        }
509:
510:                        // re-sync "c" for the next go-round, and drop out of the loop if
511:                        // we've made it off the end of the range
512:                        c = CICurrent32(text);
513:                        state = 0;
514:                        if (text.getIndex() >= endPos) {
515:                            break;
516:                        }
517:                    }
518:
519:                    // if we didn't hit any exceptional conditions on this last iteration,
520:                    // just advance to the next character and loop
521:                    else {
522:                        c = CINext32(text);
523:                    }
524:                    //System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
525:                    //System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
526:                }
527:
528:                // dump the last break position in the list, and replace it with the actual
529:                // end of the range (which may be the same character, or may be further on
530:                // because the range actually ended with non-dictionary characters we want to
531:                // keep with the word)
532:                if (!currentBreakPositions.isEmpty()) {
533:                    currentBreakPositions.pop();
534:                }
535:                currentBreakPositions.push(new Integer(endPos));
536:
537:                // create a regular array to hold the break positions and copy
538:                // the break positions from the stack to the array (in addition,
539:                // our starting position goes into this array as a break position).
540:                // This array becomes the cache of break positions used by next()
541:                // and previous(), so this is where we actually refresh the cache.
542:                cachedBreakPositions = new int[currentBreakPositions.size() + 1];
543:                cachedBreakPositions[0] = startPos;
544:
545:                for (int i = 0; i < currentBreakPositions.size(); i++) {
546:                    cachedBreakPositions[i + 1] = ((Integer) currentBreakPositions
547:                            .elementAt(i)).intValue();
548:                }
549:                positionInCache = 0;
550:            }
551:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.