001: /*
002: *
003: * @(#)BreakIterator.java 1.34 06/10/10
004: *
005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
006: * Reserved. Use is subject to license terms.
007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
008: *
009: * This program is free software; you can redistribute it and/or
010: * modify it under the terms of the GNU General Public License version
011: * 2 only, as published by the Free Software Foundation.
012: *
013: * This program is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * General Public License version 2 for more details (a copy is
017: * included at /legal/license.txt).
018: *
019: * You should have received a copy of the GNU General Public License
020: * version 2 along with this work; if not, write to the Free Software
021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022: * 02110-1301 USA
023: *
024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
025: * Clara, CA 95054 or visit www.sun.com if you need additional
026: * information or have any questions.
027: */
028:
029: /*
030: * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
031: * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
032: *
033: * The original version of this source code and documentation
034: * is copyrighted and owned by Taligent, Inc., a wholly-owned
035: * subsidiary of IBM. These materials are provided under terms
036: * of a License Agreement between Taligent and Sun. This technology
037: * is protected by multiple US and International patents.
038: *
039: * This notice and attribution to Taligent may not be removed.
040: * Taligent is a registered trademark of Taligent, Inc.
041: *
042: */
043:
044: package java.text;
045:
046: import java.util.Vector;
047: import java.util.Locale;
048: import java.util.ResourceBundle;
049: import java.util.MissingResourceException;
050: import sun.text.resources.LocaleData;
051: import java.text.CharacterIterator;
052: import java.text.StringCharacterIterator;
053:
054: import java.net.URL;
055: import java.io.InputStream;
056: import java.io.IOException;
057:
058: import java.lang.ref.SoftReference;
059: import java.security.AccessController;
060: import java.security.PrivilegedAction;
061:
062: /**
063: * The <code>BreakIterator</code> class implements methods for finding
064: * the location of boundaries in text. Instances of <code>BreakIterator</code>
065: * maintain a current position and scan over text
066: * returning the index of characters where boundaries occur.
067: * Internally, <code>BreakIterator</code> scans text using a
068: * <code>CharacterIterator</code>, and is thus able to scan text held
069: * by any object implementing that protocol. A <code>StringCharacterIterator</code>
070: * is used to scan <code>String</code> objects passed to <code>setText</code>.
071: *
072: * <p>
073: * You use the factory methods provided by this class to create
074: * instances of various types of break iterators. In particular,
075: * use <code>getWordIterator</code>, <code>getLineIterator</code>,
076: * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
077: * to create <code>BreakIterator</code>s that perform
078: * word, line, sentence, and character boundary analysis respectively.
079: * A single <code>BreakIterator</code> can work only on one unit
080: * (word, line, sentence, and so on). You must use a different iterator
081: * for each unit boundary analysis you wish to perform.
082: *
083: * <p>
084: * Line boundary analysis determines where a text string can be
085: * broken when line-wrapping. The mechanism correctly handles
086: * punctuation and hyphenated words.
087: *
088: * <p>
089: * Sentence boundary analysis allows selection with correct interpretation
090: * of periods within numbers and abbreviations, and trailing punctuation
091: * marks such as quotation marks and parentheses.
092: *
093: * <p>
094: * Word boundary analysis is used by search and replace functions, as
095: * well as within text editing applications that allow the user to
096: * select words with a double click. Word selection provides correct
097: * interpretation of punctuation marks within and following
098: * words. Characters that are not part of a word, such as symbols
099: * or punctuation marks, have word-breaks on both sides.
100: *
101: * <p>
102: * Character boundary analysis allows users to interact with characters
103: * as they expect to, for example, when moving the cursor through a text
104: * string. Character boundary analysis provides correct navigation of
105: * through character strings, regardless of how the character is stored.
106: * For example, an accented character might be stored as a base character
107: * and a diacritical mark. What users consider to be a character can
108: * differ between languages.
109: *
110: * <p>
111: * <code>BreakIterator</code> is intended for use with natural
112: * languages only. Do not use this class to tokenize a programming language.
113: *
114: * <P>
115: * <strong>Examples</strong>:<P>
116: * Creating and using text boundaries
117: * <blockquote>
118: * <pre>
119: * public static void main(String args[]) {
120: * if (args.length == 1) {
121: * String stringToExamine = args[0];
122: * //print each word in order
123: * BreakIterator boundary = BreakIterator.getWordInstance();
124: * boundary.setText(stringToExamine);
125: * printEachForward(boundary, stringToExamine);
126: * //print each sentence in reverse order
127: * boundary = BreakIterator.getSentenceInstance(Locale.US);
128: * boundary.setText(stringToExamine);
129: * printEachBackward(boundary, stringToExamine);
130: * printFirst(boundary, stringToExamine);
131: * printLast(boundary, stringToExamine);
132: * }
133: * }
134: * </pre>
135: * </blockquote>
136: *
137: * Print each element in order
138: * <blockquote>
139: * <pre>
140: * public static void printEachForward(BreakIterator boundary, String source) {
141: * int start = boundary.first();
142: * for (int end = boundary.next();
143: * end != BreakIterator.DONE;
144: * start = end, end = boundary.next()) {
145: * System.out.println(source.substring(start,end));
146: * }
147: * }
148: * </pre>
149: * </blockquote>
150: *
151: * Print each element in reverse order
152: * <blockquote>
153: * <pre>
154: * public static void printEachBackward(BreakIterator boundary, String source) {
155: * int end = boundary.last();
156: * for (int start = boundary.previous();
157: * start != BreakIterator.DONE;
158: * end = start, start = boundary.previous()) {
159: * System.out.println(source.substring(start,end));
160: * }
161: * }
162: * </pre>
163: * </blockquote>
164: *
165: * Print first element
166: * <blockquote>
167: * <pre>
168: * public static void printFirst(BreakIterator boundary, String source) {
169: * int start = boundary.first();
170: * int end = boundary.next();
171: * System.out.println(source.substring(start,end));
172: * }
173: * </pre>
174: * </blockquote>
175: *
176: * Print last element
177: * <blockquote>
178: * <pre>
179: * public static void printLast(BreakIterator boundary, String source) {
180: * int end = boundary.last();
181: * int start = boundary.previous();
182: * System.out.println(source.substring(start,end));
183: * }
184: * </pre>
185: * </blockquote>
186: *
187: * Print the element at a specified position
188: * <blockquote>
189: * <pre>
190: * public static void printAt(BreakIterator boundary, int pos, String source) {
191: * int end = boundary.following(pos);
192: * int start = boundary.previous();
193: * System.out.println(source.substring(start,end));
194: * }
195: * </pre>
196: * </blockquote>
197: *
198: * Find the next word
199: * <blockquote>
200: * <pre>
201: * public static int nextWordStartAfter(int pos, String text) {
202: * BreakIterator wb = BreakIterator.getWordInstance();
203: * wb.setText(text);
204: * int last = wb.following(pos);
205: * int current = wb.next();
206: * while (current != BreakIterator.DONE) {
207: * for (int p = last; p < current; p++) {
208: * if (Character.isLetter(text.charAt(p))
209: * return last;
210: * }
211: * last = current;
212: * current = wb.next();
213: * }
214: * return BreakIterator.DONE;
215: * }
216: * </pre>
217: * (The iterator returned by BreakIterator.getWordInstance() is unique in that
218: * the break positions it returns don't represent both the start and end of the
219: * thing being iterated over. That is, a sentence-break iterator returns breaks
220: * that each represent the end of one sentence and the beginning of the next.
221: * With the word-break iterator, the characters between two boundaries might be a
222: * word, or they might be the punctuation or whitespace between two words. The
223: * above code uses a simple heuristic to determine which boundary is the beginning
224: * of a word: If the characters between this boundary and the next boundary
225: * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
226: * a Hangul syllable, a Kana character, etc.), then the text between this boundary
227: * and the next is a word; otherwise, it's the material between words.)
228: * </blockquote>
229:
230: *
231: * @see CharacterIterator
232: *
233: */
234:
235: public abstract class BreakIterator implements Cloneable {
236: /**
237: * Constructor. BreakIterator is stateless and has no default behavior.
238: */
239: protected BreakIterator() {
240: }
241:
242: /**
243: * Create a copy of this iterator
244: * @return A copy of this
245: */
246: public Object clone() {
247: try {
248: return super .clone();
249: } catch (CloneNotSupportedException e) {
250: throw new InternalError();
251: }
252: }
253:
254: /**
255: * DONE is returned by previous() and next() after all valid
256: * boundaries have been returned.
257: */
258: public static final int DONE = -1;
259:
260: /**
261: * Return the first boundary. The iterator's current position is set
262: * to the first boundary.
263: * @return The character index of the first text boundary.
264: */
265: public abstract int first();
266:
267: /**
268: * Return the last boundary. The iterator's current position is set
269: * to the last boundary.
270: * @return The character index of the last text boundary.
271: */
272: public abstract int last();
273:
274: /**
275: * Return the nth boundary from the current boundary
276: * @param n which boundary to return. A value of 0
277: * does nothing. Negative values move to previous boundaries
278: * and positive values move to later boundaries.
279: * @return The index of the nth boundary from the current position.
280: */
281: public abstract int next(int n);
282:
283: /**
284: * Return the boundary following the current boundary.
285: * @return The character index of the next text boundary or DONE if all
286: * boundaries have been returned. Equivalent to next(1).
287: */
288: public abstract int next();
289:
290: /**
291: * Return the boundary preceding the current boundary.
292: * @return The character index of the previous text boundary or DONE if all
293: * boundaries have been returned.
294: */
295: public abstract int previous();
296:
297: /**
298: * Return the first boundary following the specified offset.
299: * The value returned is always greater than the offset or
300: * the value BreakIterator.DONE
301: * @param offset the offset to begin scanning. Valid values
302: * are determined by the CharacterIterator passed to
303: * setText(). Invalid values cause
304: * an IllegalArgumentException to be thrown.
305: * @return The first boundary after the specified offset.
306: */
307: public abstract int following(int offset);
308:
309: /**
310: * Return the last boundary preceding the specfied offset.
311: * The value returned is always less than the offset or the value
312: * BreakIterator.DONE.
313: * @param offset the offset to begin scanning. Valid values are
314: * determined by the CharacterIterator passed to setText().
315: * Invalid values cause an IllegalArgumentException to be thrown.
316: * @return The last boundary before the specified offset.
317: * @since 1.2
318: */
319: public int preceding(int offset) {
320: // NOTE: This implementation is here solely because we can't add new
321: // abstract methods to an existing class. There is almost ALWAYS a
322: // better, faster way to do this.
323: int pos = following(offset);
324: while (pos >= offset && pos != DONE)
325: pos = previous();
326: return pos;
327: }
328:
329: /**
330: * Return true if the specified position is a boundary position.
331: * @param offset the offset to check.
332: * @return True if "offset" is a boundary position.
333: * @since 1.2
334: */
335: public boolean isBoundary(int offset) {
336: // NOTE: This implementation probably is wrong for most situations
337: // because it fails to take into account the possibility that a
338: // CharacterIterator passed to setText() may not have a begin offset
339: // of 0. But since the abstract BreakIterator doesn't have that
340: // knowledge, it assumes the begin offset is 0. If you subclass
341: // BreakIterator, copy the SimpleTextBoundary implementation of this
342: // function into your subclass. [This should have been abstract at
343: // this level, but it's too late to fix that now.]
344: if (offset == 0)
345: return true;
346: else
347: return following(offset - 1) == offset;
348: }
349:
350: /**
351: * Return character index of the text boundary that was most recently
352: * returned by next(), previous(), first(), or last()
353: * @return The boundary most recently returned.
354: */
355: public abstract int current();
356:
357: /**
358: * Get the text being scanned
359: * @return the text being scanned
360: */
361: public abstract CharacterIterator getText();
362:
363: /**
364: * Set a new text string to be scanned. The current scan
365: * position is reset to first().
366: * @param newText new text to scan.
367: */
368: public void setText(String newText) {
369: setText(new StringCharacterIterator(newText));
370: }
371:
372: /**
373: * Set a new text for scanning. The current scan
374: * position is reset to first().
375: * @param newText new text to scan.
376: */
377: public abstract void setText(CharacterIterator newText);
378:
379: private static final int CHARACTER_INDEX = 0;
380: private static final int WORD_INDEX = 1;
381: private static final int LINE_INDEX = 2;
382: private static final int SENTENCE_INDEX = 3;
383: private static final SoftReference[] iterCache = new SoftReference[4];
384:
385: /**
386: * Create BreakIterator for word-breaks using default locale.
387: * Returns an instance of a BreakIterator implementing word breaks.
388: * WordBreak is usefull for word selection (ex. double click)
389: * @return A BreakIterator for word-breaks
390: * @see java.util.Locale#getDefault
391: */
392: public static BreakIterator getWordInstance() {
393: return getWordInstance(Locale.getDefault());
394: }
395:
396: /**
397: * Create BreakIterator for word-breaks using specified locale.
398: * Returns an instance of a BreakIterator implementing word breaks.
399: * WordBreak is usefull for word selection (ex. double click)
400: * @param where the local. If a specific WordBreak is not
401: * avaliable for the specified locale, a default WordBreak is returned.
402: * @return A BreakIterator for word-breaks
403: */
404: public static BreakIterator getWordInstance(Locale where) {
405: return getBreakInstance(where, WORD_INDEX, "WordBreakRules",
406: "WordBreakDictionary");
407: }
408:
409: /**
410: * Create BreakIterator for line-breaks using default locale.
411: * Returns an instance of a BreakIterator implementing line breaks. Line
412: * breaks are logically possible line breaks, actual line breaks are
413: * usually determined based on display width.
414: * LineBreak is useful for word wrapping text.
415: * @return A BreakIterator for line-breaks
416: * @see java.util.Locale#getDefault
417: */
418: public static BreakIterator getLineInstance() {
419: return getLineInstance(Locale.getDefault());
420: }
421:
422: /**
423: * Create BreakIterator for line-breaks using specified locale.
424: * Returns an instance of a BreakIterator implementing line breaks. Line
425: * breaks are logically possible line breaks, actual line breaks are
426: * usually determined based on display width.
427: * LineBreak is useful for word wrapping text.
428: * @param where the local. If a specific LineBreak is not
429: * avaliable for the specified locale, a default LineBreak is returned.
430: * @return A BreakIterator for line-breaks
431: */
432: public static BreakIterator getLineInstance(Locale where) {
433: return getBreakInstance(where, LINE_INDEX, "LineBreakRules",
434: "LineBreakDictionary");
435: }
436:
437: /**
438: * Create BreakIterator for character-breaks using default locale
439: * Returns an instance of a BreakIterator implementing character breaks.
440: * Character breaks are boundaries of combining character sequences.
441: * @return A BreakIterator for character-breaks
442: * @see Locale#getDefault
443: */
444: public static BreakIterator getCharacterInstance() {
445: return getCharacterInstance(Locale.getDefault());
446: }
447:
448: /**
449: * Create BreakIterator for character-breaks using specified locale
450: * Returns an instance of a BreakIterator implementing character breaks.
451: * Character breaks are boundaries of combining character sequences.
452: * @param where the local. If a specific character break is not
453: * avaliable for the specified local, a default character break is returned.
454: * @return A BreakIterator for character-breaks
455: */
456: public static BreakIterator getCharacterInstance(Locale where) {
457: return getBreakInstance(where, CHARACTER_INDEX,
458: "CharacterBreakRules", "CharacterBreakDictionary");
459: }
460:
461: /**
462: * Create BreakIterator for sentence-breaks using default locale
463: * Returns an instance of a BreakIterator implementing sentence breaks.
464: * @return A BreakIterator for sentence-breaks
465: * @see java.util.Locale#getDefault
466: */
467: public static BreakIterator getSentenceInstance() {
468: return getSentenceInstance(Locale.getDefault());
469: }
470:
471: /**
472: * Create BreakIterator for sentence-breaks using specified locale
473: * Returns an instance of a BreakIterator implementing sentence breaks.
474: * @param where the local. If a specific SentenceBreak is not
475: * avaliable for the specified local, a default SentenceBreak is returned.
476: * @return A BreakIterator for sentence-breaks
477: */
478: public static BreakIterator getSentenceInstance(Locale where) {
479: return getBreakInstance(where, SENTENCE_INDEX,
480: "SentenceBreakRules", "SentenceBreakDictionary");
481: }
482:
483: private static BreakIterator getBreakInstance(Locale where,
484: int type, String rulesName, String dictionaryName) {
485: if (iterCache[type] != null) {
486: BreakIteratorCache cache = (BreakIteratorCache) iterCache[type]
487: .get();
488: if (cache != null) {
489: if (cache.getLocale().equals(where)) {
490: return cache.createBreakInstance();
491: }
492: }
493: }
494:
495: BreakIterator result = createBreakInstance(where, type,
496: rulesName, dictionaryName);
497: BreakIteratorCache cache = new BreakIteratorCache(where, result);
498: iterCache[type] = new SoftReference(cache);
499: return result;
500: }
501:
502: private static ResourceBundle getBundle(final String baseName,
503: final Locale locale) {
504: return (ResourceBundle) AccessController
505: .doPrivileged(new PrivilegedAction() {
506: public Object run() {
507: return ResourceBundle.getBundle(baseName,
508: locale);
509: }
510: });
511: }
512:
513: private static BreakIterator createBreakInstance(Locale where,
514: int type, String rulesName, String dictionaryName) {
515:
516: ResourceBundle bundle = getBundle(
517: "sun.text.resources.BreakIteratorRules", where);
518: String[] classNames = bundle
519: .getStringArray("BreakIteratorClasses");
520:
521: String rules = bundle.getString(rulesName);
522:
523: if (classNames[type].equals("RuleBasedBreakIterator")) {
524: return new RuleBasedBreakIterator(rules);
525: } else if (classNames[type]
526: .equals("DictionaryBasedBreakIterator")) {
527: try {
528: URL url = (URL) bundle.getObject(dictionaryName);
529: InputStream dictionary = url.openStream();
530: return new DictionaryBasedBreakIterator(rules,
531: dictionary);
532: } catch (IOException e) {
533: } catch (MissingResourceException e) {
534: }
535: return new RuleBasedBreakIterator(rules);
536: } else
537: throw new IllegalArgumentException(
538: "Invalid break iterator class \""
539: + classNames[type] + "\"");
540: }
541:
542: /**
543: * Get the set of Locales for which BreakIterators are installed
544: * @return available locales
545: */
546: public static synchronized Locale[] getAvailableLocales() {
547: //NOTE - this is a known issue. It should return
548: //all locales.
549: return LocaleData.getAvailableLocales("NumberPatterns");
550: }
551:
552: private static final class BreakIteratorCache {
553:
554: private BreakIterator iter;
555: private Locale where;
556:
557: BreakIteratorCache(Locale where, BreakIterator iter) {
558: this .where = where;
559: this .iter = (BreakIterator) iter.clone();
560: }
561:
562: Locale getLocale() {
563: return where;
564: }
565:
566: BreakIterator createBreakInstance() {
567: return (BreakIterator) iter.clone();
568: }
569: }
570: }
|