001: /**
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.util;
007:
008: import java.util.Enumeration;
009: import java.util.NoSuchElementException;
010: import com.ibm.icu.text.UnicodeSet;
011: import com.ibm.icu.text.UTF16;
012:
013: /**
014: * <p>The string tokenizer class allows an application to break a string
015: * into tokens by performing code point comparison.
016: * The <code>StringTokenizer</code> methods do not distinguish
017: * among identifiers, numbers, and quoted strings, nor do they recognize
018: * and skip comments.</p>
019: * <p>
020: * The set of delimiters (the codepoints that separate tokens) may be
021: * specified either at creation time or on a per-token basis.
022: * </p>
023: * <p>
024: * An instance of <code>StringTokenizer</code> behaves in one of three ways,
025: * depending on whether it was created with the <code>returnDelims</code>
026: * and <code>coalesceDelims</code>
027: * flags having the value <code>true</code> or <code>false</code>:
028: * <ul>
029: * <li>If returnDelims is <code>false</code>, delimiter code points serve to
030: * separate tokens. A token is a maximal sequence of consecutive
031: * code points that are not delimiters.
032: * <li>If returnDelims is <code>true</code>, delimiter code points are
033: * themselves considered to be tokens. In this case, if coalesceDelims is
034: * <code>true</code>, such tokens will be the maximal sequence of consecutive
035: * code points that <em>are</em> delimiters. If coalesceDelims is false,
036: * a token will be received for each delimiter code point.
037: * </ul>
038: * <p>A token is thus either one
039: * delimiter code point, a maximal sequence of consecutive code points that
040: * are delimiters, or a maximal sequence of consecutive code
041: * points that are not delimiters.
042: * </p>
043: * <p>
044: * A <tt>StringTokenizer</tt> object internally maintains a current
045: * position within the string to be tokenized. Some operations advance this
046: * current position past the code point processed.
047: * </p>
048: * <p>
049: * A token is returned by taking a substring of the string that was used to
050: * create the <tt>StringTokenizer</tt> object.
051: * </p>
052: * <p>
053: * Example of the use of the default delimiter tokenizer.
054: * <blockquote><pre>
055: * StringTokenizer st = new StringTokenizer("this is a test");
056: * while (st.hasMoreTokens()) {
057: * println(st.nextToken());
058: * }
059: * </pre></blockquote>
060: * </p>
061: * <p>
062: * prints the following output:
063: * <blockquote><pre>
064: * this
065: * is
066: * a
067: * test
068: * </pre></blockquote>
069: * </p>
070: * <p>
071: * Example of the use of the tokenizer with user specified delimiter.
072: * <blockquote><pre>
073: * StringTokenizer st = new StringTokenizer(
074: * "this is a test with supplementary characters \ud800\ud800\udc00\udc00",
075: * " \ud800\udc00");
076: * while (st.hasMoreTokens()) {
077: * println(st.nextToken());
078: * }
079: * </pre></blockquote>
080: * </p>
081: * <p>
082: * prints the following output:
083: * <blockquote><pre>
084: * this
085: * is
086: * a
087: * test
088: * with
089: * supplementary
090: * characters
091: * \ud800
092: * \udc00
093: * </pre></blockquote>
094: * </p>
095: * @author syn wee
096: * @stable ICU 2.4
097: */
098: public final class StringTokenizer implements Enumeration {
099: // public constructors ---------------------------------------------
100:
101: /**
102: * <p>Constructs a string tokenizer for the specified string. All
103: * characters in the delim argument are the delimiters for separating
104: * tokens.</p>
105: * <p>If the returnDelims flag is false, the delimiter characters are
106: * skipped and only serve as separators between tokens.</p>
107: * <p>If the returnDelims flag is true, then the delimiter characters
108: * are also returned as tokens, one per delimiter.
109: * @param str a string to be parsed.
110: * @param delim the delimiters.
111: * @param returndelims flag indicating whether to return the delimiters
112: * as tokens.
113: * @exception throws a NullPointerException if str is null
114: * @stable ICU 2.4
115: */
116: public StringTokenizer(String str, UnicodeSet delim,
117: boolean returndelims) {
118: this (str, delim, returndelims, false);
119: }
120:
121: /**
122: * <p>Constructs a string tokenizer for the specified string. All
123: * characters in the delim argument are the delimiters for separating
124: * tokens.</p>
125: * <p>If the returnDelims flag is false, the delimiter characters are
126: * skipped and only serve as separators between tokens.</p>
127: * <p>If the returnDelims flag is true, then the delimiter characters
128: * are also returned as tokens. If coalescedelims is true, one token
129: * is returned for each run of delimiter characters, otherwise one
130: * token is returned per delimiter. Since surrogate pairs can be
131: * delimiters, the returned token might be two chars in length.</p>
132: * @param str a string to be parsed.
133: * @param delim the delimiters.
134: * @param returndelims flag indicating whether to return the delimiters
135: * as tokens.
136: * @param coalescedelims flag indicating whether to return a run of
137: * delimiters as a single token or as one token per delimiter.
138: * This only takes effect if returndelims is true.
139: * @exception throws a NullPointerException if str is null
140: * @internal ICU 3.4.3
141: * @deprecated This API is ICU internal only.
142: */
143: public StringTokenizer(String str, UnicodeSet delim,
144: boolean returndelims, boolean coalescedelims) {
145: m_source_ = str;
146: m_length_ = str.length();
147: if (delim == null) {
148: m_delimiters_ = EMPTY_DELIMITER_;
149: } else {
150: m_delimiters_ = delim;
151: }
152: m_returnDelimiters_ = returndelims;
153: m_coalesceDelimiters_ = coalescedelims;
154: m_tokenOffset_ = -1;
155: m_tokenSize_ = -1;
156: if (m_length_ == 0) {
157: // string length 0, no tokens
158: m_nextOffset_ = -1;
159: } else {
160: m_nextOffset_ = 0;
161: if (!returndelims) {
162: m_nextOffset_ = getNextNonDelimiter(0);
163: }
164: }
165: }
166:
167: /**
168: * <p>Constructs a string tokenizer for the specified string. The
169: * characters in the delim argument are the delimiters for separating
170: * tokens.</p>
171: * <p>Delimiter characters themselves will not be treated as tokens.</p>
172: * @param str a string to be parsed.
173: * @param delim the delimiters.
174: * @exception throws a NullPointerException if str is null
175: * @stable ICU 2.4
176: */
177: public StringTokenizer(String str, UnicodeSet delim) {
178: this (str, delim, false, false);
179: }
180:
181: /**
182: * <p>Constructs a string tokenizer for the specified string. All
183: * characters in the delim argument are the delimiters for separating
184: * tokens.</p>
185: * <p>If the returnDelims flag is false, the delimiter characters are
186: * skipped and only serve as separators between tokens.</p>
187: * <p>If the returnDelims flag is true, then the delimiter characters
188: * are also returned as tokens, one per delimiter.
189: * @param str a string to be parsed.
190: * @param delim the delimiters.
191: * @param returndelims flag indicating whether to return the delimiters
192: * as tokens.
193: * @exception throws a NullPointerException if str is null
194: * @stable ICU 2.4
195: */
196: public StringTokenizer(String str, String delim,
197: boolean returndelims) {
198: this (str, delim, returndelims, false); // java default behavior
199: }
200:
201: /**
202: * <p>Constructs a string tokenizer for the specified string. All
203: * characters in the delim argument are the delimiters for separating
204: * tokens.</p>
205: * <p>If the returnDelims flag is false, the delimiter characters are
206: * skipped and only serve as separators between tokens.</p>
207: * <p>If the returnDelims flag is true, then the delimiter characters
208: * are also returned as tokens. If coalescedelims is true, one token
209: * is returned for each run of delimiter characters, otherwise one
210: * token is returned per delimiter. Since surrogate pairs can be
211: * delimiters, the returned token might be two chars in length.</p>
212: * @param str a string to be parsed.
213: * @param delim the delimiters.
214: * @param returndelims flag indicating whether to return the delimiters
215: * as tokens.
216: * @param coalescedelims flag indicating whether to return a run of
217: * delimiters as a single token or as one token per delimiter.
218: * This only takes effect if returndelims is true.
219: * @exception throws a NullPointerException if str is null
220: * @internal ICU 3.4.3
221: * @deprecated This API is ICU internal only.
222: */
223: public StringTokenizer(String str, String delim,
224: boolean returndelims, boolean coalescedelims) {
225: // don't ignore whitespace
226: m_delimiters_ = EMPTY_DELIMITER_;
227: if (delim != null && delim.length() > 0) {
228: m_delimiters_ = new UnicodeSet();
229: m_delimiters_.addAll(delim);
230: checkDelimiters();
231: }
232: m_coalesceDelimiters_ = coalescedelims;
233: m_source_ = str;
234: m_length_ = str.length();
235: m_returnDelimiters_ = returndelims;
236: m_tokenOffset_ = -1;
237: m_tokenSize_ = -1;
238: if (m_length_ == 0) {
239: // string length 0, no tokens
240: m_nextOffset_ = -1;
241: } else {
242: m_nextOffset_ = 0;
243: if (!returndelims) {
244: m_nextOffset_ = getNextNonDelimiter(0);
245: }
246: }
247: }
248:
249: /**
250: * <p>Constructs a string tokenizer for the specified string. The
251: * characters in the delim argument are the delimiters for separating
252: * tokens.</p>
253: * <p>Delimiter characters themselves will not be treated as tokens.</p>
254: * @param str a string to be parsed.
255: * @param delim the delimiters.
256: * @exception throws a NullPointerException if str is null
257: * @stable ICU 2.4
258: */
259: public StringTokenizer(String str, String delim) {
260: // don't ignore whitespace
261: this (str, delim, false, false);
262: }
263:
264: /**
265: * <p>Constructs a string tokenizer for the specified string.
266: * The tokenizer uses the default delimiter set, which is
267: * " \t\n\r\f":
268: * the space character, the tab character, the newline character, the
269: * carriage-return character, and the form-feed character.</p>
270: * <p>Delimiter characters themselves will not be treated as tokens.</p>
271: * @param str a string to be parsed
272: * @exception throws a NullPointerException if str is null
273: * @stable ICU 2.4
274: */
275: public StringTokenizer(String str) {
276: this (str, DEFAULT_DELIMITERS_, false, false);
277: }
278:
279: // public methods --------------------------------------------------
280:
281: /**
282: * Tests if there are more tokens available from this tokenizer's
283: * string.
284: * If this method returns <tt>true</tt>, then a subsequent call to
285: * <tt>nextToken</tt> with no argument will successfully return a token.
286: * @return <code>true</code> if and only if there is at least one token
287: * in the string after the current position; <code>false</code>
288: * otherwise.
289: * @stable ICU 2.4
290: */
291: public boolean hasMoreTokens() {
292: return m_nextOffset_ >= 0;
293: }
294:
295: /**
296: * Returns the next token from this string tokenizer.
297: * @return the next token from this string tokenizer.
298: * @exception NoSuchElementException if there are no more tokens in
299: * this tokenizer's string.
300: * @stable ICU 2.4
301: */
302: public String nextToken() {
303: if (m_tokenOffset_ < 0) {
304: if (m_nextOffset_ < 0) {
305: throw new NoSuchElementException(
306: "No more tokens in String");
307: }
308: // pre-calculations of tokens not done
309: if (m_returnDelimiters_) {
310: int tokenlimit = 0;
311: int c = UTF16.charAt(m_source_, m_nextOffset_);
312: boolean contains = delims == null ? m_delimiters_
313: .contains(c) : c < delims.length && delims[c];
314: if (contains) {
315: if (m_coalesceDelimiters_) {
316: tokenlimit = getNextNonDelimiter(m_nextOffset_);
317: } else {
318: tokenlimit = m_nextOffset_
319: + UTF16.getCharCount(c);
320: if (tokenlimit == m_length_) {
321: tokenlimit = -1;
322: }
323: }
324: } else {
325: tokenlimit = getNextDelimiter(m_nextOffset_);
326: }
327: String result;
328: if (tokenlimit < 0) {
329: result = m_source_.substring(m_nextOffset_);
330: } else {
331: result = m_source_.substring(m_nextOffset_,
332: tokenlimit);
333: }
334: m_nextOffset_ = tokenlimit;
335: return result;
336: } else {
337: int tokenlimit = getNextDelimiter(m_nextOffset_);
338: String result;
339: if (tokenlimit < 0) {
340: result = m_source_.substring(m_nextOffset_);
341: m_nextOffset_ = tokenlimit;
342: } else {
343: result = m_source_.substring(m_nextOffset_,
344: tokenlimit);
345: m_nextOffset_ = getNextNonDelimiter(tokenlimit);
346: }
347:
348: return result;
349: }
350: }
351: // count was called before and we have all the tokens
352: if (m_tokenOffset_ >= m_tokenSize_) {
353: throw new NoSuchElementException("No more tokens in String");
354: }
355: String result;
356: if (m_tokenLimit_[m_tokenOffset_] >= 0) {
357: result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
358: m_tokenLimit_[m_tokenOffset_]);
359: } else {
360: result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
361: }
362: m_tokenOffset_++;
363: m_nextOffset_ = -1;
364: if (m_tokenOffset_ < m_tokenSize_) {
365: m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
366: }
367: return result;
368: }
369:
370: /**
371: * Returns the next token in this string tokenizer's string. First,
372: * the set of characters considered to be delimiters by this
373: * <tt>StringTokenizer</tt> object is changed to be the characters in
374: * the string <tt>delim</tt>. Then the next token in the string
375: * after the current position is returned. The current position is
376: * advanced beyond the recognized token. The new delimiter set
377: * remains the default after this call.
378: * @param delim the new delimiters.
379: * @return the next token, after switching to the new delimiter set.
380: * @exception NoSuchElementException if there are no more tokens in
381: * this tokenizer's string.
382: * @stable ICU 2.4
383: */
384: public String nextToken(String delim) {
385: m_delimiters_ = EMPTY_DELIMITER_;
386: if (delim != null && delim.length() > 0) {
387: m_delimiters_ = new UnicodeSet();
388: m_delimiters_.addAll(delim);
389: }
390: return nextToken(m_delimiters_);
391: }
392:
393: /**
394: * Returns the next token in this string tokenizer's string. First,
395: * the set of characters considered to be delimiters by this
396: * <tt>StringTokenizer</tt> object is changed to be the characters in
397: * the string <tt>delim</tt>. Then the next token in the string
398: * after the current position is returned. The current position is
399: * advanced beyond the recognized token. The new delimiter set
400: * remains the default after this call.
401: * @param delim the new delimiters.
402: * @return the next token, after switching to the new delimiter set.
403: * @exception NoSuchElementException if there are no more tokens in
404: * this tokenizer's string.
405: * @stable ICU 2.4
406: */
407: public String nextToken(UnicodeSet delim) {
408: m_delimiters_ = delim;
409: checkDelimiters();
410: m_tokenOffset_ = -1;
411: m_tokenSize_ = -1;
412: if (!m_returnDelimiters_) {
413: m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
414: }
415: return nextToken();
416: }
417:
418: /**
419: * Returns the same value as the <code>hasMoreTokens</code> method.
420: * It exists so that this class can implement the
421: * <code>Enumeration</code> interface.
422: * @return <code>true</code> if there are more tokens;
423: * <code>false</code> otherwise.
424: * @see #hasMoreTokens()
425: * @stable ICU 2.4
426: */
427: public boolean hasMoreElements() {
428: return hasMoreTokens();
429: }
430:
431: /**
432: * Returns the same value as the <code>nextToken</code> method, except
433: * that its declared return value is <code>Object</code> rather than
434: * <code>String</code>. It exists so that this class can implement the
435: * <code>Enumeration</code> interface.
436: * @return the next token in the string.
437: * @exception NoSuchElementException if there are no more tokens in
438: * this tokenizer's string.
439: * @see #nextToken()
440: * @stable ICU 2.4
441: */
442: public Object nextElement() {
443: return nextToken();
444: }
445:
446: /**
447: * Calculates the number of times that this tokenizer's
448: * <code>nextToken</code> method can be called before it generates an
449: * exception. The current position is not advanced.
450: * @return the number of tokens remaining in the string using the
451: * current delimiter set.
452: * @see #nextToken()
453: * @stable ICU 2.4
454: */
455: public int countTokens() {
456: int result = 0;
457: if (hasMoreTokens()) {
458: if (m_tokenOffset_ >= 0) {
459: return m_tokenSize_ - m_tokenOffset_;
460: }
461: if (m_tokenStart_ == null) {
462: m_tokenStart_ = new int[TOKEN_SIZE_];
463: m_tokenLimit_ = new int[TOKEN_SIZE_];
464: }
465: do {
466: if (m_tokenStart_.length == result) {
467: int temptokenindex[] = m_tokenStart_;
468: int temptokensize[] = m_tokenLimit_;
469: int originalsize = temptokenindex.length;
470: int newsize = originalsize + TOKEN_SIZE_;
471: m_tokenStart_ = new int[newsize];
472: m_tokenLimit_ = new int[newsize];
473: System.arraycopy(temptokenindex, 0, m_tokenStart_,
474: 0, originalsize);
475: System.arraycopy(temptokensize, 0, m_tokenLimit_,
476: 0, originalsize);
477: }
478: m_tokenStart_[result] = m_nextOffset_;
479: if (m_returnDelimiters_) {
480: int c = UTF16.charAt(m_source_, m_nextOffset_);
481: boolean contains = delims == null ? m_delimiters_
482: .contains(c) : c < delims.length
483: && delims[c];
484: if (contains) {
485: if (m_coalesceDelimiters_) {
486: m_tokenLimit_[result] = getNextNonDelimiter(m_nextOffset_);
487: } else {
488: int p = m_nextOffset_ + 1;
489: if (p == m_length_) {
490: p = -1;
491: }
492: m_tokenLimit_[result] = p;
493:
494: }
495: } else {
496: m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
497: }
498: m_nextOffset_ = m_tokenLimit_[result];
499: } else {
500: m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
501: m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
502: }
503: result++;
504: } while (m_nextOffset_ >= 0);
505: m_tokenOffset_ = 0;
506: m_tokenSize_ = result;
507: m_nextOffset_ = m_tokenStart_[0];
508: }
509: return result;
510: }
511:
512: // private data members -------------------------------------------------
513:
514: /**
515: * Current offset to the token array. If the array token is not set up yet,
516: * this value is a -1
517: */
518: private int m_tokenOffset_;
519: /**
520: * Size of the token array. If the array token is not set up yet,
521: * this value is a -1
522: */
523: private int m_tokenSize_;
524: /**
525: * Array of pre-calculated tokens start indexes in source string terminated
526: * by -1.
527: * This is only set up during countTokens() and only stores the remaining
528: * tokens, not all tokens including parsed ones
529: */
530: private int m_tokenStart_[];
531: /**
532: * Array of pre-calculated tokens limit indexes in source string.
533: * This is only set up during countTokens() and only stores the remaining
534: * tokens, not all tokens including parsed ones
535: */
536: private int m_tokenLimit_[];
537: /**
538: * UnicodeSet containing delimiters
539: */
540: private UnicodeSet m_delimiters_;
541: /**
542: * String to parse for tokens
543: */
544: private String m_source_;
545: /**
546: * Length of m_source_
547: */
548: private int m_length_;
549: /**
550: * Current position in string to parse for tokens
551: */
552: private int m_nextOffset_;
553: /**
554: * Flag indicator if delimiters are to be treated as tokens too
555: */
556: private boolean m_returnDelimiters_;
557:
558: /**
559: * Flag indicating whether to coalesce runs of delimiters into single tokens
560: */
561: private boolean m_coalesceDelimiters_;
562:
563: /**
564: * Default set of delimiters \t\n\r\f
565: */
566: private static final UnicodeSet DEFAULT_DELIMITERS_ = new UnicodeSet(
567: "[ \t\n\r\f]", false);
568: /**
569: * Array size increments
570: */
571: private static final int TOKEN_SIZE_ = 100;
572: /**
573: * A empty delimiter UnicodeSet, used when user specified null delimiters
574: */
575: private static final UnicodeSet EMPTY_DELIMITER_ = new UnicodeSet();
576:
577: // private methods ------------------------------------------------------
578:
579: /**
580: * Gets the index of the next delimiter after offset
581: * @param offset to the source string
582: * @return offset of the immediate next delimiter, otherwise
583: * (- source string length - 1) if there
584: * are no more delimiters after m_nextOffset
585: */
586: private int getNextDelimiter(int offset) {
587: if (offset >= 0) {
588: int result = offset;
589: int c = 0;
590: if (delims == null) {
591: do {
592: c = UTF16.charAt(m_source_, result);
593: if (m_delimiters_.contains(c)) {
594: break;
595: }
596: result++;
597: } while (result < m_length_);
598: } else {
599: do {
600: c = UTF16.charAt(m_source_, result);
601: if (c < delims.length && delims[c]) {
602: break;
603: }
604: result++;
605: } while (result < m_length_);
606: }
607: if (result < m_length_) {
608: return result;
609: }
610: }
611: return -1 - m_length_;
612: }
613:
614: /**
615: * Gets the index of the next non-delimiter after m_nextOffset_
616: * @param offset to the source string
617: * @return offset of the immediate next non-delimiter, otherwise
618: * (- source string length - 1) if there
619: * are no more delimiters after m_nextOffset
620: */
621: private int getNextNonDelimiter(int offset) {
622: if (offset >= 0) {
623: int result = offset;
624: int c = 0;
625: if (delims == null) {
626: do {
627: c = UTF16.charAt(m_source_, result);
628: if (!m_delimiters_.contains(c)) {
629: break;
630: }
631: result++;
632: } while (result < m_length_);
633: } else {
634: do {
635: c = UTF16.charAt(m_source_, result);
636: if (!(c < delims.length && delims[c])) {
637: break;
638: }
639: result++;
640: } while (result < m_length_);
641: }
642: if (result < m_length_) {
643: return result;
644: }
645: }
646: return -1 - m_length_;
647: }
648:
649: void checkDelimiters() {
650: if (m_delimiters_ == null || m_delimiters_.size() == 0) {
651: delims = new boolean[0];
652: } else {
653: int maxChar = m_delimiters_.getRangeEnd(m_delimiters_
654: .getRangeCount() - 1);
655: if (maxChar < 0x7f) {
656: delims = new boolean[maxChar + 1];
657: for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
658: delims[ch] = true;
659: }
660: } else {
661: delims = null;
662: }
663: }
664: }
665:
666: private boolean[] delims;
667: }
|