0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017: package org.apache.commons.lang.text;
0018:
0019: import java.util.ArrayList;
0020: import java.util.Collections;
0021: import java.util.List;
0022: import java.util.ListIterator;
0023: import java.util.NoSuchElementException;
0024:
0025: /**
0026: * Tokenizes a string based based on delimiters (separators)
0027: * and supporting quoting and ignored character concepts.
0028: * <p>
0029: * This class can split a String into many smaller strings. It aims
0030: * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
0031: * however it offers much more control and flexibility including implementing
0032: * the <code>ListIterator</code> interface. By default, it is setup
0033: * like <code>StringTokenizer</code>.
0034: * <p>
0035: * The input String is split into a number of <i>tokens</i>.
0036: * Each token is separated from the next String by a <i>delimiter</i>.
0037: * One or more delimiter characters must be specified.
0038: * <p>
0039: * Each token may be surrounded by quotes.
0040: * The <i>quote</i> matcher specifies the quote character(s).
0041: * A quote may be escaped within a quoted section by duplicating itself.
0042: * <p>
0043: * Between each token and the delimiter are potentially characters that need trimming.
0044: * The <i>trimmer</i> matcher specifies these characters.
0045: * One usage might be to trim whitespace characters.
0046: * <p>
0047: * At any point outside the quotes there might potentially be invalid characters.
0048: * The <i>ignored</i> matcher specifies these characters to be removed.
0049: * One usage might be to remove new line characters.
0050: * <p>
0051: * Empty tokens may be removed or returned as null.
0052: * <pre>
0053: * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
0054: * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
0055: * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
0056: * </pre>
0057: * <p>
0058: *
0059: * This tokenizer has the following properties and options:
0060: *
0061: * <table>
0062: * <tr>
0063: * <th>Property</th><th>Type</th><th>Default</th>
0064: * </tr>
0065: * <tr>
0066: * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
0067: * </tr>
0068: * <tr>
0069: * <td>quote</td><td>NoneMatcher</td><td>{}</td>
0070: * </tr>
0071: * <tr>
0072: * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
0073: * </tr>
0074: * <tr>
0075: * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
0076: * </tr>
0077: * <tr>
0078: * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
0079: * </tr>
0080: * </table>
0081: *
0082: * @author Matthew Inger
0083: * @author Stephen Colebourne
0084: * @author Gary D. Gregory
0085: * @since 2.2
0086: * @version $Id: StrTokenizer.java 491653 2007-01-01 22:03:58Z ggregory $
0087: */
0088: public class StrTokenizer implements ListIterator, Cloneable {
0089:
0090: private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
0091: private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
0092: static {
0093: CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
0094: CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher
0095: .commaMatcher());
0096: CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher
0097: .doubleQuoteMatcher());
0098: CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher
0099: .noneMatcher());
0100: CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher
0101: .trimMatcher());
0102: CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
0103: CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
0104:
0105: TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
0106: TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher
0107: .tabMatcher());
0108: TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher
0109: .doubleQuoteMatcher());
0110: TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher
0111: .noneMatcher());
0112: TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher
0113: .trimMatcher());
0114: TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
0115: TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
0116: }
0117:
0118: /** The text to work on. */
0119: private char chars[];
0120: /** The parsed tokens */
0121: private String tokens[];
0122: /** The current iteration position */
0123: private int tokenPos;
0124:
0125: /** The delimiter matcher */
0126: private StrMatcher delimMatcher = StrMatcher.splitMatcher();
0127: /** The quote matcher */
0128: private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
0129: /** The ignored matcher */
0130: private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
0131: /** The trimmer matcher */
0132: private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
0133:
0134: /** Whether to return empty tokens as null */
0135: private boolean emptyAsNull = false;
0136: /** Whether to ignore empty tokens */
0137: private boolean ignoreEmptyTokens = true;
0138:
0139: //-----------------------------------------------------------------------
0140:
0141: /**
0142: * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
0143: *
0144: * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
0145: */
0146: private static StrTokenizer getCSVClone() {
0147: return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
0148: }
0149:
0150: /**
0151: * Gets a new tokenizer instance which parses Comma Seperated Value strings
0152: * initializing it with the given input. The default for CSV processing
0153: * will be trim whitespace from both ends (which can be overriden with
0154: * the setTrimmer method).
0155: * <p>
0156: * You must call a "reset" method to set the string which you want to parse.
0157: * @return a new tokenizer instance which parses Comma Seperated Value strings
0158: */
0159: public static StrTokenizer getCSVInstance() {
0160: return getCSVClone();
0161: }
0162:
0163: /**
0164: * Gets a new tokenizer instance which parses Comma Seperated Value strings
0165: * initializing it with the given input. The default for CSV processing
0166: * will be trim whitespace from both ends (which can be overriden with
0167: * the setTrimmer method).
0168: *
0169: * @param input the text to parse
0170: * @return a new tokenizer instance which parses Comma Seperated Value strings
0171: */
0172: public static StrTokenizer getCSVInstance(String input) {
0173: StrTokenizer tok = getCSVClone();
0174: tok.reset(input);
0175: return tok;
0176: }
0177:
0178: /**
0179: * Gets a new tokenizer instance which parses Comma Seperated Value strings
0180: * initializing it with the given input. The default for CSV processing
0181: * will be trim whitespace from both ends (which can be overriden with
0182: * the setTrimmer method).
0183: *
0184: * @param input the text to parse
0185: * @return a new tokenizer instance which parses Comma Seperated Value strings
0186: */
0187: public static StrTokenizer getCSVInstance(char[] input) {
0188: StrTokenizer tok = getCSVClone();
0189: tok.reset(input);
0190: return tok;
0191: }
0192:
0193: /**
0194: * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
0195: *
0196: * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
0197: */
0198: private static StrTokenizer getTSVClone() {
0199: return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
0200: }
0201:
0202: /**
0203: * Gets a new tokenizer instance which parses Tab Seperated Value strings.
0204: * The default for CSV processing will be trim whitespace from both ends
0205: * (which can be overriden with the setTrimmer method).
0206: * <p>
0207: * You must call a "reset" method to set the string which you want to parse.
0208: * @return a new tokenizer instance which parses Tab Seperated Value strings.
0209: */
0210: public static StrTokenizer getTSVInstance() {
0211: return getTSVClone();
0212: }
0213:
0214: /**
0215: * Gets a new tokenizer instance which parses Tab Seperated Value strings.
0216: * The default for CSV processing will be trim whitespace from both ends
0217: * (which can be overriden with the setTrimmer method).
0218: * @param input the string to parse
0219: * @return a new tokenizer instance which parses Tab Seperated Value strings.
0220: */
0221: public static StrTokenizer getTSVInstance(String input) {
0222: StrTokenizer tok = getTSVClone();
0223: tok.reset(input);
0224: return tok;
0225: }
0226:
0227: /**
0228: * Gets a new tokenizer instance which parses Tab Seperated Value strings.
0229: * The default for CSV processing will be trim whitespace from both ends
0230: * (which can be overriden with the setTrimmer method).
0231: * @param input the string to parse
0232: * @return a new tokenizer instance which parses Tab Seperated Value strings.
0233: */
0234: public static StrTokenizer getTSVInstance(char[] input) {
0235: StrTokenizer tok = getTSVClone();
0236: tok.reset(input);
0237: return tok;
0238: }
0239:
0240: //-----------------------------------------------------------------------
0241: /**
0242: * Constructs a tokenizer splitting on space, tab, newline and formfeed
0243: * as per StringTokenizer, but with no text to tokenize.
0244: * <p>
0245: * This constructor is normally used with {@link #reset(String)}.
0246: */
0247: public StrTokenizer() {
0248: super ();
0249: this .chars = null;
0250: }
0251:
0252: /**
0253: * Constructs a tokenizer splitting on space, tab, newline and formfeed
0254: * as per StringTokenizer.
0255: *
0256: * @param input the string which is to be parsed
0257: */
0258: public StrTokenizer(String input) {
0259: super ();
0260: if (input != null) {
0261: chars = input.toCharArray();
0262: } else {
0263: chars = null;
0264: }
0265: }
0266:
0267: /**
0268: * Constructs a tokenizer splitting on the specified delimiter character.
0269: *
0270: * @param input the string which is to be parsed
0271: * @param delim the field delimiter character
0272: */
0273: public StrTokenizer(String input, char delim) {
0274: this (input);
0275: setDelimiterChar(delim);
0276: }
0277:
0278: /**
0279: * Constructs a tokenizer splitting on the specified delimiter string.
0280: *
0281: * @param input the string which is to be parsed
0282: * @param delim the field delimiter string
0283: */
0284: public StrTokenizer(String input, String delim) {
0285: this (input);
0286: setDelimiterString(delim);
0287: }
0288:
0289: /**
0290: * Constructs a tokenizer splitting using the specified delimiter matcher.
0291: *
0292: * @param input the string which is to be parsed
0293: * @param delim the field delimiter matcher
0294: */
0295: public StrTokenizer(String input, StrMatcher delim) {
0296: this (input);
0297: setDelimiterMatcher(delim);
0298: }
0299:
0300: /**
0301: * Constructs a tokenizer splitting on the specified delimiter character
0302: * and handling quotes using the specified quote character.
0303: *
0304: * @param input the string which is to be parsed
0305: * @param delim the field delimiter character
0306: * @param quote the field quoted string character
0307: */
0308: public StrTokenizer(String input, char delim, char quote) {
0309: this (input, delim);
0310: setQuoteChar(quote);
0311: }
0312:
0313: /**
0314: * Constructs a tokenizer splitting using the specified delimiter matcher
0315: * and handling quotes using the specified quote matcher.
0316: *
0317: * @param input the string which is to be parsed
0318: * @param delim the field delimiter matcher
0319: * @param quote the field quoted string matcher
0320: */
0321: public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
0322: this (input, delim);
0323: setQuoteMatcher(quote);
0324: }
0325:
0326: /**
0327: * Constructs a tokenizer splitting on space, tab, newline and formfeed
0328: * as per StringTokenizer.
0329: * <p>
0330: * The input character array is not cloned, and must not be altered after
0331: * passing in to this method.
0332: *
0333: * @param input the string which is to be parsed, not cloned
0334: */
0335: public StrTokenizer(char[] input) {
0336: super ();
0337: this .chars = input;
0338: }
0339:
0340: /**
0341: * Constructs a tokenizer splitting on the specified character.
0342: * <p>
0343: * The input character array is not cloned, and must not be altered after
0344: * passing in to this method.
0345: *
0346: * @param input the string which is to be parsed, not cloned
0347: * @param delim the field delimiter character
0348: */
0349: public StrTokenizer(char[] input, char delim) {
0350: this (input);
0351: setDelimiterChar(delim);
0352: }
0353:
0354: /**
0355: * Constructs a tokenizer splitting on the specified string.
0356: * <p>
0357: * The input character array is not cloned, and must not be altered after
0358: * passing in to this method.
0359: *
0360: * @param input the string which is to be parsed, not cloned
0361: * @param delim the field delimiter string
0362: */
0363: public StrTokenizer(char[] input, String delim) {
0364: this (input);
0365: setDelimiterString(delim);
0366: }
0367:
0368: /**
0369: * Constructs a tokenizer splitting using the specified delimiter matcher.
0370: * <p>
0371: * The input character array is not cloned, and must not be altered after
0372: * passing in to this method.
0373: *
0374: * @param input the string which is to be parsed, not cloned
0375: * @param delim the field delimiter matcher
0376: */
0377: public StrTokenizer(char[] input, StrMatcher delim) {
0378: this (input);
0379: setDelimiterMatcher(delim);
0380: }
0381:
0382: /**
0383: * Constructs a tokenizer splitting on the specified delimiter character
0384: * and handling quotes using the specified quote character.
0385: * <p>
0386: * The input character array is not cloned, and must not be altered after
0387: * passing in to this method.
0388: *
0389: * @param input the string which is to be parsed, not cloned
0390: * @param delim the field delimiter character
0391: * @param quote the field quoted string character
0392: */
0393: public StrTokenizer(char[] input, char delim, char quote) {
0394: this (input, delim);
0395: setQuoteChar(quote);
0396: }
0397:
0398: /**
0399: * Constructs a tokenizer splitting using the specified delimiter matcher
0400: * and handling quotes using the specified quote matcher.
0401: * <p>
0402: * The input character array is not cloned, and must not be altered after
0403: * passing in to this method.
0404: *
0405: * @param input the string which is to be parsed, not cloned
0406: * @param delim the field delimiter character
0407: * @param quote the field quoted string character
0408: */
0409: public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
0410: this (input, delim);
0411: setQuoteMatcher(quote);
0412: }
0413:
0414: // API
0415: //-----------------------------------------------------------------------
0416: /**
0417: * Gets the number of tokens found in the String.
0418: *
0419: * @return the number of matched tokens
0420: */
0421: public int size() {
0422: checkTokenized();
0423: return tokens.length;
0424: }
0425:
0426: /**
0427: * Gets the next token from the String.
0428: *
0429: * @return the next sequential token, or null when no more tokens are found
0430: */
0431: public String nextToken() {
0432: if (hasNext()) {
0433: return tokens[tokenPos++];
0434: }
0435: return null;
0436: }
0437:
0438: /**
0439: * Gets the previous token from the String.
0440: *
0441: * @return the previous sequential token, or null when no more tokens are found
0442: */
0443: public String previousToken() {
0444: if (hasPrevious()) {
0445: return tokens[--tokenPos];
0446: }
0447: return null;
0448: }
0449:
0450: /**
0451: * Gets a copy of the full token list as an independent modifiable array.
0452: *
0453: * @return the tokens as a String array
0454: */
0455: public String[] getTokenArray() {
0456: checkTokenized();
0457: return (String[]) tokens.clone();
0458: }
0459:
0460: /**
0461: * Gets a copy of the full token list as an independent modifiable list.
0462: *
0463: * @return the tokens as a String array
0464: */
0465: public List getTokenList() {
0466: checkTokenized();
0467: List list = new ArrayList(tokens.length);
0468: for (int i = 0; i < tokens.length; i++) {
0469: list.add(tokens[i]);
0470: }
0471: return list;
0472: }
0473:
0474: /**
0475: * Resets this tokenizer, forgetting all parsing and iteration already completed.
0476: * <p>
0477: * This method allows the same tokenizer to be reused for the same String.
0478: *
0479: * @return this, to enable chaining
0480: */
0481: public StrTokenizer reset() {
0482: tokenPos = 0;
0483: tokens = null;
0484: return this ;
0485: }
0486:
0487: /**
0488: * Reset this tokenizer, giving it a new input string to parse.
0489: * In this manner you can re-use a tokenizer with the same settings
0490: * on multiple input lines.
0491: *
0492: * @param input the new string to tokenize, null sets no text to parse
0493: * @return this, to enable chaining
0494: */
0495: public StrTokenizer reset(String input) {
0496: reset();
0497: if (input != null) {
0498: this .chars = input.toCharArray();
0499: } else {
0500: this .chars = null;
0501: }
0502: return this ;
0503: }
0504:
0505: /**
0506: * Reset this tokenizer, giving it a new input string to parse.
0507: * In this manner you can re-use a tokenizer with the same settings
0508: * on multiple input lines.
0509: * <p>
0510: * The input character array is not cloned, and must not be altered after
0511: * passing in to this method.
0512: *
0513: * @param input the new character array to tokenize, not cloned, null sets no text to parse
0514: * @return this, to enable chaining
0515: */
0516: public StrTokenizer reset(char[] input) {
0517: reset();
0518: this .chars = input;
0519: return this ;
0520: }
0521:
0522: // ListIterator
0523: //-----------------------------------------------------------------------
0524: /**
0525: * Checks whether there are any more tokens.
0526: *
0527: * @return true if there are more tokens
0528: */
0529: public boolean hasNext() {
0530: checkTokenized();
0531: return tokenPos < tokens.length;
0532: }
0533:
0534: /**
0535: * Gets the next token. This method is equivalent to {@link #nextToken()}.
0536: *
0537: * @return the next String token
0538: */
0539: public Object next() {
0540: if (hasNext()) {
0541: return tokens[tokenPos++];
0542: }
0543: throw new NoSuchElementException();
0544: }
0545:
0546: /**
0547: * Gets the index of the next token to return.
0548: *
0549: * @return the next token index
0550: */
0551: public int nextIndex() {
0552: return tokenPos;
0553: }
0554:
0555: /**
0556: * Checks whether there are any previous tokens that can be iterated to.
0557: *
0558: * @return true if there are previous tokens
0559: */
0560: public boolean hasPrevious() {
0561: checkTokenized();
0562: return tokenPos > 0;
0563: }
0564:
0565: /**
0566: * Gets the token previous to the last returned token.
0567: *
0568: * @return the previous token
0569: */
0570: public Object previous() {
0571: if (hasPrevious()) {
0572: return tokens[--tokenPos];
0573: }
0574: throw new NoSuchElementException();
0575: }
0576:
0577: /**
0578: * Gets the index of the previous token.
0579: *
0580: * @return the previous token index
0581: */
0582: public int previousIndex() {
0583: return tokenPos - 1;
0584: }
0585:
0586: /**
0587: * Unsupported ListIterator operation.
0588: *
0589: * @throws UnsupportedOperationException always
0590: */
0591: public void remove() {
0592: throw new UnsupportedOperationException(
0593: "remove() is unsupported");
0594: }
0595:
0596: /**
0597: * Unsupported ListIterator operation.
0598: * @param obj this parameter ignored.
0599: * @throws UnsupportedOperationException always
0600: */
0601: public void set(Object obj) {
0602: throw new UnsupportedOperationException("set() is unsupported");
0603: }
0604:
0605: /**
0606: * Unsupported ListIterator operation.
0607: * @param obj this parameter ignored.
0608: * @throws UnsupportedOperationException always
0609: */
0610: public void add(Object obj) {
0611: throw new UnsupportedOperationException("add() is unsupported");
0612: }
0613:
0614: // Implementation
0615: //-----------------------------------------------------------------------
0616: /**
0617: * Checks if tokenization has been done, and if not then do it.
0618: */
0619: private void checkTokenized() {
0620: if (tokens == null) {
0621: if (chars == null) {
0622: // still call tokenize as subclass may do some work
0623: List split = tokenize(null, 0, 0);
0624: tokens = (String[]) split.toArray(new String[split
0625: .size()]);
0626: } else {
0627: List split = tokenize(chars, 0, chars.length);
0628: tokens = (String[]) split.toArray(new String[split
0629: .size()]);
0630: }
0631: }
0632: }
0633:
0634: /**
0635: * Internal method to performs the tokenization.
0636: * <p>
0637: * Most users of this class do not need to call this method. This method
0638: * will be called automatically by other (public) methods when required.
0639: * <p>
0640: * This method exists to allow subclasses to add code before or after the
0641: * tokenization. For example, a subclass could alter the character array,
0642: * offset or count to be parsed, or call the tokenizer multiple times on
0643: * multiple strings. It is also be possible to filter the results.
0644: * <p>
0645: * <code>StrTokenizer</code> will always pass a zero offset and a count
0646: * equal to the length of the array to this method, however a subclass
0647: * may pass other values, or even an entirely different array.
0648: *
0649: * @param chars the character array being tokenized, may be null
0650: * @param offset the start position within the character array, must be valid
0651: * @param count the number of characters to tokenize, must be valid
0652: * @return the modifiable list of String tokens, unmodifiable if null array or zero count
0653: */
0654: protected List tokenize(char[] chars, int offset, int count) {
0655: if (chars == null || count == 0) {
0656: return Collections.EMPTY_LIST;
0657: }
0658: StrBuilder buf = new StrBuilder();
0659: List tokens = new ArrayList();
0660: int pos = offset;
0661:
0662: // loop around the entire buffer
0663: while (pos >= 0 && pos < count) {
0664: // find next token
0665: pos = readNextToken(chars, pos, count, buf, tokens);
0666:
0667: // handle case where end of string is a delimiter
0668: if (pos >= count) {
0669: addToken(tokens, "");
0670: }
0671: }
0672: return tokens;
0673: }
0674:
0675: /**
0676: * Adds a token to a list, paying attention to the parameters we've set.
0677: *
0678: * @param list the list to add to
0679: * @param tok the token to add
0680: */
0681: private void addToken(List list, String tok) {
0682: if (tok == null || tok.length() == 0) {
0683: if (isIgnoreEmptyTokens()) {
0684: return;
0685: }
0686: if (isEmptyTokenAsNull()) {
0687: tok = null;
0688: }
0689: }
0690: list.add(tok);
0691: }
0692:
0693: /**
0694: * Reads character by character through the String to get the next token.
0695: *
0696: * @param chars the character array being tokenized
0697: * @param start the first character of field
0698: * @param len the length of the character array being tokenized
0699: * @param workArea a temporary work area
0700: * @param tokens the list of parsed tokens
0701: * @return the starting position of the next field (the character
0702: * immediately after the delimiter), or -1 if end of string found
0703: */
0704: private int readNextToken(char[] chars, int start, int len,
0705: StrBuilder workArea, List tokens) {
0706: // skip all leading whitespace, unless it is the
0707: // field delimiter or the quote character
0708: while (start < len) {
0709: int removeLen = Math.max(getIgnoredMatcher().isMatch(chars,
0710: start, start, len), getTrimmerMatcher().isMatch(
0711: chars, start, start, len));
0712: if (removeLen == 0
0713: || getDelimiterMatcher().isMatch(chars, start,
0714: start, len) > 0
0715: || getQuoteMatcher().isMatch(chars, start, start,
0716: len) > 0) {
0717: break;
0718: }
0719: start += removeLen;
0720: }
0721:
0722: // handle reaching end
0723: if (start >= len) {
0724: addToken(tokens, "");
0725: return -1;
0726: }
0727:
0728: // handle empty token
0729: int delimLen = getDelimiterMatcher().isMatch(chars, start,
0730: start, len);
0731: if (delimLen > 0) {
0732: addToken(tokens, "");
0733: return start + delimLen;
0734: }
0735:
0736: // handle found token
0737: int quoteLen = getQuoteMatcher().isMatch(chars, start, start,
0738: len);
0739: if (quoteLen > 0) {
0740: return readWithQuotes(chars, start + quoteLen, len,
0741: workArea, tokens, start, quoteLen);
0742: }
0743: return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
0744: }
0745:
0746: /**
0747: * Reads a possibly quoted string token.
0748: *
0749: * @param chars the character array being tokenized
0750: * @param start the first character of field
0751: * @param len the length of the character array being tokenized
0752: * @param workArea a temporary work area
0753: * @param tokens the list of parsed tokens
0754: * @param quoteStart the start position of the matched quote, 0 if no quoting
0755: * @param quoteLen the length of the matched quote, 0 if no quoting
0756: * @return the starting position of the next field (the character
0757: * immediately after the delimiter, or if end of string found,
0758: * then the length of string
0759: */
0760: private int readWithQuotes(char[] chars, int start, int len,
0761: StrBuilder workArea, List tokens, int quoteStart,
0762: int quoteLen) {
0763: // Loop until we've found the end of the quoted
0764: // string or the end of the input
0765: workArea.clear();
0766: int pos = start;
0767: boolean quoting = (quoteLen > 0);
0768: int trimStart = 0;
0769:
0770: while (pos < len) {
0771: // quoting mode can occur several times throughout a string
0772: // we must switch between quoting and non-quoting until we
0773: // encounter a non-quoted delimiter, or end of string
0774: if (quoting) {
0775: // In quoting mode
0776:
0777: // If we've found a quote character, see if it's
0778: // followed by a second quote. If so, then we need
0779: // to actually put the quote character into the token
0780: // rather than end the token.
0781: if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
0782: if (isQuote(chars, pos + quoteLen, len, quoteStart,
0783: quoteLen)) {
0784: // matched pair of quotes, thus an escaped quote
0785: workArea.append(chars, pos, quoteLen);
0786: pos += (quoteLen * 2);
0787: trimStart = workArea.size();
0788: continue;
0789: }
0790:
0791: // end of quoting
0792: quoting = false;
0793: pos += quoteLen;
0794: continue;
0795: }
0796:
0797: // copy regular character from inside quotes
0798: workArea.append(chars[pos++]);
0799: trimStart = workArea.size();
0800:
0801: } else {
0802: // Not in quoting mode
0803:
0804: // check for delimiter, and thus end of token
0805: int delimLen = getDelimiterMatcher().isMatch(chars,
0806: pos, start, len);
0807: if (delimLen > 0) {
0808: // return condition when end of token found
0809: addToken(tokens, workArea.substring(0, trimStart));
0810: return pos + delimLen;
0811: }
0812:
0813: // check for quote, and thus back into quoting mode
0814: if (quoteLen > 0) {
0815: if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
0816: quoting = true;
0817: pos += quoteLen;
0818: continue;
0819: }
0820: }
0821:
0822: // check for ignored (outside quotes), and ignore
0823: int ignoredLen = getIgnoredMatcher().isMatch(chars,
0824: pos, start, len);
0825: if (ignoredLen > 0) {
0826: pos += ignoredLen;
0827: continue;
0828: }
0829:
0830: // check for trimmed character
0831: // don't yet know if its at the end, so copy to workArea
0832: // use trimStart to keep track of trim at the end
0833: int trimmedLen = getTrimmerMatcher().isMatch(chars,
0834: pos, start, len);
0835: if (trimmedLen > 0) {
0836: workArea.append(chars, pos, trimmedLen);
0837: pos += trimmedLen;
0838: continue;
0839: }
0840:
0841: // copy regular character from outside quotes
0842: workArea.append(chars[pos++]);
0843: trimStart = workArea.size();
0844: }
0845: }
0846:
0847: // return condition when end of string found
0848: addToken(tokens, workArea.substring(0, trimStart));
0849: return -1;
0850: }
0851:
0852: /**
0853: * Checks if the characters at the index specified match the quote
0854: * already matched in readNextToken().
0855: *
0856: * @param chars the character array being tokenized
0857: * @param pos the position to check for a quote
0858: * @param len the length of the character array being tokenized
0859: * @param quoteStart the start position of the matched quote, 0 if no quoting
0860: * @param quoteLen the length of the matched quote, 0 if no quoting
0861: * @return true if a quote is matched
0862: */
0863: private boolean isQuote(char[] chars, int pos, int len,
0864: int quoteStart, int quoteLen) {
0865: for (int i = 0; i < quoteLen; i++) {
0866: if ((pos + i) >= len
0867: || chars[pos + i] != chars[quoteStart + i]) {
0868: return false;
0869: }
0870: }
0871: return true;
0872: }
0873:
0874: // Delimiter
0875: //-----------------------------------------------------------------------
0876: /**
0877: * Gets the field delimiter matcher.
0878: *
0879: * @return the delimiter matcher in use
0880: */
0881: public StrMatcher getDelimiterMatcher() {
0882: return this .delimMatcher;
0883: }
0884:
0885: /**
0886: * Sets the field delimiter matcher.
0887: * <p>
0888: * The delimitier is used to separate one token from another.
0889: *
0890: * @param delim the delimiter matcher to use
0891: * @return this, to enable chaining
0892: */
0893: public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
0894: if (delim == null) {
0895: this .delimMatcher = StrMatcher.noneMatcher();
0896: } else {
0897: this .delimMatcher = delim;
0898: }
0899: return this ;
0900: }
0901:
0902: /**
0903: * Sets the field delimiter character.
0904: *
0905: * @param delim the delimiter character to use
0906: * @return this, to enable chaining
0907: */
0908: public StrTokenizer setDelimiterChar(char delim) {
0909: return setDelimiterMatcher(StrMatcher.charMatcher(delim));
0910: }
0911:
0912: /**
0913: * Sets the field delimiter string.
0914: *
0915: * @param delim the delimiter string to use
0916: * @return this, to enable chaining
0917: */
0918: public StrTokenizer setDelimiterString(String delim) {
0919: return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
0920: }
0921:
0922: // Quote
0923: //-----------------------------------------------------------------------
0924: /**
0925: * Gets the quote matcher currently in use.
0926: * <p>
0927: * The quote character is used to wrap data between the tokens.
0928: * This enables delimiters to be entered as data.
0929: * The default value is '"' (double quote).
0930: *
0931: * @return the quote matcher in use
0932: */
0933: public StrMatcher getQuoteMatcher() {
0934: return quoteMatcher;
0935: }
0936:
0937: /**
0938: * Set the quote matcher to use.
0939: * <p>
0940: * The quote character is used to wrap data between the tokens.
0941: * This enables delimiters to be entered as data.
0942: *
0943: * @param quote the quote matcher to use, null ignored
0944: * @return this, to enable chaining
0945: */
0946: public StrTokenizer setQuoteMatcher(StrMatcher quote) {
0947: if (quote != null) {
0948: this .quoteMatcher = quote;
0949: }
0950: return this ;
0951: }
0952:
0953: /**
0954: * Sets the quote character to use.
0955: * <p>
0956: * The quote character is used to wrap data between the tokens.
0957: * This enables delimiters to be entered as data.
0958: *
0959: * @param quote the quote character to use
0960: * @return this, to enable chaining
0961: */
0962: public StrTokenizer setQuoteChar(char quote) {
0963: return setQuoteMatcher(StrMatcher.charMatcher(quote));
0964: }
0965:
0966: // Ignored
0967: //-----------------------------------------------------------------------
0968: /**
0969: * Gets the ignored character matcher.
0970: * <p>
0971: * These characters are ignored when parsing the String, unless they are
0972: * within a quoted region.
0973: * The default value is not to ignore anything.
0974: *
0975: * @return the ignored matcher in use
0976: */
0977: public StrMatcher getIgnoredMatcher() {
0978: return ignoredMatcher;
0979: }
0980:
0981: /**
0982: * Set the matcher for characters to ignore.
0983: * <p>
0984: * These characters are ignored when parsing the String, unless they are
0985: * within a quoted region.
0986: *
0987: * @param ignored the ignored matcher to use, null ignored
0988: * @return this, to enable chaining
0989: */
0990: public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
0991: if (ignored != null) {
0992: this .ignoredMatcher = ignored;
0993: }
0994: return this ;
0995: }
0996:
0997: /**
0998: * Set the character to ignore.
0999: * <p>
1000: * This character is ignored when parsing the String, unless it is
1001: * within a quoted region.
1002: *
1003: * @param ignored the ignored character to use
1004: * @return this, to enable chaining
1005: */
1006: public StrTokenizer setIgnoredChar(char ignored) {
1007: return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
1008: }
1009:
1010: // Trimmer
1011: //-----------------------------------------------------------------------
1012: /**
1013: * Gets the trimmer character matcher.
1014: * <p>
1015: * These characters are trimmed off on each side of the delimiter
1016: * until the token or quote is found.
1017: * The default value is not to trim anything.
1018: *
1019: * @return the trimmer matcher in use
1020: */
1021: public StrMatcher getTrimmerMatcher() {
1022: return trimmerMatcher;
1023: }
1024:
1025: /**
1026: * Sets the matcher for characters to trim.
1027: * <p>
1028: * These characters are trimmed off on each side of the delimiter
1029: * until the token or quote is found.
1030: *
1031: * @param trimmer the trimmer matcher to use, null ignored
1032: * @return this, to enable chaining
1033: */
1034: public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1035: if (trimmer != null) {
1036: this .trimmerMatcher = trimmer;
1037: }
1038: return this ;
1039: }
1040:
1041: //-----------------------------------------------------------------------
1042: /**
1043: * Gets whether the tokenizer currently returns empty tokens as null.
1044: * The default for this property is false.
1045: *
1046: * @return true if empty tokens are returned as null
1047: */
1048: public boolean isEmptyTokenAsNull() {
1049: return this .emptyAsNull;
1050: }
1051:
1052: /**
1053: * Sets whether the tokenizer should return empty tokens as null.
1054: * The default for this property is false.
1055: *
1056: * @param emptyAsNull whether empty tokens are returned as null
1057: * @return this, to enable chaining
1058: */
1059: public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1060: this .emptyAsNull = emptyAsNull;
1061: return this ;
1062: }
1063:
1064: //-----------------------------------------------------------------------
1065: /**
1066: * Gets whether the tokenizer currently ignores empty tokens.
1067: * The default for this property is false.
1068: *
1069: * @return true if empty tokens are not returned
1070: */
1071: public boolean isIgnoreEmptyTokens() {
1072: return ignoreEmptyTokens;
1073: }
1074:
1075: /**
1076: * Sets whether the tokenizer should ignore and not return empty tokens.
1077: * The default for this property is false.
1078: *
1079: * @param ignoreEmptyTokens whether empty tokens are not returned
1080: * @return this, to enable chaining
1081: */
1082: public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1083: this .ignoreEmptyTokens = ignoreEmptyTokens;
1084: return this ;
1085: }
1086:
1087: //-----------------------------------------------------------------------
1088: /**
1089: * Gets the String content that the tokenizer is parsing.
1090: *
1091: * @return the string content being parsed
1092: */
1093: public String getContent() {
1094: if (chars == null) {
1095: return null;
1096: }
1097: return new String(chars);
1098: }
1099:
1100: //-----------------------------------------------------------------------
1101: /**
1102: * Creates a new instance of this Tokenizer. The new instance is reset so
1103: * that it will be at the start of the token list.
1104: * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1105: *
1106: * @return a new instance of this Tokenizer which has been reset.
1107: */
1108: public Object clone() {
1109: try {
1110: return cloneReset();
1111: } catch (CloneNotSupportedException ex) {
1112: return null;
1113: }
1114: }
1115:
1116: /**
1117: * Creates a new instance of this Tokenizer. The new instance is reset so that
1118: * it will be at the start of the token list.
1119: *
1120: * @return a new instance of this Tokenizer which has been reset.
1121: * @throws CloneNotSupportedException if there is a problem cloning
1122: */
1123: Object cloneReset() throws CloneNotSupportedException {
1124: // this method exists to enable 100% test coverage
1125: StrTokenizer cloned = (StrTokenizer) super .clone();
1126: if (cloned.chars != null) {
1127: cloned.chars = (char[]) cloned.chars.clone();
1128: }
1129: cloned.reset();
1130: return cloned;
1131: }
1132:
1133: //-----------------------------------------------------------------------
1134: /**
1135: * Gets the String content that the tokenizer is parsing.
1136: *
1137: * @return the string content being parsed
1138: */
1139: public String toString() {
1140: if (tokens == null) {
1141: return "StrTokenizer[not tokenized yet]";
1142: }
1143: return "StrTokenizer" + getTokenList();
1144: }
1145:
1146: }
|