001: /*
002: * <copyright>
003: *
004: * Copyright 2003-2004 BBNT Solutions, LLC
005: * under sponsorship of the Defense Advanced Research Projects
006: * Agency (DARPA).
007: *
008: * You can redistribute this software and/or modify it under the
009: * terms of the Cougaar Open Source License as published on the
010: * Cougaar Open Source Website (www.cougaar.org).
011: *
012: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
013: * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
014: * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
015: * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
016: * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
017: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
018: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
019: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
020: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
021: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
022: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
023: *
024: * </copyright>
025: *
026: * See original perl version of the parser (from japhy) at the end.
027: */
028:
029: package org.cougaar.util;
030:
031: import java.util.ArrayList;
032: import java.util.Collection;
033: import java.util.List;
034: import java.util.regex.Matcher;
035: import java.util.regex.Pattern;
036:
037: public class CSVUtility {
038: private CSVUtility() {
039: }
040:
041: private static final String patternPrefix = "\\G\\s*(?!$)(\\\"[^\\\"]*(?:\\\"\\\"[^\\\"]*)*\\\"|[^";
042: private static final String patternMiddle = "\\\"]*)";
043: private static final String patternSuffix = "?";
044: private static final Pattern commaP = buildPattern(',');
045:
046: private static Pattern buildPattern(char comma) {
047: return Pattern.compile(patternPrefix + comma + patternMiddle
048: + comma + patternSuffix);
049: }
050:
051: private static final String[] emptyStrings = {};
052:
053: /** Parse a single string in mocrosift-like CSV format **/
054: public static String[] parse(String str) {
055: return parse(str, commaP);
056: }
057:
058: public static String[] parse(String str, char sep) {
059: return parse(str, buildPattern(sep));
060: }
061:
062: private static String[] parse(String str, Pattern p) {
063: List l = parseToList(str, p);
064: return (String[]) l.toArray(emptyStrings);
065: }
066:
067: public static List parseToList(String str) {
068: return parseToList(str, commaP);
069: }
070:
071: public static List parseToList(String str, char sep) {
072: return parseToList(str, buildPattern(sep));
073: }
074:
075: public static List parseToList(String str, Pattern p) {
076: List l = new ArrayList();
077: Matcher m = p.matcher(str);
078: while (m.find()) {
079: String v = m.group(1);
080: v.trim();
081: if (v.length() > 1 && v.startsWith("\"")
082: && v.endsWith("\"")) {
083: v = v.substring(1, v.length() - 1);
084: v = v.replaceAll("\"\"", "\"");
085: }
086: l.add(v);
087: }
088: return l;
089: }
090:
091: /** @deprecated Use parseToList instead **/
092: public static Collection parseToCollection(String str) {
093: return parseToList(str, commaP);
094: }
095: }
096:
097: /*
098: From the perl at http://japhy.perlmonk.org/
099:
100: @fields = parseCSV(
101: 'this,that,"those,these, and mine","""I want those,"" he said"'
102: );
103:
104: print map "($_)", @fields;
105:
106:
107: sub parseCSV {
108: my $str = @_ ? shift : $_;
109: my @ret;
110: while ($str =~ /\G\s*(?!$)("[^"]*(?:""[^"]*)*"|[^,"]*),?/g) {
111: push @ret, $1;
112: $ret[-1] =~ s/\s+$//;
113: if ($ret[-1] =~ s/^"//) { chop $ret[-1]; $ret[-1] =~ s/""/"/g }
114: }
115: return @ret;
116: }
117:
118:
119: __END__
120:
121: $str =~ m{
122: \G # where the last match left off (defaults to ^)
123: \s* # any whitespace (we don't save this)
124: (?!$) # NOT followed by end-of-string
125: ( # save this to $1 (this is ONE CSV)
126: " # a "
127: [^"]* # followed by 0 or more non-"
128: (?: # then...
129: "" # a "" (which is a CSV 'escape' for a ")
130: [^"]* # followed by 0 or more non-"
131: )* # 0 or more times
132: " # the ending "
133: | # *OR*
134: [^,"]* # 0 or more non-, and non-" characters
135: )
136: ,? # with an optional ending comma
137: }gx;
138: */
|