001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.generation;
018:
019: import java.io.BufferedReader;
020: import java.io.ByteArrayInputStream;
021: import java.io.CharArrayWriter;
022: import java.io.IOException;
023: import java.io.InputStream;
024: import java.io.InputStreamReader;
025: import java.io.Reader;
026: import java.io.Serializable;
027: import java.util.HashMap;
028: import java.util.Map;
029:
030: import org.apache.avalon.framework.parameters.Parameters;
031: import org.apache.cocoon.ProcessingException;
032: import org.apache.cocoon.environment.SourceResolver;
033: import org.apache.excalibur.source.Source;
034: import org.xml.sax.Attributes;
035: import org.xml.sax.Locator;
036: import org.xml.sax.SAXException;
037: import org.xml.sax.helpers.AttributesImpl;
038:
039: /**
040: * <p>A simple parser converting a Comma Separated Values (CSV) file into XML.</p>
041: *
042: * <p>This parser is controlled by the following sitemap parameters:</p>
043: *
044: * <ul>
045: * <li>
046: * <b>process-headers</b>: whether the first line in the CSV is considered
047: * to be the header defining column names (the resulting output will be
048: * different if this is <i>true</i> or <i>false</i> (default: <i>false</i>).
049: * </li>
050: * <li>
051: * <b>max-records</b>: the maximum number of records to read
052: * (default: <i>-1</i> read all records).
053: * </li>
054: * <li>
055: * <b>encoding</b>: the character encoding (UTF-8, ISO8859-1, ...) used to
056: * interpret the input CSV source file (default: <i>system default</i>).
057: * </li>
058: * <li>
059: * <b>separator</b>: the field-separator character in the CSV file (comma,
060: * tab, ...) (default: <i>,</i> <small>comma</small>).
061: * </li>
062: * <li>
063: * <b>escape</b>: the character used to escape fields, or part of them, in
064: * the CSV file (default: <i>"</i> <small>quote</small>).
065: * </li>
066: * <li>
067: * <b>buffer-size</b>: the size of the buffer used for reading the source
068: * CSV file (default: <i>4096 bytes</i>).
069: * </li>
070: * </ul>
071: *
072: * <p>The generated output will look something like the following:</p>
073: *
074: * <pre>
075: * <?xml version="1.0" encoding="ISO-8859-1"?>
076: * <csv:document xmlns:csv="http://apache.org/cocoon/csv/1.0">
077: * <csv:header>
078: * <csv:column number="1">Column A</csv:column>
079: * <csv:column number="2">Column B</csv:column>
080: * <csv:column number="3">Column C</csv:column>
081: * </csv:header>
082: * <csv:record number="1">
083: * <csv:field number="1" column="Column A">Field A1</csv:field>
084: * <csv:field number="2" column="Column B">Field B1</csv:field>
085: * <csv:field number="3" column="Column C">Field C1</csv:field>
086: * </csv:record>
087: * <csv:record number="2">
088: * <csv:field number="1" column="Column A">Field A2</csv:field>
089: * <csv:field number="2" column="Column B">Field B2</csv:field>
090: * <csv:field number="3" column="Column C">Field C2</csv:field>
091: * </csv:record>
092: * </csv:document>
093: * </pre>
094: *
095: * <p>Note that this generator has been thoroughly tested with CSV files generated
096: * by <a href="http://office.microsoft.com/" target="_new">Microsoft Excel</a>.
097: * Unfortunately no official CSV specification has ever been published by
098: * any standard body, so the interpretation of the format might be slightly
099: * different in cases.</p>
100: *
101: * @author <a href="mailto:pier@apache.org">Pier Fumagalli</a>
102: */
103: public class CSVGenerator extends FileGenerator {
104:
105: /** <p>The namespace URI of XML generated by this instance.</p> */
106: public static final String NAMESPACE_URI = "http://apache.org/cocoon/csv/1.0";
107: /** <p>The namespace prefix of XML generated by this instance.</p> */
108: public static final String NAMESPACE_PREFIX = "csv";
109:
110: /** <p>The default encoding configured in the Java VM.</p> */
111: private static final String DEFAULT_ENCODING = new InputStreamReader(
112: new ByteArrayInputStream(new byte[0])).getEncoding();
113: /** <p>The default field separator character.</p> */
114: private static final String DEFAULT_SEPARATOR = ",";
115: /** <p>The default field separator character.</p> */
116: private static final String DEFAULT_ESCAPE = "\"";
117: /** <p>The default field separator character.</p> */
118: private static final int DEFAULT_BUFFER_SIZE = 4096;
119: private static final int UNLIMITED_MAXRECORDS = -1;
120: /** <p>A string used for indenting.</p> */
121: private static final char INDENT_STRING[] = "\n "
122: .toCharArray();
123:
124: /** <p>The encoding used to read the CSV resource from a stream.</p> */
125: private String encoding = DEFAULT_ENCODING;
126: /** <p>The character used to separate fields.</p> */
127: private char separator = DEFAULT_SEPARATOR.charAt(0);
128: /** <p>The character used to initiate and terminate esacaped sequences.</p> */
129: private char escape = DEFAULT_ESCAPE.charAt(0);
130: /** <p>The size of the buffer used to read the input.</p> */
131: private int buffersize = DEFAULT_BUFFER_SIZE;
132: /** <p>The current field (column) number in the current record.</p> */
133: private int fieldnumber = 1;
134: /** <p>The current record (line) number in the current CSV.</p> */
135: private int recordnumber = 1;
136: /** <p>The maximum number of records to read (-1 = read all records)</p> */
137: private int maxrecords;
138: /** <p>A flag indicating whether the <record> tag was opened.</p> */
139: private boolean openrecord = false;
140: /** <p>The character buffer for the current field.</p> */
141: private CharArrayWriter buffer = null;
142: /** <p>A map of all known columns or null if no headers are processed.</p> */
143: private Map columns = null;
144:
145: /**
146: * <p>Create a new {@link CSVGenerator} instance.</p>
147: */
148: public CSVGenerator() {
149: super ();
150: }
151:
152: /**
153: * <p>Recycle this component.</p>.
154: */
155: public void recycle() {
156: super .recycle();
157:
158: this .encoding = DEFAULT_ENCODING;
159: this .separator = DEFAULT_SEPARATOR.charAt(0);
160: this .escape = DEFAULT_ESCAPE.charAt(0);
161: this .buffersize = DEFAULT_BUFFER_SIZE;
162: this .buffer = null;
163: this .columns = null;
164: this .recordnumber = 1;
165: this .fieldnumber = 1;
166: this .openrecord = false;
167: }
168:
169: /**
170: * <p>Setup this {@link CSVGenerator} instance.</p>
171: */
172: public void setup(SourceResolver resolver, Map object_model,
173: String source, Parameters parameters)
174: throws ProcessingException, SAXException, IOException {
175: super .setup(resolver, object_model, source, parameters);
176:
177: boolean header = parameters.getParameterAsBoolean(
178: "process-headers", false);
179:
180: this .encoding = parameters.getParameter("encoding",
181: DEFAULT_ENCODING);
182: this .separator = parameters.getParameter("separator",
183: DEFAULT_SEPARATOR).charAt(0);
184: this .escape = parameters.getParameter("escape", DEFAULT_ESCAPE)
185: .charAt(0);
186: this .buffersize = parameters.getParameterAsInteger(
187: "buffer-size", DEFAULT_BUFFER_SIZE);
188: this .maxrecords = parameters.getParameterAsInteger(
189: "max-records", UNLIMITED_MAXRECORDS);
190: this .buffer = new CharArrayWriter();
191: this .columns = (header ? new HashMap() : null);
192: this .recordnumber = (header ? 0 : 1);
193: this .fieldnumber = 1;
194: this .openrecord = false;
195: }
196:
197: /**
198: * <p>Generate the unique key.</p>
199: */
200: public Serializable getKey() {
201: StringBuffer key = new StringBuffer(this .inputSource.getURI());
202: if (this .columns != null)
203: key.append("headers");
204: key.append(separator);
205: key.append(maxrecords);
206: key.append(escape);
207: return key;
208: }
209:
210: /**
211: * <p>Generate XML data from a Comma Separated Value resource.</p>.
212: */
213: public void generate() throws IOException, SAXException,
214: ProcessingException {
215:
216: /* Create a new Reader correctly decoding the source stream */
217: CSVReader csv = new CSVReader(this .inputSource, this .encoding,
218: this .buffersize);
219:
220: try {
221: /* Start the document */
222: this .contentHandler.setDocumentLocator(csv);
223: this .contentHandler.startDocument();
224: this .contentHandler.startPrefixMapping(NAMESPACE_PREFIX,
225: NAMESPACE_URI);
226: this .indent(0);
227: this .startElement("document");
228:
229: /* Allocate buffer and status for parsing */
230: boolean unescaped = true;
231: int prev = -1;
232: int curr = -1;
233:
234: /* Parse the file reading characters one-by-one */
235: while ((curr = csv.read()) >= 0
236: && (this .maxrecords == UNLIMITED_MAXRECORDS || recordnumber <= this .maxrecords)) {
237:
238: /* Process any occurrence of the escape character */
239: if (curr == this .escape) {
240: if ((unescaped) && (prev == this .escape)) {
241: this .buffer.write(this .escape);
242: }
243: unescaped = !unescaped;
244: prev = curr;
245: continue;
246: }
247:
248: /* Process any occurrence of the field separator */
249: if ((unescaped) && (curr == this .separator)) {
250: this .dumpField();
251: prev = curr;
252: continue;
253: }
254:
255: /* Process newline characters */
256: if ((unescaped) && ((curr == '\r') || (curr == '\n'))) {
257: this .dumpField();
258: this .dumpRecord();
259:
260: /* Record numbering */
261: if (((curr == '\n') && (prev != '\r'))
262: || (curr == '\r')) {
263: this .recordnumber++;
264: }
265:
266: /* Nothing else to do */
267: prev = curr;
268: continue;
269: }
270:
271: /* Any other character simply gets added to the buffer */
272: this .buffer.write(curr);
273: prev = curr;
274: }
275:
276: /* Terminate any hanging open record element (just in case) */
277: this .dumpField();
278: this .dumpRecord();
279:
280: /* Terminate the document */
281: this .indent(0);
282: this .endElement("document");
283: this .contentHandler.endPrefixMapping(NAMESPACE_PREFIX);
284: this .contentHandler.endDocument();
285:
286: } finally {
287: csv.close();
288: }
289: }
290:
291: private void dumpField() throws SAXException {
292: if (this .buffer.size() < 1) {
293: this .fieldnumber++;
294: return;
295: }
296:
297: if (!this .openrecord) {
298: this .indent(4);
299:
300: if (this .recordnumber > 0) {
301: AttributesImpl attributes = new AttributesImpl();
302: String value = Integer.toString(this .recordnumber);
303: attributes.addAttribute("", "number", "number",
304: "CDATA", value);
305: this .startElement("record", attributes);
306: } else {
307: this .startElement("header");
308: }
309: this .openrecord = true;
310: }
311:
312: /* Enclode the field in the proper element */
313: String element = "field";
314: char array[] = this .buffer.toCharArray();
315: this .indent(8);
316:
317: AttributesImpl attributes = new AttributesImpl();
318: String value = Integer.toString(this .fieldnumber);
319: attributes.addAttribute("", "number", "number", "CDATA", value);
320:
321: if (this .recordnumber < 1) {
322: this .columns.put(new Integer(this .fieldnumber), new String(
323: array));
324: element = "column";
325: } else if (this .columns != null) {
326: String header = (String) this .columns.get(new Integer(
327: this .fieldnumber));
328: if (header != null) {
329: attributes.addAttribute("", "column", "column",
330: "CDATA", header);
331: }
332: }
333:
334: this .startElement(element, attributes);
335: this .contentHandler.characters(array, 0, array.length);
336: this .endElement(element);
337: this .buffer.reset();
338:
339: this .fieldnumber++;
340: }
341:
342: private void dumpRecord() throws SAXException {
343: if (this .openrecord) {
344: this .indent(4);
345: if (this .recordnumber > 0) {
346: this .endElement("record");
347: } else {
348: this .endElement("header");
349: }
350: this .openrecord = false;
351: }
352: this .fieldnumber = 1;
353: }
354:
355: private void indent(int level) throws SAXException {
356: this .contentHandler.characters(INDENT_STRING, 0, level + 1);
357: }
358:
359: private void startElement(String name) throws SAXException {
360: this .startElement(name, new AttributesImpl());
361: }
362:
363: private void startElement(String name, Attributes atts)
364: throws SAXException {
365: if (name == null)
366: throw new NullPointerException("Null name");
367: if (atts == null)
368: atts = new AttributesImpl();
369: String qual = NAMESPACE_PREFIX + ':' + name;
370: this .contentHandler.startElement(NAMESPACE_URI, name, qual,
371: atts);
372: }
373:
374: private void endElement(String name) throws SAXException {
375: String qual = NAMESPACE_PREFIX + ':' + name;
376: this .contentHandler.endElement(NAMESPACE_URI, name, qual);
377: }
378:
379: private static final class CSVReader extends Reader implements
380: Locator {
381:
382: private String uri = null;
383: private Reader input = null;
384: private int column = 1;
385: private int line = 1;
386: private int last = -1;
387:
388: private CSVReader(Source source, String encoding, int buffer)
389: throws IOException {
390: InputStream stream = source.getInputStream();
391: Reader reader = new InputStreamReader(stream, encoding);
392: this .input = new BufferedReader(reader, buffer);
393: this .uri = source.getURI();
394: }
395:
396: public String getPublicId() {
397: return null;
398: }
399:
400: public String getSystemId() {
401: return this .uri;
402: }
403:
404: public int getLineNumber() {
405: return this .line;
406: }
407:
408: public int getColumnNumber() {
409: return this .column;
410: }
411:
412: public void close() throws IOException {
413: this .input.close();
414: }
415:
416: public int read() throws IOException {
417: int c = this .input.read();
418: if (c < 0)
419: return c;
420:
421: if (((c == '\n') && (this .last != '\r')) || (c == '\r')) {
422: this .column = 1;
423: this .line++;
424: }
425:
426: this .last = c;
427: return c;
428: }
429:
430: public int read(char b[], int o, int l) throws IOException {
431: if (b == null)
432: throw new NullPointerException();
433: if ((o < 0) || (o > b.length) || (l < 0)
434: || ((o + l) > b.length) || ((o + l) < 0)) {
435: throw new IndexOutOfBoundsException();
436: }
437: if (l == 0)
438: return 0;
439:
440: int c = read();
441: if (c == -1)
442: return -1;
443: b[o] = (char) c;
444:
445: int i = 1;
446: try {
447: for (i = 1; i < l; i++) {
448: c = read();
449: if (c == -1)
450: break;
451: b[o + i] = (char) c;
452: }
453: } catch (IOException ee) {
454: return i;
455: }
456: return i;
457: }
458: }
459: }
|