001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.slop.parsing;
018:
019: import org.xml.sax.ContentHandler;
020: import org.xml.sax.SAXException;
021: import org.xml.sax.helpers.AttributesImpl;
022: import org.apache.cocoon.ProcessingException;
023: import org.apache.cocoon.xml.XMLUtils;
024: import org.apache.cocoon.slop.interfaces.SlopParser;
025: import org.apache.cocoon.slop.interfaces.SlopConstants;
026:
027: /**
028: * Simplistic SLOP parser, recognizes the following constructs:
029: *
030: * Field: a line starting with letters and : is considered a field
031: *
032: * Empty lines are detected.
033: * Other lines are output as line elements
034: *
035: * This is sufficient for basic parsing of RFC 822 headers,
036: * but a configurable rfc822 mode would be good to differentiate
037: * between the header and body of the email message and parse them
038: * with different rules.
039: *
040: * @author <a href="mailto:bdelacretaz@apache.org">Bertrand Delacretaz</a>
041: * @version $Id: SimpleSlopParser.java 433543 2006-08-22 06:22:54Z crossley $
042: */
043: public class SimpleSlopParser implements SlopParser, SlopConstants {
044:
045: private ContentHandler contentHandler;
046:
047: /** chars that can be part of a field name (other than letters) */
048: private final static String DEFAULT_TAGNAME_CHARS = "-_";
049: private String tagnameChars = DEFAULT_TAGNAME_CHARS;
050:
051: /** valid characters in an XML element name (in addition to letters and digits) */
052: final static String VALID_TAGNAME_CHARS = "_-";
053: final static String TAGNAME_REPLACEMENT_CHAR = "_";
054:
055: /** optionally preserve whitespace in input */
056: private boolean preserveSpace = false;
057:
058: /** count lines */
059: private int lineCounter;
060:
061: /** result of parsing a line */
062: static class ParsedLine {
063: final String name;
064: final String contents;
065:
066: ParsedLine(String elementName, String elementContents) {
067: name = filterElementName(elementName);
068: contents = elementContents;
069: }
070: }
071:
072: /** make sure element names are valid XML */
073: static String filterElementName(String str) {
074: final StringBuffer sb = new StringBuffer();
075: for (int i = 0; i < str.length(); i++) {
076: final char c = str.charAt(i);
077: if (Character.isLetter(c)) {
078: sb.append(c);
079: } else if (Character.isDigit(c) && i > 0) {
080: sb.append(c);
081: } else if (VALID_TAGNAME_CHARS.indexOf(c) >= 0) {
082: sb.append(c);
083: } else {
084: sb.append(TAGNAME_REPLACEMENT_CHAR);
085: }
086: }
087: return sb.toString();
088: }
089:
090: /** set the list of valid chars for tag names (in addition to letters) */
091: public void setValidTagnameChars(String str) {
092: tagnameChars = (str == null ? DEFAULT_TAGNAME_CHARS : str
093: .trim());
094: }
095:
096: /** optionally preserve whitespace in input */
097: public void setPreserveWhitespace(boolean b) {
098: preserveSpace = b;
099: }
100:
101: /** must be called before any call to processLine() */
102: public void startDocument(ContentHandler destination)
103: throws SAXException, ProcessingException {
104: contentHandler = destination;
105: contentHandler.startDocument();
106: contentHandler.startPrefixMapping("", SLOP_NAMESPACE_URI);
107: contentHandler.startElement(SLOP_NAMESPACE_URI,
108: SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT,
109: XMLUtils.EMPTY_ATTRIBUTES);
110: }
111:
112: /** must be called once all calls to processLine() are done */
113: public void endDocument() throws SAXException, ProcessingException {
114: contentHandler.endElement(SLOP_NAMESPACE_URI,
115: SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT);
116: contentHandler.endPrefixMapping("");
117: contentHandler.endDocument();
118: contentHandler = null;
119: }
120:
121: /** add simple name-value attribute to attr */
122: private void setAttribute(AttributesImpl attr, String name,
123: String value) {
124: final String ATTR_TYPE = "NMTOKEN";
125: attr.addAttribute("", name, name, ATTR_TYPE, value);
126: }
127:
128: /** call this to process input lines, does the actual parsing */
129: public void processLine(String line) throws SAXException,
130: ProcessingException {
131: if (contentHandler == null) {
132: throw new ProcessingException(
133: "SimpleSlopParser content handler is null (startDocument not called?)");
134: }
135:
136: // find out which element name to use, based on the contents of the line
137: final ParsedLine p = parseLine(line);
138:
139: // generate the element and its contents
140: lineCounter++;
141: final AttributesImpl atts = new AttributesImpl();
142: setAttribute(atts, SLOP_ATTR_LINENUMBER, String
143: .valueOf(lineCounter));
144: contentHandler.startElement(SLOP_NAMESPACE_URI, p.name, p.name,
145: atts);
146: contentHandler.characters(p.contents.toCharArray(), 0,
147: p.contents.length());
148: contentHandler.endElement(SLOP_NAMESPACE_URI, p.name, p.name);
149: }
150:
151: /** parse a line, extract element name and contents */
152: protected ParsedLine parseLine(String line) {
153: ParsedLine result = null;
154:
155: // empty lines
156: if (line == null || line.trim().length() == 0) {
157: result = new ParsedLine(SLOP_EMPTY_LINE_ELEMENT, "");
158: }
159:
160: // simple extraction of field names, lines starting with alpha chars followed
161: // by a colon are parsed as follows:
162: //
163: // input:
164: // field-name: this line is a field
165: // output:
166: // <field-name>this line is a field</field-name>
167: if (result == null) {
168: final int colonPos = line.indexOf(':');
169: if (colonPos > 0) {
170: boolean fieldFound = true;
171: for (int i = 0; i < colonPos; i++) {
172: final char c = line.charAt(i);
173: final boolean isFieldChar = Character.isLetter(c)
174: || tagnameChars.indexOf(c) >= 0;
175: if (!isFieldChar) {
176: fieldFound = false;
177: break;
178: }
179: }
180:
181: if (fieldFound) {
182: String contents = "";
183: if (line.length() > colonPos + 1) {
184: final String str = line.substring(colonPos + 1);
185: contents = (preserveSpace ? str : str.trim());
186: }
187: result = new ParsedLine(
188: line.substring(0, colonPos), contents);
189: }
190: }
191: }
192:
193: // default: output a line element
194: if (result == null) {
195: final String str = (preserveSpace ? line : line.trim());
196: result = new ParsedLine(SLOP_LINE_ELEMENT, str);
197: }
198:
199: return result;
200: }
201: }
|