001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import net.sourceforge.chaperon.build.LexicalAutomatonBuilder;
020: import net.sourceforge.chaperon.common.Decoder;
021: import net.sourceforge.chaperon.model.lexicon.Lexicon;
022: import net.sourceforge.chaperon.model.lexicon.LexiconFactory;
023: import net.sourceforge.chaperon.process.LexicalAutomaton;
024: import net.sourceforge.chaperon.process.PatternProcessor;
025:
026: import org.apache.avalon.excalibur.pool.Recyclable;
027: import org.apache.avalon.framework.activity.Disposable;
028: import org.apache.avalon.framework.logger.LogEnabled;
029: import org.apache.avalon.framework.logger.Logger;
030: import org.apache.avalon.framework.parameters.ParameterException;
031: import org.apache.avalon.framework.parameters.Parameterizable;
032: import org.apache.avalon.framework.parameters.Parameters;
033: import org.apache.avalon.framework.service.ServiceException;
034: import org.apache.avalon.framework.service.ServiceManager;
035: import org.apache.avalon.framework.service.Serviceable;
036:
037: import org.apache.cocoon.ProcessingException;
038: import org.apache.cocoon.xml.XMLUtils;
039: import org.apache.cocoon.caching.CacheableProcessingComponent;
040: import org.apache.cocoon.components.source.SourceUtil;
041: import org.apache.cocoon.environment.SourceResolver;
042:
043: //import org.apache.commons.logging.impl.AvalonLogger;
044:
045: import org.apache.excalibur.source.Source;
046: import org.apache.excalibur.source.SourceException;
047: import org.apache.excalibur.source.SourceValidity;
048: import org.apache.excalibur.store.Store;
049:
050: import org.xml.sax.Attributes;
051: import org.xml.sax.SAXException;
052: import org.xml.sax.helpers.AttributesImpl;
053:
054: import java.io.IOException;
055: import java.io.Serializable;
056:
057: import java.util.Map;
058:
059: /**
060: * This transfomer transforms text pattern of a XML file into lexemes by using a lexicon file.
061: *
062: * <p>
063: * Input:
064: * </p>
065: * <pre>
066: * <section>
067: * Text 123 bla
068: * </section>
069: * </pre>
070: *
071: * <p>
072: * can be transformed into the following output:
073: * </p>
074: * <pre>
075: * <section>
076: * Text
077: * <lexeme symbol="number" text="123"/>
078: * bla
079: * </section>
080: * </pre>
081: *
082: * @author <a href="mailto:stephan@apache.org">Stephan Michels</a>
083: * @version $Id: PatternTransformer.java 433543 2006-08-22 06:22:54Z crossley $
084: */
085: public class PatternTransformer extends AbstractTransformer implements
086: LogEnabled, Serviceable, Recyclable, Disposable,
087: Parameterizable, CacheableProcessingComponent {
088:
089: /** Namespace for the SAX events. */
090: public static final String NS = "http://chaperon.sourceforge.net/schema/lexemes/2.0";
091: private String lexicon = null;
092: private Source lexiconSource = null;
093: private Logger logger = null;
094: private ServiceManager manager = null;
095: private SourceResolver resolver = null;
096: private LexicalAutomaton automaton = null;
097: private PatternProcessor processor = new PatternProcessor();
098: private boolean groups = false;
099: private StringBuffer buffer = new StringBuffer();
100: private StringBuffer output = new StringBuffer();
101:
102: /**
103: * Provide component with a logger.
104: *
105: * @param logger the logger
106: */
107: public void enableLogging(Logger logger) {
108: this .logger = logger;
109: }
110:
111: /**
112: * Pass the ServiceManager to the Serviceable. The Serviceable implementation should use the
113: * specified ServiceManager to acquire the services it needs for execution.
114: *
115: * @param manager The ServiceManager which this Serviceable uses.
116: */
117: public void service(ServiceManager manager) {
118: this .manager = manager;
119: }
120:
121: /**
122: * Provide component with parameters.
123: *
124: * @param parameters the parameters
125: *
126: * @throws ParameterException if parameters are invalid
127: */
128: public void parameterize(Parameters parameters)
129: throws ParameterException {
130: groups = parameters.getParameterAsBoolean("groups", false);
131: }
132:
133: /**
134: * Set the SourceResolver, objectModel Map, the source and sitemap Parameters used to process the
135: * request.
136: *
137: * @param resolver Source resolver
138: * @param objectmodel Object model
139: * @param src Source
140: * @param parameters Parameters
141: *
142: * @throws IOException
143: * @throws ProcessingException
144: * @throws SAXException
145: */
146: public void setup(SourceResolver resolver, Map objectmodel,
147: String src, Parameters parameters)
148: throws ProcessingException, SAXException, IOException {
149: this .resolver = resolver;
150:
151: Store store = null;
152:
153: try {
154: this .lexicon = src;
155:
156: this .lexiconSource = resolver.resolveURI(this .lexicon);
157:
158: // Retrieve the lexical automaton from the transient store
159: store = (Store) this .manager.lookup(Store.TRANSIENT_STORE);
160:
161: LexicalAutomatonEntry entry = (LexicalAutomatonEntry) store
162: .get(this .lexiconSource.getURI());
163:
164: // If the lexicon has changed, rebuild the lexical automaton
165: if ((entry == null)
166: || (entry.getValidity() == null)
167: || (entry.getValidity().isValid(
168: this .lexiconSource.getValidity()) <= 0)) {
169: this .logger.info("(Re)building the automaton from '"
170: + this .lexiconSource.getURI() + "'");
171:
172: if (this .lexiconSource.getInputStream() == null)
173: throw new ProcessingException("Source '"
174: + this .lexiconSource.getURI()
175: + "' not found");
176:
177: LexiconFactory factory = new LexiconFactory();
178: SourceUtil.toSAX(this .manager, this .lexiconSource,
179: null, factory);
180:
181: Lexicon lexicon = factory.getLexicon();
182:
183: LexicalAutomatonBuilder builder = new LexicalAutomatonBuilder(
184: lexicon/*, new AvalonLogger(this.logger)*/);
185:
186: this .automaton = builder.getLexicalAutomaton();
187:
188: this .logger.info("Store automaton into store for '"
189: + this .lexiconSource.getURI() + "'");
190: store.store(this .lexiconSource.getURI(),
191: new LexicalAutomatonEntry(this .automaton,
192: this .lexiconSource.getValidity()));
193: } else {
194: this .logger.info("Getting automaton from store for '"
195: + this .lexiconSource.getURI() + "'");
196: this .automaton = entry.getLexicalAutomaton();
197: }
198: } catch (SourceException se) {
199: throw new ProcessingException("Error during resolving of '"
200: + src + "'.", se);
201: } catch (ServiceException se) {
202: throw new ProcessingException(
203: "Could not lookup for component", se);
204: } finally {
205: if (store != null)
206: this .manager.release(store);
207: }
208: }
209:
210: /**
211: * Generate the unique key. This key must be unique inside the space of this component.
212: *
213: * @return The generated key hashes the src
214: */
215: public Serializable getKey() {
216: return this .lexiconSource.getURI();
217: }
218:
219: /**
220: * Generate the validity object.
221: *
222: * @return The generated validity object or <code>null</code> if the component is currently not
223: * cacheable.
224: */
225: public SourceValidity getValidity() {
226: return this .lexiconSource.getValidity();
227: }
228:
229: /**
230: * Recycle this component. All instance variables are set to <code>null</code>.
231: */
232: public void recycle() {
233: if ((this .resolver != null) && (this .lexiconSource != null)) {
234: this .resolver.release(this .lexiconSource);
235: this .lexiconSource = null;
236: }
237:
238: this .automaton = null;
239: super .recycle();
240: }
241:
242: /**
243: * The dispose operation is called at the end of a components lifecycle.
244: */
245: public void dispose() {
246: if ((this .resolver != null) && (this .lexiconSource != null)) {
247: this .resolver.release(this .lexiconSource);
248: this .lexiconSource = null;
249: }
250:
251: this .manager = null;
252: }
253:
254: /**
255: * Receive notification of the beginning of an element.
256: *
257: * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or if
258: * Namespace processing is not being performed.
259: * @param loc The local name (without prefix), or the empty string if Namespace processing is not
260: * being performed.
261: * @param raw The raw XML 1.0 name (with prefix), or the empty string if raw names are not
262: * available.
263: * @param a The attributes attached to the element. If there are no attributes, it shall be an
264: * empty Attributes object.
265: *
266: * @throws SAXException
267: */
268: public void startElement(String uri, String loc, String raw,
269: Attributes a) throws SAXException {
270: search();
271:
272: if (contentHandler != null)
273: contentHandler.startElement(uri, loc, raw, a);
274: }
275:
276: /**
277: * Receive notification of the end of an element.
278: *
279: * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or if
280: * Namespace processing is not being performed.
281: * @param loc The local name (without prefix), or the empty string if Namespace processing is not
282: * being performed.
283: * @param raw The raw XML 1.0 name (with prefix), or the empty string if raw names are not
284: * available.
285: *
286: * @throws SAXException
287: */
288: public void endElement(String uri, String loc, String raw)
289: throws SAXException {
290: search();
291:
292: if (contentHandler != null)
293: contentHandler.endElement(uri, loc, raw);
294: }
295:
296: /**
297: * Receive notification of character data.
298: *
299: * @param c The characters from the XML document.
300: * @param start The start position in the array.
301: * @param len The number of characters to read from the array.
302: *
303: * @throws SAXException
304: */
305: public void characters(char[] c, int start, int len)
306: throws SAXException {
307: buffer.append(c, start, len);
308: }
309:
310: /**
311: * Receive notification of ignorable whitespace in element content.
312: *
313: * @param c The characters from the XML document.
314: * @param start The start position in the array.
315: * @param len The number of characters to read from the array.
316: *
317: * @throws SAXException
318: */
319: public void ignorableWhitespace(char[] c, int start, int len)
320: throws SAXException {
321: buffer.append(c, start, len);
322: }
323:
324: /**
325: * Receive notification of a processing instruction.
326: *
327: * @param target The processing instruction target.
328: * @param data The processing instruction data, or null if none was supplied.
329: *
330: * @throws SAXException
331: */
332: public void processingInstruction(String target, String data)
333: throws SAXException {
334: search();
335:
336: if (contentHandler != null)
337: contentHandler.processingInstruction(target, data);
338: }
339:
340: /**
341: * Report an XML comment anywhere in the document.
342: *
343: * @param ch An array holding the characters in the comment.
344: * @param start The starting position in the array.
345: * @param len The number of characters to use from the array.
346: *
347: * @throws SAXException
348: */
349: public void comment(char[] ch, int start, int len)
350: throws SAXException {
351: search();
352:
353: if (lexicalHandler != null)
354: lexicalHandler.comment(ch, start, len);
355: }
356:
357: /**
358: * @throws SAXException
359: */
360: private void search() throws SAXException {
361: if (buffer.length() <= 0)
362: return;
363:
364: char[] text = buffer.toString().toCharArray();
365:
366: String lexemesymbol;
367: String lexemetext;
368: String[] groups = null;
369: int lexemeindex = 0;
370: int position = 0;
371:
372: output.setLength(0);
373: do {
374: lexemesymbol = null;
375: lexemetext = null;
376:
377: for (lexemeindex = automaton.getLexemeCount() - 1; lexemeindex >= 0; lexemeindex--) {
378: processor.setPatternAutomaton(automaton
379: .getLexemeDefinition(lexemeindex));
380:
381: if ((processor.match(text, position))
382: && ((lexemetext == null) || (processor
383: .getGroup().length() >= lexemetext
384: .length()))) {
385: lexemesymbol = automaton
386: .getLexemeSymbol(lexemeindex);
387: lexemetext = processor.getGroup();
388: if (this .groups) {
389: groups = new String[processor.getGroupCount()];
390: for (int group = 0; group < processor
391: .getGroupCount(); group++)
392: groups[group] = processor.getGroup(group);
393: }
394: }
395: }
396:
397: if ((lexemetext != null) && (lexemetext.length() > 0)) {
398: if (lexemesymbol != null) {
399: if (logger != null)
400: logger.debug("Recognize token " + lexemesymbol
401: + " with "
402: + Decoder.toString(lexemetext));
403:
404: if (output.length() > 0)
405: contentHandler.characters(output.toString()
406: .toCharArray(), 0, output.length());
407:
408: output.setLength(0);
409:
410: contentHandler.startPrefixMapping("", NS);
411:
412: AttributesImpl atts = new AttributesImpl();
413:
414: atts.addAttribute("", "symbol", "symbol", "CDATA",
415: lexemesymbol);
416: atts.addAttribute("", "text", "text", "CDATA",
417: lexemetext);
418: contentHandler.startElement(NS, "lexeme", "lexeme",
419: atts);
420:
421: if (this .groups) {
422: for (int group = 0; group < groups.length; group++) {
423: contentHandler.startElement(NS, "group",
424: "group", XMLUtils.EMPTY_ATTRIBUTES);
425: contentHandler.characters(groups[group]
426: .toCharArray(), 0, groups[group]
427: .length());
428: contentHandler.endElement(NS, "group",
429: "group");
430: }
431: }
432:
433: contentHandler.endElement(NS, "lexeme", "lexeme");
434: contentHandler.endPrefixMapping("");
435: } else if (logger != null)
436: logger.debug("Ignore lexeme with "
437: + Decoder.toString(lexemetext));
438:
439: position += lexemetext.length();
440: } else {
441: output.append(text[position]);
442: position++;
443: }
444: } while (position < text.length);
445:
446: if (output.length() > 0)
447: contentHandler.characters(output.toString().toCharArray(),
448: 0, output.length());
449:
450: buffer.setLength(0);
451: }
452:
453: /**
454: * This class represent a entry in a store to cache the lexical automaton.
455: */
456: public static class LexicalAutomatonEntry implements Serializable {
457: private SourceValidity validity = null;
458: private LexicalAutomaton automaton = null;
459:
460: /**
461: * Create a new entry.
462: *
463: * @param automaton Lexical automaton.
464: * @param validity Validity of the lexicon file.
465: */
466: public LexicalAutomatonEntry(LexicalAutomaton automaton,
467: SourceValidity validity) {
468: this .automaton = automaton;
469: this .validity = validity;
470: }
471:
472: /**
473: * Return the validity of the lexicon file.
474: *
475: * @return Validity of the lexicon file.
476: */
477: public SourceValidity getValidity() {
478: return this .validity;
479: }
480:
481: /**
482: * Return the lexical automaton.
483: *
484: * @return Lexical automaton.
485: */
486: public LexicalAutomaton getLexicalAutomaton() {
487: return this .automaton;
488: }
489:
490: private void writeObject(java.io.ObjectOutputStream out)
491: throws IOException {
492: out.writeObject(validity);
493: out.writeObject(automaton);
494: }
495:
496: private void readObject(java.io.ObjectInputStream in)
497: throws IOException, ClassNotFoundException {
498: validity = (SourceValidity) in.readObject();
499: automaton = (LexicalAutomaton) in.readObject();
500: }
501: }
502: }
|