001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import java.io.IOException;
020: import java.util.HashMap;
021: import java.util.HashSet;
022: import java.util.Map;
023: import java.util.Set;
024: import java.util.StringTokenizer;
025:
026: import org.apache.avalon.framework.activity.Disposable;
027: import org.apache.avalon.framework.activity.Initializable;
028: import org.apache.avalon.framework.configuration.Configuration;
029: import org.apache.avalon.framework.configuration.ConfigurationException;
030: import org.apache.avalon.framework.parameters.ParameterException;
031: import org.apache.avalon.framework.parameters.Parameters;
032: import org.apache.cocoon.ProcessingException;
033: import org.apache.cocoon.components.modules.input.InputModuleHelper;
034: import org.apache.cocoon.environment.SourceResolver;
035: import org.apache.cocoon.transformation.helpers.VariableConfiguration;
036: import org.apache.regexp.RE;
037: import org.apache.regexp.RECompiler;
038: import org.apache.regexp.REProgram;
039: import org.apache.regexp.RESyntaxException;
040: import org.xml.sax.Attributes;
041: import org.xml.sax.SAXException;
042: import org.xml.sax.helpers.AttributesImpl;
043:
044: /**
045: * Rewrites URIs in links to a value determined by an InputModule.
046: * The URI scheme identifies the InputModule to use, and the rest of the URI is
047: * used as the attribute name.
048: *
049: * <h3>Example</h3>
050: * <p>For instance, if we had an {@link
051: * org.apache.cocoon.components.modules.input.XMLFileModule}, configured to
052: * read values from an XML file:
053: * <pre>
054: * <site>
055: * <faq>
056: * <how_to_boil_eggs href="faq/eggs.html"/>
057: * </faq>
058: * </site>
059: * </pre>
060: *
061: * mapped to the prefix 'site:', then <code><link
062: * href="site:/site/faq/how_to_boil_eggs/@href"></code> would be replaced
063: * with <code><link href="faq/eggs.html"></code>
064: *
065: * <h3>InputModule Configuration</h3>
066: * <p>InputModules are configured twice; first statically in
067: * <code>cocoon.xconf</code>, and then dynamically at runtime, with dynamic
068: * configuration (if any) taking precedence. Transformer allows
069: * you to pass a dynamic configuration to used InputModules as follows.
070: *
071: * <p>First, a template Configuration is specified in the static
072: * <map:components> block of the sitemap within <input-module> tags:
073: * <pre>
074: * <map:transformer name="linkrewriter"
075: * src="org.apache.cocoon.transformation.LinkRewriterTransformer">
076: * <link-attrs>href src</link-attrs>
077: * <schemes>site ext</schemes>
078: * <input-module name="site">
079: * <file src="cocoon://samples/link/linkmap" reloadable="true"/>
080: * </input-module>
081: * <input-module name="mapper">
082: * <input-module name="site">
083: * <file src="{src}" reloadable="true"/>
084: * </input-module>
085: * <prefix>/site/</prefix>
086: * <suffix>/@href</suffix>
087: * </input-module>
088: * </map:transformer>
089: * </pre>
090: *
091: * Here, we have first configured which attributes to examine, and which URL
092: * schemes to consider rewriting. In this example, <a href="site:index">
093: * would be processed. See below for more configuration options.
094: *
095: * <p>Then, we have established dynamic configuration templates for two modules,
096: * 'site' (an {@link org.apache.cocoon.components.modules.input.XMLFileModule}
097: * and 'mapper' (A {@link
098: * org.apache.cocoon.components.modules.input.SimpleMappingMetaModule}. All
099: * other InputModules will use their static configs. Note that, when
100: * configuring a meta InputModule like 'mapper', we need to also configure the
101: * 'inner' module (here, 'site') with a nested <input-module>.
102: *
103: * <p>There is one further twist; to have <em>really</em> dynamic configuration,
104: * we need information available only when the transformer actually runs. This
105: * is why the above config was called a "template" configuration; it needs to
106: * be 'instantiated' and provided extra info, namely:
107: * <ul>
108: * <li>The {src} string will be replaced with the map:transform @src attribute value.
109: * <li>Any other {variables} will be replaced with map:parameter values
110: * </ul>
111: *
112: * With the above config template, we can have a matcher like:
113: *
114: * <pre>
115: * <map:match pattern="**welcome">
116: * <map:generate src="index.xml"/>
117: * <map:transform type="linkrewriter" src="cocoon:/{1}linkmap"/>
118: * <map:serialize type="xml"/>
119: * </map:match>
120: * </pre>
121: *
122: * Which would cause the 'mapper' XMLFileModule to be configured with a
123: * different XML file, depending on the request.
124: *
125: * <p>Similarly, we could use a dynamic prefix:
126: * <pre>
127: * <prefix>{prefix}</prefix>
128: * </pre>
129: * in the template config, and:
130: * <pre>
131: * <map:parameter name="prefix" value="/site/"/>
132: * </pre>
133: * in the map:transform
134: *
135: * <p>A live example of LinkRewriterTransformer can be found in the <a
136: * href="http://forrest.apache.org/">Apache Forrest</a> sitemap.
137: *
138: * <h3>Transformer Configuration</h3>
139: * <p>
140: * The following configuration entries in map:transformer block are recognised:
141: * <dl>
142: * <dt>link-attrs</dt>
143: * <dd>Space-separated list of attributes to consider links (to be
144: * transformed). The whole value of the attribute is considered link and
145: * transformed.</dd>
146: *
147: * <dt>link-attr</dt>
148: * <dd>0..n of these elements each specify an attribute containing link(s)
149: * (to be transformed) and optionally a regular expression to locate
150: * substring(s) of the attribute value considered link(s). Has two
151: * attributes:
152: * <dl>
153: * <dt>name</dt>
154: * <dd>(required) name of the attribute whose value contains link(s).</dd>
155: * <dt>pattern</dt>
156: * <dd>(optional) regular expression such that when matched against the
157: * attribute value, all parenthesized expressions (except number 0) will
158: * be considered links that should be transformed. If absent, the whole value
159: * of the attribute is considered to be a link, as if the attribute was
160: * included in 'link-attrs'.</dd>
161: * </dl>
162: * </dd>
163: *
164: * <dt>schemes</dt>
165: * <dd>Space-separated list of URI schemes to explicitly include.
166: * If specified, all URIs with unlisted schemes will <i>not</i> be converted.</dd>
167: *
168: * <dt>exclude-schemes</dt>
169: * <dd>Space-separated list of URI schemes to explicitly exclude.
170: * Defaults to 'http https ftp news mailto'.</dd>
171: *
172: * <dt>bad-link-str</dt>
173: * <dd>String to use for links with a correct InputModule prefix, but no value
174: * therein. Defaults to the original URI.</dd>
175: *
176: * <dt>namespace-uri</dt>
177: * <dd>The namespace uri of elements whose attributes are considered for
178: * transformation. Defaults to the empty namespace ("").</dd>
179: * </dl>
180: *
181: * <p>
182: * The attributes considered to contain links are a <em>set</em> of the attributes
183: * specified in 'link-attrs' element and all 'link-attr' elements. Each attribute
184: * should be specified only once either in 'link-attrs' or 'link-attr'; i.e. an
185: * attribute can have at most 1 regular expression associated with it. If neither
186: * 'link-attrs' nor 'link-attr' configuration is present, defaults to 'href'.
187: *
188: * <p>Below is an example of regular expression usage that will transform links
189: * <code>x1</code> and <code>x2</code> in
190: * <code><action target="foo url(x1) bar url(x2)"/></code>:
191: *
192: * <pre>
193: * <map:transformer name="linkrewriter"
194: * src="org.apache.cocoon.transformation.LinkRewriterTransformer">
195: * <link-attr name="target" pattern="(?:url\((.*?)\).*?){1,2}$"/>
196: * <!-- additional configuration ... -->
197: * </map:transformer>
198: * </pre>
199: *
200: * <p>
201: * When matched against the value of <code>target</code> attribute above,
202: * the parenthesized expressions are:<br/>
203: * <samp>
204: * $0 = url(x1) bar url(x2)<br/>
205: * $1 = x1<br/>
206: * $2 = x2<br/>
207: * </samp>
208: *
209: * <p>
210: * Expression number 0 is always discarded by the transformer and the rest
211: * are considered links and re-written.
212: *
213: * <p>If present, map:parameter's from the map:transform block override the
214: * corresponding configuration entries from map:transformer. As an exception,
215: * 'link-attr' parameters are not recognised; 'link-attrs' parameter overrides
216: * both 'link-attrs' and 'link-attr' configuration.
217: *
218: * <p>
219: *
220: * @version $Id: LinkRewriterTransformer.java 433543 2006-08-22 06:22:54Z crossley $
221: */
222: public class LinkRewriterTransformer extends AbstractSAXTransformer
223: implements Initializable, Disposable {
224:
225: private final static String NAMESPACE = "";
226:
227: /**
228: * A guardian object denoting absense of regexp pattern for a given
229: * attribute. Used as value in linkAttrs and origLinkAttrs maps.
230: */
231: private final static Object NO_REGEXP = new Object();
232:
233: //
234: // Configure()'d parameters
235: //
236:
237: /** Configuration passed to the component once through configure(). */
238: private Configuration origConf;
239:
240: private String origBadLinkStr;
241: private String origInSchemes;
242: private String origOutSchemes;
243: private String origNamespaceURI;
244:
245: /**
246: * A map where keys are those attributes which are considered 'links'.
247: * Obtained from configuration passed to the component once through
248: * the configure() method.
249: *
250: * <p>Map contains NO_REGEXP object for attributes whose whole values are
251: * considered links, or compiled RE expressions for attributes whose values
252: * might contain a link.
253: */
254: private Map origLinkAttrs;
255:
256: //
257: // Setup()'d parameters
258: //
259:
260: /**
261: * Derivation of origConf with variables obtained from setup() parameters.
262: * Recreated once per invocation.
263: */
264: private Configuration conf;
265:
266: /**
267: * String to use for links with a correct InputModule prefix, but no value
268: * therein.
269: */
270: private String badLinkStr;
271:
272: /** Set containing schemes (protocols) of links to process */
273: private Set inSchemes;
274:
275: /** Set containing schemes (protocols) of links to exclude from processing */
276: private Set outSchemes;
277:
278: /**
279: * A map of attributes considered 'links' and corresponding RE expression
280: * or NO_REGEXP object. Recreated once per invocation or copied from
281: * origLinkAttrs based on setup() method parameters.
282: */
283: private Map linkAttrs;
284:
285: private InputModuleHelper modHelper;
286:
287: /**
288: * Configure this component from the map:transformer block. Called before
289: * initialization and setup.
290: */
291: public void configure(Configuration conf)
292: throws ConfigurationException {
293: super .configure(conf);
294:
295: this .origConf = conf;
296: this .origBadLinkStr = conf.getChild("bad-link-str").getValue(
297: null);
298: this .origInSchemes = conf.getChild("schemes").getValue("");
299: this .origOutSchemes = conf.getChild("exclude-schemes")
300: .getValue("http https ftp news mailto");
301:
302: this .origNamespaceURI = conf.getChild("namespace-uri")
303: .getValue(NAMESPACE);
304:
305: /*
306: * Setup origLinkAttrs map from the original Configuration:
307: * 1. Parse link-attrs Configuration
308: * 2. Process link-attr Children, warn if overwriting
309: * 3. If no link-attrs, and no link-attr are available, defaults to "href"
310: */
311:
312: String linkAttrsValue = conf.getChild("link-attrs")
313: .getValue("");
314: this .origLinkAttrs = split(linkAttrsValue, " ", NO_REGEXP);
315:
316: Configuration[] attrConfs = conf.getChildren("link-attr");
317: if (attrConfs.length > 0) {
318: RECompiler compiler = new RECompiler();
319: for (int i = 0; i < attrConfs.length; i++) {
320: String attr = attrConfs[i].getAttribute("name");
321: if (getLogger().isWarnEnabled()
322: && origLinkAttrs.containsKey(attr)) {
323: getLogger()
324: .warn(
325: "Duplicate configuration entry found for attribute '"
326: + attr
327: + "', overwriting previous configuration");
328: }
329:
330: String pattern = attrConfs[i].getAttribute("pattern",
331: null);
332: if (pattern == null) {
333: this .origLinkAttrs.put(attr, NO_REGEXP);
334: } else {
335: try {
336: this .origLinkAttrs.put(attr, compiler
337: .compile(pattern));
338: } catch (RESyntaxException e) {
339: String msg = "Invalid regexp pattern '"
340: + pattern
341: + "' specified for attribute '" + attr
342: + "'";
343: throw new ConfigurationException(msg,
344: attrConfs[i], e);
345: }
346: }
347: }
348: }
349:
350: // If nothing configured, default to href attribute
351: if (this .origLinkAttrs.size() == 0) {
352: this .origLinkAttrs.put("href", NO_REGEXP);
353: }
354: }
355:
356: /**
357: * Initiate resources prior to this component becoming active.
358: */
359: public void initialize() throws Exception {
360: this .modHelper = new InputModuleHelper();
361: this .modHelper.setup(this .manager);
362: }
363:
364: /**
365: * Setup this component to handle a map:transform instance.
366: */
367: public void setup(SourceResolver resolver, Map objectModel,
368: String src, Parameters parameters)
369: throws ProcessingException, SAXException, IOException {
370: super .setup(resolver, objectModel, src, parameters);
371:
372: this .badLinkStr = parameters.getParameter("bad-link-str", // per-request config
373: this .origBadLinkStr); // else fall back to per-instance config
374:
375: this .namespaceURI = parameters.getParameter("namespace-uri",
376: this .origNamespaceURI);
377:
378: this .inSchemes = split(parameters.getParameter("schemes",
379: this .origInSchemes), " ");
380: this .outSchemes = split(parameters.getParameter(
381: "exclude-schemes", this .origOutSchemes), " ");
382:
383: this .linkAttrs = this .origLinkAttrs;
384: if (parameters.isParameter("link-attrs")) {
385: try {
386: this .linkAttrs = split(parameters
387: .getParameter("link-attrs"), " ", NO_REGEXP);
388: } catch (ParameterException ex) {
389: // shouldn't happen
390: }
391: }
392:
393: if (getLogger().isDebugEnabled()) {
394: getLogger().debug("bad-link-str = " + badLinkStr);
395: getLogger().debug("link-attrs = " + linkAttrs);
396: getLogger().debug("schemes = " + inSchemes);
397: getLogger().debug("exclude-schemes = " + outSchemes);
398: getLogger().debug("namespace-uri = " + namespaceURI);
399: }
400:
401: // Generate conf
402: VariableConfiguration varConf = new VariableConfiguration(
403: this .origConf);
404: varConf.addVariable("src", src);
405: varConf.addVariables(parameters);
406: try {
407: this .conf = varConf.getConfiguration();
408: } catch (ConfigurationException ce) {
409: throw new ProcessingException(
410: "Couldn't create dynamic config ", ce);
411: }
412: }
413:
414: /** Recycle this component for use in another map:transform. */
415: public void recycle() {
416: // Note: configure() and initialize() are not called after every
417: // recycle, so don't null origConf, origLinkAttrs, etc.
418: this .conf = null;
419: this .badLinkStr = null;
420: this .linkAttrs = null;
421: this .inSchemes = null;
422: this .outSchemes = null;
423:
424: super .recycle();
425: }
426:
427: /**
428: * Split a string into a Set of strings.
429: *
430: * @param str String to split
431: * @param delim Delimiter character
432: * @return A Set of strings in 'str'
433: */
434: private Set split(String str, String delim) {
435: if (str == null) {
436: return null;
437: }
438:
439: Set tokens = new HashSet();
440: StringTokenizer st = new StringTokenizer(str, delim);
441: while (st.hasMoreTokens()) {
442: tokens.add(st.nextToken());
443: }
444: return tokens;
445: }
446:
447: /**
448: * Split a string and create a Map where keys are the tokens from the string.
449: *
450: * @param str String to split
451: * @param delim Delimiter character
452: * @param valueObj Object to insert in the Map (may be null)
453: * @return A Map of strings in 'str'
454: */
455: private Map split(String str, String delim, Object valueObj) {
456: if (str == null) {
457: return null;
458: }
459:
460: // valueObj may be null, because HashMap permits null values
461: Map schemes = new HashMap();
462: StringTokenizer st = new StringTokenizer(str, delim);
463: while (st.hasMoreTokens()) {
464: String pfx = st.nextToken();
465: if (schemes.containsKey(pfx) && getLogger().isWarnEnabled()) {
466: getLogger()
467: .warn(
468: "Duplicate configuration entry found for attribute '"
469: + pfx
470: + "', overwriting previous configuration");
471: }
472: schemes.put(pfx, valueObj);
473: }
474: return schemes;
475: }
476:
477: /**
478: * Start processing elements of our namespace.
479: * This hook is invoked for each sax event with our namespace.
480: * @param uri The namespace of the element.
481: * @param name The local name of the element.
482: * @param raw The qualified name of the element.
483: * @param attr The attributes of the element.
484: */
485: public void startTransformingElement(String uri, String name,
486: String raw, Attributes attr) throws ProcessingException,
487: IOException, SAXException {
488: boolean matched = false;
489:
490: for (int attrIdx = 0; attrIdx < attr.getLength(); attrIdx++) {
491: String attrName = attr.getQName(attrIdx);
492:
493: String attrValue = createTransformedAttr(attrName, attr
494: .getValue(attrIdx));
495: if (attrValue != null) {
496: if (!matched) {
497: attr = new AttributesImpl(attr);
498: matched = true;
499: }
500: ((AttributesImpl) attr).setValue(attrIdx, attrValue);
501: }
502: }
503: super .startTransformingElement(uri, name, raw, attr);
504: }
505:
506: /**
507: * Rewrite set of links in an attribute.
508: *
509: * @param attrName QName of the attribute containing unconverted link(s).
510: * @param oldAttrValue value of the attribute containing unconverted link(s).
511: * @return new value of the attribute based on <code>oldAttrValue</code>, but with link(s) rewritten. If not
512: * modified, returns null (for example, if attribute not found in <code>linkAttrs</code> or not matched to
513: * regexp pattern).
514: */
515: private String createTransformedAttr(String attrName,
516: String oldAttrValue) {
517: if (!linkAttrs.containsKey(attrName)) {
518: return null;
519: }
520:
521: String newAttrValue = null;
522: Object reProgram = linkAttrs.get(attrName);
523: if (reProgram == NO_REGEXP) {
524: newAttrValue = createTransformedLink(oldAttrValue);
525: } else {
526: // must be instanceof REProgram
527: RE r = new RE((REProgram) reProgram);
528: if (r.match(oldAttrValue)) {
529: StringBuffer bufOut = new StringBuffer(oldAttrValue);
530: int offset = 0;
531: String link = null;
532: String newLink = null;
533: boolean modified = false;
534:
535: // skip the first paren
536: for (int i = 1; i < r.getParenCount(); i++) {
537: link = r.getParen(i);
538: newLink = createTransformedLink(link);
539: if (newLink != null) {
540: bufOut.replace(r.getParenStart(i) + offset, r
541: .getParenEnd(i)
542: + offset, newLink);
543: offset += newLink.length()
544: - r.getParenLength(i);
545: modified = true;
546: }
547: }
548: if (modified) {
549: newAttrValue = bufOut.toString();
550: }
551: }
552: }
553:
554: return newAttrValue;
555: }
556:
557: /**
558: * Rewrite a link - use InputModule to obtain new value for the link based on <code>oldLink</code>.
559: *
560: * @param oldLink value of the unconverted link.
561: * @return new value of the link. If not modified, returns null (for example, if link scheme
562: * is in <code>outSchemes</code>.
563: */
564: private String createTransformedLink(String oldLink) {
565: String newLink = null;
566: int i = oldLink.indexOf(":");
567: if (i != -1) {
568: String scheme = oldLink.substring(0, i);
569: String addr = oldLink.substring(i + 1);
570: if (outSchemes.contains(scheme)) {
571: if (getLogger().isDebugEnabled()) {
572: getLogger()
573: .debug("Ignoring link '" + oldLink + "'");
574: }
575: } else if (inSchemes.contains(scheme)
576: || inSchemes.size() == 0) {
577: // If the link wasn't deliberately excluded from a
578: // list of 'good' links, then include it.
579: try {
580: newLink = (String) modHelper.getAttribute(
581: this .objectModel, getConf(scheme), scheme,
582: addr, (badLinkStr != null ? badLinkStr
583: : scheme + ":" + addr));
584: if (getLogger().isDebugEnabled()) {
585: getLogger().debug(
586: "Converted link '" + oldLink + "' to '"
587: + newLink + "'");
588: }
589: } catch (org.apache.avalon.framework.CascadingRuntimeException e) {
590: // Rethrow Configuration errors
591: if (e.getCause() instanceof ConfigurationException) {
592: throw e;
593: }
594:
595: // Swallow IM errors, usually prefixes like 'telnet' that aren't
596: // bound to an InputModule. These should really be declared in
597: // 'exclude-schemes', hence the 'error' classification of this log.
598: if (getLogger().isErrorEnabled()) {
599: getLogger().error(
600: "Error rewriting link '" + oldLink
601: + "': " + e.getMessage());
602: }
603: }
604: }
605: }
606: return newLink;
607: }
608:
609: /**
610: * Retrieve a dynamic configuration for a specific InputModule.
611: *
612: * @param scheme InputModule name
613: * @return Configuration for specified scheme, from the map:transformer block.
614: */
615: private Configuration getConf(String scheme) {
616: Configuration[] schemeConfs = this .conf
617: .getChildren("input-module");
618: for (int i = 0; i < schemeConfs.length; i++) {
619: if (scheme
620: .equals(schemeConfs[i].getAttribute("name", null))) {
621: return schemeConfs[i];
622: }
623: }
624: return null;
625: }
626:
627: /* (non-Javadoc)
628: * @see org.apache.avalon.framework.activity.Disposable#dispose()
629: */
630: public void dispose() {
631: if (this.modHelper != null) {
632: this.modHelper.releaseAll();
633: this.modHelper = null;
634: }
635: super.dispose();
636: }
637: }
|