001: /*---------------------------------------------------------------------------*\
002: $Id: ParsedFeedURLEditPlugIn.java 7041 2007-09-09 01:04:47Z bmc $
003: ---------------------------------------------------------------------------
004: This software is released under a BSD-style license:
005:
006: Copyright (c) 2004-2007 Brian M. Clapper. All rights reserved.
007:
008: Redistribution and use in source and binary forms, with or without
009: modification, are permitted provided that the following conditions are
010: met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. The end-user documentation included with the redistribution, if any,
016: must include the following acknowlegement:
017:
018: "This product includes software developed by Brian M. Clapper
019: (bmc@clapper.org, http://www.clapper.org/bmc/). That software is
020: copyright (c) 2004-2007 Brian M. Clapper."
021:
022: Alternately, this acknowlegement may appear in the software itself,
023: if wherever such third-party acknowlegements normally appear.
024:
025: 3. Neither the names "clapper.org", "curn", nor any of the names of the
026: project contributors may be used to endorse or promote products
027: derived from this software without prior written permission. For
028: written permission, please contact bmc@clapper.org.
029:
030: 4. Products derived from this software may not be called "curn", nor may
031: "clapper.org" appear in their names without prior written permission
032: of Brian M. Clapper.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
035: WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
036: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
037: NO EVENT SHALL BRIAN M. CLAPPER BE LIABLE FOR ANY DIRECT, INDIRECT,
038: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
039: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
040: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
041: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044: \*---------------------------------------------------------------------------*/
045:
046: package org.clapper.curn.plugins;
047:
048: import org.clapper.curn.CurnConfig;
049: import org.clapper.curn.CurnException;
050: import org.clapper.curn.FeedInfo;
051: import org.clapper.curn.FeedConfigItemPlugIn;
052: import org.clapper.curn.PostFeedParsePlugIn;
053:
054: import org.clapper.curn.parser.RSSItem;
055: import org.clapper.curn.parser.RSSLink;
056:
057: import org.clapper.util.classutil.ClassUtil;
058: import org.clapper.util.config.ConfigurationException;
059: import org.clapper.util.logging.Logger;
060: import org.clapper.util.regex.RegexUtil;
061: import org.clapper.util.regex.RegexException;
062:
063: import java.net.URL;
064: import java.net.MalformedURLException;
065:
066: import java.util.ArrayList;
067: import java.util.HashMap;
068: import java.util.List;
069: import java.util.Map;
070: import org.clapper.curn.FeedCache;
071: import org.clapper.curn.parser.RSSChannel;
072:
073: /**
074: * The <tt>ParsedFeedURLEditPlugIn</tt> edits a feed after it has been
075: * parsed, adjusting the URLs in the feed (i.e., the item URLs and the
076: * channel, or feed, URL) according to various configuration parameters. It
077: * can be used to fix known errors in the XML. It intercepts the following
078: * per-feed configuration parameters:
079: *
080: * <table border="1">
081: * <tr valign="top">
082: * <td><tt>EditItemURL<i>suffix</i></tt></td>
083: * <td>Specifies a regular expression to be applied to the URLs
084: * for all items in the feed. Multiple expressions may be specified
085: * per feed. See the User's Guide for details.
086: * </td>
087: * </tr>
088: * <tr valign="top">
089: * <td><tt>EditFeedURL<i>suffix</i></tt></td>
090: * <td>Specifies a regular expression to be applied to the channel, or
091: * feed, URL. Multiple expressions may be specified per feed. See
092: * the User's Guide for details.
093: * </td>
094: * </tr>
095: * <tr valign="top">
096: * <td><tt>PruneURLs</tt></td>
097: * <td>Specifies that all URLs should be pruned of their HTTP parameters.
098: * This action also can be accomplished with edit directives, using
099: * the above configuration items; this parameter is a convenience.</td>
100: * </tr>
101: * </table>
102: *
103: * @version <tt>$Revision: 7041 $</tt>
104: */
105: public class ParsedFeedURLEditPlugIn implements FeedConfigItemPlugIn,
106: PostFeedParsePlugIn {
107: /*----------------------------------------------------------------------*\
108: Private Constants
109: \*----------------------------------------------------------------------*/
110:
111: private static final String VAR_PRUNE_URLS = "PruneURLs";
112: private static final boolean DEF_PRUNE_URLS = false;
113: private static final String VAR_EDIT_ITEM_URL = "EditItemURL";
114: private static final String VAR_EDIT_FEED_URL = "EditFeedURL";
115:
116: /*----------------------------------------------------------------------*\
117: Private Classes
118: \*----------------------------------------------------------------------*/
119:
120: /**
121: * Feed edit info
122: */
123: class FeedEditInfo {
124: boolean pruneURLs = DEF_PRUNE_URLS;
125: List<String> itemURLEditEditCmds = new ArrayList<String>();
126: List<String> channelURLEditEditCmds = new ArrayList<String>();
127:
128: FeedEditInfo() {
129: // Nothing to do
130: }
131: }
132:
133: /*----------------------------------------------------------------------*\
134: Private Data Items
135: \*----------------------------------------------------------------------*/
136:
137: /**
138: * Feed save data, by feed
139: */
140: private Map<FeedInfo, FeedEditInfo> perFeedEditInfoMap = new HashMap<FeedInfo, FeedEditInfo>();
141:
142: /**
143: * For log messages
144: */
145: private static final Logger log = new Logger(
146: ParsedFeedURLEditPlugIn.class);
147:
148: /**
149: * Regular expression helper
150: */
151: private RegexUtil regexUtil = new RegexUtil();
152:
153: /*----------------------------------------------------------------------*\
154: Constructor
155: \*----------------------------------------------------------------------*/
156:
157: /**
158: * Default constructor (required).
159: */
160: public ParsedFeedURLEditPlugIn() {
161: // Nothing to do
162: }
163:
164: /*----------------------------------------------------------------------*\
165: Public Methods Required by *PlugIn Interfaces
166: \*----------------------------------------------------------------------*/
167:
168: /**
169: * Get a displayable name for the plug-in.
170: *
171: * @return the name
172: */
173: public String getPlugInName() {
174: return "Edit Parsed Feed URL";
175: }
176:
177: /**
178: * Get the sort key for this plug-in.
179: *
180: * @return the sort key string.
181: */
182: public String getPlugInSortKey() {
183: return ClassUtil.getShortClassName(getClass().getName());
184: }
185:
186: /**
187: * Initialize the plug-in. This method is called before any of the
188: * plug-in methods are called.
189: *
190: * @throws CurnException on error
191: */
192: public void initPlugIn() throws CurnException {
193: }
194:
195: /*----------------------------------------------------------------------*\
196: Public Methods Required by *PlugIn Interfaces
197: \*----------------------------------------------------------------------*/
198:
199: /**
200: * Called immediately after <i>curn</i> has read and processed a
201: * configuration item in a "feed" configuration section. All
202: * configuration items are passed, one by one, to each loaded plug-in.
203: * If a plug-in class is not interested in a particular configuration
204: * item, this method should simply return without doing anything. Note
205: * that some configuration items may simply be variable assignment;
206: * there's no real way to distinguish a variable assignment from a
207: * blessed configuration item.
208: *
209: * @param sectionName the name of the configuration section where
210: * the item was found
211: * @param paramName the name of the parameter
212: * @param config the active configuration
213: * @param feedInfo partially complete <tt>FeedInfo</tt> object
214: * for the feed. The URL is guaranteed to be
215: * present, but no other fields are.
216: *
217: * @return <tt>true</tt> to continue processing the feed,
218: * <tt>false</tt> to skip it
219: *
220: * @throws CurnException on error
221: *
222: * @see CurnConfig
223: * @see FeedInfo
224: * @see FeedInfo#getURL
225: */
226: public boolean runFeedConfigItemPlugIn(String sectionName,
227: String paramName, CurnConfig config, FeedInfo feedInfo)
228: throws CurnException {
229: try {
230: if (paramName.startsWith(VAR_EDIT_ITEM_URL)) {
231: FeedEditInfo editInfo = getOrMakeFeedEditInfo(feedInfo);
232: String value = config.getConfigurationValue(
233: sectionName, paramName);
234: editInfo.itemURLEditEditCmds.add(value);
235: log.debug("[" + sectionName + "]: added item regexp "
236: + value);
237: }
238:
239: else if (paramName.startsWith(VAR_EDIT_FEED_URL)) {
240: FeedEditInfo editInfo = getOrMakeFeedEditInfo(feedInfo);
241: String value = config.getConfigurationValue(
242: sectionName, paramName);
243: editInfo.channelURLEditEditCmds.add(value);
244: log.debug("[" + sectionName + "]: added feed regexp "
245: + value);
246: }
247:
248: else if (paramName.equals(VAR_PRUNE_URLS)) {
249: FeedEditInfo editInfo = getOrMakeFeedEditInfo(feedInfo);
250: editInfo.pruneURLs = config.getRequiredBooleanValue(
251: sectionName, paramName);
252: log.debug("[" + sectionName + "]: set PruneURLs="
253: + editInfo.pruneURLs);
254: }
255:
256: return true;
257: }
258:
259: catch (ConfigurationException ex) {
260: throw new CurnException(ex);
261: }
262: }
263:
264: /**
265: * Called immediately after a feed is parsed, but before it is
266: * otherwise processed. This method can return <tt>false</tt> to signal
267: * <i>curn</i> that the feed should be skipped. For instance, a plug-in
268: * that filters on the parsed feed data could use this method to weed
269: * out non-matching feeds before they are downloaded. Similarly, a
270: * plug-in that edits the parsed data (removing or editing individual
271: * items, for instance) could use method to do so.
272: *
273: * @param feedInfo the {@link FeedInfo} object for the feed that
274: * has been downloaded and parsed.
275: * @param feedCache the feed cache
276: * @param channel the parsed channel data
277: *
278: * @return <tt>true</tt> if <i>curn</i> should continue to process the
279: * feed, <tt>false</tt> to skip the feed. A return value of
280: * <tt>false</tt> aborts all further processing on the feed.
281: * In particular, <i>curn</i> will not pass the feed along to
282: * other plug-ins that have yet to be notified of this event.
283: *
284: * @throws CurnException on error
285: *
286: * @see RSSChannel
287: * @see FeedInfo
288: */
289: public boolean runPostFeedParsePlugIn(FeedInfo feedInfo,
290: FeedCache feedCache, RSSChannel channel)
291: throws CurnException {
292: FeedEditInfo editInfo = perFeedEditInfoMap.get(feedInfo);
293:
294: if (editInfo != null) {
295: // First the channel itself.
296:
297: if (editInfo.pruneURLs
298: || (editInfo.channelURLEditEditCmds.size() > 0)) {
299: RSSLink channelLink = channel.getURL();
300: URL channelURL = channelLink.getURL();
301: String strChannelURL = channelURL.toExternalForm();
302:
303: log.debug("Before editing, feed URL=" + strChannelURL);
304:
305: if (editInfo.pruneURLs)
306: strChannelURL = pruneURL(strChannelURL);
307:
308: for (String editCmd : editInfo.channelURLEditEditCmds)
309: strChannelURL = editURL(strChannelURL, editCmd);
310:
311: log.debug("After editing, feed URL=" + strChannelURL);
312:
313: try {
314: channelLink.setURL(new URL(strChannelURL));
315: }
316:
317: catch (MalformedURLException ex) {
318: throw new CurnException("After editing feed URL \""
319: + channelURL + "\", result \""
320: + strChannelURL + "\" is an illegal URL.");
321: }
322: }
323:
324: // Now the individual items.
325:
326: if (editInfo.pruneURLs
327: || (editInfo.itemURLEditEditCmds.size() > 0)) {
328: for (RSSItem item : channel.getItems()) {
329: RSSLink itemLink = item.getURL();
330: URL itemURL = itemLink.getURL();
331: String strItemURL = itemURL.toExternalForm();
332:
333: log.debug("Before editing, item URL=" + strItemURL);
334:
335: if (editInfo.pruneURLs)
336: strItemURL = pruneURL(strItemURL);
337:
338: for (String editCmd : editInfo.itemURLEditEditCmds)
339: strItemURL = editURL(strItemURL, editCmd);
340:
341: log.debug("After editing, item URL=" + strItemURL);
342:
343: try {
344: itemLink.setURL(new URL(strItemURL));
345: }
346:
347: catch (MalformedURLException ex) {
348: throw new CurnException(
349: "After editing item URL \"" + itemURL
350: + "\", result \"" + strItemURL
351: + "\" is an illegal URL", ex);
352: }
353: }
354: }
355: }
356:
357: return true;
358: }
359:
360: /*----------------------------------------------------------------------*\
361: Private Methods
362: \*----------------------------------------------------------------------*/
363:
364: private FeedEditInfo getOrMakeFeedEditInfo(FeedInfo feedInfo) {
365: FeedEditInfo editInfo = perFeedEditInfoMap.get(feedInfo);
366: if (editInfo == null) {
367: editInfo = new FeedEditInfo();
368: perFeedEditInfoMap.put(feedInfo, editInfo);
369: }
370:
371: return editInfo;
372: }
373:
374: /**
375: * Prune a URL string of its HTTP parameters.
376: *
377: * @param urlString the URL string
378: *
379: * @return the possibly edited result
380: */
381: private String pruneURL(String urlString) {
382: int i = urlString.indexOf("?");
383:
384: if (i != -1)
385: urlString = urlString.substring(0, i);
386:
387: return urlString;
388: }
389:
390: /**
391: * Apply a regular expression to a URL, returning the result.
392: *
393: * @param urlString the URL string
394: * @param editCmd the substitution command
395: *
396: * @return the possibly edited result
397: *
398: * @throws CurnException on error
399: */
400: private String editURL(String urlString, String editCmd)
401: throws CurnException {
402: try {
403: return regexUtil.substitute(editCmd, urlString);
404: }
405:
406: catch (RegexException ex) {
407: throw new CurnException("Failed to edit URL \"" + urlString
408: + "\" with \"" + editCmd + "\"", ex);
409: }
410: }
411: }
|