001: /*---------------------------------------------------------------------------*\
002: $Id: CommonXMLFixupsPlugIn.java 7041 2007-09-09 01:04:47Z bmc $
003: ---------------------------------------------------------------------------
004: This software is released under a BSD-style license:
005:
006: Copyright (c) 2004-2007 Brian M. Clapper. All rights reserved.
007:
008: Redistribution and use in source and binary forms, with or without
009: modification, are permitted provided that the following conditions are
010: met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. The end-user documentation included with the redistribution, if any,
016: must include the following acknowlegement:
017:
018: "This product includes software developed by Brian M. Clapper
019: (bmc@clapper.org, http://www.clapper.org/bmc/). That software is
020: copyright (c) 2004-2007 Brian M. Clapper."
021:
022: Alternately, this acknowlegement may appear in the software itself,
023: if wherever such third-party acknowlegements normally appear.
024:
025: 3. Neither the names "clapper.org", "curn", nor any of the names of the
026: project contributors may be used to endorse or promote products
027: derived from this software without prior written permission. For
028: written permission, please contact bmc@clapper.org.
029:
030: 4. Products derived from this software may not be called "curn", nor may
031: "clapper.org" appear in their names without prior written permission
032: of Brian M. Clapper.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
035: WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
036: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
037: NO EVENT SHALL BRIAN M. CLAPPER BE LIABLE FOR ANY DIRECT, INDIRECT,
038: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
039: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
040: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
041: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044: \*---------------------------------------------------------------------------*/
045:
046: package org.clapper.curn.plugins;
047:
048: import org.clapper.curn.CurnConfig;
049: import org.clapper.curn.CurnException;
050: import org.clapper.curn.FeedInfo;
051: import org.clapper.curn.MainConfigItemPlugIn;
052: import org.clapper.curn.FeedConfigItemPlugIn;
053: import org.clapper.curn.PostFeedDownloadPlugIn;
054:
055: import org.clapper.util.classutil.ClassUtil;
056: import org.clapper.util.config.ConfigurationException;
057: import org.clapper.util.logging.Logger;
058: import java.io.File;
059: import java.util.Arrays;
060: import java.util.HashMap;
061: import java.util.Map;
062:
063: /**
064: * <p>The <tt>CommonXMLFixupsPlugIn</tt> attempts to fix some common errors
065: * in the downloaded, pre-parsed XML in any feed for which it is enabled.
066: * There is some XML badness that is surprisingly common across feeds,
067: * including (but not limited to):</p>
068: *
069: * <ul>
070: * <li>Using a "naked" ampersand (&) without escaping it.
071: * <li>Use of nonexistent entities (e.g., &ouml;, &nbsp;)
072: * <li>Improperly formatted entity escapes
073: * <li>"Demoronizing" (with apologies to John Walker and his
074: * <a href="http://www.fourmilab.ch/webtools/demoroniser"><i>demoroniser</i></a>
075: * tool). Demoronizing is the act of replacing Microsoft Windows-specific
076: * characters with more reasonable, universal values--values that will
077: * actually display properly in my Firefox browser on Unix or FreeBSD.
078: * These annoying characters include the Windows 1252 character set's
079: * "smart" quotes, trademark symbol, em dash, and other characters
080: * that don't display properly in non-Windows character sets.
081: * </ul>
082: *
083: * <p>This plug-in attempts to fix those problems.</p>
084: *
085: * <p>This plug-in intercepts the following configuration parameters:</p>
086: *
087: * <table border="1">
088: * <tr valign="bottom" align="left">
089: * <th>Section</th>
090: * <th>Parameter</th>
091: * <th>Legal Values</th>
092: * <th>Meaning</th>
093: * </tr>
094: * <tr valign="top">
095: * <td><tt>[curn]</tt></td>
096: * <td><tt>CommonXMLFixups</tt></td>
097: * <td><tt>true</tt>, <tt>false</tt></td>
098: * <td>The global setting, which can be used to enable or disable
099: * this plug-in for all feeds (though the plug-in can still be
100: * disabled or enabled on a per-feed basis). If not specified, this
101: * parameter defaults to <tt>false</tt>.</td>
102: * </tr>
103: * <tr valign="top">
104: * <td><tt>[Feed<i>xxx</i>]</tt></td>
105: * <td><tt>CommonXMLFixups</tt></td>
106: * <td><tt>true</tt>, <tt>false</tt></td>
107: * <td>Enables or disables this plug-in for a specific feed. If not
108: * specified, this parameter defaults to the global setting.</td>
109: * </tr>
110: * </table>
111: *
112: * @version <tt>$Revision: 7041 $</tt>
113: */
114: public class CommonXMLFixupsPlugIn extends AbstractXMLEditPlugIn
115: implements MainConfigItemPlugIn, FeedConfigItemPlugIn,
116: PostFeedDownloadPlugIn {
117: /*----------------------------------------------------------------------*\
118: Private Constants
119: \*----------------------------------------------------------------------*/
120:
121: private static final String VAR_COMMON_XML_FIXUPS = "CommonXMLFixups";
122:
123: /**
124: * The table of edit commands.
125: */
126: private static final String[] EDITS = new String[] {
127: // Various forms of unescaped ampersands.
128:
129: "s/ & / \\& /g",
130: "s/&$/\\&/g",
131: "s/ &amp; / \\& /g",
132: "s/&([^;]+)(\\s)/\\&$1$2/g",
133:
134: // Remove " " and "nbsp;". The first is legal HTML, but not
135: // legal XML. The second is illegal. Also have to handle this:
136: //
137: // &nbsp
138: //
139: // That doesn't have to be removed.
140:
141: "s/ / /g",
142: "s/([^&;])nbsp;/$1 /g",
143:
144: // Non-existent XML entities
145:
146: "s/ö/\\ö/g",
147: "s/—/\\—/g",
148:
149: // For some reason, no one seems to escape "AT&T" properly...
150:
151: "s/AT&T/AT\\&T/g",
152:
153: // Demoronization
154: // CP-1252 What it is
155: // --------------------------------------
156: "s/€/\\€/g", // 0x80 Euro sign
157: "s/‚/\\‚/g", // 0x82 Single low-9 quote mark
158: "s/ƒ/\\ƒ/g", // 0x83 Latin small letter "f" w/ hook
159: "s/„/\\„/g", // 0x84 Double low-9 quote mark
160: "s/…/\\…/g", // 0x85 Horizontal ellipsis
161: "s/†/\\†/g", // 0x86 Dagger
162: "s/‡/\\‡/g", // 0x87 Double dagger
163: "s/ˆ/\\ˆ/g", // 0x88 Circumflex accent
164: "s/‰/\\‰/g", // 0x89 Per mille sign
165: "s/Š/\\Š/g", // 0x8A Latin capital "S" with caron
166: "s/‹/\\‹/g", // 0x8B Single left angle quote
167: "s/Œ/\\Œ/g", // 0x8C Latin capital ligature "OE"
168: "s/Ž/\\Ž/g", // 0x8E Latin capital "Z" with caron
169: "s/‘/\\‘/g", // 0x91 Left single quote mark
170: "s/’/\\’/g", // 0x92 Right single quote mark
171: "s/“/\\“/g", // 0x93 Left double quote mark
172: "s/”/\\”/g", // 0x94 Right double quote mark
173: "s/•/\\•/g", // 0x95 Bullet
174: "s/–/\\–/g", // 0x96 En dash
175: "s/—/\\—/g", // 0x97 Em dash
176: "s/˜/\\˜/g", // 0x98 Small tilde
177: "s/™/\\™/g", // 0x99 Trademark sign
178: "s/š/\\š/g", // 0x9A Latin small "s" with caron
179: "s/›/\\›/g", // 0x9B Single right angle quote
180: "s/œ/\\œ/g", // 0x9C Latin small ligature "oe"
181: "s/ž/\\ž/g", // 0x9E Latin small "z" with caron
182: "s/Ÿ/\\Ÿ/g", // 0x9F Latin capital "Y" with diaeresis
183:
184: // Try to handle XML with references to illegal character entities.
185: // First, translate any legal entities to something else.
186:
187: "s/&(amp|quot|apos|lt|gt);/@@AMP@@$1;/g",
188: "s/&(#[0-9]{1,4});/@@AMP@@$1;/g",
189: "s/&(#x[0-9A-Fa-f]{1,4});/@@AMP@@$1;/g",
190:
191: // Next, find anything else, and escape it.
192:
193: "s/&/\\&/g",
194:
195: // Finally, restore the escaped stuff.
196:
197: "s/@@AMP@@/\\&/g"
198:
199: };
200:
201: /*----------------------------------------------------------------------*\
202: Private Data Items
203: \*----------------------------------------------------------------------*/
204:
205: /**
206: * Feed save data, by feed
207: */
208: private Map<FeedInfo, Boolean> perFeedEnabledFlag = new HashMap<FeedInfo, Boolean>();
209:
210: /**
211: * Whether globally enabled or not.
212: */
213: private boolean globallyEnabled = false;
214:
215: /**
216: * For log messages
217: */
218: private static final Logger log = new Logger(
219: CommonXMLFixupsPlugIn.class);
220:
221: /*----------------------------------------------------------------------* \
222: Constructor
223: \*----------------------------------------------------------------------*/
224:
225: /**
226: * Default constructor (required).
227: */
228: public CommonXMLFixupsPlugIn() {
229: // Nothing to do
230: }
231:
232: /*----------------------------------------------------------------------*\
233: Public Methods Required by *PlugIn Interfaces
234: \*----------------------------------------------------------------------*/
235:
236: /**
237: * Get a displayable name for the plug-in.
238: *
239: * @return the name
240: */
241: public String getPlugInName() {
242: return "Common XML Fixups";
243: }
244:
245: /**
246: * Get the sort key for this plug-in.
247: *
248: * @return the sort key string.
249: */
250: public String getPlugInSortKey() {
251: return ClassUtil.getShortClassName(getClass().getName());
252: }
253:
254: /**
255: * Initialize the plug-in. This method is called before any of the
256: * plug-in methods are called.
257: *
258: * @throws CurnException on error
259: */
260: public void initPlugIn() throws CurnException {
261: }
262:
263: /**
264: * Called immediately after <i>curn</i> has read and processed a
265: * configuration item in the main [curn] configuration section. All
266: * configuration items are passed, one by one, to each loaded plug-in.
267: * If a plug-in class is not interested in a particular configuration
268: * item, this method should simply return without doing anything. Note
269: * that some configuration items may simply be variable assignment;
270: * there's no real way to distinguish a variable assignment from a
271: * blessed configuration item.
272: *
273: * @param sectionName the name of the configuration section where
274: * the item was found
275: * @param paramName the name of the parameter
276: * @param config the {@link CurnConfig} object
277: *
278: * @throws CurnException on error
279: *
280: * @see CurnConfig
281: */
282: public void runMainConfigItemPlugIn(String sectionName,
283: String paramName, CurnConfig config) throws CurnException {
284: try {
285: if (paramName.equals(VAR_COMMON_XML_FIXUPS)) {
286: globallyEnabled = config.getRequiredBooleanValue(
287: sectionName, paramName);
288: }
289: }
290:
291: catch (ConfigurationException ex) {
292: throw new CurnException(ex);
293: }
294: }
295:
296: /**
297: * Called immediately after <i>curn</i> has read and processed a
298: * configuration item in a "feed" configuration section. All
299: * configuration items are passed, one by one, to each loaded plug-in.
300: * If a plug-in class is not interested in a particular configuration
301: * item, this method should simply return without doing anything. Note
302: * that some configuration items may simply be variable assignment;
303: * there's no real way to distinguish a variable assignment from a
304: * blessed configuration item.
305: *
306: * @param sectionName the name of the configuration section where
307: * the item was found
308: * @param paramName the name of the parameter
309: * @param config the active configuration
310: * @param feedInfo partially complete <tt>FeedInfo</tt> object
311: * for the feed. The URL is guaranteed to be
312: * present, but no other fields are.
313: *
314: * @return <tt>true</tt> to continue processing the feed,
315: * <tt>false</tt> to skip it
316: *
317: * @throws CurnException on error
318: *
319: * @see CurnConfig
320: * @see FeedInfo
321: * @see FeedInfo#getURL
322: */
323: public boolean runFeedConfigItemPlugIn(String sectionName,
324: String paramName, CurnConfig config, FeedInfo feedInfo)
325: throws CurnException {
326: try {
327: if (paramName.equals(VAR_COMMON_XML_FIXUPS)) {
328: boolean flag = config.getRequiredBooleanValue(
329: sectionName, paramName);
330: perFeedEnabledFlag.put(feedInfo, flag);
331: log.debug("[" + sectionName + "]: " + paramName + "="
332: + flag);
333: }
334:
335: return true;
336: }
337:
338: catch (ConfigurationException ex) {
339: throw new CurnException(ex);
340: }
341: }
342:
343: /**
344: * Called immediately after a feed is downloaded. This method can
345: * return <tt>false</tt> to signal <i>curn</i> that the feed should be
346: * skipped. For instance, a plug-in that filters on the unparsed XML
347: * feed content could use this method to weed out non-matching feeds
348: * before they are downloaded.
349: *
350: * @param feedInfo the {@link FeedInfo} object for the feed that
351: * has been downloaded
352: * @param feedDataFile the file containing the downloaded, unparsed feed
353: * XML. <b><i>curn</i> may delete this file after all
354: * plug-ins are notified!</b>
355: * @param encoding the encoding used to store the data in the file,
356: * or null for the default
357: *
358: * @return <tt>true</tt> if <i>curn</i> should continue to process the
359: * feed, <tt>false</tt> to skip the feed. A return value of
360: * <tt>false</tt> aborts all further processing on the feed.
361: * In particular, <i>curn</i> will not pass the feed along to
362: * other plug-ins that have yet to be notified of this event.
363: *
364: * @throws CurnException on error
365: *
366: * @see FeedInfo
367: */
368: public boolean runPostFeedDownloadPlugIn(FeedInfo feedInfo,
369: File feedDataFile, String encoding) throws CurnException {
370: Boolean enabledBoxed = perFeedEnabledFlag.get(feedInfo);
371: boolean enabled = globallyEnabled;
372:
373: if (enabledBoxed != null)
374: enabled = enabledBoxed;
375:
376: if (enabled)
377: editXML(feedInfo, feedDataFile, encoding, Arrays
378: .asList(EDITS));
379:
380: return true;
381: }
382:
383: /*----------------------------------------------------------------------*\
384: Protected Methods
385: \*----------------------------------------------------------------------*/
386:
387: protected Logger getLogger() {
388: return log;
389: }
390: }
|