001: /*---------------------------------------------------------------------------*\
002: $Id: IgnoreDuplicateArticlesPlugIn.java 7041 2007-09-09 01:04:47Z bmc $
003: ---------------------------------------------------------------------------
004: This software is released under a BSD-style license:
005:
006: Copyright (c) 2004-2007 Brian M. Clapper. All rights reserved.
007:
008: Redistribution and use in source and binary forms, with or without
009: modification, are permitted provided that the following conditions are
010: met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. The end-user documentation included with the redistribution, if any,
016: must include the following acknowlegement:
017:
018: "This product includes software developed by Brian M. Clapper
019: (bmc@clapper.org, http://www.clapper.org/bmc/). That software is
020: copyright (c) 2004-2007 Brian M. Clapper."
021:
022: Alternately, this acknowlegement may appear in the software itself,
023: if wherever such third-party acknowlegements normally appear.
024:
025: 3. Neither the names "clapper.org", "curn", nor any of the names of the
026: project contributors may be used to endorse or promote products
027: derived from this software without prior written permission. For
028: written permission, please contact bmc@clapper.org.
029:
030: 4. Products derived from this software may not be called "curn", nor may
031: "clapper.org" appear in their names without prior written permission
032: of Brian M. Clapper.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
035: WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
036: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
037: NO EVENT SHALL BRIAN M. CLAPPER BE LIABLE FOR ANY DIRECT, INDIRECT,
038: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
039: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
040: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
041: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044: \*---------------------------------------------------------------------------*/
045:
046: package org.clapper.curn.plugins;
047:
048: import org.clapper.curn.CurnConfig;
049: import org.clapper.curn.CurnException;
050: import org.clapper.curn.FeedInfo;
051: import org.clapper.curn.FeedConfigItemPlugIn;
052: import org.clapper.curn.PostFeedParsePlugIn;
053: import org.clapper.curn.parser.RSSChannel;
054: import org.clapper.curn.parser.RSSItem;
055: import org.clapper.curn.parser.RSSLink;
056:
057: import org.clapper.util.classutil.ClassUtil;
058: import org.clapper.util.config.ConfigurationException;
059: import org.clapper.util.logging.Logger;
060:
061: import java.util.HashMap;
062: import java.util.Map;
063: import org.clapper.curn.FeedCache;
064:
065: /**
066: * The <tt>IgnoreDuplicateArticlesPlugIn</tt> handles removing duplicate
067: * items from downloaded feeds, where "duplicate" means "has the same
068: * title". It intercepts the following per-feed configuration parameters:
069: *
070: * <table border="1">
071: * <tr valign="top" align="left">
072: * <th>Parameter</th>
073: * <th>Meaning</th>
074: * </tr>
075: * <tr valign="top">
076: * <td><tt>IgnoreDuplicateTitles</tt></td>
077: * <td>Set to "true" to strip duplicate titles, "false" to pass them
078: * along. Defaults to "false".</td>
079: * </tr>
080: * </table>
081: *
082: * @version <tt>$Revision: 7041 $</tt>
083: */
084: public class IgnoreDuplicateArticlesPlugIn implements
085: FeedConfigItemPlugIn, PostFeedParsePlugIn {
086: /*----------------------------------------------------------------------*\
087: Private Constants
088: \*----------------------------------------------------------------------*/
089:
090: private static final String VAR_IGNORE_DUP_TITLES = "IgnoreDuplicateTitles";
091:
092: /*----------------------------------------------------------------------*\
093: Private Data Items
094: \*----------------------------------------------------------------------*/
095:
096: /**
097: * Feed "ignore" flags, by feed
098: */
099: private Map<FeedInfo, Boolean> perFeedIgnoreFlagMap = new HashMap<FeedInfo, Boolean>();
100:
101: /**
102: * For log messages
103: */
104: private static final Logger log = new Logger(
105: IgnoreDuplicateArticlesPlugIn.class);
106:
107: /*----------------------------------------------------------------------*\
108: Constructor
109: \*----------------------------------------------------------------------*/
110:
111: /**
112: * Default constructor (required).
113: */
114: public IgnoreDuplicateArticlesPlugIn() {
115: // Nothing to do
116: }
117:
118: /*----------------------------------------------------------------------*\
119: Public Methods Required by *PlugIn Interfaces
120: \*----------------------------------------------------------------------*/
121:
122: /**
123: * Get a displayable name for the plug-in.
124: *
125: * @return the name
126: */
127: public String getPlugInName() {
128: return "Ignore Duplicate Articles";
129: }
130:
131: /**
132: * Get the sort key for this plug-in.
133: *
134: * @return the sort key string.
135: */
136: public String getPlugInSortKey() {
137: return ClassUtil.getShortClassName(getClass().getName());
138: }
139:
140: /**
141: * Initialize the plug-in. This method is called before any of the
142: * plug-in methods are called.
143: *
144: * @throws CurnException on error
145: */
146: public void initPlugIn() throws CurnException {
147: }
148:
149: /**
150: * Called immediately after <i>curn</i> has read and processed a
151: * configuration item in a "feed" configuration section. All
152: * configuration items are passed, one by one, to each loaded plug-in.
153: * If a plug-in class is not interested in a particular configuration
154: * item, this method should simply return without doing anything. Note
155: * that some configuration items may simply be variable assignment;
156: * there's no real way to distinguish a variable assignment from a
157: * blessed configuration item.
158: *
159: * @param sectionName the name of the configuration section where
160: * the item was found
161: * @param paramName the name of the parameter
162: * @param config the active configuration
163: * @param feedInfo partially complete <tt>FeedInfo</tt> object
164: * for the feed. The URL is guaranteed to be
165: * present, but no other fields are.
166: *
167: * @return <tt>true</tt> to continue processing the feed,
168: * <tt>false</tt> to skip it
169: *
170: * @throws CurnException on error
171: *
172: * @see CurnConfig
173: * @see FeedInfo
174: * @see FeedInfo#getURL
175: */
176: public boolean runFeedConfigItemPlugIn(String sectionName,
177: String paramName, CurnConfig config, FeedInfo feedInfo)
178: throws CurnException {
179: try {
180: if (paramName.equals(VAR_IGNORE_DUP_TITLES)) {
181: boolean flag = config.getRequiredBooleanValue(
182: sectionName, paramName);
183: perFeedIgnoreFlagMap.put(feedInfo, flag);
184: log.debug("[" + sectionName + "]: " + paramName + "="
185: + flag);
186: }
187:
188: return true;
189: }
190:
191: catch (ConfigurationException ex) {
192: throw new CurnException(ex);
193: }
194: }
195:
196: /**
197: * Called immediately after a feed is parsed, but before it is
198: * otherwise processed. This method can return <tt>false</tt> to signal
199: * <i>curn</i> that the feed should be skipped. For instance, a plug-in
200: * that filters on the parsed feed data could use this method to weed
201: * out non-matching feeds before they are downloaded. Similarly, a
202: * plug-in that edits the parsed data (removing or editing individual
203: * items, for instance) could use method to do so.
204: *
205: * @param feedInfo the {@link FeedInfo} object for the feed that
206: * has been downloaded and parsed.
207: * @param feedCache the feed cache
208: * @param channel the parsed channel data
209: *
210: * @return <tt>true</tt> if <i>curn</i> should continue to process the
211: * feed, <tt>false</tt> to skip the feed. A return value of
212: * <tt>false</tt> aborts all further processing on the feed.
213: * In particular, <i>curn</i> will not pass the feed along to
214: * other plug-ins that have yet to be notified of this event.
215: *
216: * @throws CurnException on error
217: *
218: * @see RSSChannel
219: * @see FeedInfo
220: */
221: public boolean runPostFeedParsePlugIn(FeedInfo feedInfo,
222: FeedCache feedCache, RSSChannel channel)
223: throws CurnException {
224: Boolean ignore = perFeedIgnoreFlagMap.get(feedInfo);
225: if ((ignore != null) && (ignore)) {
226: String feedURL = feedInfo.getURL().toString();
227: log.debug("Stripping duplicate titles from " + feedURL);
228: Map<String, RSSItem> titlesSeen = new HashMap<String, RSSItem>();
229:
230: for (RSSItem item : channel.getItems()) {
231: RSSLink itemLink = item.getURL();
232: String strItemURL = itemLink.getURL().toString();
233: String title = item.getTitle();
234: String titleKey;
235:
236: if (title == null)
237: titleKey = strItemURL;
238:
239: else {
240: // Convert to lower case and consolidate multiple
241: // adjacent white space characters.
242:
243: titleKey = title.toLowerCase().replaceAll("\\s+",
244: " ");
245: }
246:
247: RSSItem firstOne = titlesSeen.get(titleKey);
248: if (firstOne != null) {
249: String strFirstOneURL = firstOne.getURL()
250: .toString();
251: String firstTitle = firstOne.getTitle();
252:
253: if (firstTitle == null)
254: firstTitle = strFirstOneURL;
255:
256: log
257: .debug("Feed "
258: + feedURL
259: + ": Ignoring item with URL \""
260: + strItemURL
261: + "\" and title \""
262: + title
263: + "\": It matches already seen item with URL \""
264: + strFirstOneURL
265: + "\" and title \"" + firstTitle
266: + "\"");
267:
268: // Since getItems() returns a copy of the list of
269: // items, this call will not cause a
270: // ConcurrentModificationException to be thrown.
271:
272: channel.removeItem(item);
273: }
274:
275: else {
276: titlesSeen.put(titleKey, item);
277: }
278: }
279: }
280:
281: return true;
282: }
283: }
|