001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. The ASF licenses this file to You
004: * under the Apache License, Version 2.0 (the "License"); you may not
005: * use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License. For additional information regarding
015: * copyright in this work, please see the NOTICE file in the top level
016: * directory of this distribution.
017: */
018: package org.apache.roller.util;
019:
020: import java.io.BufferedReader;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.io.InputStreamReader;
024: import java.io.StringReader;
025: import java.net.MalformedURLException;
026: import java.net.URL;
027: import java.util.Arrays;
028: import java.util.Iterator;
029: import java.util.List;
030:
031: import javax.swing.text.MutableAttributeSet;
032: import javax.swing.text.html.HTML;
033: import javax.swing.text.html.HTMLEditorKit;
034: import javax.swing.text.html.HTML.Tag;
035: import javax.swing.text.html.HTMLEditorKit.Parser;
036: import javax.swing.text.html.HTMLEditorKit.ParserCallback;
037:
038: import org.apache.commons.logging.Log;
039: import org.apache.commons.logging.LogFactory;
040:
041: import com.sun.syndication.feed.synd.SyndEntry;
042: import com.sun.syndication.feed.synd.SyndFeed;
043: import com.sun.syndication.io.FeedException;
044: import com.sun.syndication.io.SyndFeedInput;
045:
046: /**
047: * Parses HTML file for referring linkback title and excerpt.
048: *
049: * @author David M Johnson
050: */
051: public class LinkbackExtractor {
052: private static Log mLogger = LogFactory.getFactory().getInstance(
053: LinkbackExtractor.class);
054: private boolean mFound = false;
055: private String mTitle = "";
056: private String mRssLink = null;
057: private String mExcerpt = null;
058: private String mPermalink = null;
059: private int mStart = 0;
060: private int mEnd = 0;
061: private int mMaxExcerpt = 500; // characters
062: private String mRequestURL = null;
063: private String mRequestURLWWW = null;
064: private String mRefererURL;
065:
066: //------------------------------------------------------------------------
067: /**
068: * Extract referring page title, excerpt, and permalink.
069: *
070: * @param refererUrl
071: * @param requestUrl
072: */
073: public LinkbackExtractor(String refererURL, String requestURL)
074: throws MalformedURLException, IOException {
075: try {
076: extractByParsingHtml(refererURL, requestURL);
077: if (mRssLink != null) {
078: extractByParsingRss(mRssLink, requestURL);
079: }
080: } catch (Exception e) {
081: if (mLogger.isDebugEnabled()) {
082: mLogger.debug("Extracting linkback", e);
083: }
084: }
085: }
086:
087: //------------------------------------------------------------------------
088: private void extractByParsingHtml(String refererURL,
089: String requestURL) throws MalformedURLException,
090: IOException {
091: URL url = new URL(refererURL);
092: InputStream is = url.openStream();
093:
094: mRefererURL = refererURL;
095:
096: if (requestURL.startsWith("http://www.")) {
097: mRequestURLWWW = requestURL;
098: mRequestURL = "http://" + mRequestURLWWW.substring(11);
099: } else {
100: mRequestURL = requestURL;
101: mRequestURLWWW = "http://www." + mRequestURL.substring(7);
102: }
103:
104: // Trick gets Swing's HTML parser
105: Parser parser = (new HTMLEditorKit() {
106: public Parser getParser() {
107: return super .getParser();
108: }
109: }).getParser();
110:
111: // Read HTML file into string
112: StringBuffer sb = new StringBuffer();
113: InputStreamReader isr = new InputStreamReader(is);
114: BufferedReader br = new BufferedReader(isr);
115: try {
116: String line = null;
117: while ((line = br.readLine()) != null) {
118: sb.append(line);
119: }
120: } finally {
121: br.close();
122: }
123:
124: // Parse HTML string to find title and start and end position
125: // of the referring excerpt.
126: StringReader sr = new StringReader(sb.toString());
127: parser.parse(sr, new LinkbackCallback(), true);
128:
129: if (mStart != 0 && mEnd != 0 && mEnd > mStart) {
130: mExcerpt = sb.toString().substring(mStart, mEnd);
131: mExcerpt = Utilities.removeHTML(mExcerpt);
132:
133: if (mExcerpt.length() > mMaxExcerpt) {
134: mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
135: }
136: }
137:
138: if (mTitle.startsWith(">") && mTitle.length() > 1) {
139: mTitle = mTitle.substring(1);
140: }
141: }
142:
143: //------------------------------------------------------------------------
144: private void extractByParsingRss(String rssLink, String requestURL)
145: throws IllegalArgumentException, MalformedURLException,
146: FeedException, IOException {
147: SyndFeedInput feedInput = new SyndFeedInput();
148: SyndFeed feed = feedInput.build(new InputStreamReader(new URL(
149: rssLink).openStream()));
150: Iterator itemIter = feed.getEntries().iterator();
151: String feedTitle = feed.getTitle();
152:
153: int count = 0;
154:
155: if (mLogger.isDebugEnabled()) {
156: mLogger.debug("Feed parsed, title: " + feedTitle);
157: }
158:
159: while (itemIter.hasNext()) {
160: count++;
161: SyndEntry item = (SyndEntry) itemIter.next();
162: if (item.getDescription().getValue().indexOf(requestURL) != -1) {
163: mFound = true;
164: mPermalink = item.getLink().toString();
165: if (feedTitle != null && feedTitle.trim().length() > 0) {
166: mTitle = feedTitle + ": " + item.getTitle();
167: } else {
168: mTitle = item.getTitle();
169: }
170: mExcerpt = item.getDescription().getValue();
171: mExcerpt = Utilities.removeHTML(mExcerpt);
172: if (mExcerpt.length() > mMaxExcerpt) {
173: mExcerpt = mExcerpt.substring(0, mMaxExcerpt)
174: + "...";
175: }
176: break;
177: }
178: }
179:
180: if (mLogger.isDebugEnabled()) {
181: mLogger.debug("Parsed " + count
182: + " articles, found linkback=" + mFound);
183: }
184: }
185:
186: //------------------------------------------------------------------------
187: /**
188: * Returns the excerpt.
189: *
190: * @return String
191: */
192: public String getExcerpt() {
193: return mExcerpt;
194: }
195:
196: //------------------------------------------------------------------------
197: /**
198: * Returns the title.
199: *
200: * @return String
201: */
202: public String getTitle() {
203: return mTitle;
204: }
205:
206: //------------------------------------------------------------------------
207: /**
208: * Returns the permalink.
209: *
210: * @return String
211: */
212: public String getPermalink() {
213: return mPermalink;
214: }
215:
216: //------------------------------------------------------------------------
217: /**
218: * Sets the permalink.
219: *
220: * @param permalink
221: * The permalink to set
222: */
223: public void setPermalink(String permalink) {
224: mPermalink = permalink;
225: }
226:
227: /////////////////////////////////////////////////////////////////////////
228:
229: /**
230: * Parser callback that finds title and excerpt. As we walk through the HTML
231: * tags, we keep track of the most recently encountered divider tag in the
232: * mStart field. Once we find the referring permalink, we set the mFound
233: * flag. After that, we look for the next divider tag and save it's position
234: * in the mEnd field.
235: */
236: private final class LinkbackCallback extends ParserCallback {
237: // Dividers
238: private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
239: Tag.BLOCKQUOTE, Tag.P, Tag.LI, Tag.BR, Tag.HR, Tag.PRE,
240: Tag.H1, Tag.H2, Tag.H3, Tag.H4, Tag.H5, Tag.H6 };
241:
242: private List mList = Arrays.asList(mDivTags);
243:
244: private Tag mCurrentTag = null;
245:
246: /**
247: * Look for divider tags and for the permalink.
248: *
249: * @param tag
250: * HTML tag
251: * @param atts
252: * Attributes of that tag
253: * @param pos
254: * Tag's position in file
255: */
256: public void handleStartTag(Tag tag, MutableAttributeSet atts,
257: int pos) {
258: if (mList.contains(tag) && !mFound) {
259: mStart = pos;
260: } else if (mList.contains(tag) && mFound && mEnd == 0) {
261: mEnd = pos;
262: } else if (tag.equals(Tag.A)) {
263: String href = (String) atts
264: .getAttribute(HTML.Attribute.HREF);
265: if (href == null)
266: return;
267: int hashPos = href.lastIndexOf('#');
268: if (hashPos != -1) {
269: href = href.substring(0, hashPos);
270: }
271: if (href != null
272: && (href.equals(mRequestURL) || href
273: .equals(mRequestURLWWW))) {
274: mFound = true;
275: } else {
276: /*
277: * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
278: * "+href); }
279: */
280: }
281: }
282: mCurrentTag = tag;
283: }
284:
285: /**
286: * Needed to handle SPAN tag.
287: */
288: public void handleSimpleTag(Tag tag, MutableAttributeSet atts,
289: int pos) {
290: if (mList.contains(tag) && mFound && mEnd == 0) {
291: mEnd = pos;
292: } else if (tag.equals(Tag.LINK)) {
293: // Look out for RSS autodiscovery link
294: String title = (String) atts
295: .getAttribute(HTML.Attribute.TITLE);
296: String type = (String) atts
297: .getAttribute(HTML.Attribute.TYPE);
298: if (title != null && type != null
299: && type.equals("application/rss+xml")
300: && title.equals("RSS")) {
301: mRssLink = (String) atts
302: .getAttribute(HTML.Attribute.HREF);
303:
304: if (mLogger.isDebugEnabled()) {
305: mLogger.debug("Found RSS link " + mRssLink);
306: }
307:
308: if (mRssLink.startsWith("/")
309: && mRssLink.length() > 1) {
310: try {
311: URL url = new URL(mRefererURL);
312: mRssLink = url.getProtocol() + "://"
313: + url.getHost() + ":"
314: + url.getPort() + mRssLink;
315: } catch (MalformedURLException e) {
316: mRssLink = null;
317: if (mLogger.isDebugEnabled()) {
318: mLogger.debug("Determining RSS URL", e);
319: }
320: }
321: } else if (!mRssLink.startsWith("http")) {
322: int slash = mRefererURL.lastIndexOf("/");
323: if (slash != -1) {
324: mRssLink = mRefererURL.substring(0, slash)
325: + "/" + mRssLink;
326: }
327: }
328: if (mLogger.isDebugEnabled()) {
329: mLogger.debug("Qualified RSS link is "
330: + mRssLink);
331: }
332: }
333: }
334: }
335:
336: /**
337: * Stop at the very first divider tag after the permalink.
338: *
339: * @param tag
340: * End tag
341: * @param pos
342: * Position in HTML file
343: */
344: public void handleEndTag(Tag tag, int pos) {
345: if (mList.contains(tag) && mFound && mEnd == 0) {
346: mEnd = pos;
347: } else if (mList.contains(tag) && !mFound) {
348: mStart = pos;
349: } else {
350: mCurrentTag = null;
351: }
352: }
353:
354: /**
355: * Get the page title
356: */
357: public void handleText(char[] data, int pos) {
358: if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE)) {
359: String newText = new String(data);
360: if (mTitle.length() < 50) {
361: mTitle += newText;
362: }
363: }
364: }
365: }
366: }
|