Source Code Cross Referenced for LinkbackExtractor.java in » Blogger-System » apache-roller-3.1 » org » apache » roller » util » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Blogger System » apache roller 3.1 » org.apache.roller.util

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /*
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         *  contributor license agreements.  The ASF licenses this file to You
004:         * under the Apache License, Version 2.0 (the "License"); you may not
005:         * use this file except in compliance with the License.
006:         * You may obtain a copy of the License at
007:         *
008:         *     http://www.apache.org/licenses/LICENSE-2.0
009:         *
010:         * Unless required by applicable law or agreed to in writing, software
011:         * distributed under the License is distributed on an "AS IS" BASIS,
012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013:         * See the License for the specific language governing permissions and
014:         * limitations under the License.  For additional information regarding
015:         * copyright in this work, please see the NOTICE file in the top level
016:         * directory of this distribution.
017:         */
018:        package org.apache.roller.util;
019:
020:        import java.io.BufferedReader;
021:        import java.io.IOException;
022:        import java.io.InputStream;
023:        import java.io.InputStreamReader;
024:        import java.io.StringReader;
025:        import java.net.MalformedURLException;
026:        import java.net.URL;
027:        import java.util.Arrays;
028:        import java.util.Iterator;
029:        import java.util.List;
030:
031:        import javax.swing.text.MutableAttributeSet;
032:        import javax.swing.text.html.HTML;
033:        import javax.swing.text.html.HTMLEditorKit;
034:        import javax.swing.text.html.HTML.Tag;
035:        import javax.swing.text.html.HTMLEditorKit.Parser;
036:        import javax.swing.text.html.HTMLEditorKit.ParserCallback;
037:
038:        import org.apache.commons.logging.Log;
039:        import org.apache.commons.logging.LogFactory;
040:
041:        import com.sun.syndication.feed.synd.SyndEntry;
042:        import com.sun.syndication.feed.synd.SyndFeed;
043:        import com.sun.syndication.io.FeedException;
044:        import com.sun.syndication.io.SyndFeedInput;
045:
046:        /**
047:         * Parses HTML file for referring linkback title and excerpt.
048:         * 
049:         * @author David M Johnson
050:         */
051:        public class LinkbackExtractor {
052:            private static Log mLogger = LogFactory.getFactory().getInstance(
053:                    LinkbackExtractor.class);
054:            private boolean mFound = false;
055:            private String mTitle = "";
056:            private String mRssLink = null;
057:            private String mExcerpt = null;
058:            private String mPermalink = null;
059:            private int mStart = 0;
060:            private int mEnd = 0;
061:            private int mMaxExcerpt = 500; // characters
062:            private String mRequestURL = null;
063:            private String mRequestURLWWW = null;
064:            private String mRefererURL;
065:
066:            //------------------------------------------------------------------------
067:            /**
068:             * Extract referring page title, excerpt, and permalink.
069:             * 
070:             * @param refererUrl
071:             * @param requestUrl
072:             */
073:            public LinkbackExtractor(String refererURL, String requestURL)
074:                    throws MalformedURLException, IOException {
075:                try {
076:                    extractByParsingHtml(refererURL, requestURL);
077:                    if (mRssLink != null) {
078:                        extractByParsingRss(mRssLink, requestURL);
079:                    }
080:                } catch (Exception e) {
081:                    if (mLogger.isDebugEnabled()) {
082:                        mLogger.debug("Extracting linkback", e);
083:                    }
084:                }
085:            }
086:
087:            //------------------------------------------------------------------------
088:            private void extractByParsingHtml(String refererURL,
089:                    String requestURL) throws MalformedURLException,
090:                    IOException {
091:                URL url = new URL(refererURL);
092:                InputStream is = url.openStream();
093:
094:                mRefererURL = refererURL;
095:
096:                if (requestURL.startsWith("http://www.")) {
097:                    mRequestURLWWW = requestURL;
098:                    mRequestURL = "http://" + mRequestURLWWW.substring(11);
099:                } else {
100:                    mRequestURL = requestURL;
101:                    mRequestURLWWW = "http://www." + mRequestURL.substring(7);
102:                }
103:
104:                // Trick gets Swing's HTML parser
105:                Parser parser = (new HTMLEditorKit() {
106:                    public Parser getParser() {
107:                        return super .getParser();
108:                    }
109:                }).getParser();
110:
111:                // Read HTML file into string
112:                StringBuffer sb = new StringBuffer();
113:                InputStreamReader isr = new InputStreamReader(is);
114:                BufferedReader br = new BufferedReader(isr);
115:                try {
116:                    String line = null;
117:                    while ((line = br.readLine()) != null) {
118:                        sb.append(line);
119:                    }
120:                } finally {
121:                    br.close();
122:                }
123:
124:                // Parse HTML string to find title and start and end position
125:                // of the referring excerpt.
126:                StringReader sr = new StringReader(sb.toString());
127:                parser.parse(sr, new LinkbackCallback(), true);
128:
129:                if (mStart != 0 && mEnd != 0 && mEnd > mStart) {
130:                    mExcerpt = sb.toString().substring(mStart, mEnd);
131:                    mExcerpt = Utilities.removeHTML(mExcerpt);
132:
133:                    if (mExcerpt.length() > mMaxExcerpt) {
134:                        mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
135:                    }
136:                }
137:
138:                if (mTitle.startsWith(">") && mTitle.length() > 1) {
139:                    mTitle = mTitle.substring(1);
140:                }
141:            }
142:
143:            //------------------------------------------------------------------------
144:            private void extractByParsingRss(String rssLink, String requestURL)
145:                    throws IllegalArgumentException, MalformedURLException,
146:                    FeedException, IOException {
147:                SyndFeedInput feedInput = new SyndFeedInput();
148:                SyndFeed feed = feedInput.build(new InputStreamReader(new URL(
149:                        rssLink).openStream()));
150:                Iterator itemIter = feed.getEntries().iterator();
151:                String feedTitle = feed.getTitle();
152:
153:                int count = 0;
154:
155:                if (mLogger.isDebugEnabled()) {
156:                    mLogger.debug("Feed parsed, title: " + feedTitle);
157:                }
158:
159:                while (itemIter.hasNext()) {
160:                    count++;
161:                    SyndEntry item = (SyndEntry) itemIter.next();
162:                    if (item.getDescription().getValue().indexOf(requestURL) != -1) {
163:                        mFound = true;
164:                        mPermalink = item.getLink().toString();
165:                        if (feedTitle != null && feedTitle.trim().length() > 0) {
166:                            mTitle = feedTitle + ": " + item.getTitle();
167:                        } else {
168:                            mTitle = item.getTitle();
169:                        }
170:                        mExcerpt = item.getDescription().getValue();
171:                        mExcerpt = Utilities.removeHTML(mExcerpt);
172:                        if (mExcerpt.length() > mMaxExcerpt) {
173:                            mExcerpt = mExcerpt.substring(0, mMaxExcerpt)
174:                                    + "...";
175:                        }
176:                        break;
177:                    }
178:                }
179:
180:                if (mLogger.isDebugEnabled()) {
181:                    mLogger.debug("Parsed " + count
182:                            + " articles, found linkback=" + mFound);
183:                }
184:            }
185:
186:            //------------------------------------------------------------------------
187:            /**
188:             * Returns the excerpt.
189:             * 
190:             * @return String
191:             */
192:            public String getExcerpt() {
193:                return mExcerpt;
194:            }
195:
196:            //------------------------------------------------------------------------
197:            /**
198:             * Returns the title.
199:             * 
200:             * @return String
201:             */
202:            public String getTitle() {
203:                return mTitle;
204:            }
205:
206:            //------------------------------------------------------------------------
207:            /**
208:             * Returns the permalink.
209:             * 
210:             * @return String
211:             */
212:            public String getPermalink() {
213:                return mPermalink;
214:            }
215:
216:            //------------------------------------------------------------------------
217:            /**
218:             * Sets the permalink.
219:             * 
220:             * @param permalink
221:             *            The permalink to set
222:             */
223:            public void setPermalink(String permalink) {
224:                mPermalink = permalink;
225:            }
226:
227:            /////////////////////////////////////////////////////////////////////////
228:
229:            /**
230:             * Parser callback that finds title and excerpt. As we walk through the HTML
231:             * tags, we keep track of the most recently encountered divider tag in the
232:             * mStart field. Once we find the referring permalink, we set the mFound
233:             * flag. After that, we look for the next divider tag and save it's position
234:             * in the mEnd field.
235:             */
236:            private final class LinkbackCallback extends ParserCallback {
237:                // Dividers
238:                private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
239:                        Tag.BLOCKQUOTE, Tag.P, Tag.LI, Tag.BR, Tag.HR, Tag.PRE,
240:                        Tag.H1, Tag.H2, Tag.H3, Tag.H4, Tag.H5, Tag.H6 };
241:
242:                private List mList = Arrays.asList(mDivTags);
243:
244:                private Tag mCurrentTag = null;
245:
246:                /**
247:                 * Look for divider tags and for the permalink.
248:                 * 
249:                 * @param tag
250:                 *            HTML tag
251:                 * @param atts
252:                 *            Attributes of that tag
253:                 * @param pos
254:                 *            Tag's position in file
255:                 */
256:                public void handleStartTag(Tag tag, MutableAttributeSet atts,
257:                        int pos) {
258:                    if (mList.contains(tag) && !mFound) {
259:                        mStart = pos;
260:                    } else if (mList.contains(tag) && mFound && mEnd == 0) {
261:                        mEnd = pos;
262:                    } else if (tag.equals(Tag.A)) {
263:                        String href = (String) atts
264:                                .getAttribute(HTML.Attribute.HREF);
265:                        if (href == null)
266:                            return;
267:                        int hashPos = href.lastIndexOf('#');
268:                        if (hashPos != -1) {
269:                            href = href.substring(0, hashPos);
270:                        }
271:                        if (href != null
272:                                && (href.equals(mRequestURL) || href
273:                                        .equals(mRequestURLWWW))) {
274:                            mFound = true;
275:                        } else {
276:                            /*
277:                             * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
278:                             * "+href); }
279:                             */
280:                        }
281:                    }
282:                    mCurrentTag = tag;
283:                }
284:
285:                /**
286:                 * Needed to handle SPAN tag.
287:                 */
288:                public void handleSimpleTag(Tag tag, MutableAttributeSet atts,
289:                        int pos) {
290:                    if (mList.contains(tag) && mFound && mEnd == 0) {
291:                        mEnd = pos;
292:                    } else if (tag.equals(Tag.LINK)) {
293:                        // Look out for RSS autodiscovery link
294:                        String title = (String) atts
295:                                .getAttribute(HTML.Attribute.TITLE);
296:                        String type = (String) atts
297:                                .getAttribute(HTML.Attribute.TYPE);
298:                        if (title != null && type != null
299:                                && type.equals("application/rss+xml")
300:                                && title.equals("RSS")) {
301:                            mRssLink = (String) atts
302:                                    .getAttribute(HTML.Attribute.HREF);
303:
304:                            if (mLogger.isDebugEnabled()) {
305:                                mLogger.debug("Found RSS link " + mRssLink);
306:                            }
307:
308:                            if (mRssLink.startsWith("/")
309:                                    && mRssLink.length() > 1) {
310:                                try {
311:                                    URL url = new URL(mRefererURL);
312:                                    mRssLink = url.getProtocol() + "://"
313:                                            + url.getHost() + ":"
314:                                            + url.getPort() + mRssLink;
315:                                } catch (MalformedURLException e) {
316:                                    mRssLink = null;
317:                                    if (mLogger.isDebugEnabled()) {
318:                                        mLogger.debug("Determining RSS URL", e);
319:                                    }
320:                                }
321:                            } else if (!mRssLink.startsWith("http")) {
322:                                int slash = mRefererURL.lastIndexOf("/");
323:                                if (slash != -1) {
324:                                    mRssLink = mRefererURL.substring(0, slash)
325:                                            + "/" + mRssLink;
326:                                }
327:                            }
328:                            if (mLogger.isDebugEnabled()) {
329:                                mLogger.debug("Qualified RSS link is "
330:                                        + mRssLink);
331:                            }
332:                        }
333:                    }
334:                }
335:
336:                /**
337:                 * Stop at the very first divider tag after the permalink.
338:                 * 
339:                 * @param tag
340:                 *            End tag
341:                 * @param pos
342:                 *            Position in HTML file
343:                 */
344:                public void handleEndTag(Tag tag, int pos) {
345:                    if (mList.contains(tag) && mFound && mEnd == 0) {
346:                        mEnd = pos;
347:                    } else if (mList.contains(tag) && !mFound) {
348:                        mStart = pos;
349:                    } else {
350:                        mCurrentTag = null;
351:                    }
352:                }
353:
354:                /**
355:                 * Get the page title
356:                 */
357:                public void handleText(char[] data, int pos) {
358:                    if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE)) {
359:                        String newText = new String(data);
360:                        if (mTitle.length() < 50) {
361:                            mTitle += newText;
362:                        }
363:                    }
364:                }
365:            }
366:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.