001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU Library General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
015: */
016: package web.rss;
017:
018: import java.io.BufferedReader;
019: import java.io.BufferedWriter;
020: import java.io.ByteArrayInputStream;
021: import java.io.File;
022: import java.io.FileNotFoundException;
023: import java.io.FileReader;
024: import java.io.FileWriter;
025: import java.io.FilenameFilter;
026: import java.io.IOException;
027: import java.io.InputStream;
028: import java.io.Reader;
029: import java.io.StringReader;
030: import java.text.MessageFormat;
031: import java.text.ParseException;
032: import java.util.Enumeration;
033: import java.util.Hashtable;
034: import java.util.Properties;
035:
036: import javax.xml.parsers.DocumentBuilder;
037: import javax.xml.parsers.DocumentBuilderFactory;
038:
039: import org.apache.commons.digester.Digester;
040: import org.apache.commons.httpclient.HttpClient;
041: import org.apache.commons.httpclient.HttpStatus;
042: import org.apache.commons.httpclient.methods.GetMethod;
043: import org.apache.commons.logging.Log;
044: import org.apache.commons.logging.LogFactory;
045: import org.w3c.dom.Document;
046: import org.w3c.dom.Element;
047:
048: /**
049: * 用于网站摘要信息获取的类
050: * 使用方法:
051: * Channel channel = RssHunter.parse("http://www.javayou.com/blog/rss1.jsp");
052: * @author Winter Lau
053: */
054: public abstract class RssHunter {
055:
056: static Log log;
057:
058: static Hashtable hunters;
059: static {
060: log = LogFactory.getLog(RssHunter.class);
061: hunters = new Hashtable();
062: InputStream in = RssHunter.class
063: .getResourceAsStream("rss.properties");
064: if (in != null) {
065: Properties props = new Properties();
066: try {
067: props.load(in);
068: } catch (IOException e) {
069: log.error("load res.properties failed.", e);
070: } finally {
071: if (in != null) {
072: try {
073: in.close();
074: } catch (Exception e) {
075: }
076: }
077: }
078: Enumeration keys = props.keys();
079: while (keys.hasMoreElements()) {
080: String key = (String) keys.nextElement();
081: try {
082: RssHunter hunter = (RssHunter) Class.forName(
083: props.getProperty(key)).newInstance();
084: hunters.put(key.toLowerCase(), hunter);
085: } catch (Exception e) {
086: log.error("initialize RssHunter failure.", e);
087: }
088: }
089: }
090: }
091:
092: protected RssHunter() {
093: }
094:
095: /**
096: * 获取指定协议对应的摘要信息加载类
097: * @param protocol
098: * @return
099: */
100: protected static RssHunter getHunter(String protocol) {
101: return (RssHunter) hunters.get(protocol.toLowerCase());
102: }
103:
104: /**
105: * 从URL获取信息内容并解析到Channel对象中
106: * @param url
107: * @return
108: * @throws Exception
109: */
110: public static Channel parse(String url) throws Exception {
111: String[] result = getContent(url);
112: String encoding = getEncoding(result[0].substring(0, 50));
113: try {
114: RssHunter hunter = getHunter(result[1]);
115: return hunter.parse(new StringReader(result[0].trim()));
116: } catch (Exception e) {
117: System.out.println("in parse mode=" + result[1] + ",url="
118: + url);
119: throw e;
120: }
121: }
122:
123: /**
124: * 解析XML所用的编码方式
125: * @param xml
126: * @return
127: * @throws ParseException
128: */
129: protected static String getEncoding(String xml)
130: throws ParseException {
131: MessageFormat mf = new MessageFormat("{1}encoding=\"{0}\"{2}");
132: try {
133: return (String) (mf.parse(xml)[0]);
134: } catch (Exception e) {
135: return "UTF-8";
136: }
137: }
138:
139: /**
140: * 从地址URL中抓取摘要信息以及内容格式
141: * @param url
142: * @return 两个元素的字符串数据,第一个元素为摘要内容,第二个元素为内容格式,例如rss
143: * @throws Exception
144: */
145: protected static String[] getContent(String url) throws Exception {
146: StringBuffer content = new StringBuffer();
147: StringBuffer mode = new StringBuffer();
148: long lastReload = load(url, content, mode);
149: if (needReload(lastReload) || content.length() == 0
150: || (mode != null && mode.length() == 0)) {
151: HttpClient client = new HttpClient();
152: GetMethod get = new GetMethod(url);
153: get.addRequestHeader("user-agent",
154: "DLOG4J(http://www.javayou.com) RssHunter 1.0");
155: try {
156: client.executeMethod(get);
157: String charset = get.getResponseCharSet();
158: if (get.getStatusCode() == HttpStatus.SC_OK) {
159: String ct = get.getResponseBodyAsString().trim();
160: String encoding = getEncoding(ct.substring(0, 50));
161: DocumentBuilderFactory dbf = DocumentBuilderFactory
162: .newInstance();
163: DocumentBuilder db = dbf.newDocumentBuilder();
164: Document doc = db.parse(new ByteArrayInputStream(ct
165: .getBytes(charset)));
166: String sMode = null;
167: Element elem = doc.getDocumentElement();
168: if ("feed".equals(elem.getNodeName()))
169: sMode = "atom";
170: else if ("rss".equals(elem.getNodeName()))
171: sMode = "rss";
172: else if ("rdf:RDF".equals(elem.getNodeName()))
173: sMode = "rdf";
174: else
175: throw new IllegalArgumentException(url);
176:
177: mode = new StringBuffer(sMode);
178:
179: ct = new String(ct.getBytes(charset), encoding);
180: save(url, ct, sMode);
181: content = new StringBuffer(ct);
182: }
183: } catch (Exception e) {
184: log.error("fetch content from " + url + " failed.", e);
185: } finally {
186: get.releaseConnection();
187: }
188: }
189: return new String[] { content.toString().trim(),
190: mode.toString() };
191: }
192:
193: /**
194: * 加载指定站点上次的保存的时间
195: * @param url 输入参数,站点的URL
196: * @param content 输出参数,保存信息的内容
197: * @return
198: * @throws IOException
199: */
200: protected static long load(String url, StringBuffer content,
201: StringBuffer mode) throws IOException {
202: String path = getCachePath();
203: BufferedReader reader = null;
204: long lastModified = 0L;
205: try {
206: File f = new File(path);
207: if (f.exists()) {
208: final String pattern = Math.abs(url.hashCode()) + ".";
209: File[] fs = f.listFiles(new FilenameFilter() {
210: public boolean accept(File dir, String name) {
211: return name.startsWith(pattern);
212: }
213: });
214: if (fs.length > 0) {
215: mode.append(fs[0].getName().substring(
216: pattern.length()));
217: lastModified = fs[0].lastModified();
218: reader = new BufferedReader(new FileReader(fs[0]));
219: String lineSep = System
220: .getProperty("line.separator");
221: StringBuffer tmpContent = new StringBuffer();
222: do {
223: String line = reader.readLine();
224: if (line == null)
225: break;
226: tmpContent.append(line);
227: tmpContent.append(lineSep);
228: } while (true);
229: content.append(tmpContent.toString().trim());
230: }
231: }
232: } catch (FileNotFoundException e) {
233: } finally {
234: if (reader != null)
235: reader.close();
236: }
237: return lastModified;
238: }
239:
240: /**
241: * 保存某个站点的最新信息
242: * @param url 输入参数,站点URL
243: * @param content 输入参数,站点摘要信息
244: * @throws IOException
245: */
246: protected static void save(String url, String content, String mode)
247: throws IOException {
248: StringBuffer path = new StringBuffer(getCachePath());
249: path.append(Math.abs(url.hashCode()));
250: path.append('.');
251: path.append(mode);
252: BufferedWriter writer = null;
253: try {
254: File f = new File(path.toString());
255: if (!f.getParentFile().exists())
256: f.getParentFile().mkdirs();
257: writer = new BufferedWriter(new FileWriter(f));
258: writer.write(content);
259: } finally {
260: if (writer != null)
261: writer.close();
262: }
263: }
264:
265: /**
266: * 获取存放缓冲文件所在的目录,默认为临时目录下的dlog4j_cache子目录
267: * @return
268: */
269: protected static String getCachePath() {
270: String tmpDir = System.getProperty("java.io.tmpdir");
271: if (!tmpDir.endsWith(File.separator))
272: tmpDir += File.separator;
273: tmpDir += "dlog4j_cache";
274: tmpDir += File.separator;
275: return tmpDir;
276: }
277:
278: /**
279: * 默认的重新加载策略:上一次加载二十分钟后允许重新加载
280: * 子类可覆盖该方法进行策略的重新定义
281: * @param lastReload
282: * @return
283: */
284: protected static boolean needReload(long lastReload) {
285: long currentTime = System.currentTimeMillis();
286: return (currentTime - lastReload) > 3600000;
287: }
288:
289: /**
290: * 子类负责将摘要内容解析成条目
291: * @param content
292: * @return
293: * @throws Exception
294: */
295: protected abstract Channel parse(Reader content) throws Exception;
296:
297: /**
298: * 获取XML解析器
299: * @param channel
300: * @return
301: */
302: protected Digester getDigester() {
303: Digester digester = new Digester();
304: digester.push(new Channel());
305: digester.setNamespaceAware(true);
306: digester.setValidating(false);
307: return digester;
308: }
309:
310: /**
311: * 获取XML解析器
312: * @param channel
313: * @return
314: */
315: protected Digester getDigester(Channel channel) {
316: Digester digester = new Digester();
317: digester.push(channel);
318: digester.setNamespaceAware(true);
319: digester.setValidating(false);
320: return digester;
321: }
322:
323: public static void main(String[] args) throws Exception {
324: Channel site = parse(args[0]);
325: System.out.println("site.title:" + site.getTitle());
326: System.out.println("site.link:" + site.getLink());
327: System.out.println("site.description:" + site.getDescription());
328: System.out.println("============ ITEMS ============");
329: for (int i = 0; i < site.getItems().size(); i++) {
330: Item log = (Item) site.getItems().get(i);
331: System.out.println("log.title:" + log.getTitle());
332: System.out.println("log.link:" + log.getLink());
333: System.out.println("log.description:"
334: + log.getDescription());
335: System.out.println("-----------------------------------");
336: }
337: }
338:
339: }
|