001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: /* $Id: HTML.java 473861 2006-11-12 03:51:14Z gregor $ */
020:
021: package org.apache.lenya.util;
022:
023: import java.io.FileReader;
024: import java.io.IOException;
025: import java.io.InputStreamReader;
026: import java.io.Reader;
027: import java.net.URL;
028: import java.net.URLConnection;
029: import java.util.Iterator;
030: import java.util.List;
031:
032: import javax.swing.text.html.parser.ParserDelegator;
033:
034: /**
035: * http://developer.java.sun/developer/TechTips/1999/tt0923.html
036: */
037: public class HTML {
038: HTMLHandler htmlHandler;
039:
040: /**
041: * Creates a new HTML object from a URI
042: * @param uri The URI
043: * @throws IOException if a IO error occurs
044: */
045: public HTML(String uri) throws IOException {
046: ParserDelegator pd = new ParserDelegator();
047: this .htmlHandler = new HTMLHandler();
048: pd.parse(getReader(uri), this .htmlHandler, true);
049: }
050:
051: /**
052: * Command line interface
053: * @param args Command line args
054: */
055: public static void main(String[] args) {
056: if (args.length != 1) {
057: System.err.println("Usage: HTML uri (file or url)");
058:
059: return;
060: }
061:
062: try {
063: HTML html = new HTML(args[0]);
064:
065: List img_src_list = html.getImageSrcs(false);
066: System.out.println("<im src");
067:
068: Iterator img_src_iterator = img_src_list.iterator();
069:
070: while (img_src_iterator.hasNext()) {
071: System.out.println((String) img_src_iterator.next());
072: }
073:
074: List a_href_list = html.getAnchorHRefs(false);
075: System.out.println("<a href");
076:
077: Iterator a_href_iterator = a_href_list.iterator();
078:
079: while (a_href_iterator.hasNext()) {
080: System.out.println((String) a_href_iterator.next());
081: }
082:
083: List link_href_list = html.getLinkHRefs(false);
084: System.out.println("<link href");
085:
086: Iterator link_href_iterator = link_href_list.iterator();
087:
088: while (link_href_iterator.hasNext()) {
089: System.out.println((String) link_href_iterator.next());
090: }
091: } catch (final IOException e) {
092: System.err.println("IO error : " + e);
093: }
094: }
095:
096: /**
097: * Get Anchor Hrefs
098: * @param duplicate Whether you want duplicate HREFS
099: * @return A list of Hrefs
100: */
101: public List getAnchorHRefs(boolean duplicate) {
102: if (duplicate) {
103: return this .htmlHandler.getAllAHRefs();
104: }
105: return this .htmlHandler.getAHRefs();
106: }
107:
108: /**
109: * Get Link hrefs
110: * @param duplicate Whether you want duplicate Hrefs
111: * @return A list of Hrefs
112: */
113: public List getLinkHRefs(boolean duplicate) {
114: if (duplicate) {
115: return this .htmlHandler.getAllLinkHRefs();
116: }
117: return this .htmlHandler.getLinkHRefs();
118: }
119:
120: /**
121: * Get Image src attributes
122: * @param duplicate Whether you want duplicates
123: * @return A list of src Attributes
124: */
125: public List getImageSrcs(boolean duplicate) {
126: if (duplicate) {
127: return this .htmlHandler.getAllImageSrcs();
128: }
129: return this .htmlHandler.getImageSrcs();
130: }
131:
132: private Reader getReader(String uri) throws IOException {
133: if (uri.startsWith("http:")) {
134: // uri is url
135: URLConnection connection = new URL(uri).openConnection();
136:
137: return new InputStreamReader(connection.getInputStream());
138: }
139: // uri is file
140: return new FileReader(uri);
141: }
142: }
|