001: package net.matuschek.html;
002:
003: /************************************************
004: Copyright (c) 2001/2002 by Daniel Matuschek
005: *************************************************/
006:
007: import java.net.MalformedURLException;
008: import java.net.URL;
009: import java.util.Vector;
010: import java.util.StringTokenizer;
011: import java.io.*;
012:
013: import org.w3c.dom.Document;
014: import org.w3c.dom.Element;
015: import org.w3c.dom.NodeList;
016: import org.w3c.tidy.Tidy;
017:
018: import org.apache.log4j.Category;
019:
020: import net.matuschek.util.AttribValuePair;
021:
022: /**
023: * This class implements an HTML document
024: *
025: * It uses JTidy to parse the given HTML code to an internal DOM
026: * representation.
027: *
028: * @author Daniel Matuschek
029: * @version $Id $
030: */
031: public class HtmlDocument {
032:
033: /** URL of this document */
034: private URL url = null;
035:
036: /** Content text as an array of bytes (this is how we get it from HTTP !) */
037: private byte[] content = null;
038:
039: /** the DOM representation of this HTML document */
040: private Document domDoc = null;
041:
042: /** Log4J category for logging purposes */
043: private Category log;
044:
045: /** encoding */
046: private String encoding;
047:
048: /** Base URL */
049: private URL baseURL = null;
050:
051: /** All links */
052: Vector<URL> links;
053:
054: /**
055: * initializes HTML document without content
056: */
057: private HtmlDocument(URL url) {
058: log = Category.getInstance(getClass().getName());
059: this .url = url;
060: }
061:
062: /**
063: * Initializes an HTML document with the given content.
064: *
065: * @param url the URL of this document. Needed for link extraction.
066: * @param content some HTML text as an array of bytes
067: */
068: public HtmlDocument(URL url, byte[] content) {
069: this (url);
070: this .content = content;
071: parse();
072: }
073:
074: /**
075: * Initializes an HTML document with the given content.
076: *
077: * @param url the URL of this document. Needed for link extraction.
078: * @param content some HTML text as an array of bytes
079: * @param newEncoding Is the encoding of the content.
080: */
081: public HtmlDocument(URL url, byte[] content, String newEncoding) {
082: this (url);
083: this .content = content;
084: encoding = newEncoding;
085: parse();
086: }
087:
088: /**
089: * Initalizes an HTML document from a String. Convert string to
090: * bytes using default encoding
091: */
092: public HtmlDocument(URL url, String contentStr) {
093: this (url);
094: this .content = new byte[contentStr.length() + 1];
095: for (int i = 0; i < contentStr.length(); i++) {
096: this .content[i] = (byte) contentStr.charAt(i);
097: }
098: parse();
099: }
100:
101: /**
102: * Extracts all links to other documents from this HTML document.
103: *
104: * @return a Vector of URLs containing the included links
105: */
106: private void parse() {
107: if (domDoc == null) {
108: parseToDOM();
109: }
110: this .links = new Vector<URL>();
111: extractLinks(domDoc.getDocumentElement(), links);
112: }
113:
114: public Vector<URL> getLinks() {
115: return this .links;
116: }
117:
118: /**
119: * Extracts all links to included images from this HTML document.
120: *
121: * @return a Vector of URLs containing the included links
122: */
123: public Vector getImageLinks() {
124: if (domDoc == null) {
125: parseToDOM();
126: }
127: Vector<URL> links = new Vector<URL>();
128: extractImageLinks(domDoc.getDocumentElement(), links);
129:
130: return links;
131: }
132:
133: /**
134: * gets all Element nodes of a given type as a Vector
135: * @param type the type of elements to return. e.g. type="a"
136: * will return all <A> tags. type must be lowercase
137: * @return a Vector containing all element nodes of the given type
138: */
139: public Vector getElements(String type) {
140: if (domDoc == null) {
141: parseToDOM();
142: }
143:
144: Vector<Element> links = new Vector<Element>();
145: extractElements(domDoc.getDocumentElement(), type, links);
146:
147: return links;
148: }
149:
150: /**
151: * Extract links from the given DOM subtree and put it into the given
152: * vector.
153: *
154: * @param element the top level DOM element of the DOM tree to parse
155: * @param links the vector that will store the links
156: */
157: protected void extractLinks(Element element, Vector<URL> links) {
158:
159: // this should not happen !
160: if (element == null) {
161: log.error("got a null element");
162: return;
163: }
164:
165: String name = element.getNodeName().toLowerCase();
166:
167: if (name.equals("a")) {
168:
169: // A HREF=
170: addLink(element.getAttribute("href"), links);
171:
172: } else if (name.equals("base")) {
173:
174: // BASE HREF=
175: try {
176: this .baseURL = new URL(element.getAttribute("href"));
177: log.info("baseUR=" + baseURL);
178: } catch (MalformedURLException e) {
179: }
180:
181: } else if (name.equals("frame")) {
182:
183: // FRAME SRC=
184: addLink(element.getAttribute("src"), links);
185:
186: // handle internal frame (iframes) as well
187: } else if (name.equals("iframe")) {
188:
189: // IFRAME SRC=
190: addLink(element.getAttribute("src"), links);
191:
192: } else if (name.equals("image")) {
193:
194: // IMAGEG SRC= (incorrect, but seems to work in some browsers :(
195: addLink(element.getAttribute("src"), links);
196:
197: } else if (name.equals("img")) {
198:
199: // IMG SRC=
200: addLink(element.getAttribute("src"), links);
201:
202: } else if (name.equals("area")) {
203:
204: // AREA HREF=
205: addLink(element.getAttribute("href"), links);
206:
207: } else if (name.equals("meta")) {
208:
209: // META HTTP-EQUIV=REFRESH
210: String equiv = element.getAttribute("http-equiv");
211: if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) {
212: String refreshcontent = element.getAttribute("content");
213: if (refreshcontent == null) {
214: refreshcontent = "";
215: }
216:
217: StringTokenizer st = new StringTokenizer(
218: refreshcontent, ";");
219: while (st.hasMoreTokens()) {
220: String token = st.nextToken().trim();
221: AttribValuePair av = new AttribValuePair(token);
222: if (av.getAttrib().equals("url")) {
223: addLink(av.getValue(), links);
224: }
225: }
226: }
227:
228: } else if (name.equals("body")) {
229: // BODY BACKGROUND=
230: String background = element.getAttribute("background");
231: if (!(background == null) || (background.equals(""))) {
232: addLink(background, links);
233: }
234:
235: } else {
236: log.info("Ignore tag name: " + name);
237: }
238:
239: // recursive travel through all childs
240: NodeList childs = element.getChildNodes();
241:
242: for (int i = 0; i < childs.getLength(); i++) {
243: if (childs.item(i) instanceof Element) {
244: extractLinks((Element) childs.item(i), links);
245: }
246: }
247:
248: }
249:
250: /**
251: * Extract links to includes images from the given DOM subtree and
252: * put them into the given vector.
253: *
254: * @param element the top level DOM element of the DOM tree to parse
255: * @param links the vector that will store the links
256: */
257: protected void extractImageLinks(Element element, Vector<URL> links) {
258:
259: // this should not happen !
260: if (element == null) {
261: log.error("got a null element");
262: return;
263: }
264:
265: String name = element.getNodeName();
266:
267: if (name.equals("img")) {
268: // IMG SRC=
269: addLink(element.getAttribute("src"), links);
270: }
271:
272: if (name.equals("image")) {
273: // IMAGE SRC=
274: addLink(element.getAttribute("src"), links);
275: }
276:
277: // recursive travel through all childs
278: NodeList childs = element.getChildNodes();
279:
280: for (int i = 0; i < childs.getLength(); i++) {
281: if (childs.item(i) instanceof Element) {
282: extractImageLinks((Element) childs.item(i), links);
283: }
284: }
285:
286: }
287:
288: /**
289: * Extract elements from the given DOM subtree and put it into the given
290: * vector.
291: *
292: * @param element the top level DOM element of the DOM tree to parse
293: * @param type HTML tag to extract (e.g. "a", "form", "head" ...)
294: * @param elementList the vector that will store the elements
295: */
296: protected void extractElements(Element element, String type,
297: Vector<Element> elementList) {
298:
299: // this should not happen !
300: if (element == null) {
301: log.error("got a null element");
302: return;
303: }
304:
305: String name = element.getNodeName();
306:
307: if (name.equals(type)) {
308: elementList.add(element);
309: }
310:
311: // recursive travel through all childs
312: NodeList childs = element.getChildNodes();
313:
314: for (int i = 0; i < childs.getLength(); i++) {
315: if (childs.item(i) instanceof Element) {
316: extractElements((Element) childs.item(i), type,
317: elementList);
318: }
319: }
320:
321: }
322:
323: /**
324: * parses the document to a DOM tree using Tidy
325: */
326: private void parseToDOM() {
327: ByteArrayInputStream is = new ByteArrayInputStream(content);
328:
329: // set tidy parameters
330: Tidy tidy = new Tidy();
331: tidy.setUpperCaseTags(false);
332: tidy.setUpperCaseAttrs(false);
333: tidy.setErrout(new PrintWriter(System.err));
334:
335: domDoc = tidy.parseDOM(is, null);
336: }
337:
338: /**
339: * adds a links to the given vector. ignores (but logs) possible errors
340: */
341: private void addLink(String newURL, Vector<URL> links) {
342:
343: // remove part after # from the URL
344: // thanks to Johannes Christen for bug fix.
345: if ((newURL == null) || (newURL.equals("")))
346: return;
347: int pos = newURL.indexOf("#");
348: if (pos >= 0) {
349: newURL = newURL.substring(0, pos);
350: }
351:
352: if (encoding != null) {
353: try {
354: newURL = new String(newURL.getBytes(), encoding);
355: } catch (UnsupportedEncodingException e) {
356: }
357: } else {
358: try {
359: newURL = new String(newURL.getBytes(), "ISO-8859-1");
360: } catch (UnsupportedEncodingException e) {
361: }
362: }
363:
364: try {
365: URL u = null;
366: if (this .baseURL != null) {
367: u = new URL(this .baseURL, newURL);
368: } else {
369: u = new URL(url, newURL);
370: }
371: links.add(u);
372: } catch (Exception e) {
373: log.debug("error during link extraction: " + e.getMessage()
374: + " " + newURL);
375: }
376: }
377:
378: public URL getBaseURL() {
379: return baseURL;
380: }
381:
382: }
|