001: package bplatt.spider;
002:
003: /**
004: * WebPageXtractor - extracts information from a WebPage
005: * passed as an input stream. Makes use of SimpleHTMLParser
006: * object. Used to use HTMLEditorKit and HTMLEditorKit.Parser.
007: * This turned out to be too buggy for this application.
008: * Cannot use XML parser as HTML does not follow stricter XML
009: * syntax rules. In fact many Web pages are a "tag salad" that
010: * don't even follow proper HTML syntax. WebPageXtractor parses
011: * a page and extracts links, images, and title(s).
012: *
013: * Copyright 2002, Robert L. Platt, All rights reserved
014: * @author Robert L. Platt
015: *
016: * This program is free software; you can redistribute it and/or modify
017: * it under the terms of the GNU General Public License as published by
018: * the Free Software Foundation; either version 2 of the License, or
019: * (at your option) any later version.
020: *
021: * This program is distributed in the hope that it will be useful,
022: * but WITHOUT ANY WARRANTY; without even the implied warranty of
023: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
024: * GNU General Public License for more details.
025: *
026: * You should have received a copy of the GNU General Public License
027: * along with this program; if not, write to the Free Software
028: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
029: */
030:
031: import java.io.*;
032: import java.util.*;
033:
034: public class WebPageXtractor extends SimpleHTMLParser {
035: private ArrayList links;
036: private ArrayList images;
037: private ArrayList title;
038: private boolean inTitle;
039:
040: /** Constructor */
041: public WebPageXtractor() {
042: super ();
043: links = new ArrayList();
044: images = new ArrayList();
045: title = new ArrayList();
046: }
047:
048: /**
049: * If we're within TITLE tags - save the title
050: * @see SimpleHTMLParser#processContent(SimpleHTMLToken)
051: */
052: public void processContent(SimpleHTMLToken token) {
053: String s = token.getContent().trim();
054: if (s != null && s.length() != 0) {
055: if (inTitle)
056: title.add(s);
057: }
058: }
059:
060: /**
061: * Look for </title> tags
062: * @see SimpleHTMLParser#processEndTag(SimpleHTMLToken)
063: */
064: public void processEndTag(SimpleHTMLToken token) throws IOException {
065: String tag = SimpleHTMLParser.getTagType(token, true);
066: if (tag == null)
067: throw new IOException("HTML parsing error");
068: else if (tag.equals("title"))
069: inTitle = false;
070: }
071:
072: /**
073: * Handle Anchor, Image, Frame, and Title tags
074: * @see SimpleHTMLParser#processTag(SimpleHTMLToken)
075: */
076: public void processTag(SimpleHTMLToken token) throws IOException {
077: String tag = SimpleHTMLParser.getTagType(token, true);
078: if (tag == null)
079: throw new IOException("HTML parsing error");
080: else if (tag.equals("a")) {
081: String link = extractHref(token.getContent());
082: if (link != null)
083: links.add(link);
084: } else if (tag.equals("img")) {
085: String image = extractSrc(token.getContent());
086: if (image != null)
087: images.add(image);
088: } else if (tag.equals("frame")) {
089: String link = extractSrc(token.getContent());
090: if (link != null)
091: links.add(link);
092: } else if (tag.equals("title"))
093: inTitle = true;
094: }
095:
096: // Utility method for extracting href attribute
097: private String extractHref(String tag) {
098: String delims = "\t\r\f\n \'\"=";
099: StringTokenizer tt = new StringTokenizer(tag, delims);
100: while (tt.hasMoreElements()) {
101: String s = tt.nextToken();
102: if (s.equalsIgnoreCase("href")) {
103: if (!tt.hasMoreElements())
104: return (null);
105: else
106: return (tt.nextToken());
107: }
108: }
109: return (null);
110: }
111:
112: // Utility method for extracting src attribute
113: private String extractSrc(String tag) {
114: String delims = "\t\r\f\n \'\"=";
115: StringTokenizer tt = new StringTokenizer(tag, delims);
116: while (tt.hasMoreElements()) {
117: String s = tt.nextToken();
118: if (s.equalsIgnoreCase("src")) {
119: if (!tt.hasMoreElements())
120: return (null);
121: else
122: return (tt.nextToken());
123: }
124: }
125: return (null);
126: }
127:
128: /**
129: * Returns the images.
130: * @return ArrayList
131: */
132: public ArrayList getImages() {
133: return images;
134: }
135:
136: /**
137: * Returns the links.
138: * @return ArrayList
139: */
140: public ArrayList getLinks() {
141: return links;
142: }
143:
144: /**
145: * Returns the title.
146: * @return ArrayList
147: */
148: public ArrayList getTitle() {
149: return title;
150: }
151: }
|