001: package bplatt.spider;
002:
003: /** PageInfo - Web Page Information object
004: * Copyright 2002, Robert L. Platt, All rights reserved
005: * @author Robert L. Platt
006: *
007: * This program is free software; you can redistribute it and/or modify
008: * it under the terms of the GNU General Public License as published by
009: * the Free Software Foundation; either version 2 of the License, or
010: * (at your option) any later version.
011: *
012: * This program is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
015: * GNU General Public License for more details.
016: *
017: * You should have received a copy of the GNU General Public License
018: * along with this program; if not, write to the Free Software
019: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
020: */
021:
022: import java.io.*;
023: import java.net.*;
024: import java.util.*;
025: import javax.swing.text.*;
026: import javax.swing.text.html.*;
027:
028: public class PageInfo {
029: private URL url;
030: private URL parentUrl;
031: private String title;
032: private URL[] links;
033: private URL[] images;
034: private boolean valid;
035: private int responseCode;
036: private String contentType;
037: private int contentLength;
038: private final static URL[] dummy = new URL[1];
039: private final static String HTML = "text/html";
040:
041: /** Constructor */
042: public PageInfo(URL url, URL parentUrl, String contentType,
043: int contentLength, int responseCode) {
044: this .url = url;
045: this .parentUrl = parentUrl;
046: this .contentType = contentType;
047: this .contentLength = contentLength;
048: this .responseCode = responseCode;
049: valid = false;
050: }
051:
052: // Accessors
053: public URL getUrl() {
054: return (url);
055: }
056:
057: public URL getParentUrl() {
058: return (parentUrl);
059: }
060:
061: public String getTitle() {
062: return (title);
063: }
064:
065: public URL[] getLinks() {
066: return (links);
067: }
068:
069: public URL[] getImages() {
070: return (images);
071: }
072:
073: public String getContentType() {
074: return (contentType);
075: }
076:
077: public boolean isValid() {
078: return (valid);
079: }
080:
081: public int getResponseCode() {
082: return responseCode;
083: }
084:
085: /** Call WebPageXtractor and process WebPage */
086: public void extract(Reader reader) throws IOException {
087: // Note: contentLength of -1 means UNKNOWN
088: if (reader == null || url == null
089: || responseCode != HttpURLConnection.HTTP_OK
090: || contentLength == 0
091: || contentType.equalsIgnoreCase(HTML) == false) {
092: valid = false;
093: return;
094: }
095: WebPageXtractor x = new WebPageXtractor();
096: try {
097: x.parse(reader);
098: } catch (EOFException e) {
099: valid = false;
100: return;
101: } catch (SocketTimeoutException e) {
102: valid = false;
103: throw (e);
104: } catch (IOException e) {
105: valid = false;
106: return;
107: }
108: ArrayList rawlinks = x.getLinks();
109: ArrayList rawimages = x.getImages();
110:
111: // Get web page title (1st title if more than one!)
112: ArrayList rawtitle = x.getTitle();
113: if (rawtitle.isEmpty())
114: title = null;
115: else
116: title = new String((String) rawtitle.get(0));
117:
118: // Get links
119: int numelem = rawlinks.size();
120: if (numelem == 0)
121: links = null;
122: else {
123: ArrayList t = new ArrayList();
124: for (int i = 0; i < numelem; ++i) {
125: String slink = (String) rawlinks.get(i);
126: try {
127: URL link = new URL(url, slink);
128: t.add(link);
129: } catch (MalformedURLException e) { /* Ignore */
130: }
131: }
132: if (t.isEmpty())
133: links = null;
134: else
135: links = (URL[]) t.toArray(dummy);
136: }
137:
138: // Get images
139: numelem = rawimages.size();
140: if (numelem == 0)
141: images = null;
142: else {
143: ArrayList t = new ArrayList();
144: for (int i = 0; i < numelem; ++i) {
145: String simage = (String) rawimages.get(i);
146: try {
147: URL image = new URL(url, simage);
148: t.add(image);
149: } catch (MalformedURLException e) {
150: }
151: }
152: if (t.isEmpty())
153: images = null;
154: else
155: images = (URL[]) t.toArray(dummy);
156: }
157:
158: // Set valid flag
159: valid = true;
160: }
161:
162: /** For debugging - dump page information */
163: public void dump() {
164: System.out.println("URL: " + url);
165: System.out.println("Parent URL: " + parentUrl);
166: System.out.println("Title: " + title);
167: if (links != null) {
168: System.out.print("Links: [");
169: for (int i = 0; i < links.length; ++i) {
170: System.out.print(links[i]);
171: if (i < (links.length - 1))
172: System.out.print(", ");
173: }
174: System.out.println("]");
175: }
176: if (images != null) {
177: System.out.print("Images: [");
178: for (int i = 0; i < images.length; ++i) {
179: System.out.print(images[i]);
180: if (i < (images.length - 1))
181: System.out.print(", ");
182: }
183: System.out.println("]");
184: }
185: System.out.println("Valid: " + valid);
186: System.out.println("Response Code: " + responseCode);
187: System.out.println("Content Type: " + contentType);
188: System.out.println("Content Length: " + contentLength);
189: }
190: }
|