001: package bplatt.spider;
002:
003: /**
004: * SimpleHTMLParser object - simple parser for HTML
005: * Copyright 2002, Robert L. Platt, All rights reserved
006: * @author Robert L. Platt
007: *
008: * This program is free software; you can redistribute it and/or modify
009: * it under the terms of the GNU General Public License as published by
010: * the Free Software Foundation; either version 2 of the License, or
011: * (at your option) any later version.
012: *
013: * This program is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016: * GNU General Public License for more details.
017: *
018: * You should have received a copy of the GNU General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
021: */
022:
023: import java.io.*;
024: import java.util.*;
025: import java.net.*;
026:
027: public abstract class SimpleHTMLParser {
028:
029: /**
030: * Constructor for HTMLParser.
031: */
032: public SimpleHTMLParser() {
033: }
034:
035: /** Parse an HTML page from an input stream
036: * Handles three types of tokens - TAG, ENDTAG,
037: * and CONTENT. Throws out any comments.
038: */
039: public void parse(Reader r) throws IOException {
040: char buf[] = new char[10];
041:
042: BufferedReader in = new BufferedReader(r);
043:
044: // Get rid of any initial (not-tag) chars.
045: while (true) {
046: read(in, buf, 1);
047: if (buf[0] == '<')
048: break;
049: }
050:
051: // Process page
052: int readahead;
053: while (true) {
054: // Process tag or comment
055: readahead = 3;
056: in.mark(readahead);
057: read(in, buf, readahead);
058: if (buf[0] == '!' && buf[1] == '-' && buf[2] == '-')
059: handle_comment(in);
060: else if (buf[0] == '/') {
061: in.reset();
062: read(in, buf, 1);
063: handle_tag(SimpleHTMLToken.ENDTAG, in);
064: } else {
065: in.reset();
066: handle_tag(SimpleHTMLToken.TAG, in);
067: }
068:
069: // determine if next char is start of new tag or content
070: readahead = 1;
071: in.mark(readahead);
072: try {
073: read(in, buf, readahead);
074: } catch (SocketTimeoutException e) {
075: throw (e);
076: } // Re-throw exception
077: catch (EOFException e) {
078: return;
079: } // EOF is OK after tag
080: catch (IOException e) {
081: throw (e);
082: } // Re-throw exception
083: if (buf[0] != '<') {
084: in.reset();
085: if (handle_content(in) == false)
086: return; // EOF is OK
087: }
088: }
089: }
090:
091: // Handle a tag
092: private void handle_tag(int type, BufferedReader in)
093: throws IOException {
094: char buf[] = new char[10];
095: StringBuffer guts = new StringBuffer();
096: while (true) {
097: read(in, buf, 1);
098: if (buf[0] == '>')
099: break;
100: guts.append(buf[0]);
101: }
102: SimpleHTMLToken token = new SimpleHTMLToken(type, guts
103: .toString());
104: if (type == SimpleHTMLToken.TAG)
105: processTag(token);
106: else
107: processEndTag(token);
108: }
109:
110: // Throw away comment
111: private void handle_comment(BufferedReader in) throws IOException {
112: char buf[] = new char[10];
113: while (true) {
114: read(in, buf, 1);
115: if (buf[0] == '-') {
116: int readahead = 2;
117: in.mark(readahead);
118: read(in, buf, readahead);
119: if (buf[0] == '-' && buf[1] == '>')
120: return;
121: else
122: in.reset();
123: }
124: }
125:
126: }
127:
128: // Handle tag content - return true if more content, false if EOF
129: private boolean handle_content(BufferedReader in)
130: throws IOException {
131: char buf[] = new char[10];
132: StringBuffer guts = new StringBuffer();
133: while (true) {
134: try {
135: read(in, buf, 1);
136: } catch (SocketTimeoutException e) {
137: throw (e);
138: } // Re-throw exception
139: catch (EOFException e) {
140: return (false);
141: } // EOF is OK after tag
142: catch (IOException e) {
143: throw (e);
144: } // Re-throw exception
145: if (buf[0] == '<')
146: break;
147: else
148: guts.append(buf[0]);
149: }
150: SimpleHTMLToken token = new SimpleHTMLToken(
151: SimpleHTMLToken.CONTENT, guts.toString());
152: processContent(token);
153: return (true);
154: }
155:
156: /** processTag - process a tag */
157: public abstract void processTag(SimpleHTMLToken token)
158: throws IOException;
159:
160: /** processEndTag - process an end tag */
161: public abstract void processEndTag(SimpleHTMLToken token)
162: throws IOException;
163:
164: /** processContent - process content */
165: public abstract void processContent(SimpleHTMLToken token)
166: throws IOException;
167:
168: /** Process a token and return the tag or null
169: * flag indicates whether tag is to be converted to lower case
170: */
171: public static String getTagType(SimpleHTMLToken token,
172: boolean lowerCaseFlag) {
173: if (token.getType() == SimpleHTMLToken.CONTENT)
174: return (null);
175: String content = token.getContent();
176: if (content == null || content.length() == 0)
177: return (null);
178: StringTokenizer tt = new StringTokenizer(content);
179: String tag = null;
180: try {
181: tag = tt.nextToken();
182: } catch (NoSuchElementException e) {
183: return (null);
184: }
185: return ((lowerCaseFlag ? tag.toLowerCase() : tag));
186: }
187:
188: // Read() - handle blocking / EOF
189: private void read(BufferedReader r, char[] buf, int nchars)
190: throws IOException {
191: int flag = 10;
192: int charsToRead = nchars;
193:
194: while (charsToRead != 0 && flag != 0) {
195: int charsRead = r.read(buf, 0, nchars);
196: if (charsRead == -1)
197: throw new EOFException(
198: "Premature EOF while parsing HTML");
199: charsToRead = charsToRead - charsRead;
200: flag--;
201: if (flag <= 5) {
202: // Wait a second
203: Thread mythread = Thread.currentThread();
204: try {
205: mythread.sleep(1000, 0);
206: } catch (InterruptedException e) { /* Ignore it */
207: }
208: }
209: }
210: if (flag == 0)
211: throw new SocketTimeoutException(
212: "Input timed-out while parsing HTML");
213: }
214: }
|