001: package com.quadcap.http.client;
002:
003: /* Copyright 1999 - 2003 Quadcap Software. All rights reserved.
004: *
005: * This software is distributed under the Quadcap Free Software License.
006: * This software may be used or modified for any purpose, personal or
007: * commercial. Open Source redistributions are permitted. Commercial
008: * redistribution of larger works derived from, or works which bundle
009: * this software requires a "Commercial Redistribution License"; see
010: * http://www.quadcap.com/purchase.
011: *
012: * Redistributions qualify as "Open Source" under one of the following terms:
013: *
014: * Redistributions are made at no charge beyond the reasonable cost of
015: * materials and delivery.
016: *
017: * Redistributions are accompanied by a copy of the Source Code or by an
018: * irrevocable offer to provide a copy of the Source Code for up to three
019: * years at the cost of materials and delivery. Such redistributions
020: * must allow further use, modification, and redistribution of the Source
021: * Code under substantially the same terms as this license.
022: *
023: * Redistributions of source code must retain the copyright notices as they
024: * appear in each source code file, these license terms, and the
025: * disclaimer/limitation of liability set forth as paragraph 6 below.
026: *
027: * Redistributions in binary form must reproduce this Copyright Notice,
028: * these license terms, and the disclaimer/limitation of liability set
029: * forth as paragraph 6 below, in the documentation and/or other materials
030: * provided with the distribution.
031: *
032: * The Software is provided on an "AS IS" basis. No warranty is
033: * provided that the Software is free of defects, or fit for a
034: * particular purpose.
035: *
036: * Limitation of Liability. Quadcap Software shall not be liable
037: * for any damages suffered by the Licensee or any third party resulting
038: * from use of the Software.
039: */
040:
041: import java.io.CharArrayWriter;
042: import java.io.IOException;
043: import java.io.Reader;
044:
045: import org.xml.sax.AttributeList;
046: import org.xml.sax.DocumentHandler;
047: import org.xml.sax.DTDHandler;
048: import org.xml.sax.EntityResolver;
049: import org.xml.sax.ErrorHandler;
050: import org.xml.sax.InputSource;
051: import org.xml.sax.Parser;
052: import org.xml.sax.SAXException;
053:
054: import org.xml.sax.helpers.AttributeListImpl;
055:
056: import com.quadcap.util.collections.ArrayQueue;
057:
058: /**
059: * A SAX Parser for HTML.
060: *
061: * @author Stan Bailes
062: */
063: public class HtmlParser implements Parser {
064: InputSource in;
065: Reader r;
066: DocumentHandler docHandler = null;
067: DTDHandler dtdHandler = null;
068: EntityResolver entityResolver = null;
069: CharArrayWriter tag = new CharArrayWriter();
070: CharArrayWriter data = new CharArrayWriter();
071: AttributeListImpl attributes = new AttributeListImpl();
072: String tagName = null;
073:
074: final static int TAG = 1;
075:
076: public HtmlParser() {
077: }
078:
079: public void parse(InputSource in) throws SAXException, IOException {
080: this .in = in;
081: this .r = in.getCharacterStream();
082: tag.reset();
083: data.reset();
084: parse();
085: }
086:
087: public void parse(String s) {
088: }
089:
090: public void setDocumentHandler(DocumentHandler dh) {
091: this .docHandler = dh;
092: }
093:
094: public void setDTDHandler(DTDHandler dh) {
095: this .dtdHandler = dh;
096: }
097:
098: public void setEntityResolver(EntityResolver er) {
099: this .entityResolver = er;
100: }
101:
102: public EntityResolver getEntityResolver() {
103: return entityResolver;
104: }
105:
106: public void setErrorHandler(ErrorHandler er) {
107: }
108:
109: public void setLocale(java.util.Locale locale) {
110: }
111:
112: public void parse() throws SAXException, IOException {
113: int state = 0;
114: int commentState = 0;
115: String attrName = null;
116: docHandler.startDocument();
117: while (state >= 0) {
118: int c = r.read();
119: //System.out.println("[" + ((char)c) + "] [" + state + "] <" + tag.toString() + ">");
120: if (c < 0) {
121: state = -1;
122: break;
123: }
124: switch (commentState) {
125: case 0:
126: break;
127: case 1:
128: if (c == '-')
129: commentState = 2;
130: break;
131: case 2:
132: if (c == '-')
133: commentState = 3;
134: else
135: commentState = 1;
136: break;
137: case 3:
138: if (c == '>')
139: commentState = 0;
140: else if (c != '-')
141: commentState = 1;
142: }
143:
144: switch (state) {
145: case 0:
146: if (c == '<') {
147: if (data.size() > 0) {
148: docHandler.characters(data.toCharArray(), 0,
149: data.size());
150: data.reset();
151: }
152: state = 1;
153: } else {
154: data.write(c);
155: }
156: break;
157: case 1: // seen '<'
158: switch (c) {
159: case '!':
160: data.write('<');
161: data.write('!');
162: commentState = 1;
163: state = 0;
164: break;
165: case '/':
166: state = 8;
167: break;
168: default:
169: tag.write(c);
170: state = 5;
171: break;
172: }
173: break;
174: case 5: // collect tag name
175: switch (c) {
176: case ' ':
177: tagName = tag.toString();
178: tag.reset();
179: state = 6;
180: break;
181: case '/':
182: tagName = tag.toString();
183: tag.reset();
184: state = 9;
185: break;
186: case '>':
187: tagName = tag.toString();
188: tag.reset();
189: docHandler.startElement(tagName, attributes);
190: attributes.clear();
191: state = 0;
192: break;
193: default:
194: tag.write(c);
195: }
196: break;
197: case 6: // collect attributes
198: switch (c) {
199: case ' ':
200: case '\n':
201: case '\r':
202: case '\t':
203: break;
204: case '/':
205: state = 9;
206: break;
207: case '>':
208: docHandler.startElement(tagName, attributes);
209: attributes.clear();
210: state = 0;
211: break;
212: case '=':
213: attrName = tag.toString();
214: tag.reset();
215: state = 10;
216: break;
217: default:
218: tag.write(c);
219: }
220: break;
221: case 8: // seen </
222: if (c == '>') {
223: tagName = tag.toString();
224: tag.reset();
225: docHandler.endElement(tagName);
226: state = 0;
227: } else {
228: tag.write(c);
229: }
230: break;
231: case 9: // in <tag, seen /
232: if (c == '>') {
233: docHandler.startElement(tagName, attributes);
234: attributes.clear();
235: docHandler.endElement(tagName);
236: state = 0;
237: } else {
238: tag.write('/');
239: tag.write(c);
240: state = 6;
241: }
242: break;
243: case 10: // in attriblist, seen name=
244: if (c == '"') {
245: state = 12;
246: } else if (c == '\'') {
247: state = 121;
248: } else {
249: tag.write(c);
250: state = 13;
251: }
252: break;
253: case 12: // in attriblist, seen name="
254: if (c == '"') {
255: attributes.addAttribute(attrName.toLowerCase(),
256: "string", tag.toString());
257: tag.reset();
258: state = 6;
259: } else {
260: tag.write(c);
261: }
262: break;
263: case 121: // in attriblist, seen name='
264: if (c == '\'') {
265: attributes.addAttribute(attrName.toLowerCase(),
266: "string", tag.toString());
267: tag.reset();
268: state = 6;
269: } else {
270: tag.write(c);
271: }
272: break;
273: case 13: // in attriblist, seen name=c
274: switch (c) {
275: case ' ':
276: attributes.addAttribute(attrName.toLowerCase(),
277: "string", tag.toString());
278: tag.reset();
279: state = 6;
280: break;
281: case '/':
282: state = 14;
283: break;
284: case '>':
285: attributes.addAttribute(attrName.toLowerCase(),
286: "string", tag.toString());
287: tag.reset();
288: docHandler.startElement(tagName, attributes);
289: attributes.clear();
290: state = 0;
291: break;
292: default:
293: tag.write(c);
294: }
295: break;
296: case 14: // in attriblist, seen name=dfdf/
297: if (c == '>') {
298: attributes.addAttribute(attrName.toLowerCase(),
299: "string", tag.toString());
300: tag.reset();
301: docHandler.startElement(tagName, attributes);
302: attributes.clear();
303: state = 0;
304: } else {
305: tag.write('/');
306: if (c != '/') {
307: tag.write(c);
308: state = 13;
309: }
310: }
311: break;
312: case 15:
313: if (c == '-')
314: state = 16;
315: break;
316: case 16:
317: if (c == '-')
318: state = 17;
319: else
320: state = 15;
321: break;
322: case 17:
323: if (c == '>')
324: state = 0;
325: else if (c != '-')
326: state = 15;
327: break;
328: }
329: }
330: }
331:
332: }
|