001: /*
002: * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
003: *
004: * Copyright (c) 2001 Brian Pitcher
005: *
006: * Permission is hereby granted, free of charge, to any person obtaining a
007: * copy of this software and associated documentation files (the "Software"),
008: * to deal in the Software without restriction, including without limitation
009: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
010: * and/or sell copies of the Software, and to permit persons to whom the
011: * Software is furnished to do so, subject to the following conditions:
012: *
013: * The above copyright notice and this permission notice shall be included in
014: * all copies or substantial portions of the Software.
015: *
016: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
017: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
018: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
019: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
020: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
021: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
022: * SOFTWARE.
023: */
024:
025: // $Header: /cvsroot/weblech/weblech/src/weblech/spider/HTMLParser.java,v 1.3 2002/06/09 11:02:36 weblech Exp $
026: package weblech.spider;
027:
028: import org.apache.log4j.Category;
029:
030: import java.util.List;
031: import java.util.ArrayList;
032: import java.util.HashSet;
033: import java.util.Set;
034: import java.net.URL;
035: import java.net.MalformedURLException;
036: import java.io.ByteArrayInputStream;
037: import java.io.IOException;
038: import java.io.FileWriter;
039: import java.io.PrintWriter;
040:
041: import weblech.util.Log4j;
042:
043: public class HTMLParser {
044: private final static Category _logClass = Category
045: .getInstance(URLObject.class);
046:
047: private SpiderConfig config;
048:
049: static {
050: Log4j.init();
051: }
052:
053: public HTMLParser(SpiderConfig config) {
054: this .config = config;
055: }
056:
057: public List parseLinksInDocument(URL sourceURL, String textContent) {
058: return parseAsHTML(sourceURL, textContent);
059: }
060:
061: private List parseAsHTML(URL sourceURL, String textContent) {
062: _logClass.debug("parseAsHTML()");
063: ArrayList newURLs = new ArrayList();
064: HashSet newURLSet = new HashSet();
065:
066: extractAttributesFromTags("img", "src", sourceURL, newURLs,
067: newURLSet, textContent);
068: extractAttributesFromTags("a", "href", sourceURL, newURLs,
069: newURLSet, textContent);
070: extractAttributesFromTags("body", "background", sourceURL,
071: newURLs, newURLSet, textContent);
072: extractAttributesFromTags("frame", "src", sourceURL, newURLs,
073: newURLSet, textContent);
074: extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs,
075: newURLSet, textContent);
076: extractAttributesFromTags("A", "HREF", sourceURL, newURLs,
077: newURLSet, textContent);
078: extractAttributesFromTags("BODY", "BACKGROUND", sourceURL,
079: newURLs, newURLSet, textContent);
080: extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs,
081: newURLSet, textContent);
082:
083: if (newURLs.size() == 0) {
084: _logClass
085: .debug("Got 0 new URLs from HTML parse, check HTML\n"
086: + textContent);
087: }
088: _logClass.debug("Returning " + newURLs.size()
089: + " urls extracted from page");
090: return newURLs;
091: }
092:
093: private void extractAttributesFromTags(String tag, String attr,
094: URL sourceURL, List newURLs, Set newURLSet, String input) {
095: _logClass.debug("extractAttributesFromTags(" + tag + ", "
096: + attr + ", ...)");
097:
098: int startPos = 0;
099: String startTag = "<" + tag + " ";
100: String attrStr = attr + "=\"";
101: while (true) {
102: int tagPos = input.indexOf(startTag, startPos);
103: if (tagPos < 0) {
104: return;
105: }
106: int attrPos = input.indexOf(attrStr, tagPos + 1);
107: if (attrPos < 0) {
108: startPos = tagPos + 1;
109: continue;
110: }
111: int nextClosePos = input.indexOf(">", tagPos + 1);
112: if (attrPos < nextClosePos) {
113: // Ooh, found one
114: int closeQuotePos = input.indexOf("\"", attrPos
115: + attrStr.length() + 1);
116: if (closeQuotePos > 0) {
117: String urlStr = input.substring(attrPos
118: + attrStr.length(), closeQuotePos);
119: if (urlStr.indexOf('#') != -1) {
120: urlStr = urlStr.substring(0, urlStr
121: .indexOf('#'));
122: }
123: //_logClass.debug("Found possible URL string: " + URL);
124:
125: if (isMailTo(urlStr)) {
126: logMailURL(urlStr);
127: } else {
128: try {
129:
130: URL u = new URL(sourceURL, urlStr);
131: if (newURLSet.contains(u)) {
132: //_logClass.debug("Already found URL on page: " + u);
133: } else {
134: newURLs.add(u);
135: newURLSet.add(u);
136: //_logClass.debug("Found new URL on page: " + u);
137: }
138: } catch (MalformedURLException murle) {
139: }
140: }
141: }
142: startPos = tagPos + 1;
143: continue;
144: } else {
145: startPos = tagPos + 1;
146: continue;
147: }
148: }
149: }
150:
151: private void logMailURL(String url) {
152: _logClass.debug("logMailURL()");
153:
154: try {
155: FileWriter appendedFile = new FileWriter(config
156: .getMailtoLogFile().toString(), true);
157: PrintWriter pW = new PrintWriter(appendedFile);
158: pW.println(url);
159: pW.flush();
160: pW.close();
161: } catch (IOException ioe) {
162: _logClass.warn("Caught IO exception writing mailto URL:"
163: + ioe.getMessage(), ioe);
164: }
165: }
166:
167: /**
168: * Check if a particular URL looks like it's a mailto: style link.
169: */
170: private boolean isMailTo(String url) {
171: if (url == null) {
172: return false;
173: }
174:
175: url = url.toUpperCase();
176: return (url.indexOf("MAILTO:") != -1);
177: }
178: }
|