001: /*
002: * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
003: *
004: * Copyright (c) 2001 Brian Pitcher
005: *
006: * Permission is hereby granted, free of charge, to any person obtaining a
007: * copy of this software and associated documentation files (the "Software"),
008: * to deal in the Software without restriction, including without limitation
009: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
010: * and/or sell copies of the Software, and to permit persons to whom the
011: * Software is furnished to do so, subject to the following conditions:
012: *
013: * The above copyright notice and this permission notice shall be included in
014: * all copies or substantial portions of the Software.
015: *
016: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
017: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
018: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
019: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
020: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
021: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
022: * SOFTWARE.
023: */
024:
025: // $Header: /cvsroot/weblech/weblech/src/weblech/spider/URLObject.java,v 1.3 2002/06/02 08:00:48 weblech Exp $
026: package weblech.spider;
027:
028: import org.apache.log4j.Category;
029:
030: import java.io.*;
031: import java.net.URL;
032: import java.net.URLEncoder;
033:
034: import weblech.util.Log4j;
035:
036: public class URLObject {
037: private final static Category _logClass = Category
038: .getInstance(URLObject.class);
039:
040: static {
041: Log4j.init();
042: }
043:
044: private final URL sourceURL;
045: private final String contentType;
046: private final byte[] content;
047:
048: private final SpiderConfig config;
049:
050: public URLObject(URL sourceURL, String contentType, byte[] content,
051: SpiderConfig config) {
052: this .sourceURL = sourceURL;
053: this .contentType = contentType;
054: this .content = content;
055: this .config = config;
056: }
057:
058: public URLObject(URL sourceURL, SpiderConfig config) {
059: this .sourceURL = sourceURL;
060: this .config = config;
061:
062: String s = sourceURL.toExternalForm().toLowerCase();
063: if (s.indexOf(".jpg") != -1) {
064: contentType = "image/jpeg";
065: } else if (s.indexOf(".gif") != -1) {
066: contentType = "image/gif";
067: } else {
068: contentType = "text/html";
069: }
070:
071: if (existsOnDisk()) {
072:
073: File f = new File(convertToFileName());
074: if (f.isDirectory()) {
075: f = new File(f, "index.html");
076: }
077: content = new byte[(int) f.length()];
078: try {
079: FileInputStream in = new FileInputStream(f);
080: in.read(content);
081: in.close();
082: } catch (IOException ioe) {
083: _logClass.warn(
084: "IO Exception reading disk version of URL "
085: + sourceURL, ioe);
086: }
087: } else {
088: content = new byte[0];
089: }
090: }
091:
092: public String getContentType() {
093: return contentType;
094: }
095:
096: public boolean isHTML() {
097: return contentType.toLowerCase().startsWith("text/html");
098: }
099:
100: public boolean isXML() {
101: return contentType.toLowerCase().startsWith("text/xml");
102: }
103:
104: public boolean isImage() {
105: return contentType.startsWith("image/");
106: }
107:
108: public String getStringContent() {
109: return new String(content);
110: }
111:
112: private String convertToFileName() {
113: String url = sourceURL.toExternalForm();
114: int httpIdx = url.indexOf("http://");
115: if (httpIdx == 0) {
116: url = url.substring(7);
117: }
118: // Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
119: if (url.indexOf("/") < 0) {
120: url = url + "/";
121: }
122: // If trailing slash, add index.html as default
123: if (url.endsWith("/")) {
124: url = url + "index.html";
125: }
126: url = textReplace("?", URLEncoder.encode("?"), url);
127: url = textReplace("&", URLEncoder.encode("&"), url);
128: return config.getSaveRootDirectory().getPath() + "/" + url;
129: }
130:
131: public boolean existsOnDisk() {
132: File f = new File(convertToFileName());
133: return (f.exists() && !f.isDirectory());
134: }
135:
136: public void writeToFile() {
137: writeToFile(convertToFileName());
138: }
139:
140: public void writeToFile(String fileName) {
141: _logClass.debug("writeToFile(" + fileName + ")");
142: try {
143: File f = new File(fileName);
144: f.getParentFile().mkdirs();
145: FileOutputStream out = new FileOutputStream(fileName);
146: out.write(content);
147: out.flush();
148: out.close();
149: } catch (IOException ioe) {
150: _logClass.warn("IO Exception writing to " + fileName, ioe);
151: }
152: }
153:
154: public String toString() {
155: StringBuffer sb = new StringBuffer();
156: sb.append("URLObject: ");
157: sb.append(contentType);
158: if (false)//isHTML() || isXML())
159: {
160: sb.append("\n");
161: sb.append(getStringContent());
162: }
163: return sb.toString();
164: }
165:
166: private String textReplace(String find, String replace, String input) {
167: int startPos = 0;
168: while (true) {
169: int textPos = input.indexOf(find, startPos);
170: if (textPos < 0) {
171: break;
172: }
173: input = input.substring(0, textPos) + replace
174: + input.substring(textPos + find.length());
175: startPos = textPos + replace.length();
176: }
177: return input;
178: }
179: }
|