001: package net.matuschek.http;
002:
003: /************************************************
004: Copyright (c) 2001/2002 by Daniel Matuschek
005: *************************************************/
006:
007: import java.io.BufferedOutputStream;
008: import java.io.File;
009: import java.io.FileInputStream;
010: import java.io.FileOutputStream;
011: import java.io.IOException;
012: import java.net.URL;
013: import java.util.StringTokenizer;
014:
015: import org.apache.log4j.Category;
016:
017: /**
018: * DocumentManager that will store document contents in a file.
019: *
020: * @author Daniel Matuschek
021: * @version $Revision: 1.11 $
022: */
023: public class HttpDocToFile extends AbstractHttpDocManager {
024: /**
025: * directory where the files will be created
026: */
027: private String baseDir;
028:
029: /**
030: * the object will not store files smaller then this size !
031: */
032: private int minFileSize;
033:
034: /**
035: * defines if special characters in the URL should be replaced
036: * by "normal" characters
037: * @see #setReplaceAllSpecials(boolean)
038: */
039: private boolean replaceAllSpecials = false;
040:
041: /**
042: * defines, if CGIs should be stored on disc.
043: *
044: * @see #setStoreCGI
045: */
046: private boolean storeCGI = true;
047:
048: /** Log4J logging */
049: private Category log;
050:
051: /**
052: * creates a new HttpDocToFile object that will store the
053: * documents in the given directory
054: */
055: public HttpDocToFile(String baseDir) {
056: this .baseDir = baseDir;
057: log = Category.getInstance(getClass().getName());
058: }
059:
060: /**
061: * store document (that means write it to disk)
062: * @param doc the document to store
063: * @exception DocManagerException if the document can't be stored
064: * (some IO error occured)
065: */
066: public void storeDocument(HttpDoc doc) throws DocManagerException {
067: if ((doc == null) || (doc.getContent() == null)) {
068: return;
069: }
070:
071: /*
072: * write file only, if this was NOT a cached document
073: * (in this case we have it already on harddisk)
074: */
075: if (doc.isCached()) {
076: return;
077: }
078:
079: if ((!storeCGI) && (doc.getURL().toString().indexOf('?') >= 0)) {
080: // do not store dynamic pages, because storeCGI is false
081: // and the URL contains a "?"
082: return;
083: }
084:
085: String filename = url2Filename(doc.getURL());
086: if (doc.getContent().length >= minFileSize) {
087: try {
088: createDirs(filename);
089: BufferedOutputStream os = new BufferedOutputStream(
090: new FileOutputStream(filename));
091: os.write(doc.getContent());
092: os.flush();
093: os.close();
094: } catch (IOException e) {
095: throw new DocManagerException(e.getMessage());
096: }
097: }
098: }
099:
100: /**
101: * Gets the cacheFile of the given URL if its document was stored.
102: * @param url
103: * @return cacheFile
104: */
105: protected File getCacheFile(URL url) {
106: // does the file exists on the filesystem ?
107: File cacheFile = new File(url2Filename(url));
108: if (!(cacheFile.exists() && (cacheFile.isFile()))) {
109: return null;
110: }
111: return cacheFile;
112: }
113:
114: /**
115: * Gets the extension of the given URL if its document was stored.
116: * @param url
117: * @return String
118: */
119: protected String getExtension(URL url) {
120: // is it dynamic ?
121: if ((url.toString().indexOf('?') >= 0)
122: || (url.toString().indexOf("cgi") >= 0)) {
123: return null;
124: }
125:
126: // do we have an filename extension ?
127: // without it is not possible to guess the MIME type.
128: String path = url.getPath();
129: String ext = null;
130:
131: if (path.indexOf(".") < 0) {
132: return null;
133: }
134:
135: StringTokenizer st = new StringTokenizer(path, ".");
136: while (st.hasMoreTokens()) {
137: ext = st.nextToken();
138: }
139: // no extension if ext contains a "/"
140: if (ext.indexOf("/") >= 0) {
141: return null;
142: }
143:
144: return ext;
145: }
146:
147: /**
148: * Removes a document that was stored previous from the file system. Because
149: * the HttpDocToFile does not store the HTTP headers, only the Content-Type
150: * header will exists. Even this header may not be correct. It will only use a
151: * simple heuristic to determine the possible MIME type.
152: */
153: public void removeDocument(URL u) {
154: String ext = getExtension(u);
155: if (ext == null)
156: return;
157: File cacheFile = getCacheFile(u);
158: if (cacheFile == null)
159: return;
160:
161: cacheFile.delete();
162: }
163:
164: /**
165: * Gets a document that was stored previous from the file system.
166: * Because the HttpDocToFile does not store the HTTP headers, only
167: * the Content-Type header will exists. Even this header may not
168: * be correct. It will only use a simple heuristic to determine the
169: * possible MIME type.
170: *
171: * @return null, if this document was not stored before or it seems
172: * to be a dynamic document.
173: */
174: public HttpDoc retrieveFromCache(URL u) {
175: String ext = getExtension(u);
176: if (ext == null)
177: return null;
178: File cacheFile = getCacheFile(u);
179: if (cacheFile == null)
180: return null;
181:
182: // create a buffer;
183: long size = cacheFile.length();
184: if (size > Integer.MAX_VALUE) {
185: log.info("File too large");
186: return null;
187: }
188:
189: byte[] buff = new byte[(int) size];
190:
191: // read the file
192: try {
193: FileInputStream fi = new FileInputStream(cacheFile);
194: fi.read(buff);
195: } catch (IOException e) {
196: log
197: .info("Could not read cached document "
198: + e.getMessage());
199: return null;
200: }
201:
202: // create a new HttpDoc object
203: HttpDoc doc = new HttpDoc();
204:
205: // and set the content and the header
206: doc.setHttpCode("HTTP/1.0 200 OK");
207: doc.setContent(buff);
208:
209: // now guess the MIME type
210: String mimetype = null;
211:
212: if (ext.equals("html") || ext.equals("htm")
213: || ext.equals("shtml") || ext.equals("asp")
214: || ext.equals("php") || ext.equals("jsp")) {
215: mimetype = "text/html";
216: } else {
217: mimetype = "application/unknown";
218: }
219:
220: doc.addHeader(new HttpHeader("Content-Type", mimetype));
221: doc.setURL(u);
222: doc.setCached(true);
223:
224: return doc;
225: }
226:
227: /**
228: * gets the value of baseDir
229: * @return the value of baseDir
230: */
231: public String getBaseDir() {
232: return baseDir;
233: }
234:
235: /**
236: * sets the value of basedir
237: * @param baseDir the new value of baseDir
238: */
239: public void setBaseDir(String baseDir) {
240: this .baseDir = baseDir;
241: }
242:
243: /**
244: * converts an URL to a filename http://host/path will
245: * be converted to basedir/host/path
246: * @param URL a URL to convert, must not be null
247: * @return a pathname
248: */
249: protected String url2Filename(URL u) {
250: StringBuffer sb = new StringBuffer();
251:
252: sb.append(baseDir);
253: sb.append(File.separatorChar);
254: sb.append(u.getHost());
255: sb.append(u.getFile());
256:
257: // is there a query part ?
258: // that is something after the file name seperated by ?
259: String query = u.getQuery();
260: if ((query != null) && (!query.equals(""))) {
261: sb.append(File.separatorChar);
262: sb.append(query);
263: }
264:
265: // filename that ends with /
266: // are directories, we will name the file "index.html"
267: if (sb.charAt(sb.length() - 1) == '/') {
268: sb.append("index.html");
269: }
270:
271: // postprocess filename (replace special characters)
272: for (int i = 0; i < sb.length(); i++) {
273: char c = sb.charAt(i);
274: char newc = (char) 0;
275:
276: // replace / by operating system file name separator
277: if (c == '/') {
278: newc = File.separatorChar;
279: }
280:
281: // replace special characters from CGIs
282: if (replaceAllSpecials) {
283: if ((c == '?') || (c == '=') || (c == '&')) {
284: newc = '-';
285: }
286: }
287:
288: if ((newc != (char) 0) && (newc != c)) {
289: sb.setCharAt(i, newc);
290: }
291: }
292:
293: return sb.toString();
294: }
295:
296: /**
297: * creates all directories that are needed to place the
298: * file filename if they don't exists
299: * @param filename the full path name of a file
300: */
301: protected void createDirs(String filename) throws IOException {
302: int pos = -1;
303: // look for the last directory separator in the filename
304: for (int i = filename.length() - 1; i >= 0; i--) {
305: if (filename.charAt(i) == File.separatorChar) {
306: pos = i;
307: i = -1;
308: }
309: }
310: File dir = new File(filename.substring(0, pos));
311: dir.mkdirs();
312: }
313:
314: /**
315: * gets the value of minFileSize. Files smaller then this size
316: * (in Bytes) will not be saved to disk !
317: * @return the value of minFileSize
318: */
319: public int getMinFileSize() {
320: return minFileSize;
321: }
322:
323: /**
324: * sets the value of minFileSize
325: * @param minFileSize the new value of minFileSize
326: * @see #getMinFileSize()
327: */
328: public void setMinFileSize(int minFileSize) {
329: this .minFileSize = minFileSize;
330: }
331:
332: /**
333: * Get the value of replaceAllSpecials.
334: *
335: * if replaceAllSpecials is true, all sepcial characters in the URL
336: * will be replaced by "-". This is useful for operating system that
337: * can't handle files with special characters in the filename (e.g.
338: * Windows)
339: *
340: * @return value of replaceAllSpecials.
341: */
342: public boolean isReplaceAllSpecials() {
343: return replaceAllSpecials;
344: }
345:
346: /**
347: * Set the value of replaceAllSpecials.
348: *
349: * if replaceAllSpecials is true, all sepcial characters in the URL
350: * will be replaced by "-". This is useful for operating system that
351: * can't handle files with special characters in the filename (e.g.
352: * Windows)
353: *
354: * @param v Value to assign to replaceAllSpecials.
355: */
356: public void setReplaceAllSpecials(boolean v) {
357: this .replaceAllSpecials = v;
358: }
359:
360: /**
361: * Get the value of storeCGI
362: *
363: * If this is true, the object will store ALL retrieved documents,
364: * otherwise it will store only documents from URLs that do not
365: * have a "?" in the URL
366: */
367: public boolean getStoreCGI() {
368: return storeCGI;
369: }
370:
371: /**
372: * Set the value of storeCGI.
373: *
374: * If this is true, the object will store ALL retrieved documents,
375: * otherwise it will store only documents from URLs that do not
376: * have a "?" in the URL
377: *
378: * @param v Value to assign to storeCGI.
379: */
380: public void setStoreCGI(boolean v) {
381: this.storeCGI = v;
382: }
383:
384: }
|