001: //////////////////////////////////////////////////////////////////////////////
002: // Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
003: //////////////////////////////////////////////////////////////////////////////
004:
005: package net.matuschek.http;
006:
007: import java.io.*;
008: import java.net.*;
009: import java.util.*;
010: import java.util.zip.ZipEntry;
011: import java.util.zip.ZipFile;
012: import java.util.zip.ZipOutputStream;
013:
014: import net.matuschek.util.MD5;
015: import org.apache.log4j.Category;
016:
017: /**
018: * Full implementation of HttpDocManager interface.
019: * Caches documents, links and headers in ZIP-files.
020: * Documents with same content will be detected
021: * and share the same content-storage.
022: *
023: * @author Oliver Schmidt
024: * @version $Revision: 1.2 $
025: */
026: public class HttpDocCache implements HttpDocManager {
027:
028: /** internally used header name to mark duplicates */
029: protected final static String CONTENT_DUPLICATE = "Content-Duplicate";
030:
031: /** use MD5 encoding for filenames */
032: public boolean useMD5 = true;
033:
034: /** log4j logging instance */
035: protected static Category log = Category
036: .getInstance(HttpDocCache.class.getName());
037:
038: /** collection of visited URLs */
039: private Collection urls = new LinkedList();
040:
041: /** storage main directory */
042: protected String storagedir;
043:
044: /** file that holds directory information */
045: protected File storageDirectoryFile = null;
046:
047: /** subdirectory name for links */
048: protected final static String LINKS = "links" + File.separator;
049:
050: /** subdirectory name for content */
051: protected final static String CONTENT = "content" + File.separator;
052:
053: /** subdirectory name for document information */
054: protected final static String DOCUMENTS = "documents"
055: + File.separator;
056:
057: /**
058: * Constructor
059: * @param storageDirectory
060: */
061: public HttpDocCache(String storageDirectory) {
062: setStorageDir(storageDirectory);
063: }
064:
065: private FileOutputStream storageDirectoryStream = null;
066:
067: /**
068: * Set storage directory and create directories if necessary.
069: * @param newStoragedir
070: */
071: private void setStorageDir(String newStoragedir) {
072: storagedir = newStoragedir;
073:
074: if (!storagedir.endsWith(File.separator)) {
075: storagedir = storagedir + File.separator;
076: }
077:
078: // create the directories, if they do not exist yet.
079: File storagedirFile = new File(storagedir + DOCUMENTS);
080: if (!storagedirFile.exists()) {
081: storagedirFile.mkdirs();
082: }
083: File contentFile = new File(storagedir + CONTENT);
084: if (!contentFile.exists()) {
085: contentFile.mkdirs();
086: }
087:
088: if (useMD5) {
089: storageDirectoryFile = new File(storagedir
090: + "directory.csv");
091: try {
092: storageDirectoryStream = new FileOutputStream(
093: storageDirectoryFile.getPath(), true);
094: if (!storageDirectoryFile.exists()) {
095: storageDirectoryStream.write(("Path,URL" + LF)
096: .getBytes());
097: }
098: } catch (Exception e) {
099: log.error(e.getMessage());
100: }
101: }
102: }
103:
104: final static String QUOTE = "\"";
105: final static String LF = System.getProperty("line.separator");
106:
107: /**
108: * Method store.
109: * stores the document to the storage directory
110: * @param doc the document to be stored
111: * @param links to be stored (optional)
112: * @return String
113: * @throws DocManagerException if the document cannot be written to the directory
114: */
115: public void storeDocument(HttpDoc doc) throws DocManagerException {
116: List links = doc.getLinks();
117:
118: // donīt store cached documents
119: if (doc.isCached()) {
120: return;
121: }
122:
123: // get the content type
124: String filename = generateFilename(doc.getURL()
125: .toExternalForm());
126:
127: String filepath = storagedir + DOCUMENTS + filename;
128: checkStoragePathFor(DOCUMENTS, filename);
129:
130: try {
131: File f = new File(filepath + ".zip");
132: if (!f.exists()) {
133: writeDirectoryInfo(doc, filename);
134: }
135:
136: // write it to the file
137: OutputStream fs = new BufferedOutputStream(
138: new FileOutputStream(f));
139: ZipOutputStream zos = new ZipOutputStream(fs);
140: zos.setLevel(9);
141:
142: try {
143: // writeContentToZipFile(doc, zos);
144: storeContent(doc);
145: writeHeadersToZipFile(doc, zos);
146: writeUrlToZipFile(doc, zos);
147: if (links != null) {
148: writeLinksToZipFile(links, zos);
149: }
150: } catch (Throwable e) {
151: System.out.println(e);
152: } finally {
153: zos.close();
154: fs.close();
155: long date = doc.getDateAsMilliSeconds();
156: f.setLastModified(date > 0 ? date : System
157: .currentTimeMillis());
158: }
159: } catch (IOException ioex) {
160: DocManagerException ex = new DocManagerException(ioex
161: .getMessage());
162: throw ex;
163: }
164: }
165:
166: /**
167: * Write Directory info.
168: * @param doc
169: * @param filename in cache
170: * @throws IOException
171: */
172: protected void writeDirectoryInfo(HttpDoc doc, String filename)
173: throws IOException {
174: if (storageDirectoryFile != null) {
175: synchronized (storageDirectoryFile) {
176: try {
177: String directoryInfo = QUOTE + filename + QUOTE
178: + "," + QUOTE + doc.getURL() + QUOTE + LF;
179: storageDirectoryStream.write(directoryInfo
180: .getBytes());
181: } catch (Exception e) {
182: log.warn(e.getMessage());
183: storageDirectoryStream.close();
184: }
185: }
186: }
187: }
188:
189: /**
190: * Write content to zipFile
191: * @param doc
192: * @param zos
193: * @throws IOException
194: */
195: protected void writeContentToZipFile(HttpDoc doc,
196: ZipOutputStream zos) throws IOException {
197: String contenttype = doc
198: .getHeaderValue(HttpHeader.CONTENT_TYPE);
199: String extension = getExtensionFromContenttype(contenttype);
200: ZipEntry zipEntry = new ZipEntry("content" + extension);
201: long date = doc.getLastModifiedAsMilliSeconds();
202: if (date < 0) {
203: date = doc.getDateAsMilliSeconds();
204: }
205: zipEntry.setTime(date);
206: zos.putNextEntry(zipEntry);
207: zos.write(doc.getContent());
208: zos.closeEntry();
209: }
210:
211: /**
212: * Write headers to zipFile.
213: * @param doc
214: * @param zos
215: * @return ZipEntry
216: * @throws IOException
217: */
218: protected ZipEntry writeHeadersToZipFile(HttpDoc doc,
219: ZipOutputStream zos) throws IOException {
220: StringBuffer comment = new StringBuffer();
221: Vector headers = doc.getHttpHeader();
222: for (Iterator iter = headers.iterator(); iter.hasNext();) {
223: HttpHeader header = (HttpHeader) iter.next();
224: if (!header.getName().equals(CONTENT_DUPLICATE)) {
225: comment.append(header.toString());
226: if (iter.hasNext()) {
227: comment.append(LF);
228: }
229: }
230: }
231: ZipEntry ze = new ZipEntry("header");
232: zos.putNextEntry(ze);
233: zos.write(comment.toString().getBytes());
234: long date = doc.getDateAsMilliSeconds();
235: ze.setTime(date > 0 ? date : System.currentTimeMillis());
236: zos.closeEntry();
237: return ze;
238: }
239:
240: /**
241: * Read headers from ZipFile
242: * @param doc
243: * @param zf
244: * @return boolean
245: * @throws IOException
246: */
247: protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf)
248: throws IOException {
249: ZipEntry ze = zf.getEntry("header");
250: if (ze != null) {
251: InputStream is = zf.getInputStream(ze);
252: BufferedReader reader = new BufferedReader(
253: new InputStreamReader(is));
254: while (reader.ready()) {
255: String line = reader.readLine();
256: int pos = line.indexOf(": ");
257: if (pos >= 0) {
258: String name = line.substring(0, pos);
259: String value = line.substring(pos + 2);
260: HttpHeader header = new HttpHeader(name, value);
261: doc.addHeader(header);
262: }
263: }
264: reader.close();
265: return true;
266: }
267: return false;
268: }
269:
270: /**
271: * Read links from ZipFile
272: * @param doc
273: * @param zf
274: * @return boolean
275: * @throws IOException
276: */
277: protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf)
278: throws IOException {
279: ZipEntry ze = zf.getEntry("links");
280: List links = doc.getLinks();
281: if (links == null) {
282: links = new Vector();
283: doc.setLinks(links);
284: } else {
285: links.clear();
286: }
287:
288: if (ze != null) {
289: InputStream is = zf.getInputStream(ze);
290: BufferedReader reader = new BufferedReader(
291: new InputStreamReader(is));
292: while (reader.ready()) {
293: String line = reader.readLine();
294: if (line != null) {
295: URL url = new URL(line);
296: links.add(url);
297: }
298: }
299: reader.close();
300: return true;
301: }
302: return false;
303: }
304:
305: /**
306: * Write Url to ZipFile.
307: * @param doc
308: * @param zos
309: * @return ZipEntry
310: * @throws IOException
311: */
312: protected ZipEntry writeUrlToZipFile(HttpDoc doc,
313: ZipOutputStream zos) throws IOException {
314: String url = doc.getURL().toString();
315: ZipEntry ze = new ZipEntry("url");
316: zos.putNextEntry(ze);
317: zos.write(url.getBytes());
318: long date = doc.getDateAsMilliSeconds();
319: ze.setTime(date > 0 ? date : System.currentTimeMillis());
320: zos.closeEntry();
321: return ze;
322: }
323:
324: /**
325: * Get File of document content users.
326: * @param doc
327: * @return File
328: */
329: private File getContentUsersFile(HttpDoc doc) {
330: File f = null;
331: byte[] content = doc.getContent();
332: if (content.length != 0) {
333: String md5 = doc.getContentMD5();
334: f = contentFile(md5, ".txt");
335: }
336: return f;
337: }
338:
339: /**
340: * Returns URL-String of duplicate content (if found).
341: * @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
342: */
343: public String findDuplicate(HttpDoc doc) throws IOException {
344: String duplicate = null;
345: File f = getContentUsersFile(doc);
346: if (f != null) {
347: String urlString = doc.getURL().toString();
348: if (f.exists()) {
349: BufferedReader reader = new BufferedReader(
350: new InputStreamReader(new FileInputStream(f)));
351: while (reader.ready()) {
352: String line = reader.readLine();
353: if (line.equals(urlString)) {
354: break;
355: } else if (duplicate == null) {
356: duplicate = line;
357: }
358: }
359: reader.close();
360: }
361: }
362: return duplicate;
363: }
364:
365: /**
366: * Creates a file with a name created by the content, containing the URL.
367: * @param doc
368: */
369: protected void storeContent(HttpDoc doc) throws IOException {
370: if (doc.getContent().length == 0)
371: return;
372: File f = getContentUsersFile(doc);
373: String urlString = doc.getURL().toString();
374: String md5 = doc.getContentMD5();
375:
376: // is content user?
377: boolean found = false;
378: if (f.exists()) {
379: BufferedReader reader = new BufferedReader(
380: new InputStreamReader(new FileInputStream(f)));
381: try {
382: while (reader.ready()) {
383: String line = reader.readLine();
384: if (line.equals(urlString)) {
385: found = true;
386: break;
387: }
388: }
389: } finally {
390: reader.close();
391: }
392: }
393:
394: // write content
395: File fzip = contentFile(md5, ".zip");
396: if (!fzip.exists()) {
397: checkStoragePathFor(CONTENT,
398: useFirstCharactersAsDirectories(md5));
399: OutputStream fs = new BufferedOutputStream(
400: new FileOutputStream(fzip));
401: ZipOutputStream zos = null;
402: try {
403: zos = new ZipOutputStream(fs);
404: zos.setLevel(9);
405: writeContentToZipFile(doc, zos);
406: } finally {
407: if (zos != null) {
408: zos.close();
409: } else {
410: fs.close();
411: }
412: }
413: } else {
414: fzip.setLastModified(System.currentTimeMillis());
415: }
416:
417: // append user
418: if (!found) {
419: FileOutputStream os = new FileOutputStream(f.getPath(),
420: true);
421: try {
422: os.write((urlString + LF).getBytes());
423: } finally {
424: os.close();
425: }
426: }
427: }
428:
429: /**
430: * Write links to ZipFile.
431: * @param links
432: * @param ZipOutputStream
433: */
434: protected void writeLinksToZipFile(List links, ZipOutputStream zs)
435: throws IOException {
436: HashSet storedLinks = new HashSet();
437: ZipEntry zipEntry = new ZipEntry("links");
438: zs.putNextEntry(zipEntry);
439: for (Iterator iter = links.iterator(); iter.hasNext();) {
440: URL url = (URL) iter.next();
441: if (!storedLinks.contains(url)) {
442: zs.write((url.toString() + LF).getBytes());
443: storedLinks.add(url);
444: }
445: }
446: zs.closeEntry();
447: }
448:
449: /**
450: * Collects Urls (duplicates will be skipped).
451: *
452: * @param doc a HttpDoc object to process. This may also be null
453: * @exception DocManagerException will be thrown if an error occurs
454: * while processing the document.
455: * @see net.matuschek.http.HttpDocManager#processDocument(net.matuschek.http.HttpDoc)
456: */
457: public void processDocument(HttpDoc doc) throws DocManagerException {
458: log.info("Processing " + doc.getURL().toExternalForm()
459: + doc.getHttpHeader());
460:
461: // collect URL (only if content is no duplicate)
462: HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE);
463: if (duplicate == null) {
464: urls.add(doc.getURL());
465: }
466: }
467:
468: /**
469: * retrieves a document from the cache.
470: * @param url
471: * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
472: */
473: public HttpDoc retrieveFromCache(java.net.URL url) {
474: HttpDoc doc = null;
475: File f = null;
476: try {
477: String filename0 = url.toExternalForm();
478: String filename = generateFilename(filename0) + ".zip";
479: f = new File(storagedir + DOCUMENTS + filename);
480:
481: if (f.exists()) {
482: log.info("retrieve " + f);
483:
484: // create document and read it from file
485: doc = new HttpDoc();
486: doc.setURL(url);
487: ZipFile zf = new ZipFile(f);
488:
489: // read headers
490: readHeadersFromZipFile(doc, zf);
491:
492: // read links
493: readLinksFromZipFile(doc, zf);
494:
495: doc.setCached(true);
496:
497: // read content
498: String md5 = doc.getContentMD5();
499: File contentFile = contentFile(md5, ".zip");
500: if (contentFile.exists()) {
501: ZipFile contentZip = new ZipFile(contentFile);
502: readContentFromZipFile(doc, contentZip);
503: contentZip.close();
504: } else {
505: doc.setContent(new byte[0]);
506: }
507: zf.close();
508: }
509: } catch (Exception e) {
510: log.warn("removing invalid file " + f);
511: f.delete();
512: doc = null;
513: }
514:
515: return doc;
516: }
517:
518: /**
519: * Read content from ZipFile
520: * @param doc
521: * @param contentZip
522: * @throws IOException
523: */
524: protected void readContentFromZipFile(HttpDoc doc,
525: ZipFile contentZip) throws IOException {
526: byte[] content = null;
527: for (Enumeration enumeration = contentZip.entries(); enumeration
528: .hasMoreElements();) {
529: ZipEntry zipEntry = (ZipEntry) enumeration.nextElement();
530: if (zipEntry.getName().startsWith("content")) {
531: InputStream is = contentZip.getInputStream(zipEntry);
532: int length = (int) zipEntry.getSize();
533: content = new byte[length];
534: int startPos = 0;
535: while (startPos < length) {
536: startPos += is.read(content, startPos, length
537: - startPos);
538: }
539: is.close();
540: break;
541: }
542: }
543: doc.setContent(content);
544: }
545:
546: /**
547: * Remove document from cache.
548: * @param url
549: * @see net.matuschek.http.HttpDocManager#removeDocument(URL)
550: */
551: public void removeDocument(URL url) {
552: HttpDoc doc = retrieveFromCache(url);
553:
554: File f = null;
555: try {
556: String filename0 = url.toExternalForm();
557: String filename = generateFilename(filename0) + ".zip";
558:
559: f = new File(storagedir + LINKS + filename);
560: if (f.exists()) {
561: f.delete();
562: }
563:
564: deleteContent(doc);
565: f = new File(storagedir + DOCUMENTS + filename);
566: if (f.exists()) {
567: f.delete();
568: }
569: } catch (Exception ex) {
570: log.error(ex);
571: }
572: }
573:
574: /**
575: * Deletes stored content for the given document
576: * @param document
577: */
578: private void deleteContent(HttpDoc doc) throws IOException {
579: byte[] content = doc.getContent();
580: if (content.length == 0) {
581: return;
582: }
583: String urlString = doc.getURL().toString();
584: String md5 = doc.getContentMD5();
585: File f = contentFile(md5, ".txt");
586: ArrayList entries = new ArrayList();
587: if (f.exists()) {
588: BufferedReader reader = new BufferedReader(
589: new InputStreamReader(new FileInputStream(f)));
590: while (reader.ready()) {
591: String line = reader.readLine();
592: if (!line.equals(urlString)) {
593: entries.add(line);
594: }
595: }
596: reader.close();
597: }
598: if (entries.size() > 0) {
599: FileOutputStream os = new FileOutputStream(f.getPath(),
600: false);
601: for (Iterator iter = entries.iterator(); iter.hasNext();) {
602: String line = (String) iter.next();
603: os.write((line + LF).getBytes());
604: }
605: os.close();
606: } else {
607: f.delete();
608: File fzip = contentFile(md5, ".zip");
609: if (fzip.exists()) {
610: fzip.delete();
611: }
612: }
613: }
614:
615: /**
616: * List collected URLs.
617: * @see java.lang.Object#toString()
618: */
619: public String toString() {
620: StringBuffer sb = new StringBuffer(1000);
621: for (Iterator i = urls.iterator(); i.hasNext();) {
622: sb.append(i.next()).append("\n");
623: }
624: return sb.toString();
625: }
626:
627: /**
628: * Uses the first storageDirDepth characters of filename as paths
629: * @param filename
630: */
631: private final String useFirstCharactersAsDirectories(String filename) {
632: int n = storageDirDepth;
633: if (n > filename.length())
634: n = filename.length();
635: char dir[] = new char[n * 2];
636: for (int i = 0; i < n; i++) {
637: dir[i * 2] = filename.charAt(i);
638: dir[i * 2 + 1] = File.separatorChar;
639: }
640: return new String(dir);
641: }
642:
643: /**
644: * Checks if the storage path for the given file exists and creates it if necessary.
645: * @param subdirectory
646: * @param filename
647: */
648: private final void checkStoragePathFor(String subdirectory,
649: String filename) {
650: if (!subdirectory.endsWith(File.separator)) {
651: subdirectory += File.separator;
652: }
653: String head = filename.substring(0, storageDirDepth * 2);
654: File path = new File(storagedir + subdirectory + head);
655: if (!path.exists()) {
656: path.mkdirs();
657: }
658: }
659:
660: /**
661: * Generate a valid filename for the given docURI.
662: * @param docURI
663: * @return String
664: */
665: protected String generateFilename(String docURI) {
666: if (useMD5) {
667: MD5 md5 = new MD5(docURI);
668: String hex = md5.asHex();
669: if (storageDirDepth > 0) {
670: return useFirstCharactersAsDirectories(hex)
671: + hex.substring(storageDirDepth);
672: }
673: return hex;
674: } else {
675: StringBuffer buf = new StringBuffer(docURI.length());
676:
677: for (int i = 0; i < docURI.length(); i++) {
678: char c = docURI.charAt(i);
679: switch (c) {
680: case '/':
681: buf.append("&slash;");
682: break;
683: case '\\':
684: buf.append("&backslash");
685: break;
686: case ':':
687: buf.append(":");
688: break;
689: case '*':
690: buf.append("&asterisk;");
691: break;
692: case '?':
693: buf.append("&question;");
694: break;
695: case '\"':
696: buf.append(""");
697: break;
698: case '<':
699: buf.append("<");
700: break;
701: case '>':
702: buf.append(">");
703: break;
704: case '|':
705: buf.append("∨");
706: break;
707: default:
708: buf.append(c);
709: break;
710: }
711: }
712: docURI = buf.toString();
713:
714: return docURI;
715: }
716: }
717:
718: /**
719: * Returns a File with the mapping of this content to its URLs.
720: * @param content
721: * @return long
722: */
723: protected File contentFile(String hex, String extension) {
724: return new File(storagedir + CONTENT
725: + useFirstCharactersAsDirectories(hex)
726: + hex.substring(storageDirDepth) + extension);
727: }
728:
729: /**
730: * Close storageDirectory File.
731: * @see net.matuschek.http.HttpDocManager#finish()
732: */
733: public void finish() {
734: if (storageDirectoryStream != null) {
735: try {
736: storageDirectoryStream.close();
737: storageDirectoryStream = null;
738: } catch (IOException e) {
739: e.printStackTrace();
740: }
741: }
742: }
743:
744: /**
745: * Calls finish and super.finalize().
746: * @see java.lang.Object#finalize()
747: */
748: protected void finalize() throws Throwable {
749: finish();
750: super .finalize();
751: }
752:
753: /**
754: * Depth of source set directory.
755: * (depth = number of used subdirectory levels)
756: * The first storageDirDepth characters of file will be used
757: * as directories.
758: */
759: protected int storageDirDepth = 0;
760:
761: /**
762: * Sets the desired directory depth of the source set directory
763: * (depth = number of used subdirectory levels)
764: *
765: * @param desired depth of source set directory.
766: */
767: public void setStorageDirDepth(int depth) {
768: storageDirDepth = depth;
769: }
770:
771: /**
772: * Method getstorageDirDepth.
773: * returns the directory depth of the source set directory
774: * @param desired depth of source set directory.
775: * @return the directory depth of the source set directory
776: */
777: public int getStorageDirDepth() {
778: return storageDirDepth;
779: }
780:
781: /**
782: * Get relevant part of contenttype and get default extension for it.
783: * @param contenttype
784: * @return extension
785: */
786: private String getExtensionFromContenttype(String contenttype) {
787: String extension = null;
788: if (contenttype != null) {
789: String strContentType = null;
790: int pos = contenttype.indexOf(';');
791: if (pos > 0) {
792: strContentType = contenttype.substring(0, pos).trim();
793: } else {
794: strContentType = contenttype.trim();
795: }
796: extension = getDefaultExtension(strContentType);
797: }
798:
799: if (extension == null) {
800: extension = "";
801: } else {
802: extension = "." + extension;
803: }
804: return extension;
805: }
806:
807: /**
808: * Get default extension for given contentType.
809: * @param contentType
810: * @return default extension or null
811: */
812: protected String getDefaultExtension(String contentType) {
813: if (contentType == null) {
814: return null;
815: } else if (contentType.indexOf("text/html") >= 0) {
816: return ".html";
817: } else if (contentType.indexOf("text/") >= 0) {
818: return ".txt";
819: } else {
820: return null;
821: }
822: }
823: }
|