001: /* Created on 2006-okt-03
002: *
003: * Copyright (C) 2006 National Library of Sweden.
004: *
005: * This program is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU Lesser General Public License
007: * as published by the Free Software Foundation; either version 2
008: * of the License, or (at your option) any later version.
009: *
010: * This program is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public License
016: * along with this program; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018: */
019:
020: package org.archive.crawler.writer;
021:
022: import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
023:
024: import java.io.ByteArrayOutputStream;
025: import java.io.File;
026: import java.io.FileOutputStream;
027: import java.io.IOException;
028: import java.io.OutputStream;
029: import java.net.InetAddress;
030: import java.security.MessageDigest;
031: import java.security.NoSuchAlgorithmException;
032: import java.util.logging.Level;
033: import java.util.logging.Logger;
034:
035: import javax.management.AttributeNotFoundException;
036: import javax.management.MBeanException;
037: import javax.management.ReflectionException;
038:
039: import org.archive.crawler.datamodel.CoreAttributeConstants;
040: import org.archive.crawler.datamodel.CrawlHost;
041: import org.archive.crawler.datamodel.CrawlURI;
042: import org.archive.crawler.framework.Processor;
043: import org.archive.crawler.settings.SimpleType;
044: import org.archive.crawler.settings.Type;
045: import org.archive.io.ReplayInputStream;
046: import org.archive.crawler.writer.Kw3Constants;
047:
048: /**
049: * Processor module that writes the results of successful fetches to
050: * files on disk. These files are MIME-files of the type used by the
051: * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/].
052: *
053: * Each URI gets written to its own file and has a path consisting of:
054: * <ul>
055: * <li> A dir named with the first two chars of the website's md5. </li>
056: * <li> A dir named after the website. </li>
057: * <li> 'current' - a dir indicating that this is the directory being written
058: * to by the ongoing crawl. </li>
059: * <li> A file on the format <md5 of url>.<fetchtime in seconds> </li>
060: * </ul>
061: * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
062: *
063: * The MIME-file itself consists of three parts:
064: * <ul>
065: * <li> 1. ArchiveInfo - Metadata about the file and its content. </li>
066: * <li> 2. Header - The HTTP response header. </li>
067: * <li> 3. Content - The HTTP response content, plus content-type. </li>
068: * </ul>
069: *
070: * @author oskar
071: */
072: public class Kw3WriterProcessor extends Processor implements
073: CoreAttributeConstants, Kw3Constants {
074:
075: private static final long serialVersionUID = 7171448068924684594L;
076:
077: private static String COLON = ":";
078: private static String WS = " ";
079: private static String LF = "\n";
080:
081: /**
082: * Logger.
083: */
084: private static final Logger logger = Logger
085: .getLogger(Kw3WriterProcessor.class.getName());
086:
087: /**
088: * Key to use asking settings for arc path value.
089: */
090: public static final String ATTR_PATH = "path";
091:
092: /**
093: * Default path.
094: */
095: private static final String DEFAULT_PATH = "arcs";
096:
097: /**
098: * Key to use asking settings for max size value.
099: */
100: public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
101:
102: /**
103: * Default max file size.
104: */
105: public static final int DEFAULT_MAX_FILE_SIZE = 10000000;
106:
107: /**
108: * Key to use asking settings if chmod should be execuated .
109: */
110: public static final String ATTR_CHMOD = "chmod";
111:
112: /**
113: * Key to use asking settings for the new chmod value.
114: */
115: public static final String ATTR_CHMOD_VALUE = "chmod-value";
116:
117: /**
118: * Default value for permissions.
119: */
120: public static final String DEFAULT_CHMOD_VALUE = "777";
121:
122: /**
123: * Key for the maximum ARC bytes to write attribute.
124: */
125: public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
126:
127: /**
128: * Key for the collection attribute.
129: */
130: public static final String ATTR_COLLECTION = "collection";
131:
132: /**
133: * Default value for collection.
134: */
135: public static final String DEFAULT_COLLECTION_VALUE = "kw3";
136:
137: /**
138: * Key for the harvester attribute.
139: */
140: public static final String ATTR_HARVESTER = "harvester";
141:
142: /**
143: * Default value for harvester.
144: */
145: public static final String DEFAULT_HARVESTER_VALUE = "heritrix";
146:
147: private static String BOUNDARY_START = "KulturArw3_";
148:
149: /*
150: * Private members for settings
151: */
152: private File arcsDir;
153:
154: private boolean chmod;
155:
156: private String chmodValue;
157:
158: private int maxSize;
159:
160: private String collection;
161:
162: private String harvester;
163:
164: /**
165: * @param name Name of this processor.
166: */
167: public Kw3WriterProcessor(String name) {
168: super (
169: name,
170: "Kw3Writer processor. "
171: + "A writer that writes files in the MIME format of The "
172: + "Swedish National Library. See this class's javadoc for"
173: + "format exposition.");
174: Type e;
175: e = addElementToDefinition(new SimpleType(ATTR_PATH,
176: "Top-level directory for archive files.", DEFAULT_PATH));
177: e.setOverrideable(false);
178: e = addElementToDefinition(new SimpleType(ATTR_COLLECTION,
179: "Name of collection.", DEFAULT_COLLECTION_VALUE));
180: e.setOverrideable(false);
181: e = addElementToDefinition(new SimpleType(
182: ATTR_HARVESTER,
183: "Name of the harvester that is used for the web harvesting.",
184: DEFAULT_HARVESTER_VALUE));
185: e.setOverrideable(false);
186: e = addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
187: "Max size of each file", new Integer(
188: DEFAULT_MAX_FILE_SIZE)));
189: e.setOverrideable(false);
190: e = addElementToDefinition(new SimpleType(
191: ATTR_CHMOD,
192: "Should permissions be changed for the newly created dirs",
193: new Boolean(true)));
194: e.setOverrideable(false);
195: e = addElementToDefinition(new SimpleType(
196: ATTR_CHMOD_VALUE,
197: "What should the permissions be set to."
198: + " Given as three octal digits, as to the UNIX 'chmod' command."
199: + " Ex. 777 for all permissions to everyone.",
200: DEFAULT_CHMOD_VALUE));
201: e.setOverrideable(false);
202: }
203:
204: protected void initialTasks() {
205: try {
206: String arcsDirPath = (String) getAttribute(ATTR_PATH);
207: this .arcsDir = new File(arcsDirPath);
208: if (!this .arcsDir.isAbsolute())
209: this .arcsDir = new File(getController().getDisk(),
210: arcsDirPath);
211:
212: this .collection = (String) getAttribute(ATTR_COLLECTION);
213: this .harvester = (String) getAttribute(ATTR_HARVESTER);
214: this .chmod = (Boolean) getAttribute(ATTR_CHMOD);
215: this .chmodValue = (String) getAttribute(ATTR_CHMOD_VALUE);
216: this .maxSize = (Integer) getAttribute(ATTR_MAX_SIZE_BYTES);
217: } catch (AttributeNotFoundException e) {
218: logger.log(Level.WARNING, "attribute error", e);
219: } catch (MBeanException e) {
220: logger.log(Level.WARNING, "attribute error", e);
221: } catch (ReflectionException e) {
222: logger.log(Level.WARNING, "attribute error", e);
223: }
224: }
225:
226: protected void innerProcess(CrawlURI curi) {
227: // Only successful fetches are written.
228: if (!curi.isSuccess())
229: return;
230: // Only http and https schemes are supported.
231: String scheme = curi.getUURI().getScheme().toLowerCase();
232: if (!"http".equalsIgnoreCase(scheme)
233: && !"https".equalsIgnoreCase(scheme))
234: return;
235:
236: // Write the MIME-file
237: try {
238: writeMimeFile(curi);
239: } catch (IOException e) {
240: logger.log(Level.WARNING, "i/o error", e);
241: }
242: }
243:
244: /*
245: * The actual writing of the Kulturarw3 MIME-file.
246: *
247: * The MIME-file consists of three parts:
248: * 1. ArchiveInfo - Metadata about the file and its content.
249: * 2. Header - The HTTP response header.
250: * 3. Content - The HTTP response content, plus content-type.
251: *
252: * For more on this format, see '?'.
253: */
254: protected void writeMimeFile(CrawlURI curi) throws IOException {
255: ReplayInputStream ris = null;
256: OutputStream out = null;
257:
258: try {
259: String boundary = BOUNDARY_START
260: + stringToMD5(curi.toString());
261: ris = curi.getHttpRecorder().getRecordedInput()
262: .getReplayInputStream();
263: out = initOutputStream(curi);
264:
265: // Part 1: Archive info
266: writeArchiveInfoPart(boundary, curi, ris, out);
267:
268: // Part 2: Header info + HTTP header
269: writeHeaderPart(boundary, ris, out);
270:
271: // Part 3: Content info + HTTP content
272: writeContentPart(boundary, curi, ris, out);
273:
274: // And finally the terminator string
275: String terminator = "\n--" + boundary + "--\n";
276: out.write(terminator.getBytes());
277: } finally {
278: if (ris != null)
279: ris.close();
280: if (out != null)
281: out.close();
282: }
283: }
284:
285: /*
286: * Get the OutputStream for the file to write to.
287: *
288: * It has a path consisting of:
289: * 1. A dir named with the first two chars of the website's md5.
290: * 2. A dir named after the website.
291: * 3. 'current' - a dir indicating that this is the directory being written
292: * to by the ongoing crawl.
293: * 4. A file on the format <md5 of url>.<fetchtime in seconds>
294: *
295: * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
296: */
297: protected OutputStream initOutputStream(CrawlURI curi)
298: throws IOException {
299: String uri = curi.toString();
300: int port = curi.getUURI().getPort();
301: String host = (port == 80 || port <= 0) ? curi.getUURI()
302: .getHost() : curi.getUURI().getHost() + ":" + port;
303: long fetchTime = curi.getLong(A_FETCH_BEGAN_TIME) / 1000;
304:
305: String md5 = stringToMD5(host);
306: File dir = new File(this .arcsDir, md5.substring(0, 2) + "/"
307: + host + "/current");
308: if (!dir.exists()) {
309: dir.mkdirs();
310: if (this .chmod)
311: chmods(dir, this .arcsDir);
312: }
313: md5 = stringToMD5(uri);
314: File arcFile = new File(dir, md5 + "." + fetchTime);
315: return new FastBufferedOutputStream(new FileOutputStream(
316: arcFile));
317: }
318:
319: protected void writeArchiveInfoPart(String boundary, CrawlURI curi,
320: ReplayInputStream ris, OutputStream out) throws IOException {
321: // Get things we need to write in this part
322: String uri = curi.toString();
323: String ip = getHostAddress(curi);
324: long headerLength = ris.getHeaderSize();
325: long contentLength = ris.getContentSize();
326: long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds
327: int statusCode = curi.getFetchStatus();
328: String headerMd5 = null;
329: Object contentMd5 = null;
330:
331: // Get headerMd5
332: ByteArrayOutputStream baos = new ByteArrayOutputStream();
333: ris.readHeaderTo(baos);
334: headerMd5 = stringToMD5(baos.toString());
335:
336: // Get contentMd5
337: contentMd5 = curi.getContentDigest();
338: if (contentMd5 != null)
339: contentMd5 = getHexString((byte[]) contentMd5);
340:
341: StringBuffer buffer = new StringBuffer();
342: buffer.append("MIME-version: 1.1" + LF);
343: buffer.append("Content-Type: multipart/mixed; boundary="
344: + boundary + LF);
345: buffer.append("HTTP-Part: ArchiveInfo" + LF);
346: buffer.append(COLLECTION_KEY + COLON + WS + this .collection
347: + LF);
348: buffer.append(HARVESTER_KEY + COLON + WS + this .harvester + LF);
349: buffer.append(URL_KEY + COLON + WS + uri + LF);
350: buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);
351: buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength
352: + LF);
353: buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);
354: buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength
355: + LF);
356: buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);
357: buffer.append(ARCHIVE_TIME_KEY + COLON + WS + archiveTime + LF);
358: buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF
359: + LF);
360: out.write(buffer.toString().getBytes());
361: }
362:
363: protected void writeHeaderPart(String boundary,
364: ReplayInputStream ris, OutputStream out) throws IOException {
365: StringBuffer buffer = new StringBuffer();
366: buffer.append("--" + boundary + LF);
367: buffer.append("Content-Type: text/plain; charset=\"US-ascii\""
368: + LF);
369: buffer.append("HTTP-Part: Header" + LF + LF);
370: out.write(buffer.toString().getBytes());
371: ris.readHeaderTo(out);
372: }
373:
374: protected void writeContentPart(String boundary, CrawlURI curi,
375: ReplayInputStream ris, OutputStream out) throws IOException {
376: // Get things we need to write in this part
377: String uri = curi.toString();
378: String contentType = curi.getContentType();
379: long contentLength = ris.getContentSize();
380: // Only write content if there is some
381: if (contentLength == 0)
382: return;
383:
384: StringBuffer buffer = new StringBuffer();
385: buffer.append("--" + boundary + LF);
386: buffer.append("Content-Type: " + contentType + LF);
387: buffer.append("HTTP-Part: Content" + LF + LF);
388: out.write(buffer.toString().getBytes());
389:
390: if (contentLength > this .maxSize) {
391: ris.readContentTo(out, this .maxSize);
392: logger.info(" Truncated url: " + uri + ", Size: "
393: + contentLength + ", Content-type: " + contentType);
394: } else {
395: ris.readContentTo(out);
396: }
397: }
398:
399: // --- Private helper functions --- //
400: /*
401: * Get a MD5 checksum based on a String.
402: */
403: private String stringToMD5(String str) {
404: try {
405: byte b[] = str.getBytes();
406: MessageDigest md = MessageDigest.getInstance("MD5");
407: md.update(b);
408: byte[] digest = md.digest();
409: return getHexString(digest);
410: } catch (NoSuchAlgorithmException e) {
411: logger.log(Level.WARNING, "md5 error", e);
412: }
413: return null;
414: }
415:
416: /*
417: * Fast convert a byte array to a hex string with possible leading zero.
418: */
419: private String getHexString(byte[] b) {
420: StringBuffer sb = new StringBuffer();
421: for (int i = 0; i < b.length; i++) {
422: String tmp = Integer.toHexString(b[i] & 0xff);
423: if (tmp.length() < 2)
424: sb.append("0" + tmp);
425: else
426: sb.append(tmp);
427: }
428: return sb.toString();
429: }
430:
431: /*
432: * Chmods for all newly created directories.
433: */
434: private void chmods(File dir, File arcsDir) {
435: String topdir = arcsDir.getAbsolutePath();
436: chmod(dir, this .chmodValue);
437: File parent = dir.getParentFile();
438: while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {
439: chmod(parent, this .chmodValue);
440: parent = parent.getParentFile();
441: }
442:
443: }
444:
445: /*
446: * Chmod for a specific file or directory.
447: */
448: private void chmod(File file, String permissions) {
449: Process proc = null;
450: try {
451: proc = Runtime.getRuntime().exec(
452: "chmod " + permissions + " "
453: + file.getAbsolutePath());
454: proc.waitFor();
455: proc.getInputStream().close();
456: proc.getOutputStream().close();
457: proc.getErrorStream().close();
458: } catch (IOException e) {
459: logger.log(Level.WARNING, "chmod failed", e);
460: } catch (InterruptedException e) {
461: logger.log(Level.WARNING, "chmod failed", e);
462: }
463: }
464:
465: private String getHostAddress(CrawlURI curi) {
466: CrawlHost h = getController().getServerCache().getHostFor(curi);
467: if (h == null) {
468: throw new NullPointerException("Crawlhost is null for "
469: + curi + " " + curi.getVia());
470: }
471: InetAddress a = h.getIP();
472: if (a == null) {
473: throw new NullPointerException(
474: "Address is null for "
475: + curi
476: + " "
477: + curi.getVia()
478: + ". Address "
479: + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up."
480: : (System.currentTimeMillis() - h
481: .getIpFetched())
482: + " ms ago."));
483: }
484: return h.getIP().getHostAddress();
485: }
486: }
|