001: /* UriProcessingFormatter.java
002: *
003: * $Id: UriProcessingFormatter.java 4964 2007-03-08 06:56:46Z gojomo $
004: *
005: * Created on Jun 10, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.io;
026:
027: import it.unimi.dsi.mg4j.util.MutableString;
028:
029: import java.util.logging.Formatter;
030: import java.util.logging.LogRecord;
031:
032: import org.archive.crawler.datamodel.CoreAttributeConstants;
033: import org.archive.crawler.datamodel.CrawlURI;
034: import org.archive.util.ArchiveUtils;
035: import org.archive.util.Base32;
036: import org.archive.util.MimetypeUtils;
037:
038: /**
039: * Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
040: *
041: * @author gojomo
042: */
043: public class UriProcessingFormatter extends Formatter implements
044: CoreAttributeConstants {
045: private final static String NA = "-";
046: /**
047: * Guess at line length (URIs are assumed avg. of 128 bytes).
048: * Used to preallocated the buffer we accumulate the log line
049: * in. Hopefully we get it right most of the time and no need
050: * to enlarge except in the rare case.
051: */
052: private final static int GUESS_AT_LOG_LENGTH = 17 + 1 + 3 + 1 + 10
053: + 128 + +1 + 10 + 1 + 128 + 1 + 10 + 1 + 3 + 14 + 1 + 32
054: + 4 + 128 + 1;
055:
056: /**
057: * Reuseable assembly buffer.
058: */
059: private final MutableString buffer = new MutableString(
060: GUESS_AT_LOG_LENGTH);
061:
062: public String format(LogRecord lr) {
063: CrawlURI curi = (CrawlURI) lr.getParameters()[0];
064: String length = NA;
065: String mime = null;
066: if (curi.isHttpTransaction()) {
067: if (curi.getContentLength() >= 0) {
068: length = Long.toString(curi.getContentLength());
069: } else if (curi.getContentSize() > 0) {
070: length = Long.toString(curi.getContentSize());
071: }
072: mime = curi.getContentType();
073: } else {
074: if (curi.getContentSize() > 0) {
075: length = Long.toString(curi.getContentSize());
076: }
077: mime = curi.getContentType();
078: }
079: mime = MimetypeUtils.truncate(mime);
080:
081: long time = System.currentTimeMillis();
082: String arcTimeAndDuration;
083: if (curi.containsKey(A_FETCH_COMPLETED_TIME)) {
084: long completedTime = curi.getLong(A_FETCH_COMPLETED_TIME);
085: long beganTime = curi.getLong(A_FETCH_BEGAN_TIME);
086: arcTimeAndDuration = ArchiveUtils.get17DigitDate(beganTime)
087: + "+" + Long.toString(completedTime - beganTime);
088: } else {
089: arcTimeAndDuration = NA;
090: }
091:
092: String via = curi.flattenVia();
093:
094: String digest = curi.getContentDigestSchemeString();
095:
096: String sourceTag = curi.containsKey(A_SOURCE_TAG) ? curi
097: .getString(A_SOURCE_TAG) : null;
098:
099: this .buffer.length(0);
100: return this .buffer.append(ArchiveUtils.getLog17Date(time))
101: .append(" ").append(
102: ArchiveUtils.padTo(curi.getFetchStatus(), 5))
103: .append(" ").append(ArchiveUtils.padTo(length, 10))
104: .append(" ").append(curi.getUURI().toString()).append(
105: " ").append(
106: checkForNull(curi.getPathFromSeed())).append(
107: " ").append(checkForNull(via)).append(" ")
108: .append(mime).append(" ").append("#")
109: // Pad threads to be 3 digits. For Igor.
110: .append(
111: ArchiveUtils.padTo(Integer.toString(curi
112: .getThreadNumber()), 3, '0')).append(
113: " ").append(arcTimeAndDuration).append(" ")
114: .append(checkForNull(digest)).append(" ").append(
115: checkForNull(sourceTag)).append(" ").append(
116: checkForNull(curi.getAnnotations())).append(
117: "\n").toString();
118: }
119:
120: /**
121: * @param str String to check.
122: * @return Return passed string or <code>NA</code> if null.
123: */
124: protected String checkForNull(String str) {
125: return (str == null || str.length() <= 0) ? NA : str;
126: }
127: }
|