0001: /* MirrorWriter
0002: *
0003: * $Id: MirrorWriterProcessor.java 4654 2006-09-25 20:19:54Z paul_jack $
0004: *
0005: * Created on 2004 October 26
0006: *
0007: * Copyright (C) 2004 Internet Archive.
0008: *
0009: * This file is part of the Heritrix web crawler (crawler.archive.org).
0010: *
0011: * Heritrix is free software; you can redistribute it and/or modify
0012: * it under the terms of the GNU Lesser Public License as published by
0013: * the Free Software Foundation; either version 2.1 of the License, or
0014: * any later version.
0015: *
0016: * Heritrix is distributed in the hope that it will be useful,
0017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: * GNU Lesser Public License for more details.
0020: *
0021: * You should have received a copy of the GNU Lesser Public License
0022: * along with Heritrix; if not, write to the Free Software
0023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024: */
0025: package org.archive.crawler.writer;
0026:
0027: import java.io.File;
0028: import java.io.FileOutputStream;
0029: import java.io.FilenameFilter;
0030: import java.io.IOException;
0031: import java.text.NumberFormat;
0032: import java.util.Collections;
0033: import java.util.HashMap;
0034: import java.util.HashSet;
0035: import java.util.Iterator;
0036: import java.util.Map;
0037: import java.util.Set;
0038: import java.util.TreeMap;
0039: import java.util.logging.Level;
0040: import java.util.logging.Logger;
0041:
0042: import javax.management.AttributeNotFoundException;
0043:
0044: import org.archive.crawler.datamodel.CoreAttributeConstants;
0045: import org.archive.crawler.datamodel.CrawlURI;
0046: import org.archive.crawler.framework.Processor;
0047: import org.archive.crawler.settings.ListType;
0048: import org.archive.crawler.settings.RegularExpressionConstraint;
0049: import org.archive.crawler.settings.SimpleType;
0050: import org.archive.crawler.settings.StringList;
0051: import org.archive.crawler.settings.Type;
0052: import org.archive.io.RecordingInputStream;
0053: import org.archive.io.ReplayInputStream;
0054: import org.archive.net.UURI;
0055: import org.archive.util.IoUtils;
0056:
0057: /**
0058: Processor module that writes the results of successful fetches to
0059: files on disk.
0060:
0061: Writes contents of one URI to one file on disk. The files are
0062: arranged in a directory hierarchy based on the URI paths. In that sense
0063: they mirror the file hierarchy that might exist on the servers.
0064: <p>
0065: There are a number of issues involved:
0066: <ul>
0067: <li>
0068: URIs can have arbitrary length, but file systems have length constraints.
0069: </li>
0070: <li>
0071: URIs can contain characters that file systems prohibit.
0072: </li>
0073: <li>
0074: URI paths are case-sensitive, but some file systems are case-insensitive.
0075: </li>
0076: </ul>
0077: This class tries very hard to map each URI into a file system path that
0078: obeys all file system constraints and yet reasonably represents
0079: the original URI.
0080: <p>
0081: There would normally be a single instance of this class per Heritrix
0082: instance. This class is thread-safe; any number of threads can be in its
0083: innerProcess method at once. However, conflicts can still arise in the file
0084: system. For example, if several threads try to create the same directory at
0085: the same time, only one can win. Therefore, there should be at most one
0086: access to a server at a given time.
0087:
0088: @author Howard Lee Gayle
0089: */
0090: public class MirrorWriterProcessor extends Processor implements
0091: CoreAttributeConstants {
0092:
0093: private static final long serialVersionUID = 301407556928389168L;
0094:
0095: /**
0096: * Key to use asking settings for case sensitive option.
0097: */
0098: public static final String ATTR_CASE_SENSITIVE = "case-sensitive";
0099:
0100: /**
0101: * Key to use asking settings for character map.
0102: */
0103: public static final String ATTR_CHAR_MAP = "character-map";
0104:
0105: /**
0106: * Key to use asking settings for content type map.
0107: */
0108: public static final String ATTR_CONTENT_TYPE_MAP = "content-type-map";
0109:
0110: /**
0111: * Key to use asking settings for dot begin replacement.
0112: */
0113: public static final String ATTR_DOT_BEGIN = "dot-begin";
0114:
0115: /**
0116: * Key to use asking settings for dot end replacement.
0117: */
0118: public static final String ATTR_DOT_END = "dot-end";
0119:
0120: /**
0121: * Key to use asking settings for directory file.
0122: */
0123: public static final String ATTR_DIRECTORY_FILE = "directory-file";
0124:
0125: /**
0126: * Key to use asking settings for host directory option.
0127: */
0128: public static final String ATTR_HOST_DIRECTORY = "host-directory";
0129:
0130: /**
0131: * Key to use asking settings for host map.
0132: */
0133: public static final String ATTR_HOST_MAP = "host-map";
0134:
0135: /**
0136: * Key to use asking settings for maximum file system path length.
0137: */
0138: public static final String ATTR_MAX_PATH_LEN = "max-path-length";
0139:
0140: /**
0141: * Key to use asking settings for maximum file system path segment length.
0142: */
0143: public static final String ATTR_MAX_SEG_LEN = "max-segment-length";
0144:
0145: /**
0146: * Key to use asking settings for base directory path value.
0147: */
0148: public static final String ATTR_PATH = "path";
0149:
0150: /**
0151: * Key to use asking settings for port directory option.
0152: */
0153: public static final String ATTR_PORT_DIRECTORY = "port-directory";
0154:
0155: /**
0156: * Key to use asking settings for suffix at end option.
0157: */
0158: public static final String ATTR_SUFFIX_AT_END = "suffix-at-end";
0159:
0160: /**
0161: * Key to use asking settings for too-long directory.
0162: */
0163: public static final String ATTR_TOO_LONG_DIRECTORY = "too-long-directory";
0164:
0165: /**
0166: * Key to use asking settings for underscore set.
0167: */
0168: public static final String ATTR_UNDERSCORE_SET = "underscore-set";
0169:
0170: /** Default value for ATTR_DOT_BEGIN.*/
0171: private static final String DEFAULT_DOT_BEGIN = "%2E";
0172:
0173: /** Default maximum file system path length.*/
0174: private static final int DEFAULT_MAX_PATH_LEN = 1023;
0175:
0176: /** Default maximum file system path segment length.*/
0177: private static final int DEFAULT_MAX_SEG_LEN = 255;
0178:
0179: /** Default value for ATTR_TOO_LONG_DIRECTORY.*/
0180: private static final String DEFAULT_TOO_LONG_DIRECTORY = "LONG";
0181:
0182: /** An empty Map.*/
0183: private static final Map<String, String> EMPTY_MAP = Collections
0184: .unmodifiableMap(new TreeMap<String, String>());
0185:
0186: /**
0187: Regular expression matching a file system path segment.
0188: The intent is one or more non-file-separator characters.
0189: The backslash is to quote File.separator if it's also backslash.
0190: */
0191: private static final String PATH_SEGMENT_RE = "[^\\"
0192: + File.separator + "]+";
0193:
0194: /**
0195: Regular expression constraint on ATTR_DIRECTORY_FILE.
0196: The intent is one non-file-separator character,
0197: followed by zero or more characters.
0198: The backslash is to quote File.separator if it's also backslash.
0199: */
0200: private static final String TOO_LONG_DIRECTORY_RE = "[^\\"
0201: + File.separator + "].*";
0202:
0203: /**
0204: * Logger.
0205: */
0206: private static final Logger logger = Logger
0207: .getLogger(MirrorWriterProcessor.class.getName());
0208:
0209: /**
0210: * @param name Name of this processor.
0211: */
0212: public MirrorWriterProcessor(String name) {
0213: super (
0214: name,
0215: "MirrorWriter processor. "
0216: + "A writer that writes each URL to a file on disk named for "
0217: + "a derivative of the URL.");
0218: Type e; // Current element.
0219: addElementToDefinition(new SimpleType(
0220: ATTR_CASE_SENSITIVE,
0221: "True if the file system is case-sensitive, like UNIX. "
0222: + "False if the file system is case-insensitive, "
0223: + "like Macintosh HFS+ and Windows.",
0224: Boolean.TRUE));
0225: addElementToDefinition(new StringList(
0226: ATTR_CHAR_MAP,
0227: "This list is grouped in pairs. "
0228: + "The first string in each pair must have a length of one. "
0229: + "If it occurs in a URI path, "
0230: + "it is replaced by the second string in the pair. "
0231: + "For UNIX, no character mapping is normally needed. "
0232: + "For Macintosh, the recommended value is [: %%3A]. "
0233: + "For Windows, the recommended value is "
0234: + "[' ' %%20 " %%22 * %%2A : %%3A < %%3C "
0235: + "\\> %%3E ? %%3F \\\\ %%5C ^ %%5E | %%7C]."));
0236: addElementToDefinition(new StringList(
0237: ATTR_CONTENT_TYPE_MAP,
0238: "This list is grouped in pairs. "
0239: + "If the content type of a resource begins (case-insensitive) "
0240: + "with the first string in a pair, the suffix is set to "
0241: + "the second string in the pair, replacing any suffix that may "
0242: + "have been in the URI. For example, to force all HTML files "
0243: + "to have the same suffix, use [text/html html]."));
0244: e = addElementToDefinition(new SimpleType(ATTR_DIRECTORY_FILE,
0245: "Implicitly append this to a URI ending with '/'.",
0246: "index.html"));
0247: e.addConstraint(new RegularExpressionConstraint(
0248: PATH_SEGMENT_RE, Level.SEVERE,
0249: "This must be a simple file name."));
0250: e = addElementToDefinition(new SimpleType(
0251: ATTR_DOT_BEGIN,
0252: "If a segment starts with '.', the '.' is replaced by this.",
0253: DEFAULT_DOT_BEGIN));
0254: e.addConstraint(new RegularExpressionConstraint(
0255: PATH_SEGMENT_RE, Level.SEVERE,
0256: "This must not be empty, and must not contain "
0257: + File.separator));
0258: addElementToDefinition(new SimpleType(
0259: ATTR_DOT_END,
0260: "If a directory name ends with '.' it is replaced by this. "
0261: + "For all file systems except Windows, '.' is recommended. "
0262: + "For Windows, %%2E is recommended.", "."));
0263: addElementToDefinition(new StringList(
0264: ATTR_HOST_MAP,
0265: "This list is grouped in pairs. "
0266: + "If a host name matches (case-insensitive) the first string "
0267: + "in a pair, it is replaced by the second string in the pair. "
0268: + "This can be used for consistency when several names are used "
0269: + "for one host, for example "
0270: + "[12.34.56.78 www42.foo.com]."));
0271: addElementToDefinition(new SimpleType(ATTR_HOST_DIRECTORY,
0272: "Create a subdirectory named for the host in the URI.",
0273: Boolean.TRUE));
0274: addElementToDefinition(new SimpleType(ATTR_PATH,
0275: "Top-level directory for mirror files.", "mirror"));
0276:
0277: // TODO: Add a new Constraint subclass so ATTR_MAX_PATH_LEN and
0278: // ATTR_MAX_SEG_LEN can be constained to reasonable values.
0279: addElementToDefinition(new SimpleType(ATTR_MAX_PATH_LEN,
0280: "Maximum file system path length.", new Integer(
0281: DEFAULT_MAX_PATH_LEN)));
0282: addElementToDefinition(new SimpleType(ATTR_MAX_SEG_LEN,
0283: "Maximum file system path segment length.",
0284: new Integer(DEFAULT_MAX_SEG_LEN)));
0285: addElementToDefinition(new SimpleType(ATTR_PORT_DIRECTORY,
0286: "Create a subdirectory named for the port in the URI.",
0287: Boolean.FALSE));
0288: addElementToDefinition(new SimpleType(
0289: ATTR_SUFFIX_AT_END,
0290: "If true, the suffix is placed at the end of the path, "
0291: + "after the query (if any). If false, the suffix is placed "
0292: + "before the query.", Boolean.TRUE));
0293: e = addElementToDefinition(new SimpleType(
0294: ATTR_TOO_LONG_DIRECTORY,
0295: "If all the directories in the URI would exceed, "
0296: + "or come close to exceeding, the file system maximum "
0297: + "path length, then they are all replaced by this.",
0298: DEFAULT_TOO_LONG_DIRECTORY));
0299: e.addConstraint(new RegularExpressionConstraint(
0300: TOO_LONG_DIRECTORY_RE, Level.SEVERE,
0301: "This must be relative and not empty."));
0302: addElementToDefinition(new StringList(
0303: ATTR_UNDERSCORE_SET,
0304: "If a directory name appears (case-insensitive) in this list "
0305: + "then an underscore is placed before it. "
0306: + "For all file systems except Windows, this is not needed. "
0307: + "For Windows, the following is recommended: "
0308: + "[com1 com2 com3 com4 com5 com6 com7 com8 com9 "
0309: + "lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 "
0310: + "con nul prn]."));
0311: }
0312:
0313: protected void innerProcess(CrawlURI curi) {
0314: if (!curi.isSuccess()) {
0315: return;
0316: }
0317: UURI uuri = curi.getUURI(); // Current URI.
0318:
0319: // Only http and https schemes are supported.
0320: String scheme = uuri.getScheme();
0321: if (!"http".equalsIgnoreCase(scheme)
0322: && !"https".equalsIgnoreCase(scheme)) {
0323: return;
0324: }
0325: RecordingInputStream recis = curi.getHttpRecorder()
0326: .getRecordedInput();
0327: if (0L == recis.getResponseContentLength()) {
0328: return;
0329: }
0330:
0331: String baseDir = null; // Base directory.
0332: String baseSeg = null; // ATTR_PATH value.
0333: try {
0334: baseSeg = (String) getAttribute(ATTR_PATH, curi);
0335: } catch (AttributeNotFoundException e) {
0336: logger.warning(e.getLocalizedMessage());
0337: return;
0338: }
0339:
0340: // Trim any trailing File.separatorChar characters from baseSeg.
0341: while ((baseSeg.length() > 1)
0342: && baseSeg.endsWith(File.separator)) {
0343: baseSeg = baseSeg.substring(0, baseSeg.length() - 1);
0344: }
0345: if (0 == baseSeg.length()) {
0346: baseDir = getController().getDisk().getPath();
0347: } else if ((new File(baseSeg)).isAbsolute()) {
0348: baseDir = baseSeg;
0349: } else {
0350: baseDir = getController().getDisk().getPath()
0351: + File.separator + baseSeg;
0352: }
0353:
0354: // Already have a path for this URI.
0355: boolean reCrawl = curi.containsKey(A_MIRROR_PATH);
0356:
0357: /*
0358: The file system path, relative to the value of ATTR_PATH, where
0359: this resource should be written. The intent is to
0360: add later a persistent mapping from URI to path.
0361: This will allow a URI to be re-crawled and updated
0362: if it has changed. If the resource has already been fetched
0363: and written to a file before, the path to that file
0364: has already been obtained from the persistent mapping
0365: and placed on the AList by some other module,
0366: such as the frontier.
0367: */
0368: String mps = null;
0369: File destFile = null; // Write resource contents to this file.
0370: try {
0371: if (reCrawl) {
0372: mps = curi.getString(A_MIRROR_PATH);
0373: destFile = new File(baseDir + File.separator + mps);
0374: File parent = destFile.getParentFile();
0375: if (null != parent) {
0376: IoUtils.ensureWriteableDirectory(parent);
0377: }
0378: } else {
0379: URIToFileReturn r = null; // Return from uriToFile().
0380: try {
0381: r = uriToFile(baseDir, curi);
0382: } catch (AttributeNotFoundException e) {
0383: logger.warning(e.getLocalizedMessage());
0384: return;
0385: }
0386: destFile = r.getFile();
0387: mps = r.getRelativePath();
0388: }
0389: logger.info(uuri.toString() + " -> " + destFile.getPath());
0390: writeToPath(recis, destFile);
0391: if (!reCrawl) {
0392: curi.putString(A_MIRROR_PATH, mps);
0393: }
0394: } catch (IOException e) {
0395: curi.addLocalizedError(this .getName(), e, "Mirror");
0396: }
0397: }
0398:
0399: /**
0400: Gets the directory in which the file will reside.
0401: Any directories needed are created.
0402: @param baseDir the path to the starting directory
0403: @param host the host part of the URI, or null if the host name
0404: should not be part of the returned path
0405: @param port the port part of the URI, or -1 if the port
0406: should not be part of the returned path
0407: @param segs all the segments in the URI
0408: @param maxLen the maximum path length allowed to the directory;
0409: this must leave some room for the file itself
0410: @return the directory, or null if maxLen would be exceeded
0411: @throws IOException
0412: if a needed directory could not be created
0413: @throws IOException
0414: if a needed directory is not writeable
0415: @throws IOException
0416: if a non-directory file exists with the same path as a needed directory
0417: */
0418: private URIToFileReturn dirPath(String baseDir, String host,
0419: int port, PathSegment[] segs, int maxLen)
0420: throws IOException {
0421:
0422: // Return value.
0423: URIToFileReturn r = new URIToFileReturn(baseDir, host, port);
0424: r.mkdirs();
0425: for (int i = 0; (segs.length - 1) != i; ++i) {
0426: segs[i].addToPath(r);
0427: if (r.longerThan(maxLen)) {
0428: return null;
0429: }
0430: }
0431: return r;
0432: }
0433:
0434: /**
0435: Ensures that a list contains an even number of elements.
0436: If not, the last element is removed.
0437: @param list the list
0438: */
0439: private void ensurePairs(ListType list) {
0440: if (1 == (list.size() % 2)) {
0441: list.remove(list.size() - 1);
0442: }
0443: }
0444:
0445: /**
0446: Makes a path in which a resource can be stored.
0447: @param baseDir the path to the starting directory
0448: @param curi the URI
0449: @return a path to the file in which to store the resource
0450: @throws AttributeNotFoundException
0451: if a needed setting is missing
0452: @throws IOException
0453: if a needed directory could not be created
0454: @throws IOException
0455: if a needed directory is not writeable
0456: @throws IOException
0457: if a non-directory file exists with the same path as a needed directory
0458: */
0459: private URIToFileReturn uriToFile(String baseDir, CrawlURI curi)
0460: throws AttributeNotFoundException, IOException {
0461: UURI uuri = curi.getUURI(); // Current URI.
0462: String host = null;
0463: Boolean hd = (Boolean) getAttribute(ATTR_HOST_DIRECTORY, curi);
0464: if (hd.booleanValue()) {
0465: host = uuri.getHost();
0466: StringList hostMap = (StringList) getAttribute(
0467: ATTR_HOST_MAP, curi);
0468: if ((null != hostMap) && (hostMap.size() > 1)) {
0469: ensurePairs(hostMap);
0470: Iterator<String> i = hostMap.typesafe().iterator();
0471: for (boolean more = true; more && i.hasNext();) {
0472: String h1 = i.next();
0473: String h2 = i.next();
0474: if (host.equalsIgnoreCase(h1)) {
0475: more = false;
0476: if ((null != h2) && (0 != h2.length())) {
0477: host = h2;
0478: }
0479: }
0480: }
0481: }
0482: }
0483:
0484: int port = ((Boolean) getAttribute(ATTR_PORT_DIRECTORY, curi))
0485: .booleanValue() ? uuri.getPort() : -1;
0486:
0487: String suffix = null; // Replacement suffix.
0488: StringList ctm = (StringList) getAttribute(
0489: ATTR_CONTENT_TYPE_MAP, curi);
0490: if ((null != ctm) && (ctm.size() > 1)) {
0491: ensurePairs(ctm);
0492: String contentType = curi.getContentType().toLowerCase();
0493: Iterator i = ctm.iterator();
0494: for (boolean more = true; more && i.hasNext();) {
0495: String ct = (String) i.next();
0496: String suf = (String) i.next();
0497: if ((null != ct)
0498: && contentType.startsWith(ct.toLowerCase())) {
0499: more = false;
0500: if ((null != suf) && (0 != suf.length())) {
0501: suffix = suf;
0502: }
0503: }
0504: }
0505: }
0506:
0507: int maxSegLen = ((Integer) getAttribute(ATTR_MAX_SEG_LEN, curi))
0508: .intValue();
0509: if (maxSegLen < 2) {
0510: maxSegLen = DEFAULT_MAX_SEG_LEN;
0511: }
0512:
0513: int maxPathLen = ((Integer) getAttribute(ATTR_MAX_PATH_LEN,
0514: curi)).intValue();
0515: if (maxPathLen < 2) {
0516: maxPathLen = DEFAULT_MAX_PATH_LEN;
0517: }
0518:
0519: Map<String, String> characterMap = EMPTY_MAP;
0520: StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi);
0521: if ((null != cm) && (cm.size() > 1)) {
0522: ensurePairs(cm);
0523: characterMap = new HashMap<String, String>(cm.size());
0524: // Above will be half full.
0525: for (Iterator i = cm.iterator(); i.hasNext();) {
0526: String s1 = (String) i.next();
0527: String s2 = (String) i.next();
0528: if ((null != s1) && (1 == s1.length()) && (null != s2)
0529: && (0 != s2.length())) {
0530: characterMap.put(s1, s2);
0531: }
0532: }
0533: }
0534:
0535: String dotBegin = (String) getAttribute(ATTR_DOT_BEGIN, curi);
0536: if (".".equals(dotBegin)) {
0537: dotBegin = null;
0538: }
0539:
0540: String dotEnd = (String) getAttribute(ATTR_DOT_END, curi);
0541: if (".".equals(dotEnd)) {
0542: dotEnd = null;
0543: }
0544:
0545: String tld = (String) getAttribute(ATTR_TOO_LONG_DIRECTORY,
0546: curi);
0547: if ((null == tld) || (0 == tld.length())
0548: || (-1 != tld.indexOf(File.separatorChar))) {
0549: tld = DEFAULT_TOO_LONG_DIRECTORY;
0550: }
0551:
0552: Set<String> underscoreSet = null;
0553: StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET,
0554: curi);
0555: if ((null != us) && (0 != us.size())) {
0556: underscoreSet = new HashSet<String>(us.size(), 0.5F);
0557: for (String s : us.typesafe()) {
0558: if ((null != s) && (0 != s.length())) {
0559: underscoreSet.add(s.toLowerCase());
0560: }
0561: }
0562: }
0563:
0564: return uriToFile(curi, host, port, uuri.getPath(), uuri
0565: .getQuery(), suffix, baseDir, maxSegLen, maxPathLen,
0566: ((Boolean) getAttribute(ATTR_CASE_SENSITIVE, curi))
0567: .booleanValue(), (String) getAttribute(
0568: ATTR_DIRECTORY_FILE, curi), characterMap,
0569: dotBegin, dotEnd, tld, ((Boolean) getAttribute(
0570: ATTR_SUFFIX_AT_END, curi)).booleanValue(),
0571: underscoreSet);
0572: }
0573:
0574: /**
0575: Makes a path in which a resource can be stored.
0576: @param curi the URI
0577: @param host the host part of the URI, or null if the host name
0578: should not be part of the returned path
0579: @param port the port part of the URI, or -1 if the port
0580: should not be part of the returned path
0581: @param uriPath the path part of the URI (must be absolute)
0582: @param query the query part of the URI, or null if none
0583: @param suffix if non-null, use this as the suffix in preference to
0584: any suffix that uriPath might have
0585: @param baseDir the path to the starting directory
0586: @param maxSegLen the maximum number of characters allowed in one
0587: file system path segment (component)
0588: @param maxPathLen the maximum number of characters allowed in a
0589: file system path
0590: @param caseSensitive if true, the file system is assumed to be
0591: case-sensitive; otherwise the file system is assumed to be
0592: case-insensitive but case-preserving
0593: @param dirFile the simple file name to append to a URI path
0594: ending in '/'
0595: @param characterMap a map from characters (as length-1 String values) in
0596: the URI path and query to replacement String values
0597: @param dotBegin if non-null, this replaces a '.' at
0598: the beginning of a segment
0599: @param dotEnd if non-null, this replaces a '.' that appears at the end
0600: of a directory name
0601: @param tooLongDir if the path length would exceed or be close to
0602: exceeding maxPathLen then this simple name is used as a directory
0603: under baseDir instead
0604: @param suffixAtEnd if true, the suffix is placed at the end of the
0605: path, after the query (if any); otherwise, the suffix is placed
0606: before the query
0607: @param underscoreSet if non-null and a segment, after conversion
0608: to lower case, is in this set, then prepend an underscore
0609: to the segment
0610: @return a path to the file in which to store the resource
0611: @throws IOException
0612: if a needed directory could not be created
0613: @throws IOException
0614: if a needed directory is not writeable
0615: @throws IOException
0616: if a non-directory file exists with the same path as a needed directory
0617: */
0618: private URIToFileReturn uriToFile(CrawlURI curi, String host,
0619: int port, String uriPath, String query, String suffix,
0620: String baseDir, int maxSegLen, int maxPathLen,
0621: boolean caseSensitive, String dirFile, Map characterMap,
0622: String dotBegin, String dotEnd, String tooLongDir,
0623: boolean suffixAtEnd, Set underscoreSet) throws IOException {
0624: assert (null == host) || (0 != host.length());
0625: assert 0 != uriPath.length();
0626: assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath;
0627: assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath;
0628: assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath;
0629: assert !uriPath.endsWith("/.") : "uriPath: " + uriPath;
0630: assert (null == query) || (-1 == query.indexOf('/')) : "query: "
0631: + query;
0632: assert (null == suffix)
0633: || ((0 != suffix.length()) && (-1 == suffix
0634: .indexOf('/'))) : "suffix: " + suffix;
0635: assert 0 != baseDir.length();
0636: assert maxSegLen > 2 : "maxSegLen: " + maxSegLen;
0637: assert maxPathLen > 1;
0638: assert maxPathLen >= maxSegLen : "maxSegLen: " + maxSegLen
0639: + " maxPathLen: " + maxPathLen;
0640: assert 0 != dirFile.length();
0641: assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile;
0642: assert null != characterMap;
0643: assert (null == dotBegin) || (0 != dotBegin.length());
0644: assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: "
0645: + dotEnd;
0646: assert 0 != tooLongDir.length();
0647: assert '/' != tooLongDir.charAt(0) : "tooLongDir: "
0648: + tooLongDir;
0649:
0650: int nSegs = 0; // Number of segments in the URI path.
0651: for (int i = 0; uriPath.length() != i; ++i) {
0652: if ('/' == uriPath.charAt(i)) {
0653: ++nSegs; // Just count slashes.
0654: }
0655: }
0656: assert nSegs > 0 : "uriPath: " + uriPath;
0657: PathSegment[] segs = new PathSegment[nSegs]; // The segments.
0658: int slashIndex = 0; // Index in uriPath of current /.
0659: for (int i = 0; (segs.length - 1) != i; ++i) {
0660: int nsi = uriPath.indexOf('/', slashIndex + 1); // Next index.
0661: assert nsi > slashIndex : "uriPath: " + uriPath;
0662: segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi,
0663: maxSegLen, caseSensitive, curi, characterMap,
0664: dotBegin, dotEnd, underscoreSet);
0665: slashIndex = nsi;
0666: }
0667: if (slashIndex < (uriPath.length() - 1)) {
0668:
0669: // There's something after the last /.
0670: segs[segs.length - 1] = new EndSegment(uriPath,
0671: slashIndex + 1, uriPath.length(), maxSegLen,
0672: caseSensitive, curi, characterMap, dotBegin, query,
0673: suffix, maxPathLen, suffixAtEnd);
0674: } else {
0675:
0676: // The URI ends with a /.
0677: segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile
0678: .length(), maxSegLen, caseSensitive, curi,
0679: characterMap, null, query, suffix, maxPathLen,
0680: suffixAtEnd);
0681: }
0682: URIToFileReturn r = dirPath(baseDir, host, port, segs,
0683: maxPathLen - maxSegLen);
0684: if (null == r) {
0685:
0686: // The path is too long.
0687: // Replace all the segment directories by tooLongDir.
0688: PathSegment endSegment = segs[segs.length - 1];
0689: segs = new PathSegment[2];
0690: segs[0] = new DirSegment(tooLongDir, 0,
0691: tooLongDir.length(), maxSegLen, caseSensitive,
0692: curi, EMPTY_MAP, null, null, null);
0693: segs[1] = endSegment;
0694: r = dirPath(baseDir, host, port, segs, maxPathLen
0695: - maxSegLen);
0696: }
0697: segs[segs.length - 1].addToPath(r);
0698: return r;
0699: }
0700:
0701: /**
0702: Copies a resource into a file.
0703: A temporary file is created and then atomically renamed to
0704: the destination file.
0705: This prevents leaving a partial file in case of a crash.
0706: @param recis the RecordingInputStream that recorded the contents
0707: of the resource
0708: @param dest the destination file
0709: @throws IOException on I/O error
0710: @throws IOException if
0711: the file rename fails
0712: */
0713: private void writeToPath(RecordingInputStream recis, File dest)
0714: throws IOException {
0715: ReplayInputStream replayis = recis
0716: .getContentReplayInputStream();
0717: File tf = new File(dest.getPath() + "N");
0718: FileOutputStream fos = new FileOutputStream(tf);
0719: try {
0720: replayis.readFullyTo(fos);
0721: } finally {
0722: fos.close();
0723: replayis.close();
0724: }
0725: if (!tf.renameTo(dest)) {
0726: throw new IOException("Can not rename "
0727: + tf.getAbsolutePath() + " to "
0728: + dest.getAbsolutePath());
0729: }
0730:
0731: }
0732:
0733: /**
0734: This class represents one segment (component) of a URI path.
0735: A segment between '/' characters is a directory segment.
0736: The segment after the last '/' is the end segment.
0737: */
0738: abstract class PathSegment {
0739: /**
0740: existsMaybeCaseSensitive return code
0741: for a file that does not exist.
0742: */
0743: protected static final int EXISTS_NOT = 1;
0744:
0745: /**
0746: existsMaybeCaseSensitive return code
0747: for a file that exists.
0748: Furthermore, the comparison is case-sensitive.
0749: */
0750: protected static final int EXISTS_EXACT_MATCH = 2;
0751:
0752: /**
0753: existsMaybeCaseSensitive return code
0754: for a file that exists, using a case-insensitive comparison.
0755: Furthermore, the file would not exist if the comparison
0756: were case-sensitive.
0757: */
0758: protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3;
0759:
0760: /** The URI, for logging and error reporting.*/
0761: protected CrawlURI curi;
0762:
0763: /**
0764: The main part of this segment.
0765: For a directory segment, that's all there is.
0766: For an end segment, it's the part of the URI after the last '/'
0767: up to but not including the '.' before the suffix (if any).
0768: */
0769: protected LumpyString mainPart = null;
0770:
0771: /**
0772: The maximum number of characters allowed
0773: in one file system path segment.
0774: A URI segment can potentially be much longer,
0775: but we'll trim it to this.
0776: */
0777: protected int maxSegLen;
0778:
0779: /** If true, the file system is assumed to be
0780: case-sensitive; otherwise the file system is assumed to be
0781: case-insensitive.
0782: */
0783: private boolean caseSensitive;
0784:
0785: /**
0786: Creates a new PathSegment.
0787: @param maxSegLen the maximum number of characters
0788: allowed in one path segment
0789: @param caseSensitive if true, the file system is assumed to be
0790: case-sensitive; otherwise the file system is assumed to be
0791: case-insensitive
0792: @param curi the URI
0793: @throws IllegalArgumentException if
0794: maxSegLen is too small
0795: */
0796: PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) {
0797: if (maxSegLen < 2) {
0798: throw new IllegalArgumentException("maxSegLen: "
0799: + maxSegLen);
0800: }
0801: this .maxSegLen = maxSegLen;
0802: this .caseSensitive = caseSensitive;
0803: this .curi = curi;
0804: }
0805:
0806: /**
0807: Adds this segment to a file path.
0808: This is the key method of this class.
0809: It extends the given path by one segment,
0810: named to obey all constraints.
0811: A new directory is created if necessary.
0812: @param currentPath the current path, to which this segment is added
0813: @throws IOException
0814: if a needed directory could not be created
0815: @throws IOException
0816: if a needed directory is not writeable
0817: */
0818: abstract void addToPath(URIToFileReturn currentPath)
0819: throws IOException;
0820:
0821: /**
0822: Checks if a file (including directories) exists.
0823: @param fsf the directory containing the file to be checked
0824: @param segStr the simple file or directory name
0825: @param check the file or directory for which to check
0826: @return EXISTS_NOT if check does not exist,
0827: EXISTS_EXACT_MATCH if check exists with a name that matches
0828: (case-sensitive) segStr, and
0829: EXISTS_CASE_INSENSITIVE_MATCH if check exists
0830: with a name that matches
0831: segStr using a case-insensitive match but not using a
0832: case-sensitive match
0833: */
0834: protected int existsMaybeCaseSensitive(File fsf, String segStr,
0835: File check) {
0836: if (caseSensitive) {
0837: return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT;
0838: }
0839: if (!check.exists()) {
0840: return EXISTS_NOT;
0841: }
0842:
0843: /*
0844: The JVM says the file exists, but the file system is assumed to be
0845: case-insensitive, so do we have an exact match or just a
0846: case-insensitive match? We get an array of all the
0847: file names that match (case-insensitive) the one we're
0848: checking, then we can look for a case-sensitive match.
0849: */
0850: String[] fna = fsf.list(new CaseInsensitiveFilenameFilter(
0851: segStr));
0852: for (int i = 0; fna.length != i; ++i) {
0853: if (segStr.equals(fna[i])) {
0854: return EXISTS_EXACT_MATCH;
0855: }
0856: }
0857: return EXISTS_CASE_INSENSITIVE_MATCH;
0858: }
0859:
0860: /**
0861: This class implements a FilenameFilter that matches
0862: by name, ignoring case.
0863: */
0864: class CaseInsensitiveFilenameFilter implements FilenameFilter {
0865: /** The file name we're looking for. */
0866: private String target;
0867:
0868: /**
0869: Creates a CaseInsensitiveFilenameFilter.
0870: @param target the target file name
0871: @throws IllegalArgumentException if
0872: target is null or empty.
0873: */
0874: CaseInsensitiveFilenameFilter(String target) {
0875: if (null == target) {
0876: throw new IllegalArgumentException("target null");
0877: }
0878: if (0 == target.length()) {
0879: throw new IllegalArgumentException("target empty");
0880: }
0881: this .target = target;
0882: }
0883:
0884: public boolean accept(File dir, String name) {
0885: return target.equalsIgnoreCase(name);
0886: }
0887: }
0888: }
0889:
0890: /**
0891: This class represents one directory segment (component) of a URI path.
0892: */
0893: class DirSegment extends PathSegment {
0894: /** If a segment name is in this set, prepend an underscore.*/
0895: private Set underscoreSet;
0896:
0897: /**
0898: Creates a DirSegment.
0899: @param uriPath the path part of the URI
0900: @param beginIndex the beginning index, inclusive, of the substring
0901: of uriPath to be used
0902: @param endIndex the ending index, exclusive, of the substring
0903: of uriPath to be used
0904: @param maxSegLen the maximum number of characters allowed in one
0905: file system path segment (component)
0906: @param caseSensitive if true, the file system is assumed to be
0907: case-sensitive; otherwise the file system is assumed to be
0908: case-insensitive but case-preserving
0909: @param curi the URI
0910: @param characterMap a map from characters
0911: (as length-1 String values) in
0912: the URI path and query to replacement String values
0913: @param dotBegin if non-null, this replaces a '.' at
0914: the beginning of the directory name
0915: @param dotEnd if non-null, this replaces a '.'
0916: that appears at the end of a directory name
0917: @param underscoreSet if non-null and a segment, after conversion
0918: to lower case, is in this set, then prepend an underscore
0919: to the segment
0920: @throws IllegalArgumentException if
0921: beginIndex is negative.
0922: @throws IllegalArgumentException if
0923: endIndex is less than beginIndex.
0924: @throws IllegalArgumentException if
0925: maxSegLen is too small.
0926: */
0927: DirSegment(String uriPath, int beginIndex, int endIndex,
0928: int maxSegLen, boolean caseSensitive, CrawlURI curi,
0929: Map characterMap, String dotBegin, String dotEnd,
0930: Set underscoreSet) {
0931: super (maxSegLen, caseSensitive, curi);
0932: mainPart = new LumpyString(uriPath, beginIndex, endIndex,
0933: (null == dotEnd) ? 0 : dotEnd.length(),
0934: this .maxSegLen, characterMap, dotBegin);
0935: if (null != dotEnd) {
0936:
0937: // We might get a segment like /VeryLong............../
0938: // so we have to loop to guarantee the segment doesn't
0939: // end with a dot.
0940: int dl = dotEnd.length();
0941: while (mainPart.endsWith('.')) {
0942:
0943: // Chop off the dot at the end.
0944: mainPart.trimToMax(mainPart.length() - 1);
0945: if ((mainPart.length() + dl) <= this .maxSegLen) {
0946: mainPart.append(dotEnd);
0947: }
0948: }
0949: }
0950: this .underscoreSet = underscoreSet;
0951: }
0952:
0953: void addToPath(URIToFileReturn currentPath) throws IOException {
0954: NumberFormat nf = null;
0955: int startLen = mainPart.length(); // Starting length.
0956: for (int i = 0;; ++i) {
0957: if (0 != i) {
0958:
0959: // Try to create a unique file name by appending a
0960: // number.
0961: if (null == nf) {
0962: nf = NumberFormat.getIntegerInstance();
0963: }
0964: String ending = nf.format(i);
0965: mainPart.trimToMax(Math.min(startLen, maxSegLen
0966: - ending.length()));
0967: mainPart.append(ending);
0968: }
0969: String segStr = mainPart.toString();
0970: if ((null != underscoreSet)
0971: && underscoreSet.contains(segStr.toLowerCase())) {
0972: mainPart.prepend('_');
0973: ++startLen;
0974: mainPart.trimToMax(maxSegLen);
0975: segStr = mainPart.toString();
0976: }
0977: File fsf = currentPath.getFile();
0978: File f = new File(fsf, segStr);
0979: int er = existsMaybeCaseSensitive(fsf, segStr, f);
0980: switch (er) {
0981: case EXISTS_NOT:
0982: if (!f.mkdir()) {
0983: throw new IOException("Can not mkdir "
0984: + f.getAbsolutePath());
0985: }
0986: currentPath.append(f, segStr);
0987: return; // Created new directory.
0988:
0989: case EXISTS_EXACT_MATCH:
0990: if (f.isDirectory()) {
0991: if (!f.canWrite()) {
0992: throw new IOException("Directory "
0993: + f.getAbsolutePath()
0994: + " not writeable.");
0995: }
0996:
0997: /*
0998: A writeable directory already exists.
0999: Assume it's the one we want.
1000: This assumption fails for cases like
1001: http://foo.com/a*256/b.html
1002: followed by
1003: http://foo.com/a*256z/b.html
1004: where a*256 means a sequence of the maximum allowed
1005: number of "a"s.
1006: */
1007: currentPath.append(f, segStr);
1008: return;
1009: }
1010:
1011: /*
1012: A segment already exists but isn't a directory.
1013: This could arise from, for example,
1014: http://foo.com/a*256
1015: followed by
1016: http://foo.com/a*256b/b.html
1017: We need to find a directory we created before in this
1018: situation, or make a new directory with a unique name.
1019: Going around the loop should eventually do that.
1020: */
1021: break;
1022:
1023: case EXISTS_CASE_INSENSITIVE_MATCH:
1024: /*
1025: A segment already exists that's a case-insensitive match
1026: but not an exact match. It may or may not be a directory.
1027: This could arise, on a case-insensitive, case-preserving
1028: file system (such as Macintosh HFS+). For example,
1029: http://foo.com/bar/z.html
1030: followed by
1031: http://foo.com/BAR/z.html
1032: would do it. We want bar and BAR to turn into different
1033: directories.
1034: Going around the loop should eventually do that.
1035: */
1036: break;
1037:
1038: default:
1039: throw new IllegalStateException("Code: " + er);
1040: }
1041: }
1042: }
1043: }
1044:
1045: /**
1046: This class represents the last segment (component) of a URI path.
1047: */
1048: class EndSegment extends PathSegment {
1049: /**
1050: The number of characters in the path up to this EndSegment,
1051: including the final File.separatorChar.
1052: */
1053: private int dirPathLen;
1054:
1055: /**
1056: The maximum number of characters allowed in a file path, minus 1.
1057: The extra 1 is reserved for temporarily appending
1058: a character so an existing file can be replaced atomically,
1059: for example, by writing
1060: <code>foo.htmlN</code>
1061: and then renaming it to
1062: <code>foo.html</code>.
1063: */
1064: private int maxPathLen;
1065:
1066: /** The query part of the URI, or null if none.*/
1067: private LumpyString query = null;
1068:
1069: /**
1070: The suffix, or null if none.
1071: This isn't a LumpyString because we'd only trim a suffix
1072: if space were very, very tight.
1073: */
1074: private String suffix = null;
1075:
1076: /**
1077: True if the suffix goes at the end, after the query.
1078: False if the suffix goes before the query.
1079: */
1080: private boolean suffixAtEnd;
1081:
1082: /** Appended to mainPart if necessary to create a unique file name.*/
1083: private String uniquePart = null;
1084:
1085: /**
1086: Creates an EndSegment.
1087: @param uriPath the path part of the URI
1088: @param beginIndex the beginning index, inclusive, of the substring
1089: of uriPath to be used
1090: @param endIndex the ending index, exclusive, of the substring
1091: of uriPath to be used
1092: @param maxSegLen the maximum number of characters allowed in one
1093: file system path segment (component)
1094: @param caseSensitive if true, the file system is assumed to be
1095: case-sensitive; otherwise the file system is assumed to be
1096: case-insensitive but case-preserving
1097: @param curi the URI
1098: @param characterMap maps characters (as length-1 String values) in
1099: the URI path and query to replacement String values
1100: @param dotBegin if non-null, this replaces a '.' at
1101: the beginning of the segment
1102: @param query the query part of the URI, or null if none
1103: @param suffix if non-null, use this as the suffix in preference to
1104: any suffix that uriPath might have
1105: @param maxPathLen the maximum number of characters allowed in a
1106: file system path
1107: @param suffixAtEnd if true, the suffix is placed at the end of the
1108: path, after the query (if any); otherwise, the suffix is placed
1109: before the query
1110: @throws IllegalArgumentException if
1111: beginIndex is negative.
1112: @throws IllegalArgumentException if
1113: endIndex is less than beginIndex.
1114: @throws IllegalArgumentException if
1115: maxSegLen is too small.
1116: */
1117: EndSegment(String uriPath, int beginIndex, int endIndex,
1118: int maxSegLen, boolean caseSensitive, CrawlURI curi,
1119: Map characterMap, String dotBegin, String query,
1120: String suffix, int maxPathLen, boolean suffixAtEnd) {
1121: super (maxSegLen - 1, caseSensitive, curi);
1122: int mpe = endIndex; // endIndex for the main part (no suffix).
1123: int ldi = uriPath.lastIndexOf('.'); // Index of last dot.
1124: if ((ldi > 0) && (ldi < (endIndex - 1))
1125: && (ldi > beginIndex)) {
1126: mpe = ldi; // uriPath has a suffix.
1127: }
1128: this .suffix = suffix;
1129: if ((null == this .suffix) && (mpe < (endIndex - 1))) {
1130:
1131: // There's no replacement suffix and uriPath has a suffix.
1132: // Run it through a LumpyString to do the character mapping.
1133: LumpyString ls = new LumpyString(uriPath, mpe + 1,
1134: endIndex, 0, this .maxSegLen, characterMap, null);
1135: this .suffix = ls.toString();
1136: }
1137: int pad = ((null == this .suffix) ? 0 : (1 + this .suffix
1138: .length()))
1139: + ((null == query) ? 0 : query.length());
1140: mainPart = new LumpyString(uriPath, beginIndex, mpe, pad,
1141: this .maxSegLen, characterMap, dotBegin);
1142: this .maxPathLen = maxPathLen - 1;
1143: if (null != query) {
1144: this .query = new LumpyString(query, 0, query.length(),
1145: 0, this .maxSegLen, characterMap, null);
1146: }
1147: this .suffixAtEnd = suffixAtEnd;
1148: }
1149:
1150: void addToPath(URIToFileReturn currentPath) {
1151: File fsf = currentPath.getFile();
1152: NumberFormat nf = null;
1153: dirPathLen = 1 + fsf.getPath().length();
1154: for (int i = 0;; ++i) {
1155: if (0 != i) {
1156: if (null == nf) {
1157: nf = NumberFormat.getIntegerInstance();
1158: }
1159: uniquePart = nf.format(i);
1160: }
1161: trimWithPadding((null == uniquePart) ? 0 : uniquePart
1162: .length());
1163: String segStr = joinParts(); // This EndSegment as a String.
1164: File f = new File(fsf, segStr);
1165:
1166: // Code for whether file exists.
1167: int er = existsMaybeCaseSensitive(fsf, segStr, f);
1168: switch (er) {
1169: case EXISTS_NOT:
1170: currentPath.append(f, segStr);
1171: return;
1172:
1173: case EXISTS_EXACT_MATCH:
1174: if (f.isFile()) {
1175: currentPath.append(f, segStr);
1176: return;
1177: }
1178:
1179: /*
1180: A file already exists but isn't an ordinary file.
1181: It might be a directory, special file, named pipe,
1182: whatever.
1183: We need to find an unused file name,
1184: or an ordinary file.
1185: Going around the loop should eventually do that.
1186: */
1187: break;
1188:
1189: case EXISTS_CASE_INSENSITIVE_MATCH:
1190: /*
1191: A file already exists that's a case-insensitive match
1192: but not an exact match.
1193: This could arise, on a case-insensitive, case-preserving
1194: file system (such as Macintosh HFS+). For example,
1195: http://foo.com/files.zip
1196: followed by
1197: http://foo.com/FILES.ZIP
1198: would do it. We want files.zip and FILES.ZIP to turn into
1199: different files. Going around the loop should eventually
1200: do that.
1201: */
1202: break;
1203:
1204: default:
1205: throw new IllegalStateException("Code: " + er);
1206: }
1207: }
1208: }
1209:
1210: /**
1211: Creates a simple file name from the parts of this EndSegment.
1212: @return a simple file name constructed from the main part,
1213: unique part, query, and suffix
1214: */
1215: private String joinParts() {
1216: StringBuffer sb = new StringBuffer(length());
1217: sb.append(mainPart.asStringBuffer());
1218: if (null != uniquePart) {
1219: sb.append(uniquePart);
1220: }
1221: if (suffixAtEnd) {
1222: if (null != query) {
1223: sb.append(query);
1224: }
1225: if (null != suffix) {
1226: sb.append('.');
1227: sb.append(suffix);
1228: }
1229: } else {
1230: if (null != suffix) {
1231: sb.append('.');
1232: sb.append(suffix);
1233: }
1234: if (null != query) {
1235: sb.append(query);
1236: }
1237: }
1238: return sb.toString();
1239: }
1240:
1241: /**
1242: Gets the number of available character positions.
1243: If this EndSegment were converted to a path,
1244: it would have a path length and a segment length.
1245: There are two constraints: maxSegLen and maxPathLen.
1246: The number of character positions available before bumping
1247: into the lower constraint is computed.
1248: @return the number of available positions, which may be negative
1249: */
1250: private int lenAvail() {
1251: int len = length();
1252: return Math.min(maxSegLen - len, maxPathLen - dirPathLen
1253: - len);
1254: }
1255:
1256: /**
1257: Gets the length of the simple file name that would be
1258: created for this EndSegment.
1259: @return the length
1260: */
1261: private int length() {
1262: int r = mainPart.length(); // Return value.
1263: if (null != uniquePart) {
1264: r += uniquePart.length();
1265: }
1266: if (null != query) {
1267: r += query.length();
1268: }
1269: if (null != suffix) {
1270: r += 1 + suffix.length(); // 1 for the '.'
1271: }
1272: return r;
1273: }
1274:
1275: /**
1276: Trims this EndSegment so a given number of characters are available.
1277: After trimming, there will be room for at least
1278: padding more characters before one of the constraints is
1279: encountered.
1280: The choices for trimming, in priority order, are:
1281: <ol>
1282: <li>Shorten the query.</li>
1283: <li>Remove the query.</li>
1284: <li>Shorten the main part.</li>
1285: <li>Shorten the suffix.</li>
1286: </ol>
1287: @param padding the number of character positions that need to be
1288: available
1289: @throws IllegalStateException
1290: if it's impossible to trim enough
1291: */
1292: private void trimWithPadding(int padding) {
1293: assert padding >= 0 : "padding: " + padding;
1294: int la = lenAvail();
1295: if (la >= padding) {
1296: return;
1297: }
1298:
1299: // We need space for (padding - la) characters.
1300: // la might be negative.
1301: if (null != query) {
1302: query.trimToMax(Math.max(0, query.length()
1303: - (padding - la)));
1304: if (0 == query.length()) {
1305: query = null;
1306: }
1307: la = lenAvail();
1308: if (la >= padding) {
1309: return;
1310: }
1311: }
1312: mainPart.trimToMax(Math.max(1, mainPart.length()
1313: - (padding - la)));
1314: la = lenAvail();
1315: if (la >= padding) {
1316: return;
1317: }
1318: if (null != suffix) {
1319: suffix = suffix.substring(0, Math.max(1, suffix
1320: .length()
1321: - (padding - la)));
1322: la = lenAvail();
1323: if (la >= padding) {
1324: return;
1325: }
1326: }
1327: throw new IllegalStateException("Can not trim "
1328: + curi.toString());
1329: }
1330: }
1331:
1332: /**
1333: This class represents a dynamically growable string
1334: consisting of substrings ("lumps") that
1335: are treated atomically. If the string is shortened, then an entire
1336: lump is removed. The intent is to treat each %XX escape as a lump.
1337: This class also allows single characters in a source string to be
1338: re-mapped to a different string, possible containing more than
1339: one character.
1340: Each re-mapped character is also treated as a lump.
1341: <p>
1342: For example, suppose part of a URI, between two slashes, is
1343: <code>/VeryLongString...%3A/</code>.
1344: We want to create a corresponding file system directory, but the string
1345: is a little longer than the allowed maximum.
1346: It's better to trim the entire
1347: <code>%3A</code>
1348: off the end than part of it.
1349: This is especially true if, later, we need to append some digits
1350: to create a unique directory name.
1351: So we treat the entire
1352: <code>%3A</code>
1353: as one lump.
1354: */
1355: class LumpyString {
1356: /**
1357: Lumps are indicated by an auxiliary array aux[],
1358: indexed the same as the string. The LUMP_BEGIN bit is set
1359: for a position in the string at which a lump begins.
1360: */
1361: private static final byte LUMP_BEGIN = 0x1;
1362:
1363: /** Bit set for the end of a lump. */
1364: private static final byte LUMP_END = 0x2;
1365:
1366: /**
1367: Bit set for all characters in a lump of length greater than 1,
1368: except the beginning and ending characters.
1369: */
1370: private static final byte LUMP_MID = 0x4;
1371:
1372: /** The auxiliary array. */
1373: private byte[] aux;
1374:
1375: /** Holds the string. */
1376: private StringBuffer string;
1377:
1378: /**
1379: Creates a LumpyString.
1380: @param str the source string
1381: @param beginIndex the beginning index, inclusive, of the substring
1382: of str to be used
1383: @param endIndex the ending index, exclusive, of the substring
1384: of str to be used
1385: @param padding reserve this many additional character positions
1386: before dynamic growth is needed
1387: @param maxLen the maximum string length, regardless of the
1388: values of beginIndex, endIndex, and padding
1389: @param characterMap maps from characters in the source string
1390: (represented as length-one String values) to replacement String
1391: values (length at least 1).
1392: Each replacement string is treated as one lump.
1393: This is intended to cope with characters that a file system
1394: does not allow.
1395: @param dotBegin if non-null, this replaces a '.' at
1396: <code>str[beginIndex]</code>
1397: @throws IllegalArgumentException if
1398: beginIndex is negative.
1399: @throws IllegalArgumentException if
1400: endIndex is less than beginIndex.
1401: @throws IllegalArgumentException if
1402: padding is negative.
1403: @throws IllegalArgumentException if
1404: maxLen is less than one.
1405: @throws IllegalArgumentException if
1406: characterMap is null.
1407: @throws IllegalArgumentException if
1408: dotBegin is non-null but empty.
1409: */
1410: LumpyString(String str, int beginIndex, int endIndex,
1411: int padding, int maxLen, Map characterMap,
1412: String dotBegin) {
1413: if (beginIndex < 0) {
1414: throw new IllegalArgumentException("beginIndex < 0: "
1415: + beginIndex);
1416: }
1417: if (endIndex < beginIndex) {
1418: throw new IllegalArgumentException(
1419: "endIndex < beginIndex " + "beginIndex: "
1420: + beginIndex + "endIndex: " + endIndex);
1421: }
1422: if (padding < 0) {
1423: throw new IllegalArgumentException("padding < 0: "
1424: + padding);
1425: }
1426: if (maxLen < 1) {
1427: throw new IllegalArgumentException("maxLen < 1: "
1428: + maxLen);
1429: }
1430: if (null == characterMap) {
1431: throw new IllegalArgumentException("characterMap null");
1432: }
1433: if ((null != dotBegin) && (0 == dotBegin.length())) {
1434: throw new IllegalArgumentException("dotBegin empty");
1435: }
1436:
1437: // Initial capacity. Leave some room for %XX lumps.
1438: // Guaranteed positive.
1439: int cap = Math.min(2 * (endIndex - beginIndex) + padding
1440: + 1, maxLen);
1441: string = new StringBuffer(cap);
1442: aux = new byte[cap];
1443: for (int i = beginIndex; i != endIndex; ++i) {
1444: String s = str.substring(i, i + 1);
1445: String lump; // Next lump.
1446: if (".".equals(s) && (i == beginIndex)
1447: && (null != dotBegin)) {
1448: lump = dotBegin;
1449: } else {
1450: lump = (String) characterMap.get(s);
1451: }
1452: if (null == lump) {
1453: if ("%".equals(s)
1454: && ((endIndex - i) > 2)
1455: && (-1 != Character.digit(
1456: str.charAt(i + 1), 16))
1457: && (-1 != Character.digit(
1458: str.charAt(i + 2), 16))) {
1459:
1460: // %XX escape; treat as one lump.
1461: lump = str.substring(i, i + 3);
1462: i += 2;
1463: } else {
1464: lump = s;
1465: }
1466: }
1467: if ((string.length() + lump.length()) > maxLen) {
1468: assert checkInvariants();
1469: return;
1470: }
1471: append(lump);
1472: }
1473: assert checkInvariants();
1474: }
1475:
1476: /**
1477: Converts this LumpyString to a String.
1478: @return the current string contents
1479: */
1480: public String toString() {
1481: assert checkInvariants();
1482: return string.toString();
1483: }
1484:
1485: /**
1486: Appends one lump to the end of this string.
1487: @param lump the lump (substring) to append
1488: @throws IllegalArgumentException if
1489: lump is null or empty.
1490: */
1491: void append(String lump) {
1492: if (null == lump) {
1493: throw new IllegalArgumentException("lump null");
1494: }
1495: int lumpLen = lump.length();
1496: if (0 == lumpLen) {
1497: throw new IllegalArgumentException("lump empty");
1498: }
1499: int pos = string.length(); // Current end of string.
1500: ensureCapacity(pos + lumpLen);
1501: if (1 == lumpLen) {
1502: aux[pos] = LUMP_BEGIN | LUMP_END;
1503: } else {
1504: assert lumpLen > 1;
1505: aux[pos] = LUMP_BEGIN;
1506: ++pos;
1507: for (int i = lumpLen - 2; 0 != i; --i) {
1508: aux[pos] = LUMP_MID;
1509: ++pos;
1510: }
1511: aux[pos] = LUMP_END;
1512: }
1513: string.append(lump);
1514: assert checkInvariants();
1515: }
1516:
1517: /**
1518: Returns the string as a StringBuffer.
1519: The caller should <em>not</em> modify the return value.
1520: @return the string
1521: */
1522: StringBuffer asStringBuffer() {
1523: return string;
1524: }
1525:
1526: /**
1527: Tests if this string ends with a character.
1528: @param ch the character to test for
1529: @return true if and only if this string ends with ch
1530: */
1531: boolean endsWith(char ch) {
1532: assert checkInvariants();
1533: int len = string.length();
1534: return (0 != len) && (string.charAt(len - 1) == ch);
1535: }
1536:
1537: /**
1538: Prepends one character, as a lump, to this string.
1539: @param ch the character to prepend
1540: */
1541: void prepend(char ch) {
1542: assert checkInvariants();
1543: int oldLen = string.length();
1544: ensureCapacity(1 + oldLen);
1545: string.insert(0, ch);
1546: System.arraycopy(aux, 0, aux, 1, oldLen);
1547: aux[0] = LUMP_BEGIN | LUMP_END;
1548: assert checkInvariants();
1549: }
1550:
1551: /**
1552: Gets the length of this string.
1553: @return the number of characters in this string
1554: */
1555: int length() {
1556: assert checkInvariants();
1557: return string.length();
1558: }
1559:
1560: /**
1561: If necessary, trims this string to a maximum length.
1562: Any trimming is done by removing one or more complete
1563: lumps from the end of this string.
1564: @param maxLen the new maximum length.
1565: After trimming, the actual length of this string will be
1566: at most maxLen.
1567: @throws IllegalArgumentException if
1568: maxLen is negative.
1569: */
1570: void trimToMax(int maxLen) {
1571: if (maxLen < 0) {
1572: throw new IllegalArgumentException("maxLen < 0: "
1573: + maxLen);
1574: }
1575: assert checkInvariants();
1576: int cl = string.length(); // Current length.
1577: if (cl > maxLen) {
1578: int nl = maxLen; // New length.
1579: while ((0 != nl)
1580: && (LUMP_END != (aux[nl - 1] & LUMP_END))) {
1581: --nl;
1582: }
1583: for (int i = nl; i != cl; ++i) {
1584: aux[i] = 0;
1585: }
1586: string.setLength(nl);
1587: }
1588: assert checkInvariants();
1589: }
1590:
1591: /**
1592: Checks some assertions on the instance variables.
1593: The intended usage is
1594: <code>assert checkInvariants();</code>
1595: so that if assertions are off, no call is made.
1596: @return true
1597: */
1598: private boolean checkInvariants() {
1599:
1600: // There's an aux[] element for every character in the StringBuffer.
1601: assert aux.length >= string.length() : "aux.length: "
1602: + aux.length + " string.length(): "
1603: + string.length();
1604:
1605: // The first character starts a lump.
1606: assert (0 == string.length())
1607: || (LUMP_BEGIN == (aux[0] & LUMP_BEGIN)) : "aux[0]: "
1608: + aux[0];
1609:
1610: // The last character ends a lump.
1611: assert (0 == string.length())
1612: || (LUMP_END == (aux[string.length() - 1] & LUMP_END)) : "aux[end]: "
1613: + aux[string.length() - 1];
1614: return true;
1615: }
1616:
1617: /**
1618: Ensures that the capacity is at least equal to the specified minimum.
1619: @param minCapacity the minimum desired capacity
1620: */
1621: private void ensureCapacity(int minCapacity) {
1622: assert checkInvariants();
1623: if (minCapacity > aux.length) {
1624: int nc = 2 * aux.length; // New capacity.
1625: while (nc < minCapacity) {
1626: nc *= 2;
1627: }
1628: byte[] oldAux = aux;
1629: aux = new byte[nc];
1630: System.arraycopy(oldAux, 0, aux, 0, string.length());
1631: }
1632: string.ensureCapacity(minCapacity);
1633: assert checkInvariants();
1634: }
1635: }
1636:
1637: /**
1638: This class is returned by uriToFile.
1639: It represents a file system path, both as a File and as
1640: a path relative to the base directory.
1641: */
1642: class URIToFileReturn {
1643: /** The file system path as a File.*/
1644: private File filePath;
1645:
1646: /** The relative path from baseDir.*/
1647: private StringBuffer relativePath = new StringBuffer(255);
1648:
1649: /**
1650: Creates a URIToFileReturn.
1651: @param baseDir the path to the starting directory
1652: @param host the host part of the URI, or null if the host name
1653: should not be part of the path
1654: @param port the port part of the URI, or -1 if the port
1655: should not be part of the path
1656: */
1657: URIToFileReturn(String baseDir, String host, int port) {
1658:
1659: // The initial path.
1660: StringBuffer startPath = new StringBuffer(
1661: baseDir.length() + 32);
1662: startPath.append(baseDir);
1663: if (baseDir.endsWith(File.separator)) {
1664: assert 1 != baseDir.length();
1665: startPath.deleteCharAt(startPath.length() - 1);
1666: }
1667: if (null != host) {
1668: startPath.append(File.separatorChar);
1669: startPath.append(host);
1670: relativePath.append(host);
1671: }
1672: if (port > 0) {
1673: startPath.append(File.separatorChar);
1674: startPath.append(port);
1675: relativePath.append(File.separatorChar);
1676: relativePath.append(port);
1677: }
1678: filePath = new File(startPath.toString());
1679: }
1680:
1681: /**
1682: Appends one more segment to this path.
1683: @param f a File representing the path with the next segment added
1684: @param nextSegment the next segment
1685: */
1686: void append(File f, String nextSegment) {
1687: filePath = f;
1688: if (0 != relativePath.length()) {
1689: relativePath.append(File.separatorChar);
1690: }
1691: relativePath.append(nextSegment);
1692: }
1693:
1694: /**
1695: Gets this path as a File.
1696: @return this path
1697: */
1698: File getFile() {
1699: return filePath;
1700: }
1701:
1702: /**
1703: Gets this path as a relative path from the base directory.
1704: @return the relative path
1705: */
1706: String getRelativePath() {
1707: return relativePath.toString();
1708: }
1709:
1710: /**
1711: Tests if this path is longer than a given value.
1712: @param maxLen the value to test
1713: @return true if and only if this path is longer than maxLen
1714: */
1715: boolean longerThan(int maxLen) {
1716: return filePath.getPath().length() > maxLen;
1717: }
1718:
1719: /**
1720: Creates all directories in this path as needed.
1721: @throws IOException
1722: if a needed directory could not be created
1723: @throws IOException
1724: if a needed directory is not writeable
1725: @throws IOException
1726: if a non-directory file exists
1727: with the same path as a needed directory
1728: */
1729: void mkdirs() throws IOException {
1730: if (!filePath.exists()) {
1731: if (!filePath.mkdirs()) {
1732: throw new IOException("Can not mkdir "
1733: + filePath.getAbsolutePath());
1734: }
1735: } else if (!filePath.canWrite()) {
1736: throw new IOException("Directory "
1737: + filePath.getAbsolutePath()
1738: + " not writeable.");
1739: } else if (!filePath.isDirectory()) {
1740: throw new IOException("File "
1741: + filePath.getAbsolutePath()
1742: + " is not a directory.");
1743: }
1744: }
1745: }
1746: }
|