01: /* LineReadingIterator
02: *
03: * $Id: RegexpLineIterator.java 4650 2006-09-25 18:09:42Z paul_jack $
04: *
05: * Created on Jul 27, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.util.iterator;
26:
27: import java.util.Iterator;
28: import java.util.logging.Logger;
29: import java.util.regex.Matcher;
30: import java.util.regex.Pattern;
31:
32: /**
33: * Utility class providing an Iterator interface over line-oriented
34: * text input. By providing regexps indicating lines to ignore
35: * (such as pure whitespace or comments), lines to consider input, and
36: * what to return from the input lines (such as a whitespace-trimmed
37: * non-whitespace token with optional trailing comment), this can
38: * be configured to handle a number of formats.
39: *
40: * The public static members provide pattern configurations that will
41: * be helpful in a wide variety of contexts.
42: *
43: * @author gojomo
44: */
45: public class RegexpLineIterator extends
46: TransformingIteratorWrapper<String, String> {
47: private static final Logger logger = Logger
48: .getLogger(RegexpLineIterator.class.getName());
49:
50: public static final String COMMENT_LINE = "\\s*(#.*)?";
51: public static final String NONWHITESPACE_ENTRY_TRAILING_COMMENT = "^\\s*(\\S+)\\s*(#.*)?$";
52: public static final String TRIMMED_ENTRY_TRAILING_COMMENT = "^\\s*([^#]+?)\\s*(#.*)?$";
53:
54: public static final String ENTRY = "$1";
55:
56: protected Matcher ignoreLine = null;
57: protected Matcher extractLine = null;
58: protected String outputTemplate = null;
59:
60: public RegexpLineIterator(Iterator<String> inner, String ignore,
61: String extract, String replace) {
62: this .inner = inner;
63: ignoreLine = Pattern.compile(ignore).matcher("");
64: extractLine = Pattern.compile(extract).matcher("");
65: outputTemplate = replace;
66: }
67:
68: /**
69: * Loads next item into lookahead spot, if available. Skips
70: * lines matching ignoreLine; extracts desired portion of
71: * lines matching extractLine; informationally reports any
72: * lines matching neither.
73: *
74: * @return whether any item was loaded into next field
75: */
76: protected String transform(String line) {
77: ignoreLine.reset(line);
78: if (ignoreLine.matches()) {
79: return null;
80: }
81: extractLine.reset(line);
82: if (extractLine.matches()) {
83: StringBuffer output = new StringBuffer();
84: // TODO: consider if a loop that find()s all is more
85: // generally useful here
86: extractLine.appendReplacement(output, outputTemplate);
87: return output.toString();
88: }
89: // no match; possibly error
90: logger.info("nonsense line: " + line);
91: return null;
92: }
93: }
|