001: /* RegexRule
002: *
003: * Created on Oct 6, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.url.canonicalize;
024:
025: import java.util.logging.Logger;
026: import java.util.regex.Matcher;
027:
028: import org.archive.crawler.settings.SimpleType;
029: import org.archive.util.TextUtils;
030:
031: /**
032: * General conversion rule.
033: * @author stack
034: * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
035: */
036: public class RegexRule extends BaseRule {
037:
038: private static final long serialVersionUID = -2658094415450237847L;
039:
040: protected static Logger logger = Logger.getLogger(BaseRule.class
041: .getName());
042: private static final String DESCRIPTION = "General regex rule. "
043: + "Specify a matching regex and a format string used outputting"
044: + " result if a match was found. If problem compiling regex or"
045: + " interpreting format, problem is logged, and this rule does"
046: + " nothing. See User Manual for example usage.";
047: private static final String ATTR_REGEX = "matching-regex";
048: private static final String ATTR_FORMAT = "format";
049: private static final String ATTR_COMMENT = "comment";
050:
051: public RegexRule(String name) {
052: this (name, "(.*)", "${1}");
053: }
054:
055: protected RegexRule(String name, String defaultRegex,
056: String defaultFormat) {
057: super (name, DESCRIPTION);
058: addElementToDefinition(new SimpleType(
059: ATTR_REGEX,
060: "Java regular expression. If the regex matches, we'll rewrite"
061: + " the passed url using the specified format pattern.",
062: defaultRegex));
063: addElementToDefinition(new SimpleType(
064: ATTR_FORMAT,
065: "Pattern to use rewriting matched"
066: + "url. Use '${1}' to match first regex group, '${2}' for"
067: + "next group, etc.", defaultFormat));
068: addElementToDefinition(new SimpleType(ATTR_COMMENT,
069: "Free-text comment on why this rule was added.", ""));
070: }
071:
072: public String canonicalize(String url, Object context) {
073: String regex = getNullOrAttribute(ATTR_REGEX, context);
074: if (regex == null) {
075: return url;
076: }
077: String format = getNullOrAttribute(ATTR_FORMAT, context);
078: if (format == null) {
079: return url;
080: }
081: Matcher matcher = TextUtils.getMatcher(regex, url);
082: String retVal;
083: if (matcher == null || !matcher.matches()) {
084: retVal = url;
085: } else {
086: StringBuffer buffer = new StringBuffer(url.length() * 2);
087: format(matcher, format, buffer);
088: retVal = buffer.toString();
089: }
090: TextUtils.recycleMatcher(matcher);
091: return retVal;
092: }
093:
094: /**
095: * @param matcher Matched matcher.
096: * @param format Output format specifier.
097: * @param buffer Buffer to append output to.
098: */
099: protected void format(Matcher matcher, String format,
100: StringBuffer buffer) {
101: for (int i = 0; i < format.length(); i++) {
102: switch (format.charAt(i)) {
103: case '\\':
104: if ((i + 1) < format.length()
105: && format.charAt(i + 1) == '$') {
106: // Don't write the escape character in output.
107: continue;
108: }
109:
110: case '$':
111: // Check to see if its not been escaped.
112: if (i == 0 || (i > 0 && (format.charAt(i - 1) != '\\'))) {
113: // Looks like we have a matching group specifier in
114: // our format string, something like '$2' or '${2}'.
115: int start = i + 1;
116: boolean curlyBraceStart = false;
117: if (format.charAt(start) == '{') {
118: start++;
119: curlyBraceStart = true;
120: }
121: int j = start;
122: for (; j < format.length()
123: && Character.isDigit(format.charAt(j)); j++) {
124: // While a digit, increment.
125: }
126: if (j > start) {
127: int groupIndex = Integer.parseInt(format
128: .substring(start, j));
129: if (groupIndex >= 0 && groupIndex < 256) {
130: String g = null;
131: try {
132: g = matcher.group(groupIndex);
133: } catch (IndexOutOfBoundsException e) {
134: logger
135: .warning("IndexOutOfBoundsException"
136: + " getting group "
137: + groupIndex
138: + " from "
139: + matcher.group(0)
140: + " with format of "
141: + format);
142: }
143: if (g != null) {
144: buffer.append(g);
145: }
146: // Skip closing curly bracket if one.
147: if (curlyBraceStart
148: && format.charAt(j) == '}') {
149: j++;
150: }
151: // Update the loop index so that we skip over
152: // the ${x} group item.
153: i = (j - 1);
154: // Don't fall through to the default.
155: continue;
156: }
157: }
158:
159: }
160: // Let fall through to default rule. The '$' was escaped.
161:
162: default:
163: buffer.append(format.charAt(i));
164: }
165: }
166: }
167:
168: protected String getNullOrAttribute(String name, Object context) {
169: try {
170: return (String) getAttribute(context, name);
171: } catch (Exception e) {
172: logger.severe(e.getMessage());
173: return null;
174: }
175: }
176: }
|