001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.util;
018:
019: import org.apache.regexp.RE;
020: import org.apache.regexp.RECompiler;
021: import org.apache.regexp.REProgram;
022:
023: import java.util.HashMap;
024: import java.util.Map;
025:
026: /**
027: * This class is an utility class that perform wildcard-patterns matching and isolation.
028: *
029: * @version $Id: WildcardMatcherHelper.java 448573 2006-09-21 14:52:23Z anathaniel $
030: */
031: public class WildcardMatcherHelper {
032: //~ Static fields/initializers -----------------------------------------------------------------
033:
034: /** Default path separator: "/" */
035: public static final char ESC = '\\';
036:
037: /** Default path separator: "/" */
038: public static final char PATHSEP = '/';
039:
040: /** Default path separator: "/" */
041: public static final char STAR = '*';
042:
043: //~ Methods ------------------------------------------------------------------------------------
044:
045: /**
046: * Match a pattern agains a string and isolates wildcard replacement into a <code>Map</code>.
047: * <br>
048: * Here is how the matching algorithm works:
049: *
050: * <ul>
051: * <li>
052: * The '*' character, meaning that zero or more characters (excluding the path separator '/')
053: * are to be matched.
054: * </li>
055: * <li>
056: * The '**' sequence, meaning that zero or more characters (including the path separator '/')
057: * are to be matched.
058: * </li>
059: * <li>
060: * The '\*' sequence is honored as a litteral '*' character, not a wildcard
061: * </li>
062: * </ul>
063: * <br>
064: * When more than two '*' characters, not separated by another character, are found their value is
065: * considered as '**' and immediate succeeding '*' are skipped.
066: * <br>
067: * The '**' wildcard is greedy and thus the following sample matches as {"foo/bar","baz","bug"}:
068: * <dl>
069: * <dt>pattern</dt>
070: * <dd>STAR,STAR,PATHSEP,STAR,PATHSEP,STAR,STAR (why can't I express it litterally?)</dt>
071: * <dt>string</dt>
072: * <dd>foo/bar/baz/bug</dt>
073: * </dl>
074: * The first '**' in the pattern will suck up as much as possible without making the match fail.
075: *
076: * @param pat The pattern string.
077: * @param str The string to math agains the pattern
078: *
079: * @return a <code>Map</code> containing the representation of the extracted pattern. The extracted patterns are
080: * keys in the <code>Map</code> from left to right beginning with "1" for te left most, "2" for the next,
081: * a.s.o. The key "0" is the string itself. If the return value is null, string does not match to the
082: * pattern .
083: */
084: public static Map match(final String pat, final String str) {
085: Matcher matcher;
086: synchronized (cache) {
087: matcher = (Matcher) cache.get(pat);
088: if (matcher == null) {
089: matcher = new Matcher(pat);
090: cache.put(pat, matcher);
091: }
092: }
093:
094: String[] list = matcher.getMatches(str);
095: if (list == null)
096: return null;
097:
098: int n = list.length;
099: Map map = new HashMap(n * 2 + 1);
100: for (int i = 0; i < n; i++) {
101: map.put(String.valueOf(i), list[i]);
102: }
103:
104: return map;
105: }
106:
107: /** Cache for compiled pattern matchers */
108: private static final Map cache = new HashMap();
109:
110: //~ Inner Classes ------------------------------------------------------------------------------
111:
112: /**
113: * The private matcher class
114: */
115: private static class Matcher {
116:
117: /** Regexp to split constant parts from front and back leaving wildcards in the middle. */
118: private static final REProgram splitter;
119:
120: static {
121: final String fixedRE = "([^*\\\\]*)";
122: final String wcardRE = "(.*[*\\\\])";
123: final String splitRE = "^" + fixedRE + wcardRE + fixedRE
124: + "$";
125: splitter = new RECompiler().compile(splitRE);
126: }
127:
128: /** Wildcard types to short-cut simple '*' and "**' matches. */
129: private static final int WC_CONST = 0;
130: private static final int WC_STAR = 1;
131: private static final int WC_STARSTAR = 2;
132: private static final int WC_REGEXP = 3;
133:
134: //~ Instance fields ------------------------------------------------------------------------
135:
136: // All fields declared final to emphasize requirement to be thread-safe.
137:
138: /** Fixed text at start of pattern. */
139: private final String prefix;
140:
141: /** Fixed text at end of pattern. */
142: private final String suffix;
143:
144: /** Length of prefix and suffix. */
145: private final int fixlen;
146:
147: /** Wildcard type of pattern. */
148: private final int wctype;
149:
150: /** Compiled regexp equivalent to wildcard pattern between prefix and suffix. */
151: private final REProgram regexp;
152:
153: //~ Constructors ---------------------------------------------------------------------------
154:
155: /**
156: * Creates a new Matcher object.
157: *
158: * @param pat The pattern
159: * @param str The string
160: */
161: Matcher(final String pat) {
162: RE re = new RE(splitter);
163:
164: if (re.match(pat)) {
165:
166: // Split pattern into (foo/)(*)(/bar).
167:
168: prefix = re.getParen(1);
169: String wildcard = re.getParen(2);
170: String tail = re.getParen(3);
171:
172: // If wildcard ends with \ then add the first char of postfix to wildcard.
173: if (tail.length() != 0
174: && wildcard.charAt(wildcard.length() - 1) == ESC) {
175: wildcard = wildcard + tail.substring(0, 1);
176: suffix = tail.substring(1);
177: } else {
178: suffix = tail;
179: }
180:
181: // Use short-cuts for single * or ** wildcards
182:
183: if (wildcard.equals("*")) {
184: wctype = WC_STAR;
185: regexp = null;
186: } else if (wildcard.equals("**")) {
187: wctype = WC_STARSTAR;
188: regexp = null;
189: } else {
190: wctype = WC_REGEXP;
191: regexp = compileRegexp(wildcard);
192: }
193: } else {
194: // Pattern is a constant without '*' or '\'.
195: prefix = pat;
196: suffix = "";
197: wctype = WC_CONST;
198: regexp = null;
199: }
200:
201: fixlen = prefix.length() + suffix.length();
202: }
203:
204: //~ Methods --------------------------------------------------------------------------------
205:
206: /**
207: * Match string against pattern.
208: *
209: * @param str The string
210: * @return list of wildcard matches, null if match failed
211: */
212: String[] getMatches(final String str) {
213:
214: // Protect against 'foo' matching 'foo*foo'.
215: if (str.length() < fixlen)
216: return null;
217:
218: if (!str.startsWith(prefix))
219: return null;
220:
221: if (!str.endsWith(suffix))
222: return null;
223:
224: String infix = str.substring(prefix.length(), str.length()
225: - suffix.length());
226:
227: if (wctype == WC_REGEXP) {
228: RE re = new RE(regexp);
229: if (!re.match(infix))
230: return null;
231:
232: int n = re.getParenCount();
233: String[] list = new String[n];
234: list[0] = str;
235: for (int i = 1; i < n; i++)
236: list[i] = re.getParen(i);
237: return list;
238: }
239:
240: if (wctype == WC_CONST) {
241: if (infix.length() != 0)
242: return null;
243: return new String[] { str };
244: }
245:
246: if (wctype == WC_STAR) {
247: if (infix.indexOf(PATHSEP) != -1)
248: return null;
249: }
250:
251: return new String[] { str, infix };
252: }
253: }
254:
255: /**
256: * Compile wildcard pattern into regexp pattern.
257: *
258: * @param pat The wildcard pattern
259: * @return compiled regexp program.
260: */
261: private static REProgram compileRegexp(String pat) {
262: StringBuffer repat = new StringBuffer(pat.length() * 6);
263: repat.append('^');
264:
265: // Add an extra character to allow unchecked wcpat[i+1] accesses.
266: // Unterminated ESC sequences are silently handled as '\\'.
267: char[] wcpat = (pat + ESC).toCharArray();
268: for (int i = 0, n = pat.length(); i < n; i++) {
269: char ch = wcpat[i];
270:
271: if (ch == STAR) {
272: if (wcpat[i + 1] != STAR) {
273: repat.append("([^/]*)");
274: continue;
275: }
276:
277: // Handle two and more '*' as single '**'.
278: while (wcpat[i + 1] == STAR)
279: i++;
280: repat.append("(.*)");
281: continue;
282: }
283:
284: // Match ESC+ESC and ESC+STAR as literal ESC and STAR which needs to be escaped
285: // in regexp. Match ESC+other as two characters ESC+other where other may also
286: // need to be escaped in regexp.
287: if (ch == ESC) {
288: ch = wcpat[++i];
289: if (ch != ESC && ch != STAR)
290: repat.append("\\\\");
291: }
292:
293: if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z'
294: || ch >= '0' && ch <= '9' || ch == '/') {
295: repat.append(ch);
296: continue;
297: }
298:
299: repat.append('\\');
300: repat.append(ch);
301: }
302: repat.append('$');
303:
304: return new RECompiler().compile(repat.toString());
305: }
306: }
|