001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.xerces.utils.regex;
059:
060: import java.text.CharacterIterator;
061:
062: public final class REUtil {
063: private REUtil() {
064: }
065:
066: static final int composeFromSurrogates(int high, int low) {
067: return 0x10000 + ((high - 0xd800) << 10) + low - 0xdc00;
068: }
069:
070: static final boolean isLowSurrogate(int ch) {
071: return (ch & 0xfc00) == 0xdc00;
072: }
073:
074: static final boolean isHighSurrogate(int ch) {
075: return (ch & 0xfc00) == 0xd800;
076: }
077:
078: static final String decomposeToSurrogates(int ch) {
079: char[] chs = new char[2];
080: ch -= 0x10000;
081: chs[0] = (char) ((ch >> 10) + 0xd800);
082: chs[1] = (char) ((ch & 0x3ff) + 0xdc00);
083: return new String(chs);
084: }
085:
086: static final String substring(CharacterIterator iterator,
087: int begin, int end) {
088: char[] src = new char[end - begin];
089: for (int i = 0; i < src.length; i++)
090: src[i] = iterator.setIndex(i + begin);
091: return new String(src);
092: }
093:
094: // ================================================================
095:
096: static final int getOptionValue(int ch) {
097: int ret = 0;
098: switch (ch) {
099: case 'i':
100: ret = RegularExpression.IGNORE_CASE;
101: break;
102: case 'm':
103: ret = RegularExpression.MULTIPLE_LINES;
104: break;
105: case 's':
106: ret = RegularExpression.SINGLE_LINE;
107: break;
108: case 'x':
109: ret = RegularExpression.EXTENDED_COMMENT;
110: break;
111: case 'u':
112: ret = RegularExpression.USE_UNICODE_CATEGORY;
113: break;
114: case 'w':
115: ret = RegularExpression.UNICODE_WORD_BOUNDARY;
116: break;
117: case 'F':
118: ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
119: break;
120: case 'H':
121: ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
122: break;
123: case 'X':
124: ret = RegularExpression.XMLSCHEMA_MODE;
125: break;
126: case ',':
127: ret = RegularExpression.SPECIAL_COMMA;
128: break;
129: default:
130: }
131: return ret;
132: }
133:
134: static final int parseOptions(String opts) throws ParseException {
135: if (opts == null)
136: return 0;
137: int options = 0;
138: for (int i = 0; i < opts.length(); i++) {
139: int v = getOptionValue(opts.charAt(i));
140: if (v == 0)
141: throw new ParseException("Unknown Option: "
142: + opts.substring(i), -1);
143: options |= v;
144: }
145: return options;
146: }
147:
148: static final String createOptionString(int options) {
149: StringBuffer sb = new StringBuffer(9);
150: if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
151: sb.append((char) 'F');
152: if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
153: sb.append((char) 'H');
154: if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
155: sb.append((char) 'X');
156: if ((options & RegularExpression.IGNORE_CASE) != 0)
157: sb.append((char) 'i');
158: if ((options & RegularExpression.MULTIPLE_LINES) != 0)
159: sb.append((char) 'm');
160: if ((options & RegularExpression.SINGLE_LINE) != 0)
161: sb.append((char) 's');
162: if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
163: sb.append((char) 'u');
164: if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
165: sb.append((char) 'w');
166: if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
167: sb.append((char) 'x');
168: if ((options & RegularExpression.SPECIAL_COMMA) != 0)
169: sb.append((char) ',');
170: return sb.toString().intern();
171: }
172:
173: // ================================================================
174:
175: static String stripExtendedComment(String regex) {
176: int len = regex.length();
177: StringBuffer buffer = new StringBuffer(len);
178: int offset = 0;
179: while (offset < len) {
180: int ch = regex.charAt(offset++);
181: // Skips a white space.
182: if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r'
183: || ch == ' ')
184: continue;
185:
186: if (ch == '#') { // Skips chracters between '#' and a line end.
187: while (offset < len) {
188: ch = regex.charAt(offset++);
189: if (ch == '\r' || ch == '\n')
190: break;
191: }
192: continue;
193: }
194:
195: int next; // Strips an escaped white space.
196: if (ch == '\\' && offset < len) {
197: if ((next = regex.charAt(offset)) == '#'
198: || next == '\t' || next == '\n' || next == '\f'
199: || next == '\r' || next == ' ') {
200: buffer.append((char) next);
201: offset++;
202: } else { // Other escaped character.
203: buffer.append((char) '\\');
204: buffer.append((char) next);
205: offset++;
206: }
207: } else
208: // As is.
209: buffer.append((char) ch);
210: }
211: return buffer.toString();
212: }
213:
214: // ================================================================
215:
216: /**
217: * Sample entry.
218: * <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil <regex> <string></KBD></div>
219: */
220: public static void main(String[] argv) {
221: String pattern = null;
222: try {
223: String options = "";
224: String target = null;
225: if (argv.length == 0) {
226: System.out
227: .println("Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String");
228: System.exit(0);
229: }
230: for (int i = 0; i < argv.length; i++) {
231: if (argv[i].length() == 0 || argv[i].charAt(0) != '-') {
232: if (pattern == null)
233: pattern = argv[i];
234: else if (target == null)
235: target = argv[i];
236: else
237: System.err.println("Unnecessary: " + argv[i]);
238: } else if (argv[i].equals("-i")) {
239: options += "i";
240: } else if (argv[i].equals("-m")) {
241: options += "m";
242: } else if (argv[i].equals("-s")) {
243: options += "s";
244: } else if (argv[i].equals("-u")) {
245: options += "u";
246: } else if (argv[i].equals("-w")) {
247: options += "w";
248: } else if (argv[i].equals("-X")) {
249: options += "X";
250: } else {
251: System.err.println("Unknown option: " + argv[i]);
252: }
253: }
254: RegularExpression reg = new RegularExpression(pattern,
255: options);
256: System.out.println("RegularExpression: " + reg);
257: Match match = new Match();
258: reg.matches(target, match);
259: for (int i = 0; i < match.getNumberOfGroups(); i++) {
260: if (i == 0)
261: System.out
262: .print("Matched range for the whole pattern: ");
263: else
264: System.out.print("[" + i + "]: ");
265: if (match.getBeginning(i) < 0)
266: System.out.println("-1");
267: else {
268: System.out.print(match.getBeginning(i) + ", "
269: + match.getEnd(i) + ", ");
270: System.out.println("\"" + match.getCapturedText(i)
271: + "\"");
272: }
273: }
274: } catch (ParseException pe) {
275: if (pattern == null) {
276: pe.printStackTrace();
277: } else {
278: System.err
279: .println("org.apache.xerces.utils.regex.ParseException: "
280: + pe.getMessage());
281: String indent = " ";
282: System.err.println(indent + pattern);
283: int loc = pe.getLocation();
284: if (loc >= 0) {
285: System.err.print(indent);
286: for (int i = 0; i < loc; i++)
287: System.err.print("-");
288: System.err.println("^");
289: }
290: }
291: } catch (Exception e) {
292: e.printStackTrace();
293: }
294: }
295:
296: static final int CACHESIZE = 20;
297: static RegularExpression[] regexCache = new RegularExpression[CACHESIZE];
298:
299: /**
300: * Creates a RegularExpression instance.
301: * This method caches created instances.
302: *
303: * @see org.apache.xerces.utils.regex.RegularExpression#RegularExpression(java.lang.String, java.lang.String)
304: */
305: public static RegularExpression createRegex(String pattern,
306: String options) throws ParseException {
307: RegularExpression re = null;
308: int intOptions = REUtil.parseOptions(options);
309: synchronized (REUtil.regexCache) {
310: int i;
311: for (i = 0; i < REUtil.CACHESIZE; i++) {
312: re = REUtil.regexCache[i];
313: if (re == null) {
314: i = -1;
315: break;
316: }
317: if (re.equals(pattern, intOptions))
318: break;
319: }
320: if (re != null) {
321: if (i != 0) {
322: System.arraycopy(REUtil.regexCache, 0,
323: REUtil.regexCache, 1, i);
324: REUtil.regexCache[0] = re;
325: }
326: } else {
327: re = new RegularExpression(pattern, options);
328: System.arraycopy(REUtil.regexCache, 0,
329: REUtil.regexCache, 1, REUtil.CACHESIZE - 1);
330: REUtil.regexCache[0] = re;
331: }
332: }
333: return re;
334: }
335:
336: /**
337: *
338: * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String)
339: */
340: public static boolean matches(String regex, String target)
341: throws ParseException {
342: return REUtil.createRegex(regex, null).matches(target);
343: }
344:
345: /**
346: *
347: * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String)
348: */
349: public static boolean matches(String regex, String options,
350: String target) throws ParseException {
351: return REUtil.createRegex(regex, options).matches(target);
352: }
353:
354: // ================================================================
355:
356: /**
357: *
358: */
359: public static String quoteMeta(String literal) {
360: int len = literal.length();
361: StringBuffer buffer = null;
362: for (int i = 0; i < len; i++) {
363: int ch = literal.charAt(i);
364: if (".*+?{[()|\\^$".indexOf(ch) >= 0) {
365: if (buffer == null) {
366: buffer = new StringBuffer(i + (len - i) * 2);
367: if (i > 0)
368: buffer.append(literal.substring(0, i));
369: }
370: buffer.append((char) '\\');
371: } else if (buffer != null)
372: buffer.append((char) ch);
373: }
374: return buffer != null ? buffer.toString() : literal;
375: }
376:
377: // ================================================================
378:
379: static void dumpString(String v) {
380: for (int i = 0; i < v.length(); i++) {
381: System.out.print(Integer.toHexString(v.charAt(i)));
382: System.out.print(" ");
383: }
384: System.out.println();
385: }
386: }
|