001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * Portions Copyrighted 2007 Sun Microsystems, Inc.
027: */
028:
029: package org.netbeans.modules.search;
030:
031: import java.util.regex.Pattern;
032:
033: /**
034: * Parser of simple regular expressions with only three supported special
035: * characters {@code '*'} (zero or more), {@code '?'} (zero or one)
036: * and {@code '\\'} (quotes the following character).
037: *
038: * @author Marian Petras
039: */
040: final class RegexpMaker {
041:
042: /** regular expression representing a set of word characters */
043: private static final String wordCharsExpr = "[\\p{javaLetterOrDigit}_]"; //NOI18N
044: /**
045: * regular expression representing negative lookbehind
046: * for a {@linkplain #wordCharsExpr word character}
047: */
048: private static final String checkNotAfterWordChar = "(?<!"
049: + wordCharsExpr + ")"; //NOI18N
050: /**
051: * regular expression representing negative lookahead
052: * for a {@linkplain #wordCharsExpr word character}
053: */
054: private static final String checkNotBeforeWordChar = "(?!"
055: + wordCharsExpr + ")"; //NOI18N
056:
057: private RegexpMaker() {
058: }
059:
060: /**
061: * Translates the given simple pattern to a regular expression.
062: *
063: * @param simplePattern pattern to be translated
064: * @return regular expression corresponding to the simple pattern
065: */
066: static String makeRegexp(String simplePattern) {
067:
068: /* This method is currently used only in tests. */
069:
070: return makeRegexp(simplePattern, false);
071: }
072:
073: /**
074: * Translates the given simple pattern to a regular expression.
075: *
076: * @param simplePattern pattern to be translated
077: * @param wholeWords whether the <i>Whole Words</i> option is selected
078: * @return regular expression corresponding to the simple pattern
079: */
080: static String makeRegexp(String simplePattern, boolean wholeWords) {
081: if (simplePattern.length() == 0) { //trivial case
082: return simplePattern;
083: }
084:
085: if (!wholeWords
086: && Pattern.matches("[a-zA-Z0-9 ]*", simplePattern)) { //NOI18N
087: return simplePattern; //trivial case
088: }
089:
090: StringBuilder buf = new StringBuilder(
091: simplePattern.length() + 16);
092: boolean quoted = false;
093: boolean starPresent = false;
094: int minCount = 0;
095:
096: boolean bufIsEmpty = true;
097: char lastInputChar = '*'; //might be any other non-word character
098: for (char c : simplePattern.toCharArray()) {
099: if (quoted) {
100: assert !starPresent && (minCount == 0);
101: if (wholeWords && bufIsEmpty && isWordChar(c)) {
102: buf.append(checkNotAfterWordChar);
103: }
104: if (!isSimpleCharacter(c)) {
105: buf.append('\\');
106: }
107: buf.append(c);
108: lastInputChar = c;
109: bufIsEmpty = false;
110: quoted = false;
111: } else if (c == '?') {
112: minCount++;
113: } else if (c == '*') {
114: starPresent = true;
115: } else {
116: if (starPresent || (minCount != 0)) {
117: if (wholeWords && bufIsEmpty && !starPresent) {
118: buf.append(checkNotAfterWordChar);
119: }
120: bufIsEmpty &= !addMetachars(buf, starPresent,
121: minCount, wholeWords, !bufIsEmpty);
122: starPresent = false;
123: minCount = 0;
124: }
125:
126: if (c == '\\') {
127: quoted = true;
128: } else {
129: if (wholeWords && bufIsEmpty && isWordChar(c)) {
130: buf.append(checkNotAfterWordChar);
131: }
132: if (!isSimpleCharacter(c)) {
133: buf.append('\\');
134: }
135: buf.append(c);
136: lastInputChar = c;
137: bufIsEmpty = false;
138: }
139: }
140: }
141: if (quoted) {
142: assert !starPresent && (minCount == 0);
143: buf.append('\\').append('\\');
144: lastInputChar = '\\';
145: bufIsEmpty = false;
146: quoted = false;
147: } else if (starPresent || (minCount != 0)) {
148: if (wholeWords && !starPresent && bufIsEmpty) {
149: buf.append(checkNotAfterWordChar);
150: }
151: bufIsEmpty &= !addMetachars(buf, starPresent, minCount,
152: wholeWords, false);
153: if (wholeWords && !starPresent) {
154: buf.append(checkNotBeforeWordChar);
155: }
156: lastInputChar = '*'; //might be any other non-word character
157: starPresent = false;
158: minCount = 0;
159: }
160: if (wholeWords && isWordChar(lastInputChar)) {
161: buf.append(checkNotBeforeWordChar);
162: }
163: return buf.toString();
164: }
165:
166: /**
167: * Checks whether the given character is a word character.
168: * @param c character to be checked
169: * @return {@code true} if the character is a word character,
170: * {@code false} otherwise
171: * @see #wordCharsExpr
172: */
173: private static boolean isWordChar(char c) {
174: /* not necessary - just for performance */
175: if ((c == '*') || (c == '\\')) {
176: return false;
177: }
178:
179: assert wordCharsExpr == "[\\p{javaLetterOrDigit}_]" //NOI18N
180: : "update implementation of method isWordChar(char)"; //NOI18N
181: return (c == '_') || Character.isLetterOrDigit(c);
182: }
183:
184: /**
185: * Generates the part of a regular expression, that represents a sequence
186: * of simple expression's metacharacters {@code '*'} and {@code '?'},
187: * and adds it to the given string buffer.
188: *
189: * @param buf string buffer to which the new part is to be added
190: * @param starPresent whether the sequence contained at least one
191: * {@code '*'} character
192: * @param minCount number of {@code '?'} characters in the sequence
193: * @param wholeWords whether the <i>Whole Words</i> option is selected
194: * @param middle whether the metachars are to be placed in the middle
195: * (i.e. not in the beginning or at the end) of the search
196: * expression
197: * @return {@code true} if something was added to the string buffer,
198: * {@code false} if the buffer was not modified
199: */
200: private static boolean addMetachars(final StringBuilder buf,
201: boolean starPresent, final int minCount,
202: final boolean wholeWords, final boolean middle) {
203: assert starPresent || (minCount != 0);
204:
205: /*
206: * If 'Whole Words' is not activated, ignore stars in the beginning
207: * and at the end of the expression:
208: */
209: if (starPresent && !wholeWords && !middle) {
210: starPresent = false;
211: }
212:
213: if ((minCount == 0) && !starPresent) {
214: return false;
215: }
216:
217: if (wholeWords) {
218: buf.append(wordCharsExpr);
219: } else {
220: buf.append('.');
221: }
222: switch (minCount) {
223: case 0:
224: assert starPresent;
225: buf.append('*');
226: break;
227: case 1:
228: if (starPresent) {
229: buf.append('+');
230: }
231: break;
232: default:
233: if (wholeWords) {
234: buf.append('{').append(minCount);
235: if (starPresent) {
236: buf.append(',');
237: }
238: buf.append('}');
239: } else {
240: for (int i = 1; i < minCount; i++) {
241: buf.append('.');
242: }
243: if (starPresent) {
244: buf.append('+');
245: }
246: }
247: }
248: if (starPresent && middle) {
249: buf.append('?'); //use reluctant variant of the quantifier
250: }
251: return true;
252: }
253:
254: /**
255: * Translates the given simple pattern (or several patterns) to a single
256: * regular expression.
257: *
258: * @param simplePatternList pattern list to be translated
259: * @return regular expression corresponding to the simple pattern
260: * (or to the list of simple patterns)
261: */
262: static String makeMultiRegexp(String simplePatternList) {
263: if (simplePatternList.length() == 0) { //trivial case
264: return simplePatternList;
265: }
266:
267: if (Pattern.matches("[a-zA-Z0-9]*", simplePatternList)) { //NOI18N
268: return simplePatternList; //trivial case
269: }
270:
271: StringBuilder buf = new StringBuilder(simplePatternList
272: .length() + 16);
273: boolean lastWasSeparator = false;
274: boolean quoted = false;
275: boolean starPresent = false;
276: for (char c : simplePatternList.toCharArray()) {
277: if (quoted) {
278: if (!isSimpleCharacter(c)) {
279: buf.append('\\');
280: }
281: buf.append(c);
282: quoted = false;
283: } else if ((c == ',') || (c == ' ')) {
284: if (starPresent) {
285: buf.append('.').append('*');
286: starPresent = false;
287: }
288: lastWasSeparator = true;
289: } else {
290: if (lastWasSeparator && (buf.length() != 0)) {
291: buf.append('|');
292: }
293: if (c == '?') {
294: buf.append('.');
295: } else if (c == '*') {
296: starPresent = true;
297: } else {
298: if (starPresent) {
299: buf.append('.').append('*');
300: starPresent = false;
301: }
302: if (c == '\\') {
303: quoted = true;
304: } else {
305: if (!isSimpleCharacter(c)) {
306: buf.append('\\');
307: }
308: buf.append(c);
309: }
310: }
311: lastWasSeparator = false;
312: }
313: }
314: if (quoted) {
315: buf.append('\\').append('\\');
316: quoted = false;
317: } else if (starPresent) {
318: buf.append('.').append('*');
319: starPresent = false;
320: }
321: return buf.toString();
322: }
323:
324: private static boolean isSimpleCharacter(char c) {
325: int cint = (int) c;
326: return (cint == 0x20) //space
327: || (cint > 0x7f) //non-ASCII
328: || (cint >= 0x30)
329: && (cint <= 0x39) //'0' .. '9'
330: || (cint & ~0x7f) == 0
331: && ((cint &= ~0x20) >= 0x41)
332: && (cint <= 0x5a); //a..z,A..Z
333: }
334:
335: }
|