001: /*
002: * $Id: Perl5Substitution.java,v 1.13 2003/11/07 20:16:25 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2000 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.oro.text.regex;
059:
060: import java.util.*;
061:
062: /**
063: * Perl5Substitution implements a Substitution consisting of a
064: * literal string, but allowing Perl5 variable interpolation referencing
065: * saved groups in a match. This class is intended for use with
066: * {@link Util#substitute Util.substitute}.
067: * <p>
068: * The substitution string may contain variable interpolations referring
069: * to the saved parenthesized groups of the search pattern.
070: * A variable interpolation is denoted by <b>$1</b>, or <b>$2</b>,
071: * or <b>$3</b>, etc. If you want such expressions to be
072: * interpreted literally, you should set the <b> numInterpolations </b>
073: * parameter to <b> INTERPOLATE_NONE </b>. It is easiest to explain
074: * what an interpolated variable does by giving an example:
075: * <ul>
076: * Suppose you have the pattern <b>b\d+:</b> and you want to substitute
077: * the <b>b</b>'s for <b>a</b>'s and the colon for a dash in parts of
078: * your input matching the pattern. You can do this by changing the
079: * pattern to <b>b(\d+):</b> and using the substitution expression
080: * <b>a$1-</b>. When a substitution is made, the <b>$1</b> means
081: * "Substitute whatever was matched by the first saved group of the
082: * matching pattern." An input of <b>b123:</b> after substitution
083: * would yield a result of <b>a123-</b>. But there's a little more
084: * to be aware of. If you set the <b>numInterpolations</b> parameter to
085: * <b>INTERPOLATE_ALL</b>, then every time a match is found, the
086: * interpolation variables are computed relative to that match.
087: * But if <b>numInterpolations</b> is set to some positive integer, then
088: * only the interpolation variables for the first <b>numInterpolations</b>
089: * matches are computed relative to the most recent match. After that,
090: * the remaining substitutions have their variable interpolations performed
091: * relative to the <b> numInterpolations </b>'th match. So using the
092: * previously mentioned pattern and substitution expression, if you have
093: * an input of <pre><b>Tank b123: 85 Tank b256: 32 Tank b78: 22</b></pre>
094: * and use a <b> numInterpolations </b> value of <b>INTERPOLATE_ALL</b> and
095: * <b> numSubs </b> value (see
096: * {@link Util#substitute Util.substitute})
097: * of <b> SUBSTITUTE_ALL</b>, then your result will be:
098: * <pre><b>Tank a123- 85 Tank a256- 32 Tank a78- 22</b></pre>
099: * But if you set <b> numInterpolations </b> to 2 and keep
100: * <b> numSubs </b> with a value of <b>SUBSTITUTE_ALL</b>, your result is:
101: * <pre><b>Tank a123- 85 Tank a256- 32 Tank a256- 22</b></pre>
102: * Notice how the last substitution uses the same value for <b>$1</b>
103: * as the second substitution.
104: * </ul>
105: * <p>
106: * A final thing to keep in mind is that if you use an interpolation variable
107: * that corresponds to a group not contained in the match, then it is
108: * interpreted as the empty string. So given the regular expression from the
109: * example, and a substitution expression of <b>a$2-</b>, the result
110: * of the last sample input would be:
111: * <pre><b>Tank a- 85 Tank a- 32 Tank a- 22</b></pre>
112: * The special substitution <b>$&</b> will interpolate the entire portion
113: * of the input matched by the regular expression. <b>$0</b> will
114: * do the same, but it is recommended that it be avoided because the
115: * latest versions of Perl use <b>$0</b> to store the program name rather
116: * than duplicate the behavior of <b>$&</b>.
117: * Also, the result of substituting $ followed by a non-positive integer
118: * is undefined. In order to include a $ in a substitution, it should
119: * be escaped with a backslash (e.g., <b>"\\$0"</b>).
120: * <p>
121: * Perl5 double-quoted string case modification is also supported in
122: * the substitution. The following escape sequences are supported:
123: * <dl compact>
124: * <dt> \\U <dd> make substitution uppercase until end of substitution or \\E
125: * <dt> \\u <dd> make next character uppercase
126: * <dt> \\L <dd> make substitution uppercase until end of substitution or \\E
127: * <dt> \\l <dd> make next character uppercase
128: * <dt> \\E <dd> mark the end of the case modification
129: * </dl>
130: * The double backslashes are shown to remind you that to make a
131: * backslash get past Java's string handling and appear as a backslash
132: * to the substitution, you must escape the backslash.
133: *
134: * @version @version@
135: * @since 1.1
136: * @see Substitution
137: * @see Util
138: * @see Util#substitute
139: * @see Substitution
140: * @see StringSubstitution
141: */
142: public class Perl5Substitution extends StringSubstitution {
143: /**
144: * A constant used when creating a Perl5Substitution indicating that
145: * interpolation variables should be computed relative to the most
146: * recent pattern match.
147: */
148: public static final int INTERPOLATE_ALL = 0;
149:
150: /**
151: * A constant used when creating a Perl5Substitution indicating that
152: * interpolation variables should be interpreted literally, effectively
153: * disabling interpolation.
154: */
155: public static final int INTERPOLATE_NONE = -1;
156:
157: /**
158: * The initial size and unit of growth for the
159: * {@link #_subOpCodes _subOpCodes} array.
160: */
161: private static final int __OPCODE_STORAGE_SIZE = 32;
162:
163: /**
164: * The maximum number of groups supported by interpolation.
165: */
166: private static final int __MAX_GROUPS = Character.MAX_VALUE;
167:
168: /**
169: * A constant declaring opcode for copy operation.
170: */
171: static final int _OPCODE_COPY = -1;
172:
173: /**
174: * A constant declaring opcode for lowercase char operation.
175: */
176: static final int _OPCODE_LOWERCASE_CHAR = -2;
177:
178: /**
179: * A constant declaring opcode for uppercase char operation.
180: */
181: static final int _OPCODE_UPPERCASE_CHAR = -3;
182:
183: /**
184: * A constant declaring opcode for lowercase mode operation.
185: */
186: static final int _OPCODE_LOWERCASE_MODE = -4;
187:
188: /**
189: * A constant declaring opcode for lowercase mode operation.
190: */
191: static final int _OPCODE_UPPERCASE_MODE = -5;
192:
193: /**
194: * A constant declaring opcode for lowercase mode operation.
195: */
196: static final int _OPCODE_ENDCASE_MODE = -6;
197:
198: int _numInterpolations;
199: int[] _subOpcodes;
200: int _subOpcodesCount;
201: char[] _substitutionChars;
202:
203: transient String _lastInterpolation;
204:
205: private static final boolean __isInterpolationCharacter(char ch) {
206: return (Character.isDigit(ch) || ch == '&');
207: }
208:
209: private void __addElement(int value) {
210: int len = _subOpcodes.length;
211: if (_subOpcodesCount == len) {
212: int[] newarray = new int[len + __OPCODE_STORAGE_SIZE];
213: System.arraycopy(_subOpcodes, 0, newarray, 0, len);
214: _subOpcodes = newarray;
215: }
216: _subOpcodes[_subOpcodesCount++] = value;
217: }
218:
219: private void __parseSubs(String sub) {
220: boolean saveDigits, escapeMode, caseMode;
221: int posParam;
222: int offset;
223:
224: char[] subChars = _substitutionChars = sub.toCharArray();
225: int subLength = subChars.length;
226:
227: _subOpcodes = new int[__OPCODE_STORAGE_SIZE];
228: _subOpcodesCount = 0;
229:
230: posParam = 0;
231: offset = -1;
232: saveDigits = false;
233: escapeMode = false;
234: caseMode = false;
235:
236: for (int current = 0; current < subLength; current++) {
237: char c = subChars[current];
238: char nextc;
239: int next = current + 1;
240:
241: // Save digits
242: if (saveDigits) {
243: int digit = Character.digit(c, 10);
244:
245: if (digit > -1) {
246: if (posParam <= __MAX_GROUPS) {
247: posParam *= 10;
248: posParam += digit;
249: }
250: if (next == subLength) {
251: __addElement(posParam);
252: }
253: continue;
254: } else if (c == '&') {
255: if (/*current > 0 &&*/subChars[current - 1] == '$') {
256: __addElement(0);
257: posParam = 0;
258: saveDigits = false;
259: continue;
260: }
261: }
262:
263: __addElement(posParam);
264: posParam = 0;
265: saveDigits = false;
266: }
267:
268: if ((c != '$' && c != '\\') || escapeMode) {
269: escapeMode = false;
270: if (offset < 0) {
271: offset = current;
272: __addElement(_OPCODE_COPY);
273: __addElement(offset);
274: }
275: if (next == subLength) {
276: __addElement(next - offset);
277: }
278: continue;
279: }
280:
281: if (offset >= 0) {
282: __addElement(current - offset);
283: offset = -1;
284: }
285:
286: // Only do positional and escapes if we have a next char
287: if (next == subLength)
288: continue;
289: nextc = subChars[next];
290:
291: // Positional params
292: if (c == '$') {
293: saveDigits = __isInterpolationCharacter(nextc);
294: } else if (c == '\\') { // Escape codes
295: if (nextc == 'l') {
296: if (!caseMode) {
297: __addElement(_OPCODE_LOWERCASE_CHAR);
298: current++;
299: }
300: } else if (nextc == 'u') {
301: if (!caseMode) {
302: __addElement(_OPCODE_UPPERCASE_CHAR);
303: current++;
304: }
305: } else if (nextc == 'L') {
306: __addElement(_OPCODE_LOWERCASE_MODE);
307: current++;
308: caseMode = true;
309: } else if (nextc == 'U') {
310: __addElement(_OPCODE_UPPERCASE_MODE);
311: current++;
312: caseMode = true;
313: } else if (nextc == 'E') {
314: __addElement(_OPCODE_ENDCASE_MODE);
315: current++;
316: caseMode = false;
317: } else {
318: escapeMode = true;
319: }
320: }
321: }
322: }
323:
324: String _finalInterpolatedSub(MatchResult result) {
325: StringBuffer buffer = new StringBuffer(10);
326: _calcSub(buffer, result);
327: return buffer.toString();
328: }
329:
330: void _calcSub(StringBuffer buffer, MatchResult result) {
331: int size, offset, count, caseMode;
332: char[] sub, str, match;
333: int[] subOpcodes = _subOpcodes;
334:
335: caseMode = 0;
336:
337: str = _substitutionChars;
338: match = result.group(0).toCharArray();
339:
340: size = _subOpcodesCount;
341:
342: for (int element = 0; element < size; element++) {
343: int value = subOpcodes[element];
344:
345: // If we have a group, set up interpolation, else
346: // interpret op code.
347: if (value >= 0 && value < result.groups()) {
348: int end, len;
349: offset = result.begin(value);
350:
351: if (offset < 0)
352: continue;
353:
354: end = result.end(value);
355:
356: if (end < 0)
357: continue;
358:
359: len = result.length();
360:
361: if (offset >= len || end > len || offset >= end)
362: continue;
363:
364: count = end - offset;
365: sub = match;
366: } else if (value == _OPCODE_COPY) {
367: element++;
368: if (element >= size)
369: continue;
370: offset = subOpcodes[element];
371: element++;
372: if (element >= size)
373: continue;
374: count = subOpcodes[element];
375: sub = str;
376: } else if (value == _OPCODE_LOWERCASE_CHAR
377: || value == _OPCODE_UPPERCASE_CHAR) {
378: if (caseMode != _OPCODE_LOWERCASE_MODE
379: && caseMode != _OPCODE_UPPERCASE_MODE)
380: caseMode = value;
381: continue;
382: } else if (value == _OPCODE_LOWERCASE_MODE
383: || value == _OPCODE_UPPERCASE_MODE) {
384: caseMode = value;
385: continue;
386: } else if (value == _OPCODE_ENDCASE_MODE) {
387: caseMode = 0;
388: continue;
389: } else
390: continue;
391:
392: // Apply modes to buf
393: if (caseMode == _OPCODE_LOWERCASE_CHAR) {
394: buffer.append(Character.toLowerCase(sub[offset++]));
395: buffer.append(sub, offset, --count);
396: caseMode = 0;
397: } else if (caseMode == _OPCODE_UPPERCASE_CHAR) {
398: buffer.append(Character.toUpperCase(sub[offset++]));
399: buffer.append(sub, offset, --count);
400: caseMode = 0;
401: } else if (caseMode == _OPCODE_LOWERCASE_MODE) {
402: for (int end = offset + count; offset < end;) {
403: buffer.append(Character.toLowerCase(sub[offset++]));
404: }
405: } else if (caseMode == _OPCODE_UPPERCASE_MODE) {
406: for (int end = offset + count; offset < end;) {
407: buffer.append(Character.toUpperCase(sub[offset++]));
408: }
409: } else
410: buffer.append(sub, offset, count);
411:
412: }
413: }
414:
415: /**
416: * Default constructor initializing substitution to a zero length
417: * String and the number of interpolations to
418: * {@link #INTERPOLATE_ALL}.
419: */
420: public Perl5Substitution() {
421: this ("", INTERPOLATE_ALL);
422: }
423:
424: /**
425: * Creates a Perl5Substitution using the specified substitution
426: * and setting the number of interpolations to
427: * {@link #INTERPOLATE_ALL}.
428: * <p>
429: * @param substitution The string to use as a substitution.
430: */
431: public Perl5Substitution(String substitution) {
432: this (substitution, INTERPOLATE_ALL);
433: }
434:
435: /**
436: * Creates a Perl5Substitution using the specified substitution
437: * and setting the number of interpolations to the specified value.
438: * <p>
439: * @param substitution The string to use as a substitution.
440: * @param numInterpolations
441: * If set to <b>INTERPOLATE_NONE</b>, interpolation variables are
442: * interpreted literally and not as references to the saved
443: * parenthesized groups of a pattern match. If set to
444: * <b> INTERPOLATE_ALL </b>, all variable interpolations
445: * are computed relative to the pattern match responsible for
446: * the current substitution. If set to a positive integer,
447: * the first <b> numInterpolations </b> substitutions have
448: * their variable interpolation performed relative to the
449: * most recent match, but the remaining substitutions have
450: * their variable interpolations performed relative to the
451: * <b> numInterpolations </b>'th match.
452: */
453: public Perl5Substitution(String substitution, int numInterpolations) {
454: setSubstitution(substitution, numInterpolations);
455: }
456:
457: /**
458: * Sets the substitution represented by this Perl5Substitution, also
459: * setting the number of interpolations to
460: * {@link #INTERPOLATE_ALL}.
461: * You should use this method in order to avoid repeatedly allocating new
462: * Perl5Substitutions. It is recommended that you allocate a single
463: * Perl5Substitution and reuse it by using this method when appropriate.
464: * <p>
465: * @param substitution The string to use as a substitution.
466: */
467: public void setSubstitution(String substitution) {
468: setSubstitution(substitution, INTERPOLATE_ALL);
469: }
470:
471: /**
472: * Sets the substitution represented by this Perl5Substitution, also
473: * setting the number of interpolations to the specified value.
474: * You should use this method in order to avoid repeatedly allocating new
475: * Perl5Substitutions. It is recommended that you allocate a single
476: * Perl5Substitution and reuse it by using this method when appropriate.
477: * <p>
478: * @param substitution The string to use as a substitution.
479: * @param numInterpolations
480: * If set to <b>INTERPOLATE_NONE</b>, interpolation variables are
481: * interpreted literally and not as references to the saved
482: * parenthesized groups of a pattern match. If set to
483: * <b> INTERPOLATE_ALL </b>, all variable interpolations
484: * are computed relative to the pattern match responsible for
485: * the current substitution. If set to a positive integer,
486: * the first <b> numInterpolations </b> substitutions have
487: * their variable interpolation performed relative to the
488: * most recent match, but the remaining substitutions have
489: * their variable interpolations performed relative to the
490: * <b> numInterpolations </b>'th match.
491: */
492: public void setSubstitution(String substitution,
493: int numInterpolations) {
494: super .setSubstitution(substitution);
495: _numInterpolations = numInterpolations;
496:
497: if (numInterpolations != INTERPOLATE_NONE
498: && (substitution.indexOf('$') != -1 || substitution
499: .indexOf('\\') != -1))
500: __parseSubs(substitution);
501: else
502: _subOpcodes = null;
503: _lastInterpolation = null;
504: }
505:
506: /**
507: * Appends the substitution to a buffer containing the original input
508: * with substitutions applied for the pattern matches found so far.
509: * See
510: * {@link Substitution#appendSubstitution Substitution.appendSubstition()}
511: * for more details regarding the expected behavior of this method.
512: * <p>
513: * @param appendBuffer The buffer containing the new string resulting
514: * from performing substitutions on the original input.
515: * @param match The current match causing a substitution to be made.
516: * @param substitutionCount The number of substitutions that have been
517: * performed so far by Util.substitute.
518: * @param originalInput The original input upon which the substitutions are
519: * being performed. This is a read-only parameter and is not modified.
520: * @param matcher The PatternMatcher used to find the current match.
521: * @param pattern The Pattern used to find the current match.
522: */
523: public void appendSubstitution(StringBuffer appendBuffer,
524: MatchResult match, int substitutionCount,
525: PatternMatcherInput originalInput, PatternMatcher matcher,
526: Pattern pattern) {
527: if (_subOpcodes == null) {
528: super .appendSubstitution(appendBuffer, match,
529: substitutionCount, originalInput, matcher, pattern);
530: return;
531: }
532:
533: if (_numInterpolations < 1
534: || substitutionCount < _numInterpolations)
535: _calcSub(appendBuffer, match);
536: else {
537: if (substitutionCount == _numInterpolations)
538: _lastInterpolation = _finalInterpolatedSub(match);
539: appendBuffer.append(_lastInterpolation);
540: }
541: }
542:
543: }
|