Pattern helper : Pattern « Regular Expressions


 

/*

 * Static String formatting and query routines.

 * Copyright (C) 2001-2005 Stephen Ostermiller

 * http://ostermiller.org/contact.pl?regarding=Java+Utilities

 *

 * This program is free software; you can redistribute it and/or modify

 * it under the terms of the GNU General Public License as published by

 * the Free Software Foundation; either version 2 of the License, or

 * (at your option) any later version.

 *

 * This program is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU General Public License for more details.

 *

 * See COPYING.TXT for details.

 */





import java.util.HashMap;

import java.util.regex.Pattern;



/**

 * Utilities for String formatting, manipulation, and queries.

 * More information about this class is available from <a target="_top" href=

 * "http://ostermiller.org/utils/StringHelper.html">ostermiller.org</a>.

 *

 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities

 * @since ostermillerutils 1.00.00

 */

public class StringHelper {







  /**

   * Build a regular expression that is each of the terms or'd together.

   *

   * @param terms a list of search terms.

   * @param sb place to build the regular expression.

   * @throws IllegalArgumentException if the length of terms is zero.

   *

   * @since ostermillerutils 1.02.25

   */

  private static void buildFindAnyPattern(String[] terms, StringBuffer sb){

    if (terms.length == 0) throw new IllegalArgumentException("There must be at least one term to find.");

    sb.append("(?:");

    for (int i=0; i<terms.length; i++){

      if (i>0) sb.append("|");

      sb.append("(?:");

      sb.append(escapeRegularExpressionLiteral(terms[i]));

      sb.append(")");

    }

    sb.append(")");

  }



  /**

   * Compile a pattern that can will match a string if the string

   * contains any of the given terms.

   * <p>

   * Usage:<br>

   * <code>boolean b = getContainsAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it contains any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static Pattern getContainsAnyPattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?s).*");

    buildFindAnyPattern(terms, sb);

    sb.append(".*");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * equals any of the given terms.

   * <p>

   * Usage:<br>

   * <code>boolean b = getEqualsAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it equals any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

   public static Pattern getEqualsAnyPattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?s)\\A");

    buildFindAnyPattern(terms, sb);

    sb.append("\\z");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * starts with any of the given terms.

   * <p>

   * Usage:<br>

   * <code>boolean b = getStartsWithAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it starts with any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

   public static Pattern getStartsWithAnyPattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?s)\\A");

    buildFindAnyPattern(terms, sb);

    sb.append(".*");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * ends with any of the given terms.

   * <p>

   * Usage:<br>

   * <code>boolean b = getEndsWithAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it ends with any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static Pattern getEndsWithAnyPattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?s).*");

    buildFindAnyPattern(terms, sb);

    sb.append("\\z");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * contains any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * Usage:<br>

   * <code>boolean b = getContainsAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it contains any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static Pattern getContainsAnyIgnoreCasePattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?i)(?u)(?s).*");

    buildFindAnyPattern(terms, sb);

    sb.append(".*");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * equals any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * Usage:<br>

   * <code>boolean b = getEqualsAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it equals any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

   public static Pattern getEqualsAnyIgnoreCasePattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?i)(?u)(?s)\\A");

    buildFindAnyPattern(terms, sb);

    sb.append("\\z");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * starts with any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * Usage:<br>

   * <code>boolean b = getStartsWithAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it starts with any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

   public static Pattern getStartsWithAnyIgnoreCasePattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?i)(?u)(?s)\\A");

    buildFindAnyPattern(terms, sb);

    sb.append(".*");

    return Pattern.compile(sb.toString());

  }



  /**

   * Compile a pattern that can will match a string if the string

   * ends with any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * Usage:<br>

   * <code>boolean b = getEndsWithAnyPattern(terms).matcher(s).matches();</code>

   * <p>

   * If multiple strings are matched against the same set of terms,

   * it is more efficient to reuse the pattern returned by this function.

   *

   * @param terms Array of search strings.

   * @return Compiled pattern that can be used to match a string to see if it ends with any of the terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static Pattern getEndsWithAnyIgnoreCasePattern(String[] terms){

    StringBuffer sb = new StringBuffer();

    sb.append("(?i)(?u)(?s).*");

    buildFindAnyPattern(terms, sb);

    sb.append("\\z");

    return Pattern.compile(sb.toString());

  }



  /**

   * Tests to see if the given string contains any of the given terms.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getContainsAnyPattern(String[])

   *

   * @param s String that may contain any of the given terms.

   * @param terms list of substrings that may be contained in the given string.

   * @return true iff one of the terms is a substring of the given string.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean containsAny(String s, String[] terms){

    return getContainsAnyPattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string equals any of the given terms.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getEqualsAnyPattern(String[])

   *

   * @param s String that may equal any of the given terms.

   * @param terms list of strings that may equal the given string.

   * @return true iff one of the terms is equal to the given string.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean equalsAny(String s, String[] terms){

    return getEqualsAnyPattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string starts with any of the given terms.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getStartsWithAnyPattern(String[])

   *

   * @param s String that may start with any of the given terms.

   * @param terms list of strings that may start with the given string.

   * @return true iff the given string starts with one of the given terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean startsWithAny(String s, String[] terms){

    return getStartsWithAnyPattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string ends with any of the given terms.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getEndsWithAnyPattern(String[])

   *

   * @param s String that may end with any of the given terms.

   * @param terms list of strings that may end with the given string.

   * @return true iff the given string ends with one of the given terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean endsWithAny(String s, String[] terms){

    return getEndsWithAnyPattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string contains any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getContainsAnyIgnoreCasePattern(String[])

   *

   * @param s String that may contain any of the given terms.

   * @param terms list of substrings that may be contained in the given string.

   * @return true iff one of the terms is a substring of the given string.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean containsAnyIgnoreCase(String s, String[] terms){

    return getContainsAnyIgnoreCasePattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string equals any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getEqualsAnyIgnoreCasePattern(String[])

   *

   * @param s String that may equal any of the given terms.

   * @param terms list of strings that may equal the given string.

   * @return true iff one of the terms is equal to the given string.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean equalsAnyIgnoreCase(String s, String[] terms){

    return getEqualsAnyIgnoreCasePattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string starts with any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getStartsWithAnyIgnoreCasePattern(String[])

   *

   * @param s String that may start with any of the given terms.

   * @param terms list of strings that may start with the given string.

   * @return true iff the given string starts with one of the given terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean startsWithAnyIgnoreCase(String s, String[] terms){

    return getStartsWithAnyIgnoreCasePattern(terms).matcher(s).matches();

  }



  /**

   * Tests to see if the given string ends with any of the given terms.

   * <p>

   * Case is ignored when matching using Unicode case rules.

   * <p>

   * This implementation is more efficient than the brute force approach

   * of testing the string against each of the terms.  It instead compiles

   * a single regular expression that can test all the terms at once, and

   * uses that expression against the string.

   * <p>

   * This is a convenience method.  If multiple strings are tested against

   * the same set of terms, it is more efficient not to compile the regular

   * expression multiple times.

   * @see #getEndsWithAnyIgnoreCasePattern(String[])

   *

   * @param s String that may end with any of the given terms.

   * @param terms list of strings that may end with the given string.

   * @return true iff the given string ends with one of the given terms.

   *

   * @since ostermillerutils 1.02.25

   */

  public static boolean endsWithAnyIgnoreCase(String s, String[] terms){

    return getEndsWithAnyIgnoreCasePattern(terms).matcher(s).matches();

  }



  /**

   * Escapes characters that have special meaning to

   * regular expressions

   *

   * @param s String to be escaped

   * @return escaped String

   * @throws NullPointerException if s is null.

   *

   * @since ostermillerutils 1.02.25

   */

  public static String escapeRegularExpressionLiteral(String s){

    // According to the documentation in the Pattern class:

    //

    // The backslash character ('\') serves to introduce escaped constructs,

    // as defined in the table above, as well as to quote characters that

    // otherwise would be interpreted as unescaped constructs. Thus the

    // expression \\ matches a single backslash and \{ matches a left brace.

    //

    // It is an error to use a backslash prior to any alphabetic character

    // that does not denote an escaped construct; these are reserved for future

    // extensions to the regular-expression language. A backslash may be used

    // prior to a non-alphabetic character regardless of whether that character

    // is part of an unescaped construct.

    //

    // As a result, escape everything except [0-9a-zA-Z]



    int length = s.length();

    int newLength = length;

    // first check for characters that might

    // be dangerous and calculate a length

    // of the string that has escapes.

    for (int i=0; i<length; i++){

      char c = s.charAt(i);

      if (!((c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z'))){

        newLength += 1;

      }

    }

    if (length == newLength){

      // nothing to escape in the string

      return s;

    }

    StringBuffer sb = new StringBuffer(newLength);

    for (int i=0; i<length; i++){

      char c = s.charAt(i);

      if (!((c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z'))){

        sb.append('\\');

      }

      sb.append(c);

    }

    return sb.toString();

  }

}
Pattern helper : Pattern « Regular Expressions « Java