0001: package it.unimi.dsi.mg4j.util;
0002:
0003: import it.unimi.dsi.fastutil.chars.CharList;
0004:
0005: /*
0006: * MG4J: Managing Gigabytes for Java
0007: *
0008: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
0009: *
0010: * This library is free software; you can redistribute it and/or modify it
0011: * under the terms of the GNU Lesser General Public License as published by the Free
0012: * Software Foundation; either version 2.1 of the License, or (at your option)
0013: * any later version.
0014: *
0015: * This library is distributed in the hope that it will be useful, but
0016: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0017: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0018: * for more details.
0019: *
0020: * You should have received a copy of the GNU Lesser General Public License
0021: * along with this program; if not, write to the Free Software
0022: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0023: *
0024: */
0025:
0026: /** QuickSearch matching against a constant string.
0027: *
0028: * <P>The {@linkplain java.util.regex regular expression facilities} of the Java API
0029: * are a powerful tool; however, when searching for a <em>constant</em> pattern
0030: * many algorithms can increase of orders magnitude the speed of a search.
0031: *
0032: * <P>This class provides constant-pattern text search facilities by
0033: * implementing Sunday's QuickSearch (a simplified but very effective variant
0034: * of the Boyer—Moore search algorithm) using <a href="http://vigna.dsi.unimi.it/papers.php#BoVMSJ"><em>compact
0035: * approximators</em></A>, a randomised data structure that can accomodate in a
0036: * small space (but in an approximated way) the bad-character shift table of a
0037: * large alphabet such as Unicode.
0038: *
0039: * <P>Since a large subset of US-ASCII is used in all languages (e.g.,
0040: * whitespace, punctuation, etc.), this class caches separately the shifts for
0041: * the first 128 Unicode characters, resulting in very good performance even on
0042: * text in pure US-ASCII.
0043: *
0044: * <P>Note that the {@link MutableString#indexOf(MutableString) indexOf}
0045: * methods of {@link MutableString} use a even more simplified variant of
0046: * QuickSearch which is less efficient, but has a smaller setup time and does
0047: * not generate any object. The search facilities provided by this class are
0048: * targeted at searches on very large texts, repeated searches with the same
0049: * pattern, and case-insensitive searches.
0050: *
0051: * <P>Instances of this class are immutable and thread-safe.
0052: *
0053: * <P>This class is experimental: APIs could change with the next release.
0054: *
0055: * @author Sebastiano Vigna
0056: * @author Paolo Boldi
0057: * @since 0.6
0058: * @deprecated Moved to <code>dsiutils</code>.
0059: */
0060:
0061: @Deprecated
0062: public class TextPattern implements java.io.Serializable, CharSequence {
0063: private static final long serialVersionUID = 1L;
0064:
0065: /** Enables case-insensitive matching.
0066: *
0067: * <P>By default, case-insensitive matching assumes that only characters in
0068: * the ASCII charset are being matched. Unicode-aware case-insensitive
0069: * matching can be enabled by specifying the UNICODE_CASE flag in
0070: * conjunction with this flag.
0071: *
0072: * <P>Case-insensitivity involves a performance drop.
0073: */
0074: public final static int CASE_INSENSITIVE = 1;
0075:
0076: /** Enables Unicode-aware case folding.
0077: *
0078: * <P>When this flag is specified then case-insensitive matching, when enabled
0079: * by the CASE_INSENSITIVE flag, is done in a manner consistent with the
0080: * Unicode Standard. By default, case-insensitive matching assumes that
0081: * only characters in the ASCII charset are being matched.
0082: *
0083: * <P>Unicode-aware case folding is very expensive (two method calls per
0084: * examined non-ASCII character).
0085: */
0086: public final static int UNICODE_CASE = 2;
0087:
0088: /** The square of the golden ratio multiplied by 2<sup>32</sup>. */
0089: private final static int PHI2 = 1640531525;
0090:
0091: /** The pattern backing array. */
0092: protected char[] pattern;
0093:
0094: /** The compact approximator containing the bad-character shifts; its lenght is a power of 2. */
0095: private transient int[] badCharShift;
0096:
0097: /** The bad-character shift for US-ASCII. */
0098: private transient int[] asciiBadCharShift = new int[128];
0099:
0100: /** A bit mask equal to the length of {@link #badCharShift} minus 1. */
0101: private transient int mask;
0102:
0103: /** A cached shift value for computing one of the hashes. It is equal to 16 minus the base 2 logarithm of the length of {@link #badCharShift}. */
0104: private transient int hashShift;
0105:
0106: /** Whether this pattern is case sensitive. */
0107: private boolean caseSensitive;
0108: /** Whether this pattern uses optimised ASCII downcasing (as opposed to the correct Unicode downcasing procedure). */
0109: private boolean asciiCase;
0110:
0111: /** Creates a new case-sensitive {@link TextPattern} object that can be used to search for the given pattern.
0112: *
0113: * @param pattern the constant pattern to search for.
0114: */
0115: public TextPattern(final CharSequence pattern) {
0116: this (pattern, 0);
0117: }
0118:
0119: /** Creates a new {@link TextPattern} object that can be used to search for the given pattern.
0120: *
0121: * @param pattern the constant pattern to search for.
0122: * @param flags a bit mask that may include {@link #CASE_INSENSITIVE} and {@link #UNICODE_CASE}.
0123: */
0124: public TextPattern(final CharSequence pattern, final int flags) {
0125: this .pattern = new char[pattern.length()];
0126: MutableString.getChars(pattern, 0, this .pattern.length,
0127: this .pattern, 0);
0128: caseSensitive = (flags & CASE_INSENSITIVE) == 0;
0129: asciiCase = (flags & UNICODE_CASE) == 0;
0130: if (!caseSensitive) {
0131: int i = this .pattern.length;
0132: if (asciiCase)
0133: while (i-- != 0)
0134: this .pattern[i] = asciiToLowerCase(this .pattern[i]);
0135: else
0136: while (i-- != 0)
0137: this .pattern[i] = unicodeToLowerCase(this .pattern[i]);
0138: }
0139: compile();
0140: }
0141:
0142: /** Returns whether this pattern is case insensitive.
0143: *
0144: */
0145: public boolean caseInsensitive() {
0146: return !caseSensitive;
0147: }
0148:
0149: /** Returns whether this pattern uses Unicode case folding.
0150: *
0151: */
0152: public boolean unicodeCase() {
0153: return !asciiCase;
0154: }
0155:
0156: /** A fast, optimised method to lower case just ASCII letters.
0157: *
0158: * @param c a character.
0159: * @return the character <code>c</code>, downcased if it lies between <samp>A</samp> and <samp>Z</samp>.
0160: */
0161: private static char asciiToLowerCase(final char c) {
0162: return c >= 'A' && c <= 'Z' ? (char) (c + 32) : c;
0163: }
0164:
0165: /** A method to downcase correctly Unicode characters optimised for ASCII letters.
0166: *
0167: * @param c a character.
0168: * @return the character <code>c</code>, downcased.
0169: */
0170: private static char unicodeToLowerCase(final char c) {
0171: if (c < 128)
0172: return c >= 'A' && c <= 'Z' ? (char) (c + 32) : c;
0173: return Character.toLowerCase(Character.toUpperCase(c));
0174: }
0175:
0176: private static final int MIN_COUNT = 8;
0177:
0178: /** Fills the search data structures using the pattern contained in {@link #pattern}. */
0179: private void compile() {
0180: final char[] p = pattern;
0181: final int n = p.length;
0182: int[] s = asciiBadCharShift;
0183:
0184: char c;
0185:
0186: // We make two passes on the pattern, so to approximate first the number of non-US-ASCII characters.
0187:
0188: int h, i, j = 0, k = 0, l;
0189: int[] max = new int[MIN_COUNT];
0190:
0191: i = s.length;
0192: while (i-- != 0)
0193: s[i] = n;
0194:
0195: i = MIN_COUNT;
0196: while (i-- != 0)
0197: max[i] = Integer.MAX_VALUE;
0198:
0199: i = n;
0200: while (i-- != 0) {
0201: c = p[j++];
0202: if (c < 128)
0203: s[c] = i;
0204: else {
0205: k++;
0206:
0207: /* Bar-Yossef-Jayram-Kumar-Sivakumar-Trevisan's improvement on
0208: Flajolet-Martin's algorithm for approximating the number of
0209: distinct elements in a stream. We map each character with a
0210: hash function on the unit interval, keep track of the smallest
0211: MIN_COUNT elements, and then approximate the number of distinct
0212: characters with MIN_COUNT divided by the largest element we
0213: have (of course, everything is done in fixed point arithmetic
0214: on the interval 0-2^32). If the approximation is less than the
0215: number of characters, we use it. */
0216:
0217: h = c * PHI2;
0218: l = MIN_COUNT;
0219: while (l-- != 0)
0220: if (h > max[l])
0221: break;
0222:
0223: if (++l < MIN_COUNT && h != max[l]) {
0224: System.arraycopy(max, l, max, l + 1, MIN_COUNT - l
0225: - 1);
0226: max[l] = h;
0227: }
0228: }
0229: }
0230:
0231: //System.err.println( k );
0232: //System.err.println( (int)( MIN_COUNT * 0x100000000L / ( 0x80000000L + M[ MIN_COUNT - 1 ] ) ) );
0233:
0234: k = Math
0235: .min(
0236: k,
0237: (int) (MIN_COUNT * 0x100000000L / (0x80000000L + max[MIN_COUNT - 1])));
0238:
0239: /* We do not use less than 2^7 entries, so to compensate the setup
0240: overhead with a more precise approximator for very small patterns. If,
0241: however, there are no non-US-ASCII characters, we use a single-bucket
0242: approximator (so to avoid special cases in the algorithm). In any case,
0243: we do not use approximators with more than 2^16 entries. */
0244: final int log2m = k == 0 ? 0 : Math.min(Math.max(Fast
0245: .mostSignificantBit(k * 3) + 1, 7), 16);
0246: final int m = (1 << log2m) - 1;
0247: final int hs = 16 - log2m;
0248:
0249: /* For the characters outside Unicode, we build a compact approximator
0250: with two hash functions h_0 and h_1. The approximator stores a function
0251: f from the character set to integers in a table s. The value of the
0252: function on c can be obtained by maximising the values s[h_0(c)] and
0253: s[h_1(c)]. When storing a value, instead, we minimise s[h_0(c)] and
0254: s[h_1(c)] with f(c). As a result, the value stored is smaller than or
0255: equal to the actual value f(c). By tuning the size of s and the number
0256: of hash functions one can get a desired error precision. */
0257:
0258: s = new int[m + 1];
0259:
0260: i = m + 1;
0261: while (i-- != 0)
0262: s[i] = n;
0263:
0264: i = n;
0265: j = 0;
0266: while (i-- != 0) {
0267: c = p[j++];
0268: if (c >= 128) {
0269: s[c * c & m] = i;
0270: s[(c * PHI2) >> hs & m] = i;
0271: }
0272: }
0273:
0274: this .badCharShift = s;
0275: this .mask = m;
0276: this .hashShift = hs;
0277:
0278: /*
0279: for( i = 0; i <= m; i++ ) System.err.print( "" + s[ i ] + ", " );
0280: System.err.println();
0281:
0282: for( c = 'a'; c <= 'z'; c++ ) System.err.print( c + ":" + Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) + ", " );
0283: System.err.println();
0284:
0285: for( c = 'A'; c <= 'Z'; c++ ) System.err.print( c + ":" + Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) + ", " );
0286: System.err.println();
0287:
0288: MutableString msp = new MutableString( pattern );
0289:
0290: for( c = 'a'; c <= 'z'; c++ ) if ( msp.indexOf( c ) == -1 && Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) != n )
0291: System.err.println( c + ":" + Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) + ", " );
0292:
0293: for( c = 'A'; c <= 'Z'; c++ ) if ( msp.indexOf( c ) == -1 && Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) != n )
0294: System.err.println( c + ":" + Math.max( s[ c * c & m ], s[ ( c * PHI2 ) >> hs & m ] ) + ", " );
0295: */
0296:
0297: }
0298:
0299: public int length() {
0300: return pattern.length;
0301: }
0302:
0303: public char charAt(final int i) {
0304: return pattern[i];
0305: }
0306:
0307: public CharSequence subSequence(final int from, final int to) {
0308: return new MutableString(pattern, from, to - from + 1);
0309: }
0310:
0311: /** Returns the index of the first occurrence of this one-character pattern
0312: * in the specified character array, starting at the specified index.
0313: *
0314: * <P>This method is a fallback for searches on one-character patterns.
0315: *
0316: * @param array the character array to look in.
0317: * @param from the index from which the search must start.
0318: * @param to the index at which the search must end.
0319: * @return the index of the first occurrence of this pattern or
0320: * <code>-1</code>, if this pattern never appears with index greater than
0321: * or equal to <code>from</code>.
0322: */
0323: private int indexOf(final char[] array, final int from, final int to) {
0324: final char[] a = array;
0325: final int c = pattern[0];
0326:
0327: int i = from < 0 ? -1 : from - 1;
0328:
0329: if (caseSensitive) {
0330: while (++i < to)
0331: if (a[i] == c)
0332: return i;
0333: return -1;
0334: } else if (asciiCase) {
0335: while (++i < to)
0336: if (asciiToLowerCase(a[i]) == c)
0337: return i;
0338: return -1;
0339: } else {
0340: while (++i < to)
0341: if (unicodeToLowerCase(a[i]) == c)
0342: return i;
0343: return -1;
0344: }
0345: }
0346:
0347: /** Returns the index of the first occurrence of this one-character pattern
0348: * in the specified character sequence, starting at the specified index.
0349: *
0350: * <P>This method is a fallback for searches on one-character patterns.
0351: *
0352: * @param s the character sequence to look in.
0353: * @param from the index from which the search must start.
0354: * @return the index of the first occurrence of this pattern or
0355: * <code>-1</code>, if the this pattern never appears with index greater than
0356: * or equal to <code>from</code>.
0357: */
0358: private int indexOf(final CharSequence s, final int from,
0359: final int to) {
0360: final int c = pattern[0];
0361:
0362: int i = from < 0 ? -1 : from - 1;
0363:
0364: if (caseSensitive) {
0365: while (++i < to)
0366: if (s.charAt(i) == c)
0367: return i;
0368: return -1;
0369: } else if (asciiCase) {
0370: while (++i < to)
0371: if (asciiToLowerCase(s.charAt(i)) == c)
0372: return i;
0373: return -1;
0374: } else {
0375: while (++i < to)
0376: if (unicodeToLowerCase(s.charAt(i)) == c)
0377: return i;
0378: return -1;
0379: }
0380: }
0381:
0382: /** Returns the index of the first occurrence of this one-character pattern
0383: * in the specified byte array, seen as an ISO-8859-1 string,
0384: * starting at the specified index.
0385: *
0386: * <P>This method is a fallback for searches on one-character patterns.
0387: *
0388: * @param array the byte array to look in.
0389: * @param from the index from which the search must start.
0390: * @return the index of the first occurrence of this pattern or
0391: * <code>-1</code>, if this pattern never appears with index greater than
0392: * or equal to <code>from</code>.
0393: */
0394: private int indexOf(final byte[] array, final int from, final int to) {
0395: final byte[] a = array;
0396: final int c = pattern[0];
0397:
0398: int i = from < 0 ? -1 : from - 1;
0399:
0400: if (caseSensitive) {
0401: while (++i < to)
0402: if ((a[i] & 0xFF) == c)
0403: return i;
0404: return -1;
0405: } else {
0406: while (++i < to)
0407: if (asciiToLowerCase((char) (a[i] & 0xFF)) == c)
0408: return i;
0409: return -1;
0410: }
0411: }
0412:
0413: /** Returns the index of the first occurrence of this pattern in the given character array.
0414: *
0415: * @param array the character array to look in.
0416: * @return the index of the first occurrence of this pattern contained in the
0417: * given array, or <code>-1</code>, if the pattern cannot be found.
0418: */
0419:
0420: public int search(final char[] array) {
0421: return search(array, 0, array.length);
0422: }
0423:
0424: /** Returns the index of the first occurrence of this pattern in the given character array starting from a given index.
0425: *
0426: * @param array the character array to look in.
0427: * @param from the index from which the search must start.
0428: * @return the index of the first occurrence of this pattern contained in the
0429: * subarray starting from <code>from</code> (inclusive), or
0430: * <code>-1</code>, if the pattern cannot be found.
0431: */
0432: public int search(char[] array, int from) {
0433: return search(array, from, array.length);
0434: }
0435:
0436: /** Returns the index of the first occurrence of this pattern in the given character array between given indices.
0437: *
0438: * @param a the character array to look in.
0439: * @param from the index from which the search must start.
0440: * @param to the index at which the search must end.
0441: * @return the index of the first occurrence of this pattern contained in the
0442: * subarray starting from <code>from</code> (inclusive) up to <code>to</code>
0443: * (exclusive) characters, or <code>-1</code>, if the pattern cannot be found.
0444: */
0445:
0446: public int search(final char[] a, final int from, final int to) {
0447:
0448: final int n = pattern.length;
0449:
0450: if (n == 0)
0451: return from > to ? to : (from < 0 ? 0 : from);
0452: if (n == 1)
0453: return indexOf(a, from, to);
0454:
0455: final char[] p = pattern;
0456: final char last = p[n - 1];
0457: final int m1 = to - 1;
0458: final int[] shift = badCharShift;
0459: final int[] asciiShift = asciiBadCharShift;
0460: final int m = mask;
0461: final int hs = hashShift;
0462:
0463: int i = (from < 0 ? 0 : from) + n - 1, j, k;
0464: char c;
0465:
0466: if (caseSensitive) {
0467: while (i < m1) {
0468: if (a[i] == last) {
0469: j = n - 1;
0470: k = i;
0471: while (j-- != 0 && a[--k] == p[j])
0472: ;
0473: if (j < 0)
0474: return k;
0475: }
0476: if ((c = a[++i]) < 128)
0477: i += asciiShift[c];
0478: else {
0479: j = shift[c * c & m];
0480: k = shift[(c * PHI2) >> hs & m];
0481: i += j > k ? j : k;
0482: }
0483: }
0484:
0485: if (i == m1) {
0486: j = n;
0487: while (j-- != 0 && a[i--] == p[j])
0488: ;
0489: if (j < 0)
0490: return i + 1;
0491: }
0492: return -1;
0493: } else if (asciiCase) {
0494: while (i < m1) {
0495: if (asciiToLowerCase(a[i]) == last) {
0496: j = n - 1;
0497: k = i;
0498: while (j-- != 0 && asciiToLowerCase(a[--k]) == p[j])
0499: ;
0500: if (j < 0)
0501: return k;
0502: }
0503: if ((c = asciiToLowerCase(a[++i])) < 128)
0504: i += asciiShift[c];
0505: else {
0506: j = shift[c * c & m];
0507: k = shift[(c * PHI2) >> hs & m];
0508: i += j > k ? j : k;
0509: }
0510: }
0511:
0512: if (i == m1) {
0513: j = n;
0514: while (j-- != 0 && asciiToLowerCase(a[i--]) == p[j])
0515: ;
0516: if (j < 0)
0517: return i + 1;
0518: }
0519: return -1;
0520: } else {
0521: while (i < m1) {
0522: if (unicodeToLowerCase(a[i]) == last) {
0523: j = n - 1;
0524: k = i;
0525: while (j-- != 0
0526: && unicodeToLowerCase(a[--k]) == p[j])
0527: ;
0528: if (j < 0)
0529: return k;
0530: }
0531: if ((c = unicodeToLowerCase(a[++i])) < 128)
0532: i += asciiShift[c];
0533: else {
0534: j = shift[c * c & m];
0535: k = shift[(c * PHI2) >> hs & m];
0536: i += j > k ? j : k;
0537: }
0538: }
0539:
0540: if (i == m1) {
0541: j = n;
0542: while (j-- != 0 && unicodeToLowerCase(a[i--]) == p[j])
0543: ;
0544: if (j < 0)
0545: return i + 1;
0546: }
0547: return -1;
0548: }
0549: }
0550:
0551: /** Returns the index of the first occurrence of this pattern in the given character sequence.
0552: *
0553: * @param s the character sequence to look in.
0554: * @return the index of the first occurrence of this pattern contained in the
0555: * given character sequence, or <code>-1</code>, if the pattern cannot be found.
0556: */
0557:
0558: public int search(final CharSequence s) {
0559: return search(s, 0, s.length());
0560: }
0561:
0562: /** Returns the index of the first occurrence of this pattern in the given character sequence starting from a given index.
0563: *
0564: * @param s the character array to look in.
0565: * @param from the index from which the search must start.
0566: * @return the index of the first occurrence of this pattern contained in the
0567: * subsequence starting from <code>from</code> (inclusive), or
0568: * <code>-1</code>, if the pattern cannot be found.
0569: */
0570: public int search(final CharSequence s, final int from) {
0571: return search(s, from, s.length());
0572: }
0573:
0574: /** Returns the index of the first occurrence of this pattern in the given character sequence between given indices.
0575: *
0576: * @param s the character array to look in.
0577: * @param from the index from which the search must start.
0578: * @param to the index at which the search must end.
0579: * @return the index of the first occurrence of this pattern contained in the
0580: * subsequence starting from <code>from</code> (inclusive) up to <code>to</code>
0581: * (exclusive) characters, or <code>-1</code>, if the pattern cannot be found.
0582: */
0583:
0584: public int search(final CharSequence s, final int from, final int to) {
0585:
0586: final int n = pattern.length;
0587:
0588: if (n == 0)
0589: return from > to ? to : (from < 0 ? 0 : from);
0590: if (n == 1)
0591: return indexOf(s, from, to);
0592:
0593: final char[] p = pattern;
0594: final char last = p[n - 1];
0595: final int m1 = to - 1;
0596: final int[] shift = badCharShift;
0597: final int[] asciiShift = asciiBadCharShift;
0598: final int m = mask;
0599: final int hs = hashShift;
0600:
0601: int i = (from < 0 ? 0 : from) + n - 1, j, k;
0602: char c;
0603:
0604: if (caseSensitive) {
0605: while (i < m1) {
0606: if (s.charAt(i) == last) {
0607: j = n - 1;
0608: k = i;
0609: while (j-- != 0 && s.charAt(--k) == p[j])
0610: ;
0611: if (j < 0)
0612: return k;
0613: }
0614: if ((c = s.charAt(++i)) < 128)
0615: i += asciiShift[c];
0616: else {
0617: j = shift[c * c & m];
0618: k = shift[(c * PHI2) >> hs & m];
0619: i += j > k ? j : k;
0620: }
0621: }
0622:
0623: if (i == m1) {
0624: j = n;
0625: while (j-- != 0 && s.charAt(i--) == p[j])
0626: ;
0627: if (j < 0)
0628: return i + 1;
0629: }
0630: return -1;
0631: } else if (asciiCase) {
0632: while (i < m1) {
0633: if (asciiToLowerCase(s.charAt(i)) == last) {
0634: j = n - 1;
0635: k = i;
0636: while (j-- != 0
0637: && asciiToLowerCase(s.charAt(--k)) == p[j])
0638: ;
0639: if (j < 0)
0640: return k;
0641: }
0642: if ((c = asciiToLowerCase(s.charAt(++i))) < 128)
0643: i += asciiShift[c];
0644: else {
0645: j = shift[c * c & m];
0646: k = shift[(c * PHI2) >> hs & m];
0647: i += j > k ? j : k;
0648: }
0649: }
0650:
0651: if (i == m1) {
0652: j = n;
0653: while (j-- != 0
0654: && asciiToLowerCase(s.charAt(i--)) == p[j])
0655: ;
0656: if (j < 0)
0657: return i + 1;
0658: }
0659: return -1;
0660: } else {
0661: while (i < m1) {
0662: if (unicodeToLowerCase(s.charAt(i)) == last) {
0663: j = n - 1;
0664: k = i;
0665: while (j-- != 0
0666: && unicodeToLowerCase(s.charAt(--k)) == p[j])
0667: ;
0668: if (j < 0)
0669: return k;
0670: }
0671: if ((c = unicodeToLowerCase(s.charAt(++i))) < 128)
0672: i += asciiShift[c];
0673: else {
0674: j = shift[c * c & m];
0675: k = shift[(c * PHI2) >> hs & m];
0676: i += j > k ? j : k;
0677: }
0678: }
0679:
0680: if (i == m1) {
0681: j = n;
0682: while (j-- != 0
0683: && unicodeToLowerCase(s.charAt(i--)) == p[j])
0684: ;
0685: if (j < 0)
0686: return i + 1;
0687: }
0688: return -1;
0689: }
0690: }
0691:
0692: /** Returns the index of the first occurrence of this pattern in the given byte array.
0693: *
0694: * @param a the byte array to look in.
0695: * @return the index of the first occurrence of this pattern contained in the
0696: * given byte array, or <code>-1</code>, if the pattern cannot be found.
0697: */
0698:
0699: public int search(final byte[] a) {
0700: return search(a, 0, a.length);
0701: }
0702:
0703: /** Returns the index of the first occurrence of this pattern in the given byte array starting from a given index.
0704: *
0705: * @param a the byte array to look in.
0706: * @param from the index from which the search must start.
0707: * @return the index of the first occurrence of this pattern contained in the
0708: * array fragment starting from <code>from</code> (inclusive), or
0709: * <code>-1</code>, if the pattern cannot be found.
0710: */
0711: public int search(final byte[] a, final int from) {
0712: return search(a, from, a.length);
0713: }
0714:
0715: /** Returns the index of the first occurrence of this pattern in the given byte array between given indices.
0716: *
0717: * @param a the byte array to look in.
0718: * @param from the index from which the search must start.
0719: * @param to the index at which the search must end.
0720: * @return the index of the first occurrence of this pattern contained in the
0721: * array fragment starting from <code>from</code> (inclusive) up to <code>to</code>
0722: * (exclusive) characters, or <code>-1</code>, if the pattern cannot be found.
0723: * TODO: this must be tested
0724: */
0725: public int search(final byte[] a, final int from, final int to) {
0726:
0727: final int n = pattern.length;
0728:
0729: if (n == 0)
0730: return from > to ? to : (from < 0 ? 0 : from);
0731: if (n == 1)
0732: return indexOf(a, from, to);
0733:
0734: final char[] p = pattern;
0735: final char last = p[n - 1];
0736: final int m1 = to - 1;
0737: final int[] shift = badCharShift;
0738: final int[] asciiShift = asciiBadCharShift;
0739: final int m = mask;
0740: final int hs = hashShift;
0741:
0742: int i = (from < 0 ? 0 : from) + n - 1, j, k;
0743: char c;
0744:
0745: if (caseSensitive) {
0746: while (i < m1) {
0747: if ((a[i] & 0xFF) == last) {
0748: j = n - 1;
0749: k = i;
0750: while (j-- != 0 && (a[--k] & 0xFF) == p[j])
0751: ;
0752: if (j < 0)
0753: return k;
0754: }
0755: if ((c = (char) (a[++i] & 0xFF)) < 128)
0756: i += asciiShift[c];
0757: else {
0758: j = shift[c * c & m];
0759: k = shift[(c * PHI2) >> hs & m];
0760: i += j > k ? j : k;
0761: }
0762: }
0763:
0764: if (i == m1) {
0765: j = n;
0766: while (j-- != 0 && (a[i--] & 0xFF) == p[j])
0767: ;
0768: if (j < 0)
0769: return i + 1;
0770: }
0771: return -1;
0772: } else if (asciiCase) {
0773: while (i < m1) {
0774: if (asciiToLowerCase((char) (a[i] & 0xFF)) == last) {
0775: j = n - 1;
0776: k = i;
0777: while (j-- != 0
0778: && asciiToLowerCase((char) (a[--k] & 0xFF)) == p[j])
0779: ;
0780: if (j < 0)
0781: return k;
0782: }
0783: if ((c = asciiToLowerCase((char) (a[++i] & 0xFF))) < 128)
0784: i += asciiShift[c];
0785: else {
0786: j = shift[c * c & m];
0787: k = shift[(c * PHI2) >> hs & m];
0788: i += j > k ? j : k;
0789: }
0790: }
0791:
0792: if (i == m1) {
0793: j = n;
0794: while (j-- != 0
0795: && asciiToLowerCase((char) (a[i--] & 0xFF)) == p[j])
0796: ;
0797: if (j < 0)
0798: return i + 1;
0799: }
0800: return -1;
0801: } else {
0802: while (i < m1) {
0803: if (unicodeToLowerCase((char) (a[i] & 0xFF)) == last) {
0804: j = n - 1;
0805: k = i;
0806: while (j-- != 0
0807: && unicodeToLowerCase((char) (a[--k] & 0xFF)) == p[j])
0808: ;
0809: if (j < 0)
0810: return k;
0811: }
0812: if ((c = unicodeToLowerCase((char) (a[++i] & 0xFF))) < 128)
0813: i += asciiShift[c];
0814: else {
0815: j = shift[c * c & m];
0816: k = shift[(c * PHI2) >> hs & m];
0817: i += j > k ? j : k;
0818: }
0819: }
0820:
0821: if (i == m1) {
0822: j = n;
0823: while (j-- != 0
0824: && unicodeToLowerCase((char) (a[i--] & 0xFF)) == p[j])
0825: ;
0826: if (j < 0)
0827: return i + 1;
0828: }
0829: return -1;
0830: }
0831: }
0832:
0833: /** Returns the index of the first occurrence of this pattern in the given character list.
0834: *
0835: * @param list the character list to look in.
0836: * @return the index of the first occurrence of this pattern contained in the
0837: * given list, or <code>-1</code>, if the pattern cannot be found.
0838: */
0839:
0840: public int search(final CharList list) {
0841: return search(list, 0, list.size());
0842: }
0843:
0844: /** Returns the index of the first occurrence of this pattern in the given character list starting from a given index.
0845: *
0846: * @param list the character list to look in.
0847: * @param from the index from which the search must start.
0848: * @return the index of the first occurrence of this pattern contained in the
0849: * sublist starting from <code>from</code> (inclusive), or
0850: * <code>-1</code>, if the pattern cannot be found.
0851: */
0852: public int search(final CharList list, int from) {
0853: return search(list, from, list.size());
0854: }
0855:
0856: /** Returns the index of the first occurrence of this pattern in the given character list between given indices.
0857: *
0858: * @param list the character list to look in.
0859: * @param from the index from which the search must start.
0860: * @param to the index at which the search must end.
0861: * @return the index of the first occurrence of this pattern contained in the
0862: * sublist starting from <code>from</code> (inclusive) up to <code>to</code>
0863: * (exclusive) characters, or <code>-1</code>, if the pattern cannot be found.
0864: */
0865:
0866: public int search(final CharList list, final int from, final int to) {
0867:
0868: final int n = pattern.length;
0869:
0870: if (n == 0)
0871: return from > to ? to : (from < 0 ? 0 : from);
0872: if (n == 1)
0873: return list.subList(from, to).indexOf(pattern[0]);
0874:
0875: final char[] p = pattern;
0876: final char last = p[n - 1];
0877: final int m1 = to - 1;
0878: final int[] shift = badCharShift;
0879: final int[] asciiShift = asciiBadCharShift;
0880: final int m = mask;
0881: final int hs = hashShift;
0882:
0883: int i = (from < 0 ? 0 : from) + n - 1, j, k;
0884: char c;
0885:
0886: if (caseSensitive) {
0887: while (i < m1) {
0888: if (list.getChar(i) == last) {
0889: j = n - 1;
0890: k = i;
0891: while (j-- != 0 && list.getChar(--k) == p[j])
0892: ;
0893: if (j < 0)
0894: return k;
0895: }
0896: if ((c = list.getChar(++i)) < 128)
0897: i += asciiShift[c];
0898: else {
0899: j = shift[c * c & m];
0900: k = shift[(c * PHI2) >> hs & m];
0901: i += j > k ? j : k;
0902: }
0903: }
0904:
0905: if (i == m1) {
0906: j = n;
0907: while (j-- != 0 && list.getChar(i--) == p[j])
0908: ;
0909: if (j < 0)
0910: return i + 1;
0911: }
0912: return -1;
0913: } else if (asciiCase) {
0914: while (i < m1) {
0915: if (asciiToLowerCase(list.getChar(i)) == last) {
0916: j = n - 1;
0917: k = i;
0918: while (j-- != 0
0919: && asciiToLowerCase(list.getChar(--k)) == p[j])
0920: ;
0921: if (j < 0)
0922: return k;
0923: }
0924: if ((c = asciiToLowerCase(list.getChar(++i))) < 128)
0925: i += asciiShift[c];
0926: else {
0927: j = shift[c * c & m];
0928: k = shift[(c * PHI2) >> hs & m];
0929: i += j > k ? j : k;
0930: }
0931: }
0932:
0933: if (i == m1) {
0934: j = n;
0935: while (j-- != 0
0936: && asciiToLowerCase(list.getChar(i--)) == p[j])
0937: ;
0938: if (j < 0)
0939: return i + 1;
0940: }
0941: return -1;
0942: } else {
0943: while (i < m1) {
0944: if (unicodeToLowerCase(list.getChar(i)) == last) {
0945: j = n - 1;
0946: k = i;
0947: while (j-- != 0
0948: && unicodeToLowerCase(list.getChar(--k)) == p[j])
0949: ;
0950: if (j < 0)
0951: return k;
0952: }
0953: if ((c = unicodeToLowerCase(list.getChar(++i))) < 128)
0954: i += asciiShift[c];
0955: else {
0956: j = shift[c * c & m];
0957: k = shift[(c * PHI2) >> hs & m];
0958: i += j > k ? j : k;
0959: }
0960: }
0961:
0962: if (i == m1) {
0963: j = n;
0964: while (j-- != 0
0965: && unicodeToLowerCase(list.getChar(i--)) == p[j])
0966: ;
0967: if (j < 0)
0968: return i + 1;
0969: }
0970: return -1;
0971: }
0972: }
0973:
0974: /** Compares this text pattern to another object.
0975: *
0976: * <P>This method will return <code>true</code> iff its argument
0977: * is a <code>TextPattern</code> containing the same constant pattern with the same flags set.
0978: *
0979: * @param o an object.
0980: * @return true if the argument is a <code>TextPattern</code>s that contains the same constant pattern of this text pattern
0981: * and has the same flags set.
0982: */
0983: final public boolean equals(final Object o) {
0984: if (o instanceof TextPattern) {
0985: TextPattern p = (TextPattern) o;
0986: return caseSensitive == p.caseSensitive
0987: && asciiCase == p.asciiCase
0988: && java.util.Arrays.equals(p.pattern, pattern);
0989: }
0990: return false;
0991: }
0992:
0993: /** Returns a hash code for this text pattern.
0994: *
0995: * <P>The hash code of a text pattern is the same as that of a
0996: * <code>String</code> with the same content (suitably lower cased, if the pattern is case insensitive).
0997: *
0998: * @return a hash code array for this object.
0999: * @see java.lang.String#hashCode()
1000: */
1001: final public int hashCode() {
1002: final char[] a = pattern;
1003: final int l = a.length;
1004: int h;
1005: for (int i = h = 0; i < l; i++)
1006: h = 31 * h + a[i];
1007: return h;
1008: }
1009:
1010: final public String toString() {
1011: return new String(pattern);
1012: }
1013: }
|