0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007:
0008: package com.ibm.icu.text;
0009:
0010: import java.text.CharacterIterator;
0011: import java.text.StringCharacterIterator;
0012: import java.util.Locale;
0013:
0014: import com.ibm.icu.impl.CharacterIteratorWrapper;
0015: import com.ibm.icu.impl.NormalizerImpl;
0016: import com.ibm.icu.lang.UCharacter;
0017: import com.ibm.icu.util.ULocale;
0018:
0019: /**
0020: * <p>
0021: * <code>StringSearch</code> is the concrete subclass of
0022: * <code>SearchIterator</code> that provides language-sensitive text searching
0023: * based on the comparison rules defined in a {@link RuleBasedCollator} object.
0024: * </p>
0025: * <p>
0026: * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
0027: * algorithm that has been adapted to work with the large character set of
0028: * Unicode. Refer to
0029: * <a href="http://icu.sourceforge.net/docs/papers/efficient_text_searching_in_java.html">
0030: * "Efficient Text Searching in Java"</a>, published in the
0031: * <i>Java Report</i> on February, 1999, for further information on the
0032: * algorithm.
0033: * </p>
0034: * <p>
0035: * Users are also strongly encouraged to read the section on
0036: * <a href="http://icu.sourceforge.net/userguide/searchString.html">
0037: * String Search</a> and
0038: * <a href="http://icu.sourceforge.net/userguide/Collate_Intro.html">
0039: * Collation</a> in the user guide before attempting to use this class.
0040: * </p>
0041: * <p>
0042: * String searching gets alittle complicated when accents are encountered at
0043: * match boundaries. If a match is found and it has preceding or trailing
0044: * accents not part of the match, the result returned will include the
0045: * preceding accents up to the first base character, if the pattern searched
0046: * for starts an accent. Likewise,
0047: * if the pattern ends with an accent, all trailing accents up to the first
0048: * base character will be included in the result.
0049: * </p>
0050: * <p>
0051: * For example, if a match is found in target text "a\u0325\u0300" for
0052: * the pattern
0053: * "a\u0325", the result returned by StringSearch will be the index 0 and
0054: * length 3 <0, 3>. If a match is found in the target
0055: * "a\u0325\u0300"
0056: * for the pattern "\u0300", then the result will be index 1 and length 2
0057: * <1, 2>.
0058: * </p>
0059: * <p>
0060: * In the case where the decomposition mode is on for the RuleBasedCollator,
0061: * all matches that starts or ends with an accent will have its results include
0062: * preceding or following accents respectively. For example, if pattern "a" is
0063: * looked for in the target text "á\u0325", the result will be
0064: * index 0 and length 2 <0, 2>.
0065: * </p>
0066: * <p>
0067: * The StringSearch class provides two options to handle accent matching
0068: * described below:
0069: * </p>
0070: * <p>
0071: * Let S' be the sub-string of a text string S between the offsets start and
0072: * end <start, end>.
0073: * <br>
0074: * A pattern string P matches a text string S at the offsets <start,
0075: * length>
0076: * <br>
0077: * if
0078: * <pre>
0079: * option 1. P matches some canonical equivalent string of S'. Suppose the
0080: * RuleBasedCollator used for searching has a collation strength of
0081: * TERTIARY, all accents are non-ignorable. If the pattern
0082: * "a\u0300" is searched in the target text
0083: * "a\u0325\u0300",
0084: * a match will be found, since the target text is canonically
0085: * equivalent to "a\u0300\u0325"
0086: * option 2. P matches S' and if P starts or ends with a combining mark,
0087: * there exists no non-ignorable combining mark before or after S'
0088: * in S respectively. Following the example above, the pattern
0089: * "a\u0300" will not find a match in "a\u0325\u0300",
0090: * since
0091: * there exists a non-ignorable accent '\u0325' in the middle of
0092: * 'a' and '\u0300'. Even with a target text of
0093: * "a\u0300\u0325" a match will not be found because of the
0094: * non-ignorable trailing accent \u0325.
0095: * </pre>
0096: * Option 2. will be the default mode for dealing with boundary accents unless
0097: * specified via the API setCanonical(boolean).
0098: * One restriction is to be noted for option 1. Currently there are no
0099: * composite characters that consists of a character with combining class > 0
0100: * before a character with combining class == 0. However, if such a character
0101: * exists in the future, the StringSearch may not work correctly with option 1
0102: * when such characters are encountered.
0103: * </p>
0104: * <p>
0105: * <tt>SearchIterator</tt> provides APIs to specify the starting position
0106: * within the text string to be searched, e.g. <tt>setIndex</tt>,
0107: * <tt>preceding</tt> and <tt>following</tt>. Since the starting position will
0108: * be set as it is specified, please take note that there are some dangerous
0109: * positions which the search may render incorrect results:
0110: * <ul>
0111: * <li> The midst of a substring that requires decomposition.
0112: * <li> If the following match is to be found, the position should not be the
0113: * second character which requires to be swapped with the preceding
0114: * character. Vice versa, if the preceding match is to be found,
0115: * position to search from should not be the first character which
0116: * requires to be swapped with the next character. E.g certain Thai and
0117: * Lao characters require swapping.
0118: * <li> If a following pattern match is to be found, any position within a
0119: * contracting sequence except the first will fail. Vice versa if a
0120: * preceding pattern match is to be found, a invalid starting point
0121: * would be any character within a contracting sequence except the last.
0122: * </ul>
0123: * </p>
0124: * <p>
0125: * Though collator attributes will be taken into consideration while
0126: * performing matches, there are no APIs provided in StringSearch for setting
0127: * and getting the attributes. These attributes can be set by getting the
0128: * collator from <tt>getCollator</tt> and using the APIs in
0129: * <tt>com.ibm.icu.text.Collator</tt>. To update StringSearch to the new
0130: * collator attributes, <tt>reset()</tt> or
0131: * <tt>setCollator(RuleBasedCollator)</tt> has to be called.
0132: * </p>
0133: * <p>
0134: * Consult the
0135: * <a href="http://icu.sourceforge.net/userguide/searchString.html">
0136: * String Search</a> user guide and the <code>SearchIterator</code>
0137: * documentation for more information and examples of use.
0138: * </p>
0139: * <p>
0140: * This class is not subclassable
0141: * </p>
0142: * @see SearchIterator
0143: * @see RuleBasedCollator
0144: * @author Laura Werner, synwee
0145: * @stable ICU 2.0
0146: */
0147: // internal notes: all methods do not guarantee the correct status of the
0148: // characteriterator. the caller has to maintain the original index position
0149: // if necessary. methods could change the index position as it deems fit
0150: public final class StringSearch extends SearchIterator {
0151:
0152: // public constructors --------------------------------------------------
0153:
0154: /**
0155: * Initializes the iterator to use the language-specific rules defined in
0156: * the argument collator to search for argument pattern in the argument
0157: * target text. The argument breakiter is used to define logical matches.
0158: * See super class documentation for more details on the use of the target
0159: * text and BreakIterator.
0160: * @param pattern text to look for.
0161: * @param target target text to search for pattern.
0162: * @param collator RuleBasedCollator that defines the language rules
0163: * @param breakiter A {@link BreakIterator} that is used to determine the
0164: * boundaries of a logical match. This argument can be null.
0165: * @exception IllegalArgumentException thrown when argument target is null,
0166: * or of length 0
0167: * @see BreakIterator
0168: * @see RuleBasedCollator
0169: * @see SearchIterator
0170: * @stable ICU 2.0
0171: */
0172: public StringSearch(String pattern, CharacterIterator target,
0173: RuleBasedCollator collator, BreakIterator breakiter) {
0174: super (target, breakiter);
0175: m_textBeginOffset_ = targetText.getBeginIndex();
0176: m_textLimitOffset_ = targetText.getEndIndex();
0177: m_collator_ = collator;
0178: m_colEIter_ = m_collator_.getCollationElementIterator(target);
0179: m_utilColEIter_ = collator.getCollationElementIterator("");
0180: m_ceMask_ = getMask(m_collator_.getStrength());
0181: m_isCanonicalMatch_ = false;
0182: m_pattern_ = new Pattern(pattern);
0183: m_matchedIndex_ = DONE;
0184:
0185: initialize();
0186: }
0187:
0188: /**
0189: * Initializes the iterator to use the language-specific rules defined in
0190: * the argument collator to search for argument pattern in the argument
0191: * target text. No BreakIterators are set to test for logical matches.
0192: * @param pattern text to look for.
0193: * @param target target text to search for pattern.
0194: * @param collator RuleBasedCollator that defines the language rules
0195: * @exception IllegalArgumentException thrown when argument target is null,
0196: * or of length 0
0197: * @see RuleBasedCollator
0198: * @see SearchIterator
0199: * @stable ICU 2.0
0200: */
0201: public StringSearch(String pattern, CharacterIterator target,
0202: RuleBasedCollator collator) {
0203: this (pattern, target, collator, BreakIterator
0204: .getCharacterInstance());
0205: }
0206:
0207: /**
0208: * Initializes the iterator to use the language-specific rules and
0209: * break iterator rules defined in the argument locale to search for
0210: * argument pattern in the argument target text.
0211: * See super class documentation for more details on the use of the target
0212: * text and BreakIterator.
0213: * @param pattern text to look for.
0214: * @param target target text to search for pattern.
0215: * @param locale locale to use for language and break iterator rules
0216: * @exception IllegalArgumentException thrown when argument target is null,
0217: * or of length 0. ClassCastException thrown if the collator for
0218: * the specified locale is not a RuleBasedCollator.
0219: * @see BreakIterator
0220: * @see RuleBasedCollator
0221: * @see SearchIterator
0222: * @stable ICU 2.0
0223: */
0224: public StringSearch(String pattern, CharacterIterator target,
0225: Locale locale) {
0226: this (pattern, target, ULocale.forLocale(locale));
0227: }
0228:
0229: /**
0230: * Initializes the iterator to use the language-specific rules and
0231: * break iterator rules defined in the argument locale to search for
0232: * argument pattern in the argument target text.
0233: * See super class documentation for more details on the use of the target
0234: * text and BreakIterator.
0235: * @param pattern text to look for.
0236: * @param target target text to search for pattern.
0237: * @param locale ulocale to use for language and break iterator rules
0238: * @exception IllegalArgumentException thrown when argument target is null,
0239: * or of length 0. ClassCastException thrown if the collator for
0240: * the specified locale is not a RuleBasedCollator.
0241: * @see BreakIterator
0242: * @see RuleBasedCollator
0243: * @see SearchIterator
0244: * @draft ICU 3.2
0245: * @provisional This API might change or be removed in a future release.
0246: */
0247: public StringSearch(String pattern, CharacterIterator target,
0248: ULocale locale) {
0249: this (pattern, target, (RuleBasedCollator) Collator
0250: .getInstance(locale), BreakIterator
0251: .getCharacterInstance(locale));
0252: }
0253:
0254: /**
0255: * Initializes the iterator to use the language-specific rules and
0256: * break iterator rules defined in the default locale to search for
0257: * argument pattern in the argument target text.
0258: * See super class documentation for more details on the use of the target
0259: * text and BreakIterator.
0260: * @param pattern text to look for.
0261: * @param target target text to search for pattern.
0262: * @exception IllegalArgumentException thrown when argument target is null,
0263: * or of length 0. ClassCastException thrown if the collator for
0264: * the default locale is not a RuleBasedCollator.
0265: * @see BreakIterator
0266: * @see RuleBasedCollator
0267: * @see SearchIterator
0268: * @stable ICU 2.0
0269: */
0270: public StringSearch(String pattern, String target) {
0271: this (pattern, new StringCharacterIterator(target),
0272: (RuleBasedCollator) Collator.getInstance(),
0273: BreakIterator.getCharacterInstance());
0274: }
0275:
0276: // public getters -----------------------------------------------------
0277:
0278: /**
0279: * <p>
0280: * Gets the RuleBasedCollator used for the language rules.
0281: * </p>
0282: * <p>
0283: * Since StringSearch depends on the returned RuleBasedCollator, any
0284: * changes to the RuleBasedCollator result should follow with a call to
0285: * either StringSearch.reset() or
0286: * StringSearch.setCollator(RuleBasedCollator) to ensure the correct
0287: * search behaviour.
0288: * </p>
0289: * @return RuleBasedCollator used by this StringSearch
0290: * @see RuleBasedCollator
0291: * @see #setCollator
0292: * @stable ICU 2.0
0293: */
0294: public RuleBasedCollator getCollator() {
0295: return m_collator_;
0296: }
0297:
0298: /**
0299: * Returns the pattern for which StringSearch is searching for.
0300: * @return the pattern searched for
0301: * @stable ICU 2.0
0302: */
0303: public String getPattern() {
0304: return m_pattern_.targetText;
0305: }
0306:
0307: /**
0308: * Return the index in the target text where the iterator is currently
0309: * positioned at.
0310: * If the iteration has gone past the end of the target text or past
0311: * the beginning for a backwards search, {@link #DONE} is returned.
0312: * @return index in the target text where the iterator is currently
0313: * positioned at
0314: * @stable ICU 2.8
0315: */
0316: public int getIndex() {
0317: int result = m_colEIter_.getOffset();
0318: if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_,
0319: result)) {
0320: return DONE;
0321: }
0322: return result;
0323: }
0324:
0325: /**
0326: * Determines whether canonical matches (option 1, as described in the
0327: * class documentation) is set.
0328: * See setCanonical(boolean) for more information.
0329: * @see #setCanonical
0330: * @return true if canonical matches is set, false otherwise
0331: * @stable ICU 2.8
0332: */
0333: public boolean isCanonical() {
0334: return m_isCanonicalMatch_;
0335: }
0336:
0337: // public setters -----------------------------------------------------
0338:
0339: /**
0340: * <p>
0341: * Sets the RuleBasedCollator to be used for language-specific searching.
0342: * </p>
0343: * <p>
0344: * This method causes internal data such as Boyer-Moore shift tables
0345: * to be recalculated, but the iterator's position is unchanged.
0346: * </p>
0347: * @param collator to use for this StringSearch
0348: * @exception IllegalArgumentException thrown when collator is null
0349: * @see #getCollator
0350: * @stable ICU 2.0
0351: */
0352: public void setCollator(RuleBasedCollator collator) {
0353: if (collator == null) {
0354: throw new IllegalArgumentException(
0355: "Collator can not be null");
0356: }
0357: m_collator_ = collator;
0358: m_ceMask_ = getMask(m_collator_.getStrength());
0359: // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
0360: initialize();
0361: m_colEIter_.setCollator(m_collator_);
0362: m_utilColEIter_.setCollator(m_collator_);
0363: }
0364:
0365: /**
0366: * <p>
0367: * Set the pattern to search for.
0368: * </p>
0369: * <p>
0370: * This method causes internal data such as Boyer-Moore shift tables
0371: * to be recalculated, but the iterator's position is unchanged.
0372: * </p>
0373: * @param pattern for searching
0374: * @see #getPattern
0375: * @exception IllegalArgumentException thrown if pattern is null or of
0376: * length 0
0377: * @stable ICU 2.0
0378: */
0379: public void setPattern(String pattern) {
0380: if (pattern == null || pattern.length() <= 0) {
0381: throw new IllegalArgumentException(
0382: "Pattern to search for can not be null or of length 0");
0383: }
0384: m_pattern_.targetText = pattern;
0385: initialize();
0386: }
0387:
0388: /**
0389: * Set the target text to be searched. Text iteration will hence begin at
0390: * the start of the text string. This method is useful if you want to
0391: * re-use an iterator to search within a different body of text.
0392: * @param text new text iterator to look for match,
0393: * @exception IllegalArgumentException thrown when text is null or has
0394: * 0 length
0395: * @see #getTarget
0396: * @stable ICU 2.8
0397: */
0398: public void setTarget(CharacterIterator text) {
0399: super .setTarget(text);
0400: m_textBeginOffset_ = targetText.getBeginIndex();
0401: m_textLimitOffset_ = targetText.getEndIndex();
0402: m_colEIter_.setText(targetText);
0403: }
0404:
0405: /**
0406: * <p>
0407: * Sets the position in the target text which the next search will start
0408: * from to the argument. This method clears all previous states.
0409: * </p>
0410: * <p>
0411: * This method takes the argument position and sets the position in the
0412: * target text accordingly, without checking if position is pointing to a
0413: * valid starting point to begin searching.
0414: * </p>
0415: * <p>
0416: * Search positions that may render incorrect results are highlighted in
0417: * the class documentation.
0418: * </p>
0419: * @param position index to start next search from.
0420: * @exception IndexOutOfBoundsException thrown if argument position is out
0421: * of the target text range.
0422: * @see #getIndex
0423: * @stable ICU 2.8
0424: */
0425: public void setIndex(int position) {
0426: super .setIndex(position);
0427: m_matchedIndex_ = DONE;
0428: m_colEIter_.setExactOffset(position);
0429: }
0430:
0431: /**
0432: * <p>
0433: * Set the canonical match mode. See class documentation for details.
0434: * The default setting for this property is false.
0435: * </p>
0436: * @param allowCanonical flag indicator if canonical matches are allowed
0437: * @see #isCanonical
0438: * @stable ICU 2.8
0439: */
0440: public void setCanonical(boolean allowCanonical) {
0441: m_isCanonicalMatch_ = allowCanonical;
0442: if (m_isCanonicalMatch_ == true) {
0443: if (m_canonicalPrefixAccents_ == null) {
0444: m_canonicalPrefixAccents_ = new StringBuffer();
0445: } else {
0446: m_canonicalPrefixAccents_.delete(0,
0447: m_canonicalPrefixAccents_.length());
0448: }
0449: if (m_canonicalSuffixAccents_ == null) {
0450: m_canonicalSuffixAccents_ = new StringBuffer();
0451: } else {
0452: m_canonicalSuffixAccents_.delete(0,
0453: m_canonicalSuffixAccents_.length());
0454: }
0455: }
0456: }
0457:
0458: // public miscellaneous methods -----------------------------------------
0459:
0460: /**
0461: * <p>
0462: * Resets the search iteration. All properties will be reset to the
0463: * default value.
0464: * </p>
0465: * <p>
0466: * Search will begin at the start of the target text if a forward iteration
0467: * is initiated before a backwards iteration. Otherwise if a
0468: * backwards iteration is initiated before a forwards iteration, the search
0469: * will begin at the end of the target text.
0470: * </p>
0471: * <p>
0472: * Canonical match option will be reset to false, ie an exact match.
0473: * </p>
0474: * @stable ICU 2.8
0475: */
0476: public void reset() {
0477: // reset is setting the attributes that are already in string search,
0478: // hence all attributes in the collator should be retrieved without any
0479: // problems
0480: super .reset();
0481: m_isCanonicalMatch_ = false;
0482: m_ceMask_ = getMask(m_collator_.getStrength());
0483: // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
0484: initialize();
0485: m_colEIter_.setCollator(m_collator_);
0486: m_colEIter_.reset();
0487: m_utilColEIter_.setCollator(m_collator_);
0488: }
0489:
0490: // protected methods -----------------------------------------------------
0491:
0492: /**
0493: * <p>
0494: * Concrete method to provide the mechanism
0495: * for finding the next <b>forwards</b> match in the target text.
0496: * See super class documentation for its use.
0497: * </p>
0498: * @param start index in the target text at which the forwards search
0499: * should begin.
0500: * @return the starting index of the next forwards match if found, DONE
0501: * otherwise
0502: * @see #handlePrevious(int)
0503: * @see #DONE
0504: * @stable ICU 2.8
0505: */
0506: protected int handleNext(int start) {
0507: if (m_pattern_.m_CELength_ == 0) {
0508: matchLength = 0;
0509: if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {
0510: m_matchedIndex_ = start;
0511: return m_matchedIndex_;
0512: }
0513:
0514: targetText.setIndex(start);
0515: char ch = targetText.current();
0516: // ch can never be done, it is handled by next()
0517: char ch2 = targetText.next();
0518: if (ch2 == CharacterIterator.DONE) {
0519: m_matchedIndex_ = DONE;
0520: } else {
0521: m_matchedIndex_ = targetText.getIndex();
0522: }
0523: if (UTF16.isLeadSurrogate(ch)
0524: && UTF16.isTrailSurrogate(ch2)) {
0525: targetText.next();
0526: m_matchedIndex_ = targetText.getIndex();
0527: }
0528: } else {
0529: if (matchLength <= 0) {
0530: // we must have reversed direction after we reached the start
0531: // of the target text
0532: // see SearchIterator next(), it checks the bounds and returns
0533: // if it exceeds the range. It does not allow setting of
0534: // m_matchedIndex
0535: if (start == m_textBeginOffset_) {
0536: m_matchedIndex_ = DONE;
0537: } else {
0538: // for boundary check purposes. this will ensure that the
0539: // next match will not preceed the current offset
0540: // note search->matchedIndex will always be set to something
0541: // in the code
0542: m_matchedIndex_ = start - 1;
0543: }
0544: }
0545:
0546: // status checked below
0547: if (m_isCanonicalMatch_) {
0548: // can't use exact here since extra accents are allowed.
0549: handleNextCanonical(start);
0550: } else {
0551: handleNextExact(start);
0552: }
0553: }
0554: if (m_matchedIndex_ == DONE) {
0555: targetText.setIndex(m_textLimitOffset_);
0556: } else {
0557: targetText.setIndex(m_matchedIndex_);
0558: }
0559: return m_matchedIndex_;
0560: }
0561:
0562: /**
0563: * <p>
0564: * Concrete method to provide the mechanism
0565: * for finding the next <b>backwards</b> match in the target text.
0566: * See super class documentation for its use.
0567: * </p>
0568: * @param start index in the target text at which the backwards search
0569: * should begin.
0570: * @return the starting index of the next backwards match if found, DONE
0571: * otherwise
0572: * @see #handleNext(int)
0573: * @see #DONE
0574: * @stable ICU 2.8
0575: */
0576: protected int handlePrevious(int start) {
0577: if (m_pattern_.m_CELength_ == 0) {
0578: matchLength = 0;
0579: // start can never be DONE or 0, it is handled in previous
0580: targetText.setIndex(start);
0581: char ch = targetText.previous();
0582: if (ch == CharacterIterator.DONE) {
0583: m_matchedIndex_ = DONE;
0584: } else {
0585: m_matchedIndex_ = targetText.getIndex();
0586: if (UTF16.isTrailSurrogate(ch)) {
0587: if (UTF16.isLeadSurrogate(targetText.previous())) {
0588: m_matchedIndex_ = targetText.getIndex();
0589: }
0590: }
0591: }
0592: } else {
0593: if (matchLength == 0) {
0594: // we must have reversed direction after we reached the end
0595: // of the target text
0596: // see SearchIterator next(), it checks the bounds and returns
0597: // if it exceeds the range. It does not allow setting of
0598: // m_matchedIndex
0599: m_matchedIndex_ = DONE;
0600: }
0601: if (m_isCanonicalMatch_) {
0602: // can't use exact here since extra accents are allowed.
0603: handlePreviousCanonical(start);
0604: } else {
0605: handlePreviousExact(start);
0606: }
0607: }
0608:
0609: if (m_matchedIndex_ == DONE) {
0610: targetText.setIndex(m_textBeginOffset_);
0611: } else {
0612: targetText.setIndex(m_matchedIndex_);
0613: }
0614: return m_matchedIndex_;
0615: }
0616:
0617: // private static inner classes ----------------------------------------
0618:
0619: private static class Pattern {
0620: // protected methods -----------------------------------------------
0621:
0622: /**
0623: * Pattern string
0624: */
0625: protected String targetText;
0626: /**
0627: * Array containing the collation elements of targetText
0628: */
0629: protected int m_CE_[];
0630: /**
0631: * Number of collation elements in m_CE_
0632: */
0633: protected int m_CELength_;
0634: /**
0635: * Flag indicator if targetText starts with an accent
0636: */
0637: protected boolean m_hasPrefixAccents_;
0638: /**
0639: * Flag indicator if targetText ends with an accent
0640: */
0641: protected boolean m_hasSuffixAccents_;
0642: /**
0643: * Default number of characters to shift for Boyer Moore
0644: */
0645: protected int m_defaultShiftSize_;
0646: /**
0647: * Number of characters to shift for Boyer Moore, depending on the
0648: * source text to search
0649: */
0650: protected char m_shift_[];
0651: /**
0652: * Number of characters to shift backwards for Boyer Moore, depending
0653: * on the source text to search
0654: */
0655: protected char m_backShift_[];
0656:
0657: // protected constructors ------------------------------------------
0658:
0659: /**
0660: * Empty constructor
0661: */
0662: protected Pattern(String pattern) {
0663: targetText = pattern;
0664: m_CE_ = new int[INITIAL_ARRAY_SIZE_];
0665: m_CELength_ = 0;
0666: m_hasPrefixAccents_ = false;
0667: m_hasSuffixAccents_ = false;
0668: m_defaultShiftSize_ = 1;
0669: m_shift_ = new char[MAX_TABLE_SIZE_];
0670: m_backShift_ = new char[MAX_TABLE_SIZE_];
0671: }
0672: };
0673:
0674: // private data members ------------------------------------------------
0675:
0676: /**
0677: * target text begin offset. Each targetText has a valid contiguous region
0678: * to iterate and this data member is the offset to the first such
0679: * character in the region.
0680: */
0681: private int m_textBeginOffset_;
0682: /**
0683: * target text limit offset. Each targetText has a valid contiguous region
0684: * to iterate and this data member is the offset to 1 after the last such
0685: * character in the region.
0686: */
0687: private int m_textLimitOffset_;
0688: /**
0689: * Upon completion of a search, m_matchIndex_ will store starting offset in
0690: * m_text for the match. The Value DONE is the default value.
0691: * If we are not at the start of the text or the end of the text and
0692: * m_matchedIndex_ is DONE it means that we can find any more matches in
0693: * that particular direction
0694: */
0695: private int m_matchedIndex_;
0696: /**
0697: * Current pattern to search for
0698: */
0699: private Pattern m_pattern_;
0700: /**
0701: * Collator whose rules are used to perform the search
0702: */
0703: private RuleBasedCollator m_collator_;
0704: /**
0705: * The collation element iterator for the text source.
0706: */
0707: private CollationElementIterator m_colEIter_;
0708: /**
0709: * Utility collation element, used throughout program for temporary
0710: * iteration.
0711: */
0712: private CollationElementIterator m_utilColEIter_;
0713: /**
0714: * The mask used on the collation elements to retrieve the valid strength
0715: * weight
0716: */
0717: private int m_ceMask_;
0718: /**
0719: * Buffer storing accents during a canonical search
0720: */
0721: private StringBuffer m_canonicalPrefixAccents_;
0722: /**
0723: * Buffer storing accents during a canonical search
0724: */
0725: private StringBuffer m_canonicalSuffixAccents_;
0726: /**
0727: * Flag to indicate if canonical search is to be done.
0728: * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0.
0729: */
0730: private boolean m_isCanonicalMatch_;
0731: /**
0732: * Size of the shift tables
0733: */
0734: private static final int MAX_TABLE_SIZE_ = 257;
0735: /**
0736: * Initial array size
0737: */
0738: private static final int INITIAL_ARRAY_SIZE_ = 256;
0739: /**
0740: * Utility mask
0741: */
0742: private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
0743: /**
0744: * Utility mask
0745: */
0746: private static final int LAST_BYTE_MASK_ = 0xff;
0747: /**
0748: * Utility buffer for return values and temporary storage
0749: */
0750: private int m_utilBuffer_[] = new int[2];
0751:
0752: // private methods -------------------------------------------------------
0753:
0754: /**
0755: * Hash a collation element from its full size (32 bits) down into a
0756: * value that can be used as an index into the shift tables. Right
0757: * now we do a modulus by the size of the hash table.
0758: * @param ce collation element
0759: * @return collapsed version of the collation element
0760: */
0761: private static final int hash(int ce) {
0762: // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
0763: // well with the new collation where most of the latin 1 characters
0764: // are of the value xx000xxx. their hashes will most of the time be 0
0765: // to be discussed on the hash algo.
0766: return CollationElementIterator.primaryOrder(ce)
0767: % MAX_TABLE_SIZE_;
0768: }
0769:
0770: /**
0771: * Gets the fcd value for a character at the argument index.
0772: * This method takes into accounts of the supplementary characters.
0773: * Note this method changes the offset in the character iterator.
0774: * @param str UTF16 string where character for fcd retrieval resides
0775: * @param offset position of the character whose fcd is to be retrieved
0776: * @return fcd value
0777: */
0778: private static final char getFCD(CharacterIterator str, int offset) {
0779: str.setIndex(offset);
0780: char ch = str.current();
0781: char result = NormalizerImpl.getFCD16(ch);
0782:
0783: if ((result != 0) && (str.getEndIndex() != offset + 1)
0784: && UTF16.isLeadSurrogate(ch)) {
0785: ch = str.next();
0786: if (UTF16.isTrailSurrogate(ch)) {
0787: result = NormalizerImpl.getFCD16FromSurrogatePair(
0788: result, ch);
0789: } else {
0790: result = 0;
0791: }
0792: }
0793: return result;
0794: }
0795:
0796: /**
0797: * Gets the fcd value for a character at the argument index.
0798: * This method takes into accounts of the supplementary characters.
0799: * @param str UTF16 string where character for fcd retrieval resides
0800: * @param offset position of the character whose fcd is to be retrieved
0801: * @return fcd value
0802: */
0803: private static final char getFCD(String str, int offset) {
0804: char ch = str.charAt(offset);
0805: char result = NormalizerImpl.getFCD16(ch);
0806:
0807: if ((result != 0) && (str.length() != offset + 1)
0808: && UTF16.isLeadSurrogate(ch)) {
0809: ch = str.charAt(offset + 1);
0810: if (UTF16.isTrailSurrogate(ch)) {
0811: result = NormalizerImpl.getFCD16FromSurrogatePair(
0812: result, ch);
0813: } else {
0814: result = 0;
0815: }
0816: }
0817: return result;
0818: }
0819:
0820: /**
0821: * Getting the modified collation elements taking into account the collation
0822: * attributes
0823: * @param ce
0824: * @return the modified collation element
0825: */
0826: private final int getCE(int ce) {
0827: // note for tertiary we can't use the collator->tertiaryMask, that
0828: // is a preprocessed mask that takes into account case options. since
0829: // we are only concerned with exact matches, we don't need that.
0830: ce &= m_ceMask_;
0831:
0832: if (m_collator_.isAlternateHandlingShifted()) {
0833: // alternate handling here, since only the 16 most significant
0834: // digits is only used, we can safely do a compare without masking
0835: // if the ce is a variable, we mask and get only the primary values
0836: // no shifting to quartenary is required since all primary values
0837: // less than variabletop will need to be masked off anyway.
0838: if ((m_collator_.m_variableTopValue_ << 16) > ce) {
0839: if (m_collator_.getStrength() == Collator.QUATERNARY) {
0840: ce = CollationElementIterator.primaryOrder(ce);
0841: } else {
0842: ce = CollationElementIterator.IGNORABLE;
0843: }
0844: }
0845: }
0846:
0847: return ce;
0848: }
0849:
0850: /**
0851: * Appends a int to a int array, increasing the size of the array when
0852: * we are out of space.
0853: * @param offset in array to append to
0854: * @param value to append
0855: * @param array to append to
0856: * @return the array appended to, this could be a new and bigger array
0857: */
0858: private static final int[] append(int offset, int value,
0859: int array[]) {
0860: if (offset >= array.length) {
0861: int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];
0862: System.arraycopy(array, 0, temp, 0, array.length);
0863: array = temp;
0864: }
0865: array[offset] = value;
0866: return array;
0867: }
0868:
0869: /**
0870: * Initializing the ce table for a pattern. Stores non-ignorable collation
0871: * keys. Table size will be estimated by the size of the pattern text.
0872: * Table expansion will be perform as we go along. Adding 1 to ensure that
0873: * the table size definitely increases.
0874: * Internal method, status assumed to be a success.
0875: * @return total number of expansions
0876: */
0877: private final int initializePatternCETable() {
0878: m_utilColEIter_.setText(m_pattern_.targetText);
0879:
0880: int offset = 0;
0881: int result = 0;
0882: int ce = m_utilColEIter_.next();
0883:
0884: while (ce != CollationElementIterator.NULLORDER) {
0885: int newce = getCE(ce);
0886: if (newce != CollationElementIterator.IGNORABLE) {
0887: m_pattern_.m_CE_ = append(offset, newce,
0888: m_pattern_.m_CE_);
0889: offset++;
0890: }
0891: result += m_utilColEIter_.getMaxExpansion(ce) - 1;
0892: ce = m_utilColEIter_.next();
0893: }
0894:
0895: m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);
0896: m_pattern_.m_CELength_ = offset;
0897:
0898: return result;
0899: }
0900:
0901: /**
0902: * Initializes the pattern struct.
0903: * Internal method, status assumed to be success.
0904: * @return expansionsize the total expansion size of the pattern
0905: */
0906: private final int initializePattern() {
0907: m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText,
0908: 0) >> SECOND_LAST_BYTE_SHIFT_) != 0;
0909: m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText,
0910: m_pattern_.targetText.length() - 1) & LAST_BYTE_MASK_) != 0;
0911: // since intializePattern is an internal method status is a success.
0912: return initializePatternCETable();
0913: }
0914:
0915: /**
0916: * Initializing shift tables, with the default values.
0917: * If a corresponding default value is 0, the shift table is not set.
0918: * @param shift table for forwards shift
0919: * @param backshift table for backwards shift
0920: * @param cetable table containing pattern ce
0921: * @param cesize size of the pattern ces
0922: * @param expansionsize total size of the expansions
0923: * @param defaultforward the default forward value
0924: * @param defaultbackward the default backward value
0925: */
0926: private final void setShiftTable(char shift[], char backshift[],
0927: int cetable[], int cesize, int expansionsize,
0928: char defaultforward, char defaultbackward) {
0929: // estimate the value to shift. to do that we estimate the smallest
0930: // number of characters to give the relevant ces, ie approximately
0931: // the number of ces minus their expansion, since expansions can come
0932: // from a character.
0933: for (int count = 0; count < MAX_TABLE_SIZE_; count++) {
0934: shift[count] = defaultforward;
0935: }
0936: cesize--; // down to the last index
0937: for (int count = 0; count < cesize; count++) {
0938: // number of ces from right of array to the count
0939: int temp = defaultforward - count - 1;
0940: shift[hash(cetable[count])] = temp > 1 ? ((char) temp) : 1;
0941: }
0942: shift[hash(cetable[cesize])] = 1;
0943: // for ignorables we just shift by one. see test examples.
0944: shift[hash(0)] = 1;
0945:
0946: for (int count = 0; count < MAX_TABLE_SIZE_; count++) {
0947: backshift[count] = defaultbackward;
0948: }
0949: for (int count = cesize; count > 0; count--) {
0950: // the original value count does not seem to work
0951: backshift[hash(cetable[count])] = (char) (count > expansionsize ? count
0952: - expansionsize
0953: : 1);
0954: }
0955: backshift[hash(cetable[0])] = 1;
0956: backshift[hash(0)] = 1;
0957: }
0958:
0959: /**
0960: * <p>Building of the pattern collation element list and the Boyer Moore
0961: * StringSearch table.</p>
0962: * <p>The canonical match will only be performed after the default match
0963: * fails.</p>
0964: * <p>For both cases we need to remember the size of the composed and
0965: * decomposed versions of the string. Since the Boyer-Moore shift
0966: * calculations shifts by a number of characters in the text and tries to
0967: * match the pattern from that offset, the shift value can not be too large
0968: * in case we miss some characters. To choose a right shift size, we
0969: * estimate the NFC form of the and use its size as a shift guide. The NFC
0970: * form should be the small possible representation of the pattern. Anyways,
0971: * we'll err on the smaller shift size. Hence the calculation for
0972: * minlength. Canonical match will be performed slightly differently. We'll
0973: * split the pattern into 3 parts, the prefix accents (PA), the middle
0974: * string bounded by the first and last base character (MS), the ending
0975: * accents (EA). Matches will be done on MS first, and only when we match
0976: * MS then some processing will be required for the prefix and end accents
0977: * in order to determine if they match PA and EA. Hence the default shift
0978: * values for the canonical match will take the size of either end's accent
0979: * into consideration. Forwards search will take the end accents into
0980: * consideration for the default shift values and the backwards search will
0981: * take the prefix accents into consideration.</p>
0982: * <p>If pattern has no non-ignorable ce, we return a illegal argument
0983: * error.</p>
0984: */
0985: private final void initialize() {
0986: int expandlength = initializePattern();
0987: if (m_pattern_.m_CELength_ > 0) {
0988: char minlength = (char) (m_pattern_.m_CELength_ > expandlength ? m_pattern_.m_CELength_
0989: - expandlength
0990: : 1);
0991: m_pattern_.m_defaultShiftSize_ = minlength;
0992: setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_,
0993: m_pattern_.m_CE_, m_pattern_.m_CELength_,
0994: expandlength, minlength, minlength);
0995: } else {
0996: m_pattern_.m_defaultShiftSize_ = 0;
0997: }
0998: }
0999:
1000: /**
1001: * Determine whether the search text bounded by the offset start and end is
1002: * one or more whole units of text as determined by the breakiterator in
1003: * StringSearch.
1004: * @param start target text start offset
1005: * @param end target text end offset
1006: */
1007: private final boolean isBreakUnit(int start, int end) {
1008: if (breakIterator != null) {
1009: int startindex = breakIterator.first();
1010: int endindex = breakIterator.last();
1011:
1012: // out-of-range indexes are never boundary positions
1013: if (start < startindex || start > endindex
1014: || end < startindex || end > endindex) {
1015: return false;
1016: }
1017: // otherwise, we can use following() on the position before the
1018: // specified one and return true of the position we get back is the
1019: // one the user specified
1020: boolean result = (start == startindex || breakIterator
1021: .following(start - 1) == start)
1022: && (end == endindex || breakIterator
1023: .following(end - 1) == end);
1024: if (result) {
1025: // iterates the individual ces
1026: m_utilColEIter_.setText(new CharacterIteratorWrapper(
1027: targetText), start);
1028: for (int count = 0; count < m_pattern_.m_CELength_; count++) {
1029: int ce = getCE(m_utilColEIter_.next());
1030: if (ce == CollationElementIterator.IGNORABLE) {
1031: count--;
1032: continue;
1033: }
1034: if (ce != m_pattern_.m_CE_[count]) {
1035: return false;
1036: }
1037: }
1038: int nextce = m_utilColEIter_.next();
1039: while (m_utilColEIter_.getOffset() == end
1040: && getCE(nextce) == CollationElementIterator.IGNORABLE) {
1041: nextce = m_utilColEIter_.next();
1042: }
1043: if (nextce != CollationElementIterator.NULLORDER
1044: && m_utilColEIter_.getOffset() == end) {
1045: // extra collation elements at the end of the match
1046: return false;
1047: }
1048: }
1049: return result;
1050: }
1051: return true;
1052: }
1053:
1054: /**
1055: * Getting the next base character offset if current offset is an accent,
1056: * or the current offset if the current character contains a base character.
1057: * accents the following base character will be returned
1058: * @param text string
1059: * @param textoffset current offset
1060: * @param textlength length of text string
1061: * @return the next base character or the current offset
1062: * if the current character is contains a base character.
1063: */
1064: private final int getNextBaseOffset(CharacterIterator text,
1065: int textoffset) {
1066: if (textoffset < text.getEndIndex()) {
1067: while (text.getIndex() < text.getEndIndex()) {
1068: int result = textoffset;
1069: if ((getFCD(text, textoffset++) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1070: return result;
1071: }
1072: }
1073: return text.getEndIndex();
1074: }
1075: return textoffset;
1076: }
1077:
1078: /**
1079: * Gets the next base character offset depending on the string search
1080: * pattern data
1081: * @param textoffset one offset away from the last character
1082: * to search for.
1083: * @return start index of the next base character or the current offset
1084: * if the current character is contains a base character.
1085: */
1086: private final int getNextBaseOffset(int textoffset) {
1087: if (m_pattern_.m_hasSuffixAccents_
1088: && textoffset < m_textLimitOffset_) {
1089: targetText.setIndex(textoffset);
1090: targetText.previous();
1091: if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
1092: return getNextBaseOffset(targetText, textoffset);
1093: }
1094: }
1095: return textoffset;
1096: }
1097:
1098: /**
1099: * Shifting the collation element iterator position forward to prepare for
1100: * a following match. If the last character is a unsafe character, we'll
1101: * only shift by 1 to capture contractions, normalization etc.
1102: * Internal method, status assumed to be success.
1103: * @param textoffset start text position to do search
1104: * @param ce the text ce which failed the match.
1105: * @param patternceindex index of the ce within the pattern ce buffer which
1106: * failed the match
1107: * @return final offset
1108: */
1109: private int shiftForward(int textoffset, int ce, int patternceindex)
1110:
1111: {
1112: if (ce != CollationElementIterator.NULLORDER) {
1113: int shift = m_pattern_.m_shift_[hash(ce)];
1114: // this is to adjust for characters in the middle of the
1115: // substring for matching that failed.
1116: int adjust = m_pattern_.m_CELength_ - patternceindex;
1117: if (adjust > 1 && shift >= adjust) {
1118: shift -= adjust - 1;
1119: }
1120: textoffset += shift;
1121: } else {
1122: textoffset += m_pattern_.m_defaultShiftSize_;
1123: }
1124:
1125: textoffset = getNextBaseOffset(textoffset);
1126: // check for unsafe characters
1127: // * if it is the start or middle of a contraction: to be done after
1128: // a initial match is found
1129: // * thai or lao base consonant character: similar to contraction
1130: // * high surrogate character: similar to contraction
1131: // * next character is a accent: shift to the next base character
1132: return textoffset;
1133: }
1134:
1135: /**
1136: * Gets the offset to the next safe point in text.
1137: * ie. not the middle of a contraction, swappable characters or
1138: * supplementary characters.
1139: * @param textoffset offset in string
1140: * @param end offset in string
1141: * @return offset to the next safe character
1142: */
1143: private final int getNextSafeOffset(int textoffset, int end) {
1144: int result = textoffset; // first contraction character
1145: targetText.setIndex(result);
1146: while (result != end
1147: && m_collator_.isUnsafe(targetText.current())) {
1148: result++;
1149: targetText.setIndex(result);
1150: }
1151: return result;
1152: }
1153:
1154: /**
1155: * This checks for accents in the potential match started with a composite
1156: * character.
1157: * This is really painful... we have to check that composite character do
1158: * not have any extra accents. We have to normalize the potential match and
1159: * find the immediate decomposed character before the match.
1160: * The first composite character would have been taken care of by the fcd
1161: * checks in checkForwardExactMatch.
1162: * This is the slow path after the fcd of the first character and
1163: * the last character has been checked by checkForwardExactMatch and we
1164: * determine that the potential match has extra non-ignorable preceding
1165: * ces.
1166: * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
1167: * checkExtraMatchAccent should fail since there is a middle ring in
1168: * \u01FA Note here that accents checking are slow and cautioned in the API
1169: * docs.
1170: * Internal method, status assumed to be a success, caller should check
1171: * status before calling this method
1172: * @param start index of the potential unfriendly composite character
1173: * @param end index of the potential unfriendly composite character
1174: * @return true if there is non-ignorable accents before at the beginning
1175: * of the match, false otherwise.
1176: */
1177: private final boolean checkExtraMatchAccents(int start, int end) {
1178: boolean result = false;
1179: if (m_pattern_.m_hasPrefixAccents_) {
1180: targetText.setIndex(start);
1181:
1182: if (UTF16.isLeadSurrogate(targetText.next())) {
1183: if (!UTF16.isTrailSurrogate(targetText.next())) {
1184: targetText.previous();
1185: }
1186: }
1187: // we are only concerned with the first composite character
1188: String str = getString(targetText, start, end);
1189: if (Normalizer.quickCheck(str, Normalizer.NFD, 0) == Normalizer.NO) {
1190: int safeoffset = getNextSafeOffset(start, end);
1191: if (safeoffset != end) {
1192: safeoffset++;
1193: }
1194: String decomp = Normalizer.decompose(str.substring(0,
1195: safeoffset - start), false);
1196: m_utilColEIter_.setText(decomp);
1197: int firstce = m_pattern_.m_CE_[0];
1198: boolean ignorable = true;
1199: int ce = CollationElementIterator.IGNORABLE;
1200: int offset = 0;
1201: while (ce != firstce) {
1202: offset = m_utilColEIter_.getOffset();
1203: if (ce != firstce
1204: && ce != CollationElementIterator.IGNORABLE) {
1205: ignorable = false;
1206: }
1207: ce = m_utilColEIter_.next();
1208: }
1209: m_utilColEIter_.setExactOffset(offset); // back up 1 to the
1210: m_utilColEIter_.previous(); // right offset
1211: offset = m_utilColEIter_.getOffset();
1212: result = !ignorable
1213: && (UCharacter.getCombiningClass(UTF16.charAt(
1214: decomp, offset)) != 0);
1215: }
1216: }
1217:
1218: return result;
1219: }
1220:
1221: /**
1222: * Used by exact matches, checks if there are accents before the match.
1223: * This is really painful... we have to check that composite characters at
1224: * the start of the matches have to not have any extra accents.
1225: * We check the FCD of the character first, if it starts with an accent and
1226: * the first pattern ce does not match the first ce of the character, we
1227: * bail.
1228: * Otherwise we try normalizing the first composite
1229: * character and find the immediate decomposed character before the match to
1230: * see if it is an non-ignorable accent.
1231: * Now normalizing the first composite character is enough because we ensure
1232: * that when the match is passed in here with extra beginning ces, the
1233: * first or last ce that match has to occur within the first character.
1234: * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
1235: * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
1236: * Note here that accents checking are slow and cautioned in the API docs.
1237: * @param start offset
1238: * @param end offset
1239: * @return true if there are accents on either side of the match,
1240: * false otherwise
1241: */
1242: private final boolean hasAccentsBeforeMatch(int start, int end) {
1243: if (m_pattern_.m_hasPrefixAccents_) {
1244: // we have been iterating forwards previously
1245: boolean ignorable = true;
1246: int firstce = m_pattern_.m_CE_[0];
1247: m_colEIter_.setExactOffset(start);
1248: int ce = getCE(m_colEIter_.next());
1249: while (ce != firstce) {
1250: if (ce != CollationElementIterator.IGNORABLE) {
1251: ignorable = false;
1252: }
1253: ce = getCE(m_colEIter_.next());
1254: }
1255: if (!ignorable && m_colEIter_.isInBuffer()) {
1256: // within normalization buffer, discontiguous handled here
1257: return true;
1258: }
1259:
1260: // within text
1261: boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_) != 0;
1262: if (!accent) {
1263: return checkExtraMatchAccents(start, end);
1264: }
1265: if (!ignorable) {
1266: return true;
1267: }
1268: if (start > m_textBeginOffset_) {
1269: targetText.setIndex(start);
1270: targetText.previous();
1271: if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
1272: m_colEIter_.setExactOffset(start);
1273: ce = m_colEIter_.previous();
1274: if (ce != CollationElementIterator.NULLORDER
1275: && ce != CollationElementIterator.IGNORABLE) {
1276: return true;
1277: }
1278: }
1279: }
1280: }
1281:
1282: return false;
1283: }
1284:
1285: /**
1286: * Used by exact matches, checks if there are accents bounding the match.
1287: * Note this is the initial boundary check. If the potential match
1288: * starts or ends with composite characters, the accents in those
1289: * characters will be determined later.
1290: * Not doing backwards iteration here, since discontiguos contraction for
1291: * backwards collation element iterator, use up too many characters.
1292: * E.g. looking for \u030A ring in \u01FA A ring above and acute,
1293: * should fail since there is a acute at the end of \u01FA
1294: * Note here that accents checking are slow and cautioned in the API docs.
1295: * @param start offset of match
1296: * @param end end offset of the match
1297: * @return true if there are accents on either side of the match,
1298: * false otherwise
1299: */
1300: private final boolean hasAccentsAfterMatch(int start, int end) {
1301: if (m_pattern_.m_hasSuffixAccents_) {
1302: targetText.setIndex(end);
1303: if (end > m_textBeginOffset_
1304: && UTF16.isTrailSurrogate(targetText.previous())) {
1305: if (targetText.getIndex() > m_textBeginOffset_
1306: && !UTF16
1307: .isLeadSurrogate(targetText.previous())) {
1308: targetText.next();
1309: }
1310: }
1311: if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
1312: int firstce = m_pattern_.m_CE_[0];
1313: m_colEIter_.setExactOffset(start);
1314: while (getCE(m_colEIter_.next()) != firstce) {
1315: }
1316: int count = 1;
1317: while (count < m_pattern_.m_CELength_) {
1318: if (getCE(m_colEIter_.next()) == CollationElementIterator.IGNORABLE) {
1319: count--;
1320: }
1321: count++;
1322: }
1323: int ce = getCE(m_colEIter_.next());
1324: if (ce != CollationElementIterator.NULLORDER
1325: && ce != CollationElementIterator.IGNORABLE) {
1326: if (m_colEIter_.getOffset() <= end) {
1327: return true;
1328: }
1329: if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
1330: return true;
1331: }
1332: }
1333: }
1334: }
1335: return false;
1336: }
1337:
1338: /**
1339: * Checks if the offset runs out of the text string range
1340: * @param textstart offset of the first character in the range
1341: * @param textlimit limit offset of the text string range
1342: * @param offset to test
1343: * @return true if offset is out of bounds, false otherwise
1344: */
1345: private static final boolean isOutOfBounds(int textstart,
1346: int textlimit, int offset) {
1347: return offset < textstart || offset > textlimit;
1348: }
1349:
1350: /**
1351: * Checks for identical match
1352: * @param strsrch string search data
1353: * @param start offset of possible match
1354: * @param end offset of possible match
1355: * @return true if identical match is found
1356: */
1357: private final boolean checkIdentical(int start, int end) {
1358: if (m_collator_.getStrength() != Collator.IDENTICAL) {
1359: return true;
1360: }
1361:
1362: String textstr = getString(targetText, start, end - start);
1363: if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
1364: textstr = Normalizer.decompose(textstr, false);
1365: }
1366: String patternstr = m_pattern_.targetText;
1367: if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
1368: patternstr = Normalizer.decompose(patternstr, false);
1369: }
1370: return textstr.equals(patternstr);
1371: }
1372:
1373: /**
1374: * Checks to see if the match is repeated
1375: * @param start new match start index
1376: * @param limit new match limit index
1377: * @return true if the the match is repeated, false otherwise
1378: */
1379: private final boolean checkRepeatedMatch(int start, int limit) {
1380: if (m_matchedIndex_ == DONE) {
1381: return false;
1382: }
1383: int end = limit - 1; // last character in the match
1384: int lastmatchend = m_matchedIndex_ + matchLength - 1;
1385: if (!isOverlapping()) {
1386: return (start >= m_matchedIndex_ && start <= lastmatchend)
1387: || (end >= m_matchedIndex_ && end <= lastmatchend)
1388: || (start <= m_matchedIndex_ && end >= lastmatchend);
1389:
1390: }
1391: return start <= m_matchedIndex_ && end >= lastmatchend;
1392: }
1393:
1394: /**
1395: * Checks match for contraction.
1396: * If the match ends with a partial contraction we fail.
1397: * If the match starts too far off (because of backwards iteration) we try
1398: * to chip off the extra characters depending on whether a breakiterator
1399: * has been used.
1400: * Temporary utility buffer used to return modified start and end.
1401: * @param start offset of potential match, to be modified if necessary
1402: * @param end offset of potential match, to be modified if necessary
1403: * @return true if match passes the contraction test, false otherwise.
1404: */
1405: private final boolean checkNextExactContractionMatch(int start,
1406: int end) {
1407: // This part checks if either ends of the match contains potential
1408: // contraction. If so we'll have to iterate through them
1409: char endchar = 0;
1410: if (end < m_textLimitOffset_) {
1411: targetText.setIndex(end);
1412: endchar = targetText.current();
1413: }
1414: char poststartchar = 0;
1415: if (start + 1 < m_textLimitOffset_) {
1416: targetText.setIndex(start + 1);
1417: poststartchar = targetText.current();
1418: }
1419: if (m_collator_.isUnsafe(endchar)
1420: || m_collator_.isUnsafe(poststartchar)) {
1421: // expansion prefix, what's left to iterate
1422: int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;
1423: boolean hasBufferedCE = bufferedCEOffset > 0;
1424: m_colEIter_.setExactOffset(start);
1425: int temp = start;
1426: while (bufferedCEOffset > 0) {
1427: // getting rid of the redundant ce, caused by setOffset.
1428: // since backward contraction/expansion may have extra ces if
1429: // we are in the normalization buffer, hasAccentsBeforeMatch
1430: // would have taken care of it.
1431: // E.g. the character \u01FA will have an expansion of 3, but
1432: // if we are only looking for acute and ring \u030A and \u0301,
1433: // we'll have to skip the first ce in the expansion buffer.
1434: m_colEIter_.next();
1435: if (m_colEIter_.getOffset() != temp) {
1436: start = temp;
1437: temp = m_colEIter_.getOffset();
1438: }
1439: bufferedCEOffset--;
1440: }
1441:
1442: int count = 0;
1443: while (count < m_pattern_.m_CELength_) {
1444: int ce = getCE(m_colEIter_.next());
1445: if (ce == CollationElementIterator.IGNORABLE) {
1446: continue;
1447: }
1448: if (hasBufferedCE && count == 0
1449: && m_colEIter_.getOffset() != temp) {
1450: start = temp;
1451: temp = m_colEIter_.getOffset();
1452: }
1453: if (ce != m_pattern_.m_CE_[count]) {
1454: end++;
1455: end = getNextBaseOffset(end);
1456: m_utilBuffer_[0] = start;
1457: m_utilBuffer_[1] = end;
1458: return false;
1459: }
1460: count++;
1461: }
1462: }
1463: m_utilBuffer_[0] = start;
1464: m_utilBuffer_[1] = end;
1465: return true;
1466: }
1467:
1468: /**
1469: * Checks and sets the match information if found.
1470: * Checks
1471: * <ul>
1472: * <li> the potential match does not repeat the previous match
1473: * <li> boundaries are correct
1474: * <li> exact matches has no extra accents
1475: * <li> identical matchesb
1476: * <li> potential match does not end in the middle of a contraction
1477: * </ul>
1478: * Otherwise the offset will be shifted to the next character.
1479: * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
1480: * more fitting result value.
1481: * Uses the temporary utility buffer for storing the modified textoffset.
1482: * @param textoffset offset in the collation element text.
1483: * @return true if the match is valid, false otherwise
1484: */
1485: private final boolean checkNextExactMatch(int textoffset) {
1486: int start = m_colEIter_.getOffset();
1487: if (!checkNextExactContractionMatch(start, textoffset)) {
1488: // returns the modified textoffset
1489: m_utilBuffer_[0] = m_utilBuffer_[1];
1490: return false;
1491: }
1492:
1493: start = m_utilBuffer_[0];
1494: textoffset = m_utilBuffer_[1];
1495: // this totally matches, however we need to check if it is repeating
1496: if (!isBreakUnit(start, textoffset)
1497: || checkRepeatedMatch(start, textoffset)
1498: || hasAccentsBeforeMatch(start, textoffset)
1499: || !checkIdentical(start, textoffset)
1500: || hasAccentsAfterMatch(start, textoffset)) {
1501: textoffset++;
1502: textoffset = getNextBaseOffset(textoffset);
1503: m_utilBuffer_[0] = textoffset;
1504: return false;
1505: }
1506:
1507: // totally match, we will get rid of the ending ignorables.
1508: m_matchedIndex_ = start;
1509: matchLength = textoffset - start;
1510: return true;
1511: }
1512:
1513: /**
1514: * Getting the previous base character offset, or the current offset if the
1515: * current character is a base character
1516: * @param text the source text to work on
1517: * @param textoffset one offset after the current character
1518: * @return the offset of the next character after the base character or the
1519: * first composed character with accents
1520: */
1521: private final int getPreviousBaseOffset(CharacterIterator text,
1522: int textoffset) {
1523: if (textoffset > m_textBeginOffset_) {
1524: while (true) {
1525: int result = textoffset;
1526: text.setIndex(result);
1527: if (UTF16.isTrailSurrogate(text.previous())) {
1528: if (text.getIndex() != text.getBeginIndex()
1529: && !UTF16.isLeadSurrogate(text.previous())) {
1530: text.next();
1531: }
1532: }
1533: textoffset = text.getIndex();
1534: char fcd = getFCD(text, textoffset);
1535: if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1536: if ((fcd & LAST_BYTE_MASK_) != 0) {
1537: return textoffset;
1538: }
1539: return result;
1540: }
1541: if (textoffset == m_textBeginOffset_) {
1542: return m_textBeginOffset_;
1543: }
1544: }
1545: }
1546: return textoffset;
1547: }
1548:
1549: /**
1550: * Getting the indexes of the accents that are not blocked in the argument
1551: * accent array
1552: * @param accents accents in nfd.
1553: * @param accentsindex array to store the indexes of accents in accents that
1554: * are not blocked
1555: * @return the length of populated accentsindex
1556: */
1557: private int getUnblockedAccentIndex(StringBuffer accents,
1558: int accentsindex[]) {
1559: int index = 0;
1560: int length = accents.length();
1561: int cclass = 0;
1562: int result = 0;
1563: while (index < length) {
1564: int codepoint = UTF16.charAt(accents, index);
1565: int tempclass = UCharacter.getCombiningClass(codepoint);
1566: if (tempclass != cclass) {
1567: cclass = tempclass;
1568: accentsindex[result] = index;
1569: result++;
1570: }
1571: if (UCharacter.isSupplementary(codepoint)) {
1572: index += 2;
1573: } else {
1574: index++;
1575: }
1576: }
1577: accentsindex[result] = length;
1578: return result;
1579: }
1580:
1581: /**
1582: * Appends 3 StringBuffer/CharacterIterator together into a destination
1583: * string buffer.
1584: * @param source1 string buffer
1585: * @param source2 character iterator
1586: * @param start2 start of the character iterator to merge
1587: * @param end2 end of the character iterator to merge
1588: * @param source3 string buffer
1589: * @return appended string buffer
1590: */
1591: private static final StringBuffer merge(StringBuffer source1,
1592: CharacterIterator source2, int start2, int end2,
1593: StringBuffer source3) {
1594: StringBuffer result = new StringBuffer();
1595: if (source1 != null && source1.length() != 0) {
1596: // jdk 1.3.1 does not have append(StringBuffer) yet
1597: if (com.ibm.icu.impl.ICUDebug.isJDK14OrHigher) {
1598: result.append(source1);
1599: } else {
1600: result.append(source1.toString());
1601: }
1602: }
1603: source2.setIndex(start2);
1604: while (source2.getIndex() < end2) {
1605: result.append(source2.current());
1606: source2.next();
1607: }
1608: if (source3 != null && source3.length() != 0) {
1609: // jdk 1.3.1 does not have append(StringBuffer) yet
1610: if (com.ibm.icu.impl.ICUDebug.isJDK14OrHigher) {
1611: result.append(source3);
1612: } else {
1613: result.append(source3.toString());
1614: }
1615: }
1616: return result;
1617: }
1618:
1619: /**
1620: * Running through a collation element iterator to see if the contents
1621: * matches pattern in string search data
1622: * @param coleiter collation element iterator to test
1623: * @return true if a match if found, false otherwise
1624: */
1625: private final boolean checkCollationMatch(
1626: CollationElementIterator coleiter) {
1627: int patternceindex = m_pattern_.m_CELength_;
1628: int offset = 0;
1629: while (patternceindex > 0) {
1630: int ce = getCE(coleiter.next());
1631: if (ce == CollationElementIterator.IGNORABLE) {
1632: continue;
1633: }
1634: if (ce != m_pattern_.m_CE_[offset]) {
1635: return false;
1636: }
1637: offset++;
1638: patternceindex--;
1639: }
1640: return true;
1641: }
1642:
1643: /**
1644: * Rearranges the front accents to try matching.
1645: * Prefix accents in the text will be grouped according to their combining
1646: * class and the groups will be mixed and matched to try find the perfect
1647: * match with the pattern.
1648: * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1649: * step 1: split "\u030A\u0301" into 6 other type of potential accent
1650: * substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301",
1651: * "\u030A\u0325", "\u0301\u0325".
1652: * step 2: check if any of the generated substrings matches the pattern.
1653: * Internal method, status is assumed to be success, caller has to check
1654: * status before calling this method.
1655: * @param start first offset of the accents to start searching
1656: * @param end start of the last accent set
1657: * @return DONE if a match is not found, otherwise return the starting
1658: * offset of the match. Note this start includes all preceding
1659: * accents.
1660: */
1661: private int doNextCanonicalPrefixMatch(int start, int end) {
1662: if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {
1663: // die... failed at a base character
1664: return DONE;
1665: }
1666:
1667: start = targetText.getIndex(); // index changed by fcd
1668: int offset = getNextBaseOffset(targetText, start);
1669: start = getPreviousBaseOffset(start);
1670:
1671: StringBuffer accents = new StringBuffer();
1672: String accentstr = getString(targetText, start, offset - start);
1673: // normalizing the offensive string
1674: if (Normalizer.quickCheck(accentstr, Normalizer.NFD, 0) == Normalizer.NO) {
1675: accentstr = Normalizer.decompose(accentstr, false);
1676: }
1677: accents.append(accentstr);
1678:
1679: int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
1680: int accentsize = getUnblockedAccentIndex(accents, accentsindex);
1681: int count = (2 << (accentsize - 1)) - 1;
1682: while (count > 0) {
1683: // copy the base characters
1684: m_canonicalPrefixAccents_.delete(0,
1685: m_canonicalPrefixAccents_.length());
1686: int k = 0;
1687: for (; k < accentsindex[0]; k++) {
1688: m_canonicalPrefixAccents_.append(accents.charAt(k));
1689: }
1690: // forming all possible canonical rearrangement by dropping
1691: // sets of accents
1692: for (int i = 0; i <= accentsize - 1; i++) {
1693: int mask = 1 << (accentsize - i - 1);
1694: if ((count & mask) != 0) {
1695: for (int j = accentsindex[i]; j < accentsindex[i + 1]; j++) {
1696: m_canonicalPrefixAccents_.append(accents
1697: .charAt(j));
1698: }
1699: }
1700: }
1701: StringBuffer match = merge(m_canonicalPrefixAccents_,
1702: targetText, offset, end, m_canonicalSuffixAccents_);
1703:
1704: // if status is a failure, ucol_setText does nothing.
1705: // run the collator iterator through this match
1706: m_utilColEIter_.setText(match.toString());
1707: if (checkCollationMatch(m_utilColEIter_)) {
1708: return start;
1709: }
1710: count--;
1711: }
1712: return DONE;
1713: }
1714:
1715: /**
1716: * Gets the offset to the safe point in text before textoffset.
1717: * ie. not the middle of a contraction, swappable characters or
1718: * supplementary characters.
1719: * @param start offset in string
1720: * @param textoffset offset in string
1721: * @return offset to the previous safe character
1722: */
1723: private final int getPreviousSafeOffset(int start, int textoffset) {
1724: int result = textoffset; // first contraction character
1725: targetText.setIndex(textoffset);
1726: while (result >= start
1727: && m_collator_.isUnsafe(targetText.previous())) {
1728: result = targetText.getIndex();
1729: }
1730: if (result != start) {
1731: // the first contraction character is consider unsafe here
1732: result = targetText.getIndex(); // originally result --;
1733: }
1734: return result;
1735: }
1736:
1737: /**
1738: * Take the rearranged end accents and tries matching. If match failed at
1739: * a seperate preceding set of accents (seperated from the rearranged on by
1740: * at least a base character) then we rearrange the preceding accents and
1741: * tries matching again.
1742: * We allow skipping of the ends of the accent set if the ces do not match.
1743: * However if the failure is found before the accent set, it fails.
1744: * Internal method, status assumed to be success, caller has to check
1745: * status before calling this method.
1746: * @param textoffset of the start of the rearranged accent
1747: * @return DONE if a match is not found, otherwise return the starting
1748: * offset of the match. Note this start includes all preceding
1749: * accents.
1750: */
1751: private int doNextCanonicalSuffixMatch(int textoffset) {
1752: int safelength = 0;
1753: StringBuffer safetext;
1754: int safeoffset = m_textBeginOffset_;
1755:
1756: if (textoffset != m_textBeginOffset_
1757: && m_canonicalSuffixAccents_.length() > 0
1758: && m_collator_.isUnsafe(m_canonicalSuffixAccents_
1759: .charAt(0))) {
1760: safeoffset = getPreviousSafeOffset(m_textBeginOffset_,
1761: textoffset);
1762: safelength = textoffset - safeoffset;
1763: safetext = merge(null, targetText, safeoffset, textoffset,
1764: m_canonicalSuffixAccents_);
1765: } else {
1766: safetext = m_canonicalSuffixAccents_;
1767: }
1768:
1769: // if status is a failure, ucol_setText does nothing
1770: CollationElementIterator coleiter = m_utilColEIter_;
1771: coleiter.setText(safetext.toString());
1772: // status checked in loop below
1773:
1774: int ceindex = m_pattern_.m_CELength_ - 1;
1775: boolean isSafe = true; // indication flag for position in safe zone
1776:
1777: while (ceindex >= 0) {
1778: int textce = coleiter.previous();
1779: if (textce == CollationElementIterator.NULLORDER) {
1780: // check if we have passed the safe buffer
1781: if (coleiter == m_colEIter_) {
1782: return DONE;
1783: }
1784: coleiter = m_colEIter_;
1785: if (safetext != m_canonicalSuffixAccents_) {
1786: safetext.delete(0, safetext.length());
1787: }
1788: coleiter.setExactOffset(safeoffset);
1789: // status checked at the start of the loop
1790: isSafe = false;
1791: continue;
1792: }
1793: textce = getCE(textce);
1794: if (textce != CollationElementIterator.IGNORABLE
1795: && textce != m_pattern_.m_CE_[ceindex]) {
1796: // do the beginning stuff
1797: int failedoffset = coleiter.getOffset();
1798: if (isSafe && failedoffset >= safelength) {
1799: // alas... no hope. failed at rearranged accent set
1800: return DONE;
1801: } else {
1802: if (isSafe) {
1803: failedoffset += safeoffset;
1804: }
1805:
1806: // try rearranging the front accents
1807: int result = doNextCanonicalPrefixMatch(
1808: failedoffset, textoffset);
1809: if (result != DONE) {
1810: // if status is a failure, ucol_setOffset does nothing
1811: m_colEIter_.setExactOffset(result);
1812: }
1813: return result;
1814: }
1815: }
1816: if (textce == m_pattern_.m_CE_[ceindex]) {
1817: ceindex--;
1818: }
1819: }
1820: // set offset here
1821: if (isSafe) {
1822: int result = coleiter.getOffset();
1823: // sets the text iterator with the correct expansion and offset
1824: int leftoverces = coleiter.m_CEBufferOffset_;
1825: if (result >= safelength) {
1826: result = textoffset;
1827: } else {
1828: result += safeoffset;
1829: }
1830: m_colEIter_.setExactOffset(result);
1831: m_colEIter_.m_CEBufferOffset_ = leftoverces;
1832: return result;
1833: }
1834:
1835: return coleiter.getOffset();
1836: }
1837:
1838: /**
1839: * Trying out the substring and sees if it can be a canonical match.
1840: * This will try normalizing the end accents and arranging them into
1841: * canonical equivalents and check their corresponding ces with the pattern
1842: * ce.
1843: * Suffix accents in the text will be grouped according to their combining
1844: * class and the groups will be mixed and matched to try find the perfect
1845: * match with the pattern.
1846: * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1847: * step 1: split "\u030A\u0301" into 6 other type of potential accent
1848: * substrings
1849: * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1850: * "\u0301\u0325".
1851: * step 2: check if any of the generated substrings matches the pattern.
1852: * @param textoffset end offset in the collation element text that ends with
1853: * the accents to be rearranged
1854: * @return true if the match is valid, false otherwise
1855: */
1856: private boolean doNextCanonicalMatch(int textoffset) {
1857: int offset = m_colEIter_.getOffset();
1858: targetText.setIndex(textoffset);
1859: if (UTF16.isTrailSurrogate(targetText.previous())
1860: && targetText.getIndex() > m_textBeginOffset_) {
1861: if (!UTF16.isLeadSurrogate(targetText.previous())) {
1862: targetText.next();
1863: }
1864: }
1865: if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
1866: if (m_pattern_.m_hasPrefixAccents_) {
1867: offset = doNextCanonicalPrefixMatch(offset, textoffset);
1868: if (offset != DONE) {
1869: m_colEIter_.setExactOffset(offset);
1870: return true;
1871: }
1872: }
1873: return false;
1874: }
1875:
1876: if (!m_pattern_.m_hasSuffixAccents_) {
1877: return false;
1878: }
1879:
1880: StringBuffer accents = new StringBuffer();
1881: // offset to the last base character in substring to search
1882: int baseoffset = getPreviousBaseOffset(targetText, textoffset);
1883: // normalizing the offensive string
1884: String accentstr = getString(targetText, baseoffset, textoffset
1885: - baseoffset);
1886: if (Normalizer.quickCheck(accentstr, Normalizer.NFD, 0) == Normalizer.NO) {
1887: accentstr = Normalizer.decompose(accentstr, false);
1888: }
1889: accents.append(accentstr);
1890: // status checked in loop below
1891:
1892: int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
1893: int size = getUnblockedAccentIndex(accents, accentsindex);
1894:
1895: // 2 power n - 1 plus the full set of accents
1896: int count = (2 << (size - 1)) - 1;
1897: while (count > 0) {
1898: m_canonicalSuffixAccents_.delete(0,
1899: m_canonicalSuffixAccents_.length());
1900: // copy the base characters
1901: for (int k = 0; k < accentsindex[0]; k++) {
1902: m_canonicalSuffixAccents_.append(accents.charAt(k));
1903: }
1904: // forming all possible canonical rearrangement by dropping
1905: // sets of accents
1906: for (int i = 0; i <= size - 1; i++) {
1907: int mask = 1 << (size - i - 1);
1908: if ((count & mask) != 0) {
1909: for (int j = accentsindex[i]; j < accentsindex[i + 1]; j++) {
1910: m_canonicalSuffixAccents_.append(accents
1911: .charAt(j));
1912: }
1913: }
1914: }
1915: offset = doNextCanonicalSuffixMatch(baseoffset);
1916: if (offset != DONE) {
1917: return true; // match found
1918: }
1919: count--;
1920: }
1921: return false;
1922: }
1923:
1924: /**
1925: * Gets the previous base character offset depending on the string search
1926: * pattern data
1927: * @param strsrch string search data
1928: * @param textoffset current offset, current character
1929: * @return the offset of the next character after this base character or
1930: * itself if it is a composed character with accents
1931: */
1932: private final int getPreviousBaseOffset(int textoffset) {
1933: if (m_pattern_.m_hasPrefixAccents_
1934: && textoffset > m_textBeginOffset_) {
1935: int offset = textoffset;
1936: if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
1937: return getPreviousBaseOffset(targetText, textoffset);
1938: }
1939: }
1940: return textoffset;
1941: }
1942:
1943: /**
1944: * Checks match for contraction.
1945: * If the match ends with a partial contraction we fail.
1946: * If the match starts too far off (because of backwards iteration) we try
1947: * to chip off the extra characters.
1948: * Uses the temporary util buffer for return values of the modified start
1949: * and end.
1950: * @param start offset of potential match, to be modified if necessary
1951: * @param end offset of potential match, to be modified if necessary
1952: * @return true if match passes the contraction test, false otherwise.
1953: */
1954: private boolean checkNextCanonicalContractionMatch(int start,
1955: int end) {
1956: // This part checks if either ends of the match contains potential
1957: // contraction. If so we'll have to iterate through them
1958: char schar = 0;
1959: char echar = 0;
1960: if (end < m_textLimitOffset_) {
1961: targetText.setIndex(end);
1962: echar = targetText.current();
1963: }
1964: if (start < m_textLimitOffset_) {
1965: targetText.setIndex(start + 1);
1966: schar = targetText.current();
1967: }
1968: if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
1969: int expansion = m_colEIter_.m_CEBufferOffset_;
1970: boolean hasExpansion = expansion > 0;
1971: m_colEIter_.setExactOffset(start);
1972: int temp = start;
1973: while (expansion > 0) {
1974: // getting rid of the redundant ce, caused by setOffset.
1975: // since backward contraction/expansion may have extra ces if
1976: // we are in the normalization buffer, hasAccentsBeforeMatch
1977: // would have taken care of it.
1978: // E.g. the character \u01FA will have an expansion of 3, but
1979: // if we are only looking for acute and ring \u030A and \u0301,
1980: // we'll have to skip the first ce in the expansion buffer.
1981: m_colEIter_.next();
1982: if (m_colEIter_.getOffset() != temp) {
1983: start = temp;
1984: temp = m_colEIter_.getOffset();
1985: }
1986: expansion--;
1987: }
1988:
1989: int count = 0;
1990: while (count < m_pattern_.m_CELength_) {
1991: int ce = getCE(m_colEIter_.next());
1992: // status checked below, note that if status is a failure
1993: // ucol_next returns UCOL_NULLORDER
1994: if (ce == CollationElementIterator.IGNORABLE) {
1995: continue;
1996: }
1997: if (hasExpansion && count == 0
1998: && m_colEIter_.getOffset() != temp) {
1999: start = temp;
2000: temp = m_colEIter_.getOffset();
2001: }
2002:
2003: if (count == 0 && ce != m_pattern_.m_CE_[0]) {
2004: // accents may have extra starting ces, this occurs when a
2005: // pure accent pattern is matched without rearrangement
2006: // text \u0325\u0300 and looking for \u0300
2007: int expected = m_pattern_.m_CE_[0];
2008: if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {
2009: ce = getCE(m_colEIter_.next());
2010: while (ce != expected
2011: && ce != CollationElementIterator.NULLORDER
2012: && m_colEIter_.getOffset() <= end) {
2013: ce = getCE(m_colEIter_.next());
2014: }
2015: }
2016: }
2017: if (ce != m_pattern_.m_CE_[count]) {
2018: end++;
2019: end = getNextBaseOffset(end);
2020: m_utilBuffer_[0] = start;
2021: m_utilBuffer_[1] = end;
2022: return false;
2023: }
2024: count++;
2025: }
2026: }
2027: m_utilBuffer_[0] = start;
2028: m_utilBuffer_[1] = end;
2029: return true;
2030: }
2031:
2032: /**
2033: * Checks and sets the match information if found.
2034: * Checks
2035: * <ul>
2036: * <li> the potential match does not repeat the previous match
2037: * <li> boundaries are correct
2038: * <li> potential match does not end in the middle of a contraction
2039: * <li> identical matches
2040: * </ul>
2041: * Otherwise the offset will be shifted to the next character.
2042: * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
2043: * more fitting result value.
2044: * Uses the temporary utility buffer for storing the modified textoffset.
2045: * @param textoffset offset in the collation element text.
2046: * @return true if the match is valid, false otherwise
2047: */
2048: private boolean checkNextCanonicalMatch(int textoffset) {
2049: // to ensure that the start and ends are not composite characters
2050: // if we have a canonical accent match
2051: if ((m_pattern_.m_hasSuffixAccents_ && m_canonicalSuffixAccents_
2052: .length() != 0)
2053: || (m_pattern_.m_hasPrefixAccents_ && m_canonicalPrefixAccents_
2054: .length() != 0)) {
2055: m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_
2056: .getOffset());
2057: matchLength = textoffset - m_matchedIndex_;
2058: return true;
2059: }
2060:
2061: int start = m_colEIter_.getOffset();
2062: if (!checkNextCanonicalContractionMatch(start, textoffset)) {
2063: // return the modified textoffset
2064: m_utilBuffer_[0] = m_utilBuffer_[1];
2065: return false;
2066: }
2067: start = m_utilBuffer_[0];
2068: textoffset = m_utilBuffer_[1];
2069: start = getPreviousBaseOffset(start);
2070: // this totally matches, however we need to check if it is repeating
2071: if (checkRepeatedMatch(start, textoffset)
2072: || !isBreakUnit(start, textoffset)
2073: || !checkIdentical(start, textoffset)) {
2074: textoffset++;
2075: textoffset = getNextBaseOffset(targetText, textoffset);
2076: m_utilBuffer_[0] = textoffset;
2077: return false;
2078: }
2079:
2080: m_matchedIndex_ = start;
2081: matchLength = textoffset - start;
2082: return true;
2083: }
2084:
2085: /**
2086: * Shifting the collation element iterator position forward to prepare for
2087: * a preceding match. If the first character is a unsafe character, we'll
2088: * only shift by 1 to capture contractions, normalization etc.
2089: * @param textoffset start text position to do search
2090: * @param ce the text ce which failed the match.
2091: * @param patternceindex index of the ce within the pattern ce buffer which
2092: * failed the match
2093: * @return final offset
2094: */
2095: private int reverseShift(int textoffset, int ce, int patternceindex) {
2096: if (isOverlapping()) {
2097: if (textoffset != m_textLimitOffset_) {
2098: textoffset--;
2099: } else {
2100: textoffset -= m_pattern_.m_defaultShiftSize_;
2101: }
2102: } else {
2103: if (ce != CollationElementIterator.NULLORDER) {
2104: int shift = m_pattern_.m_backShift_[hash(ce)];
2105:
2106: // this is to adjust for characters in the middle of the substring
2107: // for matching that failed.
2108: int adjust = patternceindex;
2109: if (adjust > 1 && shift > adjust) {
2110: shift -= adjust - 1;
2111: }
2112: textoffset -= shift;
2113: } else {
2114: textoffset -= m_pattern_.m_defaultShiftSize_;
2115: }
2116: }
2117:
2118: textoffset = getPreviousBaseOffset(textoffset);
2119: return textoffset;
2120: }
2121:
2122: /**
2123: * Checks match for contraction.
2124: * If the match starts with a partial contraction we fail.
2125: * Uses the temporary utility buffer to return the modified start and end.
2126: * @param start offset of potential match, to be modified if necessary
2127: * @param end offset of potential match, to be modified if necessary
2128: * @return true if match passes the contraction test, false otherwise.
2129: */
2130: private boolean checkPreviousExactContractionMatch(int start,
2131: int end) {
2132: // This part checks if either ends of the match contains potential
2133: // contraction. If so we'll have to iterate through them
2134: char echar = 0;
2135: if (end < m_textLimitOffset_) {
2136: targetText.setIndex(end);
2137: echar = targetText.current();
2138: }
2139: char schar = 0;
2140: if (start + 1 < m_textLimitOffset_) {
2141: targetText.setIndex(start + 1);
2142: schar = targetText.current();
2143: }
2144: if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
2145: // expansion suffix, what's left to iterate
2146: int expansion = m_colEIter_.m_CEBufferSize_
2147: - m_colEIter_.m_CEBufferOffset_;
2148: boolean hasExpansion = expansion > 0;
2149: m_colEIter_.setExactOffset(end);
2150: int temp = end;
2151: while (expansion > 0) {
2152: // getting rid of the redundant ce
2153: // since forward contraction/expansion may have extra ces
2154: // if we are in the normalization buffer, hasAccentsBeforeMatch
2155: // would have taken care of it.
2156: // E.g. the character \u01FA will have an expansion of 3, but if
2157: // we are only looking for A ring A\u030A, we'll have to skip the
2158: // last ce in the expansion buffer
2159: m_colEIter_.previous();
2160: if (m_colEIter_.getOffset() != temp) {
2161: end = temp;
2162: temp = m_colEIter_.getOffset();
2163: }
2164: expansion--;
2165: }
2166:
2167: int count = m_pattern_.m_CELength_;
2168: while (count > 0) {
2169: int ce = getCE(m_colEIter_.previous());
2170: // status checked below, note that if status is a failure
2171: // ucol_previous returns UCOL_NULLORDER
2172: if (ce == CollationElementIterator.IGNORABLE) {
2173: continue;
2174: }
2175: if (hasExpansion && count == 0
2176: && m_colEIter_.getOffset() != temp) {
2177: end = temp;
2178: temp = m_colEIter_.getOffset();
2179: }
2180: if (ce != m_pattern_.m_CE_[count - 1]) {
2181: start--;
2182: start = getPreviousBaseOffset(targetText, start);
2183: m_utilBuffer_[0] = start;
2184: m_utilBuffer_[1] = end;
2185: return false;
2186: }
2187: count--;
2188: }
2189: }
2190: m_utilBuffer_[0] = start;
2191: m_utilBuffer_[1] = end;
2192: return true;
2193: }
2194:
2195: /**
2196: * Checks and sets the match information if found.
2197: * Checks
2198: * <ul>
2199: * <li> the current match does not repeat the last match
2200: * <li> boundaries are correct
2201: * <li> exact matches has no extra accents
2202: * <li> identical matches
2203: * </ul>
2204: * Otherwise the offset will be shifted to the preceding character.
2205: * Uses the temporary utility buffer to store the modified textoffset.
2206: * @param textoffset offset in the collation element text. the returned value
2207: * will be the truncated start offset of the match or the new start
2208: * search offset.
2209: * @return true if the match is valid, false otherwise
2210: */
2211: private final boolean checkPreviousExactMatch(int textoffset) {
2212: // to ensure that the start and ends are not composite characters
2213: int end = m_colEIter_.getOffset();
2214: if (!checkPreviousExactContractionMatch(textoffset, end)) {
2215: return false;
2216: }
2217: textoffset = m_utilBuffer_[0];
2218: end = m_utilBuffer_[1];
2219:
2220: // this totally matches, however we need to check if it is repeating
2221: // the old match
2222: if (checkRepeatedMatch(textoffset, end)
2223: || !isBreakUnit(textoffset, end)
2224: || hasAccentsBeforeMatch(textoffset, end)
2225: || !checkIdentical(textoffset, end)
2226: || hasAccentsAfterMatch(textoffset, end)) {
2227: textoffset--;
2228: textoffset = getPreviousBaseOffset(targetText, textoffset);
2229: m_utilBuffer_[0] = textoffset;
2230: return false;
2231: }
2232: m_matchedIndex_ = textoffset;
2233: matchLength = end - textoffset;
2234: return true;
2235: }
2236:
2237: /**
2238: * Rearranges the end accents to try matching.
2239: * Suffix accents in the text will be grouped according to their combining
2240: * class and the groups will be mixed and matched to try find the perfect
2241: * match with the pattern.
2242: * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2243: * step 1: split "\u030A\u0301" into 6 other type of potential accent
2244: * substrings
2245: * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2246: * "\u0301\u0325".
2247: * step 2: check if any of the generated substrings matches the pattern.
2248: * @param start offset of the first base character
2249: * @param end start of the last accent set
2250: * @return DONE if a match is not found, otherwise return the ending
2251: * offset of the match. Note this start includes all following
2252: * accents.
2253: */
2254: private int doPreviousCanonicalSuffixMatch(int start, int end) {
2255: targetText.setIndex(end);
2256: if (UTF16.isTrailSurrogate(targetText.previous())
2257: && targetText.getIndex() > m_textBeginOffset_) {
2258: if (!UTF16.isLeadSurrogate(targetText.previous())) {
2259: targetText.next();
2260: }
2261: }
2262: if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
2263: // die... failed at a base character
2264: return DONE;
2265: }
2266: end = getNextBaseOffset(targetText, end);
2267:
2268: StringBuffer accents = new StringBuffer();
2269: int offset = getPreviousBaseOffset(targetText, end);
2270: // normalizing the offensive string
2271: String accentstr = getString(targetText, offset, end - offset);
2272: if (Normalizer.quickCheck(accentstr, Normalizer.NFD, 0) == Normalizer.NO) {
2273: accentstr = Normalizer.decompose(accentstr, false);
2274: }
2275: accents.append(accentstr);
2276:
2277: int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
2278: int accentsize = getUnblockedAccentIndex(accents, accentsindex);
2279: int count = (2 << (accentsize - 1)) - 1;
2280: while (count > 0) {
2281: m_canonicalSuffixAccents_.delete(0,
2282: m_canonicalSuffixAccents_.length());
2283: // copy the base characters
2284: for (int k = 0; k < accentsindex[0]; k++) {
2285: m_canonicalSuffixAccents_.append(accents.charAt(k));
2286: }
2287: // forming all possible canonical rearrangement by dropping
2288: // sets of accents
2289: for (int i = 0; i <= accentsize - 1; i++) {
2290: int mask = 1 << (accentsize - i - 1);
2291: if ((count & mask) != 0) {
2292: for (int j = accentsindex[i]; j < accentsindex[i + 1]; j++) {
2293: m_canonicalSuffixAccents_.append(accents
2294: .charAt(j));
2295: }
2296: }
2297: }
2298: StringBuffer match = merge(m_canonicalPrefixAccents_,
2299: targetText, start, offset,
2300: m_canonicalSuffixAccents_);
2301: // run the collator iterator through this match
2302: // if status is a failure ucol_setText does nothing
2303: m_utilColEIter_.setText(match.toString());
2304: if (checkCollationMatch(m_utilColEIter_)) {
2305: return end;
2306: }
2307: count--;
2308: }
2309: return DONE;
2310: }
2311:
2312: /**
2313: * Take the rearranged start accents and tries matching. If match failed at
2314: * a seperate following set of accents (seperated from the rearranged on by
2315: * at least a base character) then we rearrange the preceding accents and
2316: * tries matching again.
2317: * We allow skipping of the ends of the accent set if the ces do not match.
2318: * However if the failure is found before the accent set, it fails.
2319: * Internal method, status assumed to be success, caller has to check
2320: * status before calling this method.
2321: * @param textoffset of the ends of the rearranged accent
2322: * @return DONE if a match is not found, otherwise return the ending offset
2323: * of the match. Note this start includes all following accents.
2324: */
2325: private int doPreviousCanonicalPrefixMatch(int textoffset) {
2326: // int safelength = 0;
2327: StringBuffer safetext;
2328: int safeoffset = textoffset;
2329:
2330: if (textoffset > m_textBeginOffset_
2331: && m_collator_
2332: .isUnsafe(m_canonicalPrefixAccents_
2333: .charAt(m_canonicalPrefixAccents_
2334: .length() - 1))) {
2335: safeoffset = getNextSafeOffset(textoffset,
2336: m_textLimitOffset_);
2337: //safelength = safeoffset - textoffset;
2338: safetext = merge(m_canonicalPrefixAccents_, targetText,
2339: textoffset, safeoffset, null);
2340: } else {
2341: safetext = m_canonicalPrefixAccents_;
2342: }
2343:
2344: // if status is a failure, ucol_setText does nothing
2345: CollationElementIterator coleiter = m_utilColEIter_;
2346: coleiter.setText(safetext.toString());
2347: // status checked in loop below
2348:
2349: int ceindex = 0;
2350: boolean isSafe = true; // safe zone indication flag for position
2351: int prefixlength = m_canonicalPrefixAccents_.length();
2352:
2353: while (ceindex < m_pattern_.m_CELength_) {
2354: int textce = coleiter.next();
2355: if (textce == CollationElementIterator.NULLORDER) {
2356: // check if we have passed the safe buffer
2357: if (coleiter == m_colEIter_) {
2358: return DONE;
2359: }
2360: if (safetext != m_canonicalPrefixAccents_) {
2361: safetext.delete(0, safetext.length());
2362: }
2363: coleiter = m_colEIter_;
2364: coleiter.setExactOffset(safeoffset);
2365: // status checked at the start of the loop
2366: isSafe = false;
2367: continue;
2368: }
2369: textce = getCE(textce);
2370: if (textce != CollationElementIterator.IGNORABLE
2371: && textce != m_pattern_.m_CE_[ceindex]) {
2372: // do the beginning stuff
2373: int failedoffset = coleiter.getOffset();
2374: if (isSafe && failedoffset <= prefixlength) {
2375: // alas... no hope. failed at rearranged accent set
2376: return DONE;
2377: } else {
2378: if (isSafe) {
2379: failedoffset = safeoffset - failedoffset;
2380: if (safetext != m_canonicalPrefixAccents_) {
2381: safetext.delete(0, safetext.length());
2382: }
2383: }
2384:
2385: // try rearranging the end accents
2386: int result = doPreviousCanonicalSuffixMatch(
2387: textoffset, failedoffset);
2388: if (result != DONE) {
2389: // if status is a failure, ucol_setOffset does nothing
2390: m_colEIter_.setExactOffset(result);
2391: }
2392: return result;
2393: }
2394: }
2395: if (textce == m_pattern_.m_CE_[ceindex]) {
2396: ceindex++;
2397: }
2398: }
2399: // set offset here
2400: if (isSafe) {
2401: int result = coleiter.getOffset();
2402: // sets the text iterator here with the correct expansion and offset
2403: int leftoverces = coleiter.m_CEBufferSize_
2404: - coleiter.m_CEBufferOffset_;
2405: if (result <= prefixlength) {
2406: result = textoffset;
2407: } else {
2408: result = textoffset + (safeoffset - result);
2409: }
2410: m_colEIter_.setExactOffset(result);
2411: m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_
2412: - leftoverces;
2413: return result;
2414: }
2415:
2416: return coleiter.getOffset();
2417: }
2418:
2419: /**
2420: * Trying out the substring and sees if it can be a canonical match.
2421: * This will try normalizing the starting accents and arranging them into
2422: * canonical equivalents and check their corresponding ces with the pattern
2423: * ce.
2424: * Prefix accents in the text will be grouped according to their combining
2425: * class and the groups will be mixed and matched to try find the perfect
2426: * match with the pattern.
2427: * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2428: * step 1: split "\u030A\u0301" into 6 other type of potential accent
2429: * substrings
2430: * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2431: * "\u0301\u0325".
2432: * step 2: check if any of the generated substrings matches the pattern.
2433: * @param textoffset start offset in the collation element text that starts
2434: * with the accents to be rearranged
2435: * @return true if the match is valid, false otherwise
2436: */
2437: private boolean doPreviousCanonicalMatch(int textoffset) {
2438: int offset = m_colEIter_.getOffset();
2439: if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2440: if (m_pattern_.m_hasSuffixAccents_) {
2441: offset = doPreviousCanonicalSuffixMatch(textoffset,
2442: offset);
2443: if (offset != DONE) {
2444: m_colEIter_.setExactOffset(offset);
2445: return true;
2446: }
2447: }
2448: return false;
2449: }
2450:
2451: if (!m_pattern_.m_hasPrefixAccents_) {
2452: return false;
2453: }
2454:
2455: StringBuffer accents = new StringBuffer();
2456: // offset to the last base character in substring to search
2457: int baseoffset = getNextBaseOffset(targetText, textoffset);
2458: // normalizing the offensive string
2459: String textstr = getString(targetText, textoffset, baseoffset
2460: - textoffset);
2461: if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
2462: textstr = Normalizer.decompose(textstr, false);
2463: }
2464: accents.append(textstr);
2465: // status checked in loop
2466:
2467: int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
2468: int size = getUnblockedAccentIndex(accents, accentsindex);
2469:
2470: // 2 power n - 1 plus the full set of accents
2471: int count = (2 << (size - 1)) - 1;
2472: while (count > 0) {
2473: m_canonicalPrefixAccents_.delete(0,
2474: m_canonicalPrefixAccents_.length());
2475: // copy the base characters
2476: for (int k = 0; k < accentsindex[0]; k++) {
2477: m_canonicalPrefixAccents_.append(accents.charAt(k));
2478: }
2479: // forming all possible canonical rearrangement by dropping
2480: // sets of accents
2481: for (int i = 0; i <= size - 1; i++) {
2482: int mask = 1 << (size - i - 1);
2483: if ((count & mask) != 0) {
2484: for (int j = accentsindex[i]; j < accentsindex[i + 1]; j++) {
2485: m_canonicalPrefixAccents_.append(accents
2486: .charAt(j));
2487: }
2488: }
2489: }
2490: offset = doPreviousCanonicalPrefixMatch(baseoffset);
2491: if (offset != DONE) {
2492: return true; // match found
2493: }
2494: count--;
2495: }
2496: return false;
2497: }
2498:
2499: /**
2500: * Checks match for contraction.
2501: * If the match starts with a partial contraction we fail.
2502: * Uses the temporary utility buffer to return the modified start and end.
2503: * @param start offset of potential match, to be modified if necessary
2504: * @param end offset of potential match, to be modified if necessary
2505: * @return true if match passes the contraction test, false otherwise.
2506: */
2507: private boolean checkPreviousCanonicalContractionMatch(int start,
2508: int end) {
2509: int temp = end;
2510: // This part checks if either ends of the match contains potential
2511: // contraction. If so we'll have to iterate through them
2512: char echar = 0;
2513: char schar = 0;
2514: if (end < m_textLimitOffset_) {
2515: targetText.setIndex(end);
2516: echar = targetText.current();
2517: }
2518: if (start + 1 < m_textLimitOffset_) {
2519: targetText.setIndex(start + 1);
2520: schar = targetText.current();
2521: }
2522: if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
2523: int expansion = m_colEIter_.m_CEBufferSize_
2524: - m_colEIter_.m_CEBufferOffset_;
2525: boolean hasExpansion = expansion > 0;
2526: m_colEIter_.setExactOffset(end);
2527: while (expansion > 0) {
2528: // getting rid of the redundant ce
2529: // since forward contraction/expansion may have extra ces
2530: // if we are in the normalization buffer, hasAccentsBeforeMatch
2531: // would have taken care of it.
2532: // E.g. the character \u01FA will have an expansion of 3, but
2533: // if we are only looking for A ring A\u030A, we'll have to
2534: // skip the last ce in the expansion buffer
2535: m_colEIter_.previous();
2536: if (m_colEIter_.getOffset() != temp) {
2537: end = temp;
2538: temp = m_colEIter_.getOffset();
2539: }
2540: expansion--;
2541: }
2542:
2543: int count = m_pattern_.m_CELength_;
2544: while (count > 0) {
2545: int ce = getCE(m_colEIter_.previous());
2546: // status checked below, note that if status is a failure
2547: // previous() returns NULLORDER
2548: if (ce == CollationElementIterator.IGNORABLE) {
2549: continue;
2550: }
2551: if (hasExpansion && count == 0
2552: && m_colEIter_.getOffset() != temp) {
2553: end = temp;
2554: temp = m_colEIter_.getOffset();
2555: }
2556: if (count == m_pattern_.m_CELength_
2557: && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {
2558: // accents may have extra starting ces, this occurs when a
2559: // pure accent pattern is matched without rearrangement
2560: int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];
2561: targetText.setIndex(end);
2562: if (UTF16.isTrailSurrogate(targetText.previous())) {
2563: if (targetText.getIndex() > m_textBeginOffset_
2564: && !UTF16.isLeadSurrogate(targetText
2565: .previous())) {
2566: targetText.next();
2567: }
2568: }
2569: end = targetText.getIndex();
2570: if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {
2571: ce = getCE(m_colEIter_.previous());
2572: while (ce != expected
2573: && ce != CollationElementIterator.NULLORDER
2574: && m_colEIter_.getOffset() <= start) {
2575: ce = getCE(m_colEIter_.previous());
2576: }
2577: }
2578: }
2579: if (ce != m_pattern_.m_CE_[count - 1]) {
2580: start--;
2581: start = getPreviousBaseOffset(start);
2582: m_utilBuffer_[0] = start;
2583: m_utilBuffer_[1] = end;
2584: return false;
2585: }
2586: count--;
2587: }
2588: }
2589: m_utilBuffer_[0] = start;
2590: m_utilBuffer_[1] = end;
2591: return true;
2592: }
2593:
2594: /**
2595: * Checks and sets the match information if found.
2596: * Checks
2597: * <ul>
2598: * <li> the potential match does not repeat the previous match
2599: * <li> boundaries are correct
2600: * <li> potential match does not end in the middle of a contraction
2601: * <li> identical matches
2602: * </ul>
2603: * Otherwise the offset will be shifted to the next character.
2604: * Uses the temporary utility buffer for storing the modified textoffset.
2605: * @param textoffset offset in the collation element text. the returned
2606: * value will be the truncated start offset of the match or the
2607: * new start search offset.
2608: * @return true if the match is valid, false otherwise
2609: */
2610: private boolean checkPreviousCanonicalMatch(int textoffset) {
2611: // to ensure that the start and ends are not composite characters
2612: // if we have a canonical accent match
2613: if (m_pattern_.m_hasSuffixAccents_
2614: && m_canonicalSuffixAccents_.length() != 0
2615: || m_pattern_.m_hasPrefixAccents_
2616: && m_canonicalPrefixAccents_.length() != 0) {
2617: m_matchedIndex_ = textoffset;
2618: matchLength = getNextBaseOffset(m_colEIter_.getOffset())
2619: - textoffset;
2620: return true;
2621: }
2622:
2623: int end = m_colEIter_.getOffset();
2624: if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {
2625: // storing the modified textoffset
2626: return false;
2627: }
2628: textoffset = m_utilBuffer_[0];
2629: end = m_utilBuffer_[1];
2630: end = getNextBaseOffset(end);
2631: // this totally matches, however we need to check if it is repeating
2632: if (checkRepeatedMatch(textoffset, end)
2633: || !isBreakUnit(textoffset, end)
2634: || !checkIdentical(textoffset, end)) {
2635: textoffset--;
2636: textoffset = getPreviousBaseOffset(textoffset);
2637: m_utilBuffer_[0] = textoffset;
2638: return false;
2639: }
2640:
2641: m_matchedIndex_ = textoffset;
2642: matchLength = end - textoffset;
2643: return true;
2644: }
2645:
2646: /**
2647: * Method that does the next exact match
2648: * @param start the offset to start shifting from and performing the
2649: * next exact match
2650: */
2651: private void handleNextExact(int start) {
2652: int textoffset = shiftForward(start,
2653: CollationElementIterator.NULLORDER,
2654: m_pattern_.m_CELength_);
2655: int targetce = CollationElementIterator.IGNORABLE;
2656: while (textoffset <= m_textLimitOffset_) {
2657: m_colEIter_.setExactOffset(textoffset);
2658: int patternceindex = m_pattern_.m_CELength_ - 1;
2659: boolean found = false;
2660: int lastce = CollationElementIterator.NULLORDER;
2661:
2662: while (true) {
2663: // finding the last pattern ce match, imagine composite
2664: // characters. for example: search for pattern A in text \u00C0
2665: // we'll have to skip \u0300 the grave first before we get to A
2666: targetce = m_colEIter_.previous();
2667: if (targetce == CollationElementIterator.NULLORDER) {
2668: found = false;
2669: break;
2670: }
2671: targetce = getCE(targetce);
2672: if (targetce == CollationElementIterator.IGNORABLE
2673: && m_colEIter_.isInBuffer()) {
2674: // this is for the text \u0315\u0300 that requires
2675: // normalization and pattern \u0300, where \u0315 is ignorable
2676: continue;
2677: }
2678: if (lastce == CollationElementIterator.NULLORDER
2679: || lastce == CollationElementIterator.IGNORABLE) {
2680: lastce = targetce;
2681: }
2682: if (targetce == m_pattern_.m_CE_[patternceindex]) {
2683: // the first ce can be a contraction
2684: found = true;
2685: break;
2686: }
2687: if (m_colEIter_.m_CEBufferOffset_ <= 0) {
2688: found = false;
2689: break;
2690: }
2691: }
2692:
2693: while (found && patternceindex > 0) {
2694: targetce = m_colEIter_.previous();
2695: if (targetce == CollationElementIterator.NULLORDER) {
2696: found = false;
2697: break;
2698: }
2699: targetce = getCE(targetce);
2700: if (targetce == CollationElementIterator.IGNORABLE) {
2701: continue;
2702: }
2703:
2704: patternceindex--;
2705: found = found
2706: && targetce == m_pattern_.m_CE_[patternceindex];
2707: }
2708:
2709: if (!found) {
2710: textoffset = shiftForward(textoffset, lastce,
2711: patternceindex);
2712: // status checked at loop.
2713: patternceindex = m_pattern_.m_CELength_;
2714: continue;
2715: }
2716:
2717: if (checkNextExactMatch(textoffset)) {
2718: // status checked in ucol_setOffset
2719: return;
2720: }
2721: textoffset = m_utilBuffer_[0];
2722: }
2723: setMatchNotFound();
2724: }
2725:
2726: /**
2727: * Method that does the next canonical match
2728: * @param start the offset to start shifting from and performing the
2729: * next canonical match
2730: */
2731: private void handleNextCanonical(int start) {
2732: boolean hasPatternAccents = m_pattern_.m_hasSuffixAccents_
2733: || m_pattern_.m_hasPrefixAccents_;
2734:
2735: // shifting it check for setting offset
2736: // if setOffset is called previously or there was no previous match, we
2737: // leave the offset as it is.
2738: int textoffset = shiftForward(start,
2739: CollationElementIterator.NULLORDER,
2740: m_pattern_.m_CELength_);
2741: m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_
2742: .length());
2743: m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_
2744: .length());
2745: int targetce = CollationElementIterator.IGNORABLE;
2746:
2747: while (textoffset <= m_textLimitOffset_) {
2748: m_colEIter_.setExactOffset(textoffset);
2749: int patternceindex = m_pattern_.m_CELength_ - 1;
2750: boolean found = false;
2751: int lastce = CollationElementIterator.NULLORDER;
2752:
2753: while (true) {
2754: // finding the last pattern ce match, imagine composite characters
2755: // for example: search for pattern A in text \u00C0
2756: // we'll have to skip \u0300 the grave first before we get to A
2757: targetce = m_colEIter_.previous();
2758: if (targetce == CollationElementIterator.NULLORDER) {
2759: found = false;
2760: break;
2761: }
2762: targetce = getCE(targetce);
2763: if (lastce == CollationElementIterator.NULLORDER
2764: || lastce == CollationElementIterator.IGNORABLE) {
2765: lastce = targetce;
2766: }
2767: if (targetce == m_pattern_.m_CE_[patternceindex]) {
2768: // the first ce can be a contraction
2769: found = true;
2770: break;
2771: }
2772: if (m_colEIter_.m_CEBufferOffset_ <= 0) {
2773: found = false;
2774: break;
2775: }
2776: }
2777:
2778: while (found && patternceindex > 0) {
2779: targetce = m_colEIter_.previous();
2780: if (targetce == CollationElementIterator.NULLORDER) {
2781: found = false;
2782: break;
2783: }
2784: targetce = getCE(targetce);
2785: if (targetce == CollationElementIterator.IGNORABLE) {
2786: continue;
2787: }
2788:
2789: patternceindex--;
2790: found = found
2791: && targetce == m_pattern_.m_CE_[patternceindex];
2792: }
2793:
2794: // initializing the rearranged accent array
2795: if (hasPatternAccents && !found) {
2796: found = doNextCanonicalMatch(textoffset);
2797: }
2798:
2799: if (!found) {
2800: textoffset = shiftForward(textoffset, lastce,
2801: patternceindex);
2802: // status checked at loop
2803: patternceindex = m_pattern_.m_CELength_;
2804: continue;
2805: }
2806:
2807: if (checkNextCanonicalMatch(textoffset)) {
2808: return;
2809: }
2810: textoffset = m_utilBuffer_[0];
2811: }
2812: setMatchNotFound();
2813: }
2814:
2815: /**
2816: * Method that does the previous exact match
2817: * @param start the offset to start shifting from and performing the
2818: * previous exact match
2819: */
2820: private void handlePreviousExact(int start) {
2821: int textoffset = reverseShift(start,
2822: CollationElementIterator.NULLORDER,
2823: m_pattern_.m_CELength_);
2824: while (textoffset >= m_textBeginOffset_) {
2825: m_colEIter_.setExactOffset(textoffset);
2826: int patternceindex = 1;
2827: int targetce = CollationElementIterator.IGNORABLE;
2828: boolean found = false;
2829: int firstce = CollationElementIterator.NULLORDER;
2830:
2831: while (true) {
2832: // finding the first pattern ce match, imagine composite
2833: // characters. for example: search for pattern \u0300 in text
2834: // \u00C0, we'll have to skip A first before we get to
2835: // \u0300 the grave accent
2836: targetce = m_colEIter_.next();
2837: if (targetce == CollationElementIterator.NULLORDER) {
2838: found = false;
2839: break;
2840: }
2841: targetce = getCE(targetce);
2842: if (firstce == CollationElementIterator.NULLORDER
2843: || firstce == CollationElementIterator.IGNORABLE) {
2844: firstce = targetce;
2845: }
2846: if (targetce == CollationElementIterator.IGNORABLE) {
2847: continue;
2848: }
2849: if (targetce == m_pattern_.m_CE_[0]) {
2850: found = true;
2851: break;
2852: }
2853: if (m_colEIter_.m_CEBufferOffset_ == -1
2854: || m_colEIter_.m_CEBufferOffset_ == m_colEIter_.m_CEBufferSize_) {
2855: // checking for accents in composite character
2856: found = false;
2857: break;
2858: }
2859: }
2860:
2861: targetce = firstce;
2862:
2863: while (found && patternceindex < m_pattern_.m_CELength_) {
2864: targetce = m_colEIter_.next();
2865: if (targetce == CollationElementIterator.NULLORDER) {
2866: found = false;
2867: break;
2868: }
2869: targetce = getCE(targetce);
2870: if (targetce == CollationElementIterator.IGNORABLE) {
2871: continue;
2872: }
2873:
2874: found = found
2875: && targetce == m_pattern_.m_CE_[patternceindex];
2876: patternceindex++;
2877: }
2878:
2879: if (!found) {
2880: textoffset = reverseShift(textoffset, targetce,
2881: patternceindex);
2882: patternceindex = 0;
2883: continue;
2884: }
2885:
2886: if (checkPreviousExactMatch(textoffset)) {
2887: return;
2888: }
2889: textoffset = m_utilBuffer_[0];
2890: }
2891: setMatchNotFound();
2892: }
2893:
2894: /**
2895: * Method that does the previous canonical match
2896: * @param start the offset to start shifting from and performing the
2897: * previous canonical match
2898: */
2899: private void handlePreviousCanonical(int start) {
2900: boolean hasPatternAccents = m_pattern_.m_hasSuffixAccents_
2901: || m_pattern_.m_hasPrefixAccents_;
2902:
2903: // shifting it check for setting offset
2904: // if setOffset is called previously or there was no previous match, we
2905: // leave the offset as it is.
2906: int textoffset = reverseShift(start,
2907: CollationElementIterator.NULLORDER,
2908: m_pattern_.m_CELength_);
2909: m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_
2910: .length());
2911: m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_
2912: .length());
2913:
2914: while (textoffset >= m_textBeginOffset_) {
2915: m_colEIter_.setExactOffset(textoffset);
2916: int patternceindex = 1;
2917: int targetce = CollationElementIterator.IGNORABLE;
2918: boolean found = false;
2919: int firstce = CollationElementIterator.NULLORDER;
2920:
2921: while (true) {
2922: // finding the first pattern ce match, imagine composite
2923: // characters. for example: search for pattern \u0300 in text
2924: // \u00C0, we'll have to skip A first before we get to
2925: // \u0300 the grave accent
2926: targetce = m_colEIter_.next();
2927: if (targetce == CollationElementIterator.NULLORDER) {
2928: found = false;
2929: break;
2930: }
2931: targetce = getCE(targetce);
2932: if (firstce == CollationElementIterator.NULLORDER
2933: || firstce == CollationElementIterator.IGNORABLE) {
2934: firstce = targetce;
2935: }
2936:
2937: if (targetce == m_pattern_.m_CE_[0]) {
2938: // the first ce can be a contraction
2939: found = true;
2940: break;
2941: }
2942: if (m_colEIter_.m_CEBufferOffset_ == -1
2943: || m_colEIter_.m_CEBufferOffset_ == m_colEIter_.m_CEBufferSize_) {
2944: // checking for accents in composite character
2945: found = false;
2946: break;
2947: }
2948: }
2949:
2950: targetce = firstce;
2951:
2952: while (found && patternceindex < m_pattern_.m_CELength_) {
2953: targetce = m_colEIter_.next();
2954: if (targetce == CollationElementIterator.NULLORDER) {
2955: found = false;
2956: break;
2957: }
2958: targetce = getCE(targetce);
2959: if (targetce == CollationElementIterator.IGNORABLE) {
2960: continue;
2961: }
2962:
2963: found = found
2964: && targetce == m_pattern_.m_CE_[patternceindex];
2965: patternceindex++;
2966: }
2967:
2968: // initializing the rearranged accent array
2969: if (hasPatternAccents && !found) {
2970: found = doPreviousCanonicalMatch(textoffset);
2971: }
2972:
2973: if (!found) {
2974: textoffset = reverseShift(textoffset, targetce,
2975: patternceindex);
2976: patternceindex = 0;
2977: continue;
2978: }
2979:
2980: if (checkPreviousCanonicalMatch(textoffset)) {
2981: return;
2982: }
2983: textoffset = m_utilBuffer_[0];
2984: }
2985: setMatchNotFound();
2986: }
2987:
2988: /**
2989: * Gets a substring out of a CharacterIterator
2990: * @param text CharacterIterator
2991: * @param start start offset
2992: * @param length of substring
2993: * @return substring from text starting at start and length length
2994: */
2995: private static final String getString(CharacterIterator text,
2996: int start, int length) {
2997: StringBuffer result = new StringBuffer(length);
2998: int offset = text.getIndex();
2999: text.setIndex(start);
3000: for (int i = 0; i < length; i++) {
3001: result.append(text.current());
3002: text.next();
3003: }
3004: text.setIndex(offset);
3005: return result.toString();
3006: }
3007:
3008: /**
3009: * Getting the mask for collation strength
3010: * @param strength collation strength
3011: * @return collation element mask
3012: */
3013: private static final int getMask(int strength) {
3014: switch (strength) {
3015: case Collator.PRIMARY:
3016: return RuleBasedCollator.CE_PRIMARY_MASK_;
3017: case Collator.SECONDARY:
3018: return RuleBasedCollator.CE_SECONDARY_MASK_
3019: | RuleBasedCollator.CE_PRIMARY_MASK_;
3020: default:
3021: return RuleBasedCollator.CE_TERTIARY_MASK_
3022: | RuleBasedCollator.CE_SECONDARY_MASK_
3023: | RuleBasedCollator.CE_PRIMARY_MASK_;
3024: }
3025: }
3026:
3027: /**
3028: * Sets match not found
3029: */
3030: private void setMatchNotFound() {
3031: // this method resets the match result regardless of the error status.
3032: m_matchedIndex_ = DONE;
3033: setMatchLength(0);
3034: }
3035: }
|