0001: //##header
0002: /*
0003: *******************************************************************************
0004: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */
0008: package com.ibm.icu.text;
0009:
0010: import java.text.*;
0011: import com.ibm.icu.lang.*;
0012:
0013: import java.io.IOException;
0014:
0015: import com.ibm.icu.impl.CollectionUtilities;
0016: import com.ibm.icu.impl.NormalizerImpl;
0017: import com.ibm.icu.impl.Utility;
0018: import com.ibm.icu.impl.UCharacterProperty;
0019: import com.ibm.icu.impl.UBiDiProps;
0020: import com.ibm.icu.impl.UCaseProps;
0021: import com.ibm.icu.impl.UPropertyAliases;
0022: import com.ibm.icu.impl.SortedSetRelation;
0023: import com.ibm.icu.impl.RuleCharacterIterator;
0024:
0025: import com.ibm.icu.util.Freezable;
0026: import com.ibm.icu.util.ULocale;
0027: import com.ibm.icu.util.VersionInfo;
0028:
0029: import com.ibm.icu.text.BreakIterator;
0030:
0031: import java.util.Map;
0032: import java.util.HashMap;
0033: import java.util.MissingResourceException;
0034: import java.util.TreeSet;
0035: import java.util.Iterator;
0036: import java.util.Collection;
0037:
0038: /**
0039: * A mutable set of Unicode characters and multicharacter strings. Objects of this class
0040: * represent <em>character classes</em> used in regular expressions.
0041: * A character specifies a subset of Unicode code points. Legal
0042: * code points are U+0000 to U+10FFFF, inclusive.
0043: *
0044: * <p>The UnicodeSet class is not designed to be subclassed.
0045: *
0046: * <p><code>UnicodeSet</code> supports two APIs. The first is the
0047: * <em>operand</em> API that allows the caller to modify the value of
0048: * a <code>UnicodeSet</code> object. It conforms to Java 2's
0049: * <code>java.util.Set</code> interface, although
0050: * <code>UnicodeSet</code> does not actually implement that
0051: * interface. All methods of <code>Set</code> are supported, with the
0052: * modification that they take a character range or single character
0053: * instead of an <code>Object</code>, and they take a
0054: * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
0055: * operand API may be thought of in terms of boolean logic: a boolean
0056: * OR is implemented by <code>add</code>, a boolean AND is implemented
0057: * by <code>retain</code>, a boolean XOR is implemented by
0058: * <code>complement</code> taking an argument, and a boolean NOT is
0059: * implemented by <code>complement</code> with no argument. In terms
0060: * of traditional set theory function names, <code>add</code> is a
0061: * union, <code>retain</code> is an intersection, <code>remove</code>
0062: * is an asymmetric difference, and <code>complement</code> with no
0063: * argument is a set complement with respect to the superset range
0064: * <code>MIN_VALUE-MAX_VALUE</code>
0065: *
0066: * <p>The second API is the
0067: * <code>applyPattern()</code>/<code>toPattern()</code> API from the
0068: * <code>java.text.Format</code>-derived classes. Unlike the
0069: * methods that add characters, add categories, and control the logic
0070: * of the set, the method <code>applyPattern()</code> sets all
0071: * attributes of a <code>UnicodeSet</code> at once, based on a
0072: * string pattern.
0073: *
0074: * <p><b>Pattern syntax</b></p>
0075: *
0076: * Patterns are accepted by the constructors and the
0077: * <code>applyPattern()</code> methods and returned by the
0078: * <code>toPattern()</code> method. These patterns follow a syntax
0079: * similar to that employed by version 8 regular expression character
0080: * classes. Here are some simple examples:
0081: *
0082: * <blockquote>
0083: * <table>
0084: * <tr align="top">
0085: * <td nowrap valign="top" align="left"><code>[]</code></td>
0086: * <td valign="top">No characters</td>
0087: * </tr><tr align="top">
0088: * <td nowrap valign="top" align="left"><code>[a]</code></td>
0089: * <td valign="top">The character 'a'</td>
0090: * </tr><tr align="top">
0091: * <td nowrap valign="top" align="left"><code>[ae]</code></td>
0092: * <td valign="top">The characters 'a' and 'e'</td>
0093: * </tr>
0094: * <tr>
0095: * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
0096: * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
0097: * point order</td>
0098: * </tr>
0099: * <tr>
0100: * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
0101: * <td valign="top">The character U+4E01</td>
0102: * </tr>
0103: * <tr>
0104: * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
0105: * <td valign="top">The character 'a' and the multicharacter strings "ab" and
0106: * "ac"</td>
0107: * </tr>
0108: * <tr>
0109: * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
0110: * <td valign="top">All characters in the general category Uppercase Letter</td>
0111: * </tr>
0112: * </table>
0113: * </blockquote>
0114: *
0115: * Any character may be preceded by a backslash in order to remove any special
0116: * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
0117: * ignored, unless they are escaped.
0118: *
0119: * <p>Property patterns specify a set of characters having a certain
0120: * property as defined by the Unicode standard. Both the POSIX-like
0121: * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
0122: * complete list of supported property patterns, see the User's Guide
0123: * for UnicodeSet at
0124: * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
0125: * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
0126: * Actual determination of property data is defined by the underlying
0127: * Unicode database as implemented by UCharacter.
0128: *
0129: * <p>Patterns specify individual characters, ranges of characters, and
0130: * Unicode property sets. When elements are concatenated, they
0131: * specify their union. To complement a set, place a '^' immediately
0132: * after the opening '['. Property patterns are inverted by modifying
0133: * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
0134: * '^' has no special meaning.
0135: *
0136: * <p>Ranges are indicated by placing two a '-' between two
0137: * characters, as in "a-z". This specifies the range of all
0138: * characters from the left to the right, in Unicode order. If the
0139: * left character is greater than or equal to the
0140: * right character it is a syntax error. If a '-' occurs as the first
0141: * character after the opening '[' or '[^', or if it occurs as the
0142: * last character before the closing ']', then it is taken as a
0143: * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
0144: * set of three characters, 'a', 'b', and '-'.
0145: *
0146: * <p>Sets may be intersected using the '&' operator or the asymmetric
0147: * set difference may be taken using the '-' operator, for example,
0148: * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
0149: * with values less than 4096. Operators ('&' and '|') have equal
0150: * precedence and bind left-to-right. Thus
0151: * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
0152: * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
0153: * difference; intersection is commutative.
0154: *
0155: * <table>
0156: * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
0157: * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
0158: * through 'z' and all letters in between, in Unicode order
0159: * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
0160: * all characters but 'a' through 'z',
0161: * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
0162: * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
0163: * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
0164: * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
0165: * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
0166: * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
0167: * <td>The asymmetric difference of sets specified by <em>pat1</em> and
0168: * <em>pat2</em>
0169: * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
0170: * <td>The set of characters having the specified
0171: * Unicode property; in
0172: * this case, Unicode uppercase letters
0173: * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
0174: * <td>The set of characters <em>not</em> having the given
0175: * Unicode property
0176: * </table>
0177: *
0178: * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
0179: *
0180: * <p><b>Formal syntax</b></p>
0181: *
0182: * <blockquote>
0183: * <table>
0184: * <tr align="top">
0185: * <td nowrap valign="top" align="right"><code>pattern := </code></td>
0186: * <td valign="top"><code>('[' '^'? item* ']') |
0187: * property</code></td>
0188: * </tr>
0189: * <tr align="top">
0190: * <td nowrap valign="top" align="right"><code>item := </code></td>
0191: * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
0192: * </code></td>
0193: * </tr>
0194: * <tr align="top">
0195: * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
0196: * <td valign="top"><code>pattern | pattern-expr pattern |
0197: * pattern-expr op pattern<br>
0198: * </code></td>
0199: * </tr>
0200: * <tr align="top">
0201: * <td nowrap valign="top" align="right"><code>op := </code></td>
0202: * <td valign="top"><code>'&' | '-'<br>
0203: * </code></td>
0204: * </tr>
0205: * <tr align="top">
0206: * <td nowrap valign="top" align="right"><code>special := </code></td>
0207: * <td valign="top"><code>'[' | ']' | '-'<br>
0208: * </code></td>
0209: * </tr>
0210: * <tr align="top">
0211: * <td nowrap valign="top" align="right"><code>char := </code></td>
0212: * <td valign="top"><em>any character that is not</em><code> special<br>
0213: * | ('\\' </code><em>any character</em><code>)<br>
0214: * | ('\u' hex hex hex hex)<br>
0215: * </code></td>
0216: * </tr>
0217: * <tr align="top">
0218: * <td nowrap valign="top" align="right"><code>hex := </code></td>
0219: * <td valign="top"><em>any character for which
0220: * </em><code>Character.digit(c, 16)</code><em>
0221: * returns a non-negative result</em></td>
0222: * </tr>
0223: * <tr>
0224: * <td nowrap valign="top" align="right"><code>property := </code></td>
0225: * <td valign="top"><em>a Unicode property set pattern</td>
0226: * </tr>
0227: * </table>
0228: * <br>
0229: * <table border="1">
0230: * <tr>
0231: * <td>Legend: <table>
0232: * <tr>
0233: * <td nowrap valign="top"><code>a := b</code></td>
0234: * <td width="20" valign="top"> </td>
0235: * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
0236: * </tr>
0237: * <tr>
0238: * <td nowrap valign="top"><code>a?</code></td>
0239: * <td valign="top"></td>
0240: * <td valign="top">zero or one instance of <code>a</code><br>
0241: * </td>
0242: * </tr>
0243: * <tr>
0244: * <td nowrap valign="top"><code>a*</code></td>
0245: * <td valign="top"></td>
0246: * <td valign="top">one or more instances of <code>a</code><br>
0247: * </td>
0248: * </tr>
0249: * <tr>
0250: * <td nowrap valign="top"><code>a | b</code></td>
0251: * <td valign="top"></td>
0252: * <td valign="top">either <code>a</code> or <code>b</code><br>
0253: * </td>
0254: * </tr>
0255: * <tr>
0256: * <td nowrap valign="top"><code>'a'</code></td>
0257: * <td valign="top"></td>
0258: * <td valign="top">the literal string between the quotes </td>
0259: * </tr>
0260: * </table>
0261: * </td>
0262: * </tr>
0263: * </table>
0264: * </blockquote>
0265: * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
0266: *
0267: * @author Alan Liu
0268: * @stable ICU 2.0
0269: * @see UnicodeSetIterator
0270: */
0271: public class UnicodeSet extends UnicodeFilter implements Freezable {
0272:
0273: private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
0274: private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
0275: // 110000 for codepoints
0276:
0277: /**
0278: * Minimum value that can be stored in a UnicodeSet.
0279: * @stable ICU 2.0
0280: */
0281: public static final int MIN_VALUE = LOW;
0282:
0283: /**
0284: * Maximum value that can be stored in a UnicodeSet.
0285: * @stable ICU 2.0
0286: */
0287: public static final int MAX_VALUE = HIGH - 1;
0288:
0289: private int len; // length used; list may be longer to minimize reallocs
0290: private int[] list; // MUST be terminated with HIGH
0291: private int[] rangeList; // internal buffer
0292: private int[] buffer; // internal buffer
0293:
0294: // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
0295: // is not private so that UnicodeSetIterator can get access
0296: TreeSet strings = new TreeSet();
0297:
0298: /**
0299: * The pattern representation of this set. This may not be the
0300: * most economical pattern. It is the pattern supplied to
0301: * applyPattern(), with variables substituted and whitespace
0302: * removed. For sets constructed without applyPattern(), or
0303: * modified using the non-pattern API, this string will be null,
0304: * indicating that toPattern() must generate a pattern
0305: * representation from the inversion list.
0306: */
0307: private String pat = null;
0308:
0309: private static final int START_EXTRA = 16; // initial storage. Must be >= 0
0310: private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
0311:
0312: // Special property set IDs
0313: private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF]
0314: private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
0315: private static final String ASSIGNED = "Assigned"; // [:^Cn:]
0316:
0317: /**
0318: * A set of all characters _except_ the second through last characters of
0319: * certain ranges. These ranges are ranges of characters whose
0320: * properties are all exactly alike, e.g. CJK Ideographs from
0321: * U+4E00 to U+9FA5.
0322: */
0323: private static UnicodeSet INCLUSIONS[] = null;
0324:
0325: //----------------------------------------------------------------
0326: // Public API
0327: //----------------------------------------------------------------
0328:
0329: /**
0330: * Constructs an empty set.
0331: * @stable ICU 2.0
0332: */
0333: public UnicodeSet() {
0334: list = new int[1 + START_EXTRA];
0335: list[len++] = HIGH;
0336: }
0337:
0338: /**
0339: * Constructs a copy of an existing set.
0340: * @stable ICU 2.0
0341: */
0342: public UnicodeSet(UnicodeSet other) {
0343: set(other);
0344: }
0345:
0346: /**
0347: * Constructs a set containing the given range. If <code>end >
0348: * start</code> then an empty set is created.
0349: *
0350: * @param start first character, inclusive, of range
0351: * @param end last character, inclusive, of range
0352: * @stable ICU 2.0
0353: */
0354: public UnicodeSet(int start, int end) {
0355: this ();
0356: complement(start, end);
0357: }
0358:
0359: /**
0360: * Constructs a set from the given pattern. See the class description
0361: * for the syntax of the pattern language. Whitespace is ignored.
0362: * @param pattern a string specifying what characters are in the set
0363: * @exception java.lang.IllegalArgumentException if the pattern contains
0364: * a syntax error.
0365: * @stable ICU 2.0
0366: */
0367: public UnicodeSet(String pattern) {
0368: this ();
0369: applyPattern(pattern, null, null, IGNORE_SPACE);
0370: }
0371:
0372: /**
0373: * Constructs a set from the given pattern. See the class description
0374: * for the syntax of the pattern language.
0375: * @param pattern a string specifying what characters are in the set
0376: * @param ignoreWhitespace if true, ignore characters for which
0377: * UCharacterProperty.isRuleWhiteSpace() returns true
0378: * @exception java.lang.IllegalArgumentException if the pattern contains
0379: * a syntax error.
0380: * @stable ICU 2.0
0381: */
0382: public UnicodeSet(String pattern, boolean ignoreWhitespace) {
0383: this ();
0384: applyPattern(pattern, null, null,
0385: ignoreWhitespace ? IGNORE_SPACE : 0);
0386: }
0387:
0388: /**
0389: * Constructs a set from the given pattern. See the class description
0390: * for the syntax of the pattern language.
0391: * @param pattern a string specifying what characters are in the set
0392: * @param options a bitmask indicating which options to apply.
0393: * Valid options are IGNORE_SPACE and CASE.
0394: * @exception java.lang.IllegalArgumentException if the pattern contains
0395: * a syntax error.
0396: * @internal
0397: * @deprecated This API is ICU internal only.
0398: */
0399: public UnicodeSet(String pattern, int options) {
0400: this ();
0401: applyPattern(pattern, null, null, options);
0402: }
0403:
0404: /**
0405: * Constructs a set from the given pattern. See the class description
0406: * for the syntax of the pattern language.
0407: * @param pattern a string specifying what characters are in the set
0408: * @param pos on input, the position in pattern at which to start parsing.
0409: * On output, the position after the last character parsed.
0410: * @param symbols a symbol table mapping variables to char[] arrays
0411: * and chars to UnicodeSets
0412: * @exception java.lang.IllegalArgumentException if the pattern
0413: * contains a syntax error.
0414: * @stable ICU 2.0
0415: */
0416: public UnicodeSet(String pattern, ParsePosition pos,
0417: SymbolTable symbols) {
0418: this ();
0419: applyPattern(pattern, pos, symbols, IGNORE_SPACE);
0420: }
0421:
0422: /**
0423: * Constructs a set from the given pattern. See the class description
0424: * for the syntax of the pattern language.
0425: * @param pattern a string specifying what characters are in the set
0426: * @param pos on input, the position in pattern at which to start parsing.
0427: * On output, the position after the last character parsed.
0428: * @param symbols a symbol table mapping variables to char[] arrays
0429: * and chars to UnicodeSets
0430: * @param options a bitmask indicating which options to apply.
0431: * Valid options are IGNORE_SPACE and CASE.
0432: * @exception java.lang.IllegalArgumentException if the pattern
0433: * contains a syntax error.
0434: * @draft ICU 3.2
0435: * @provisional This API might change or be removed in a future release.
0436: */
0437: public UnicodeSet(String pattern, ParsePosition pos,
0438: SymbolTable symbols, int options) {
0439: this ();
0440: applyPattern(pattern, pos, symbols, options);
0441: }
0442:
0443: /**
0444: * Return a new set that is equivalent to this one.
0445: * @stable ICU 2.0
0446: */
0447: public Object clone() {
0448: UnicodeSet result = new UnicodeSet(this );
0449: result.frozen = this .frozen;
0450: return result;
0451: }
0452:
0453: /**
0454: * Make this object represent the range <code>start - end</code>.
0455: * If <code>end > start</code> then this object is set to an
0456: * an empty range.
0457: *
0458: * @param start first character in the set, inclusive
0459: * @param end last character in the set, inclusive
0460: * @stable ICU 2.0
0461: */
0462: public UnicodeSet set(int start, int end) {
0463: checkFrozen();
0464: clear();
0465: complement(start, end);
0466: return this ;
0467: }
0468:
0469: /**
0470: * Make this object represent the same set as <code>other</code>.
0471: * @param other a <code>UnicodeSet</code> whose value will be
0472: * copied to this object
0473: * @stable ICU 2.0
0474: */
0475: public UnicodeSet set(UnicodeSet other) {
0476: checkFrozen();
0477: list = (int[]) other.list.clone();
0478: len = other.len;
0479: pat = other.pat;
0480: strings = (TreeSet) other.strings.clone();
0481: return this ;
0482: }
0483:
0484: /**
0485: * Modifies this set to represent the set specified by the given pattern.
0486: * See the class description for the syntax of the pattern language.
0487: * Whitespace is ignored.
0488: * @param pattern a string specifying what characters are in the set
0489: * @exception java.lang.IllegalArgumentException if the pattern
0490: * contains a syntax error.
0491: * @stable ICU 2.0
0492: */
0493: public final UnicodeSet applyPattern(String pattern) {
0494: checkFrozen();
0495: return applyPattern(pattern, null, null, IGNORE_SPACE);
0496: }
0497:
0498: /**
0499: * Modifies this set to represent the set specified by the given pattern,
0500: * optionally ignoring whitespace.
0501: * See the class description for the syntax of the pattern language.
0502: * @param pattern a string specifying what characters are in the set
0503: * @param ignoreWhitespace if true then characters for which
0504: * UCharacterProperty.isRuleWhiteSpace() returns true are ignored
0505: * @exception java.lang.IllegalArgumentException if the pattern
0506: * contains a syntax error.
0507: * @stable ICU 2.0
0508: */
0509: public UnicodeSet applyPattern(String pattern,
0510: boolean ignoreWhitespace) {
0511: checkFrozen();
0512: return applyPattern(pattern, null, null,
0513: ignoreWhitespace ? IGNORE_SPACE : 0);
0514: }
0515:
0516: /**
0517: * Modifies this set to represent the set specified by the given pattern,
0518: * optionally ignoring whitespace.
0519: * See the class description for the syntax of the pattern language.
0520: * @param pattern a string specifying what characters are in the set
0521: * @param options a bitmask indicating which options to apply.
0522: * Valid options are IGNORE_SPACE and CASE.
0523: * @exception java.lang.IllegalArgumentException if the pattern
0524: * contains a syntax error.
0525: * @internal
0526: * @deprecated This API is ICU internal only.
0527: */
0528: public UnicodeSet applyPattern(String pattern, int options) {
0529: checkFrozen();
0530: return applyPattern(pattern, null, null, options);
0531: }
0532:
0533: /**
0534: * Return true if the given position, in the given pattern, appears
0535: * to be the start of a UnicodeSet pattern.
0536: * @stable ICU 2.0
0537: */
0538: public static boolean resemblesPattern(String pattern, int pos) {
0539: return ((pos + 1) < pattern.length() && pattern.charAt(pos) == '[')
0540: || resemblesPropertyPattern(pattern, pos);
0541: }
0542:
0543: /**
0544: * Append the <code>toPattern()</code> representation of a
0545: * string to the given <code>StringBuffer</code>.
0546: */
0547: private static void _appendToPat(StringBuffer buf, String s,
0548: boolean escapeUnprintable) {
0549: for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
0550: _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
0551: }
0552: }
0553:
0554: /**
0555: * Append the <code>toPattern()</code> representation of a
0556: * character to the given <code>StringBuffer</code>.
0557: */
0558: private static void _appendToPat(StringBuffer buf, int c,
0559: boolean escapeUnprintable) {
0560: if (escapeUnprintable && Utility.isUnprintable(c)) {
0561: // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
0562: // unprintable
0563: if (Utility.escapeUnprintable(buf, c)) {
0564: return;
0565: }
0566: }
0567: // Okay to let ':' pass through
0568: switch (c) {
0569: case '[': // SET_OPEN:
0570: case ']': // SET_CLOSE:
0571: case '-': // HYPHEN:
0572: case '^': // COMPLEMENT:
0573: case '&': // INTERSECTION:
0574: case '\\': //BACKSLASH:
0575: case '{':
0576: case '}':
0577: case '$':
0578: case ':':
0579: buf.append('\\');
0580: break;
0581: default:
0582: // Escape whitespace
0583: if (UCharacterProperty.isRuleWhiteSpace(c)) {
0584: buf.append('\\');
0585: }
0586: break;
0587: }
0588: UTF16.append(buf, c);
0589: }
0590:
0591: /**
0592: * Returns a string representation of this set. If the result of
0593: * calling this function is passed to a UnicodeSet constructor, it
0594: * will produce another set that is equal to this one.
0595: * @stable ICU 2.0
0596: */
0597: public String toPattern(boolean escapeUnprintable) {
0598: StringBuffer result = new StringBuffer();
0599: return _toPattern(result, escapeUnprintable).toString();
0600: }
0601:
0602: /**
0603: * Append a string representation of this set to result. This will be
0604: * a cleaned version of the string passed to applyPattern(), if there
0605: * is one. Otherwise it will be generated.
0606: */
0607: private StringBuffer _toPattern(StringBuffer result,
0608: boolean escapeUnprintable) {
0609: if (pat != null) {
0610: int i;
0611: int backslashCount = 0;
0612: for (i = 0; i < pat.length();) {
0613: int c = UTF16.charAt(pat, i);
0614: i += UTF16.getCharCount(c);
0615: if (escapeUnprintable && Utility.isUnprintable(c)) {
0616: // If the unprintable character is preceded by an odd
0617: // number of backslashes, then it has been escaped.
0618: // Before unescaping it, we delete the final
0619: // backslash.
0620: if ((backslashCount % 2) == 1) {
0621: result.setLength(result.length() - 1);
0622: }
0623: Utility.escapeUnprintable(result, c);
0624: backslashCount = 0;
0625: } else {
0626: UTF16.append(result, c);
0627: if (c == '\\') {
0628: ++backslashCount;
0629: } else {
0630: backslashCount = 0;
0631: }
0632: }
0633: }
0634: return result;
0635: }
0636:
0637: return _generatePattern(result, escapeUnprintable, true);
0638: }
0639:
0640: /**
0641: * Generate and append a string representation of this set to result.
0642: * This does not use this.pat, the cleaned up copy of the string
0643: * passed to applyPattern().
0644: * @param result the buffer into which to generate the pattern
0645: * @param escapeUnprintable escape unprintable characters if true
0646: * @stable ICU 2.0
0647: */
0648: public StringBuffer _generatePattern(StringBuffer result,
0649: boolean escapeUnprintable) {
0650: return _generatePattern(result, escapeUnprintable, true);
0651: }
0652:
0653: /**
0654: * Generate and append a string representation of this set to result.
0655: * This does not use this.pat, the cleaned up copy of the string
0656: * passed to applyPattern().
0657: * @param includeStrings if false, doesn't include the strings.
0658: * @internal
0659: * @deprecated This API is ICU internal only.
0660: */
0661: public StringBuffer _generatePattern(StringBuffer result,
0662: boolean escapeUnprintable, boolean includeStrings) {
0663: result.append('[');
0664:
0665: // // Check against the predefined categories. We implicitly build
0666: // // up ALL category sets the first time toPattern() is called.
0667: // for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
0668: // if (this.equals(getCategorySet(cat))) {
0669: // result.append(':');
0670: // result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
0671: // return result.append(":]");
0672: // }
0673: // }
0674:
0675: int count = getRangeCount();
0676:
0677: // If the set contains at least 2 intervals and includes both
0678: // MIN_VALUE and MAX_VALUE, then the inverse representation will
0679: // be more economical.
0680: if (count > 1 && getRangeStart(0) == MIN_VALUE
0681: && getRangeEnd(count - 1) == MAX_VALUE) {
0682:
0683: // Emit the inverse
0684: result.append('^');
0685:
0686: for (int i = 1; i < count; ++i) {
0687: int start = getRangeEnd(i - 1) + 1;
0688: int end = getRangeStart(i) - 1;
0689: _appendToPat(result, start, escapeUnprintable);
0690: if (start != end) {
0691: if ((start + 1) != end) {
0692: result.append('-');
0693: }
0694: _appendToPat(result, end, escapeUnprintable);
0695: }
0696: }
0697: }
0698:
0699: // Default; emit the ranges as pairs
0700: else {
0701: for (int i = 0; i < count; ++i) {
0702: int start = getRangeStart(i);
0703: int end = getRangeEnd(i);
0704: _appendToPat(result, start, escapeUnprintable);
0705: if (start != end) {
0706: if ((start + 1) != end) {
0707: result.append('-');
0708: }
0709: _appendToPat(result, end, escapeUnprintable);
0710: }
0711: }
0712: }
0713:
0714: if (includeStrings && strings.size() > 0) {
0715: Iterator it = strings.iterator();
0716: while (it.hasNext()) {
0717: result.append('{');
0718: _appendToPat(result, (String) it.next(),
0719: escapeUnprintable);
0720: result.append('}');
0721: }
0722: }
0723: return result.append(']');
0724: }
0725:
0726: /**
0727: * Returns the number of elements in this set (its cardinality)
0728: * Note than the elements of a set may include both individual
0729: * codepoints and strings.
0730: *
0731: * @return the number of elements in this set (its cardinality).
0732: * @stable ICU 2.0
0733: */
0734: public int size() {
0735: int n = 0;
0736: int count = getRangeCount();
0737: for (int i = 0; i < count; ++i) {
0738: n += getRangeEnd(i) - getRangeStart(i) + 1;
0739: }
0740: return n + strings.size();
0741: }
0742:
0743: /**
0744: * Returns <tt>true</tt> if this set contains no elements.
0745: *
0746: * @return <tt>true</tt> if this set contains no elements.
0747: * @stable ICU 2.0
0748: */
0749: public boolean isEmpty() {
0750: return len == 1 && strings.size() == 0;
0751: }
0752:
0753: /**
0754: * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if
0755: * this set contains any character whose low byte is the given
0756: * value. This is used by <tt>RuleBasedTransliterator</tt> for
0757: * indexing.
0758: * @stable ICU 2.0
0759: */
0760: public boolean matchesIndexValue(int v) {
0761: /* The index value v, in the range [0,255], is contained in this set if
0762: * it is contained in any pair of this set. Pairs either have the high
0763: * bytes equal, or unequal. If the high bytes are equal, then we have
0764: * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
0765: * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
0766: * Then v is contained if xx <= v || v <= yy. (This is identical to the
0767: * time zone month containment logic.)
0768: */
0769: for (int i = 0; i < getRangeCount(); ++i) {
0770: int low = getRangeStart(i);
0771: int high = getRangeEnd(i);
0772: if ((low & ~0xFF) == (high & ~0xFF)) {
0773: if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
0774: return true;
0775: }
0776: } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
0777: return true;
0778: }
0779: }
0780: if (strings.size() != 0) {
0781: Iterator it = strings.iterator();
0782: while (it.hasNext()) {
0783: String s = (String) it.next();
0784: //if (s.length() == 0) {
0785: // // Empty strings match everything
0786: // return true;
0787: //}
0788: // assert(s.length() != 0); // We enforce this elsewhere
0789: int c = UTF16.charAt(s, 0);
0790: if ((c & 0xFF) == v) {
0791: return true;
0792: }
0793: }
0794: }
0795: return false;
0796: }
0797:
0798: /**
0799: * Implementation of UnicodeMatcher.matches(). Always matches the
0800: * longest possible multichar string.
0801: * @stable ICU 2.0
0802: */
0803: public int matches(Replaceable text, int[] offset, int limit,
0804: boolean incremental) {
0805:
0806: if (offset[0] == limit) {
0807: // Strings, if any, have length != 0, so we don't worry
0808: // about them here. If we ever allow zero-length strings
0809: // we much check for them here.
0810: if (contains(UnicodeMatcher.ETHER)) {
0811: return incremental ? U_PARTIAL_MATCH : U_MATCH;
0812: } else {
0813: return U_MISMATCH;
0814: }
0815: } else {
0816: if (strings.size() != 0) { // try strings first
0817:
0818: // might separate forward and backward loops later
0819: // for now they are combined
0820:
0821: // TODO Improve efficiency of this, at least in the forward
0822: // direction, if not in both. In the forward direction we
0823: // can assume the strings are sorted.
0824:
0825: Iterator it = strings.iterator();
0826: boolean forward = offset[0] < limit;
0827:
0828: // firstChar is the leftmost char to match in the
0829: // forward direction or the rightmost char to match in
0830: // the reverse direction.
0831: char firstChar = text.charAt(offset[0]);
0832:
0833: // If there are multiple strings that can match we
0834: // return the longest match.
0835: int highWaterLength = 0;
0836:
0837: while (it.hasNext()) {
0838: String trial = (String) it.next();
0839:
0840: //if (trial.length() == 0) {
0841: // return U_MATCH; // null-string always matches
0842: //}
0843: // assert(trial.length() != 0); // We ensure this elsewhere
0844:
0845: char c = trial.charAt(forward ? 0
0846: : trial.length() - 1);
0847:
0848: // Strings are sorted, so we can optimize in the
0849: // forward direction.
0850: if (forward && c > firstChar)
0851: break;
0852: if (c != firstChar)
0853: continue;
0854:
0855: int len = matchRest(text, offset[0], limit, trial);
0856:
0857: if (incremental) {
0858: int maxLen = forward ? limit - offset[0]
0859: : offset[0] - limit;
0860: if (len == maxLen) {
0861: // We have successfully matched but only up to limit.
0862: return U_PARTIAL_MATCH;
0863: }
0864: }
0865:
0866: if (len == trial.length()) {
0867: // We have successfully matched the whole string.
0868: if (len > highWaterLength) {
0869: highWaterLength = len;
0870: }
0871: // In the forward direction we know strings
0872: // are sorted so we can bail early.
0873: if (forward && len < highWaterLength) {
0874: break;
0875: }
0876: continue;
0877: }
0878: }
0879:
0880: // We've checked all strings without a partial match.
0881: // If we have full matches, return the longest one.
0882: if (highWaterLength != 0) {
0883: offset[0] += forward ? highWaterLength
0884: : -highWaterLength;
0885: return U_MATCH;
0886: }
0887: }
0888: return super .matches(text, offset, limit, incremental);
0889: }
0890: }
0891:
0892: /**
0893: * Returns the longest match for s in text at the given position.
0894: * If limit > start then match forward from start+1 to limit
0895: * matching all characters except s.charAt(0). If limit < start,
0896: * go backward starting from start-1 matching all characters
0897: * except s.charAt(s.length()-1). This method assumes that the
0898: * first character, text.charAt(start), matches s, so it does not
0899: * check it.
0900: * @param text the text to match
0901: * @param start the first character to match. In the forward
0902: * direction, text.charAt(start) is matched against s.charAt(0).
0903: * In the reverse direction, it is matched against
0904: * s.charAt(s.length()-1).
0905: * @param limit the limit offset for matching, either last+1 in
0906: * the forward direction, or last-1 in the reverse direction,
0907: * where last is the index of the last character to match.
0908: * @return If part of s matches up to the limit, return |limit -
0909: * start|. If all of s matches before reaching the limit, return
0910: * s.length(). If there is a mismatch between s and text, return
0911: * 0
0912: */
0913: private static int matchRest(Replaceable text, int start,
0914: int limit, String s) {
0915: int maxLen;
0916: int slen = s.length();
0917: if (start < limit) {
0918: maxLen = limit - start;
0919: if (maxLen > slen)
0920: maxLen = slen;
0921: for (int i = 1; i < maxLen; ++i) {
0922: if (text.charAt(start + i) != s.charAt(i))
0923: return 0;
0924: }
0925: } else {
0926: maxLen = start - limit;
0927: if (maxLen > slen)
0928: maxLen = slen;
0929: --slen; // <=> slen = s.length() - 1;
0930: for (int i = 1; i < maxLen; ++i) {
0931: if (text.charAt(start - i) != s.charAt(slen - i))
0932: return 0;
0933: }
0934: }
0935: return maxLen;
0936: }
0937:
0938: //#ifndef FOUNDATION
0939: /**
0940: * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. For now, an internal routine.
0941: * @internal
0942: * @deprecated This API is ICU internal only.
0943: */
0944: public int matchesAt(CharSequence text, int offset) {
0945: int len = -1;
0946: strings: if (strings.size() != 0) {
0947: char firstChar = text.charAt(offset);
0948: String trial = null;
0949: // find the first string starting with firstChar
0950: Iterator it = strings.iterator();
0951: while (it.hasNext()) {
0952: trial = (String) it.next();
0953: char firstStringChar = trial.charAt(0);
0954: if (firstStringChar < firstChar)
0955: continue;
0956: if (firstStringChar > firstChar)
0957: break strings;
0958: }
0959: // now keep checking string until we get the longest one
0960: while (true) {
0961: int tempLen = CollectionUtilities.matchesAt(text,
0962: offset, trial);
0963: if (len > tempLen)
0964: break strings;
0965: len = tempLen;
0966: if (!it.hasNext())
0967: break;
0968: trial = (String) it.next();
0969: }
0970: }
0971: if (len < 2) {
0972: int cp = UTF16.charAt(text, offset);
0973: if (contains(cp)) {
0974: len = UTF16.getCharCount(cp);
0975: }
0976: }
0977: return offset + len;
0978: }
0979:
0980: //#endif
0981:
0982: /**
0983: * Implementation of UnicodeMatcher API. Union the set of all
0984: * characters that may be matched by this object into the given
0985: * set.
0986: * @param toUnionTo the set into which to union the source characters
0987: * @stable ICU 2.2
0988: */
0989: public void addMatchSetTo(UnicodeSet toUnionTo) {
0990: toUnionTo.addAll(this );
0991: }
0992:
0993: /**
0994: * Returns the index of the given character within this set, where
0995: * the set is ordered by ascending code point. If the character
0996: * is not in this set, return -1. The inverse of this method is
0997: * <code>charAt()</code>.
0998: * @return an index from 0..size()-1, or -1
0999: * @stable ICU 2.0
1000: */
1001: public int indexOf(int c) {
1002: if (c < MIN_VALUE || c > MAX_VALUE) {
1003: throw new IllegalArgumentException("Invalid code point U+"
1004: + Utility.hex(c, 6));
1005: }
1006: int i = 0;
1007: int n = 0;
1008: for (;;) {
1009: int start = list[i++];
1010: if (c < start) {
1011: return -1;
1012: }
1013: int limit = list[i++];
1014: if (c < limit) {
1015: return n + c - start;
1016: }
1017: n += limit - start;
1018: }
1019: }
1020:
1021: /**
1022: * Returns the character at the given index within this set, where
1023: * the set is ordered by ascending code point. If the index is
1024: * out of range, return -1. The inverse of this method is
1025: * <code>indexOf()</code>.
1026: * @param index an index from 0..size()-1
1027: * @return the character at the given index, or -1.
1028: * @stable ICU 2.0
1029: */
1030: public int charAt(int index) {
1031: if (index >= 0) {
1032: // len2 is the largest even integer <= len, that is, it is len
1033: // for even values and len-1 for odd values. With odd values
1034: // the last entry is UNICODESET_HIGH.
1035: int len2 = len & ~1;
1036: for (int i = 0; i < len2;) {
1037: int start = list[i++];
1038: int count = list[i++] - start;
1039: if (index < count) {
1040: return start + index;
1041: }
1042: index -= count;
1043: }
1044: }
1045: return -1;
1046: }
1047:
1048: /**
1049: * Adds the specified range to this set if it is not already
1050: * present. If this set already contains the specified range,
1051: * the call leaves this set unchanged. If <code>end > start</code>
1052: * then an empty range is added, leaving the set unchanged.
1053: *
1054: * @param start first character, inclusive, of range to be added
1055: * to this set.
1056: * @param end last character, inclusive, of range to be added
1057: * to this set.
1058: * @stable ICU 2.0
1059: */
1060: public UnicodeSet add(int start, int end) {
1061: checkFrozen();
1062: return add_unchecked(start, end);
1063: }
1064:
1065: // for internal use, after checkFrozen has been called
1066: private UnicodeSet add_unchecked(int start, int end) {
1067: if (start < MIN_VALUE || start > MAX_VALUE) {
1068: throw new IllegalArgumentException("Invalid code point U+"
1069: + Utility.hex(start, 6));
1070: }
1071: if (end < MIN_VALUE || end > MAX_VALUE) {
1072: throw new IllegalArgumentException("Invalid code point U+"
1073: + Utility.hex(end, 6));
1074: }
1075: if (start < end) {
1076: add(range(start, end), 2, 0);
1077: } else if (start == end) {
1078: add(start);
1079: }
1080: return this ;
1081: }
1082:
1083: // /**
1084: // * Format out the inversion list as a string, for debugging. Uncomment when
1085: // * needed.
1086: // */
1087: // public final String dump() {
1088: // StringBuffer buf = new StringBuffer("[");
1089: // for (int i=0; i<len; ++i) {
1090: // if (i != 0) buf.append(", ");
1091: // int c = list[i];
1092: // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1093: // // buf.append((char) c);
1094: // //} else {
1095: // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1096: // //}
1097: // }
1098: // buf.append("]");
1099: // return buf.toString();
1100: // }
1101:
1102: /**
1103: * Adds the specified character to this set if it is not already
1104: * present. If this set already contains the specified character,
1105: * the call leaves this set unchanged.
1106: * @stable ICU 2.0
1107: */
1108: public final UnicodeSet add(int c) {
1109: checkFrozen();
1110: return add_unchecked(c);
1111: }
1112:
1113: // for internal use only, after checkFrozen has been called
1114: private final UnicodeSet add_unchecked(int c) {
1115: if (c < MIN_VALUE || c > MAX_VALUE) {
1116: throw new IllegalArgumentException("Invalid code point U+"
1117: + Utility.hex(c, 6));
1118: }
1119:
1120: // find smallest i such that c < list[i]
1121: // if odd, then it is IN the set
1122: // if even, then it is OUT of the set
1123: int i = findCodePoint(c);
1124:
1125: // already in set?
1126: if ((i & 1) != 0)
1127: return this ;
1128:
1129: // HIGH is 0x110000
1130: // assert(list[len-1] == HIGH);
1131:
1132: // empty = [HIGH]
1133: // [start_0, limit_0, start_1, limit_1, HIGH]
1134:
1135: // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1136: // ^
1137: // list[i]
1138:
1139: // i == 0 means c is before the first range
1140:
1141: if (c == list[i] - 1) {
1142: // c is before start of next range
1143: list[i] = c;
1144: // if we touched the HIGH mark, then add a new one
1145: if (c == MAX_VALUE) {
1146: ensureCapacity(len + 1);
1147: list[len++] = HIGH;
1148: }
1149: if (i > 0 && c == list[i - 1]) {
1150: // collapse adjacent ranges
1151:
1152: // [..., start_k-1, c, c, limit_k, ..., HIGH]
1153: // ^
1154: // list[i]
1155: System.arraycopy(list, i + 1, list, i - 1, len - i - 1);
1156: len -= 2;
1157: }
1158: }
1159:
1160: else if (i > 0 && c == list[i - 1]) {
1161: // c is after end of prior range
1162: list[i - 1]++;
1163: // no need to chcek for collapse here
1164: }
1165:
1166: else {
1167: // At this point we know the new char is not adjacent to
1168: // any existing ranges, and it is not 10FFFF.
1169:
1170: // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1171: // ^
1172: // list[i]
1173:
1174: // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1175: // ^
1176: // list[i]
1177:
1178: // Don't use ensureCapacity() to save on copying.
1179: // NOTE: This has no measurable impact on performance,
1180: // but it might help in some usage patterns.
1181: if (len + 2 > list.length) {
1182: int[] temp = new int[len + 2 + GROW_EXTRA];
1183: if (i != 0)
1184: System.arraycopy(list, 0, temp, 0, i);
1185: System.arraycopy(list, i, temp, i + 2, len - i);
1186: list = temp;
1187: } else {
1188: System.arraycopy(list, i, list, i + 2, len - i);
1189: }
1190:
1191: list[i] = c;
1192: list[i + 1] = c + 1;
1193: len += 2;
1194: }
1195:
1196: pat = null;
1197: return this ;
1198: }
1199:
1200: /**
1201: * Adds the specified multicharacter to this set if it is not already
1202: * present. If this set already contains the multicharacter,
1203: * the call leaves this set unchanged.
1204: * Thus "ch" => {"ch"}
1205: * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1206: * @param s the source string
1207: * @return this object, for chaining
1208: * @stable ICU 2.0
1209: */
1210: public final UnicodeSet add(String s) {
1211: checkFrozen();
1212: int cp = getSingleCP(s);
1213: if (cp < 0) {
1214: strings.add(s);
1215: pat = null;
1216: } else {
1217: add_unchecked(cp, cp);
1218: }
1219: return this ;
1220: }
1221:
1222: /**
1223: * @return a code point IF the string consists of a single one.
1224: * otherwise returns -1.
1225: * @param string to test
1226: */
1227: private static int getSingleCP(String s) {
1228: if (s.length() < 1) {
1229: throw new IllegalArgumentException(
1230: "Can't use zero-length strings in UnicodeSet");
1231: }
1232: if (s.length() > 2)
1233: return -1;
1234: if (s.length() == 1)
1235: return s.charAt(0);
1236:
1237: // at this point, len = 2
1238: int cp = UTF16.charAt(s, 0);
1239: if (cp > 0xFFFF) { // is surrogate pair
1240: return cp;
1241: }
1242: return -1;
1243: }
1244:
1245: /**
1246: * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1247: * If this set already any particular character, it has no effect on that character.
1248: * @param s the source string
1249: * @return this object, for chaining
1250: * @stable ICU 2.0
1251: */
1252: public final UnicodeSet addAll(String s) {
1253: checkFrozen();
1254: int cp;
1255: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1256: cp = UTF16.charAt(s, i);
1257: add_unchecked(cp, cp);
1258: }
1259: return this ;
1260: }
1261:
1262: /**
1263: * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1264: * If this set already any particular character, it has no effect on that character.
1265: * @param s the source string
1266: * @return this object, for chaining
1267: * @stable ICU 2.0
1268: */
1269: public final UnicodeSet retainAll(String s) {
1270: return retainAll(fromAll(s));
1271: }
1272:
1273: /**
1274: * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1275: * If this set already any particular character, it has no effect on that character.
1276: * @param s the source string
1277: * @return this object, for chaining
1278: * @stable ICU 2.0
1279: */
1280: public final UnicodeSet complementAll(String s) {
1281: return complementAll(fromAll(s));
1282: }
1283:
1284: /**
1285: * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1286: * If this set already any particular character, it has no effect on that character.
1287: * @param s the source string
1288: * @return this object, for chaining
1289: * @stable ICU 2.0
1290: */
1291: public final UnicodeSet removeAll(String s) {
1292: return removeAll(fromAll(s));
1293: }
1294:
1295: /**
1296: * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1297: * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1298: * @param s the source string
1299: * @return a newly created set containing the given string
1300: * @stable ICU 2.0
1301: */
1302: public static UnicodeSet from(String s) {
1303: return new UnicodeSet().add(s);
1304: }
1305:
1306: /**
1307: * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1308: * @param s the source string
1309: * @return a newly created set containing the given characters
1310: * @stable ICU 2.0
1311: */
1312: public static UnicodeSet fromAll(String s) {
1313: return new UnicodeSet().addAll(s);
1314: }
1315:
1316: /**
1317: * Retain only the elements in this set that are contained in the
1318: * specified range. If <code>end > start</code> then an empty range is
1319: * retained, leaving the set empty.
1320: *
1321: * @param start first character, inclusive, of range to be retained
1322: * to this set.
1323: * @param end last character, inclusive, of range to be retained
1324: * to this set.
1325: * @stable ICU 2.0
1326: */
1327: public UnicodeSet retain(int start, int end) {
1328: checkFrozen();
1329: if (start < MIN_VALUE || start > MAX_VALUE) {
1330: throw new IllegalArgumentException("Invalid code point U+"
1331: + Utility.hex(start, 6));
1332: }
1333: if (end < MIN_VALUE || end > MAX_VALUE) {
1334: throw new IllegalArgumentException("Invalid code point U+"
1335: + Utility.hex(end, 6));
1336: }
1337: if (start <= end) {
1338: retain(range(start, end), 2, 0);
1339: } else {
1340: clear();
1341: }
1342: return this ;
1343: }
1344:
1345: /**
1346: * Retain the specified character from this set if it is present.
1347: * Upon return this set will be empty if it did not contain c, or
1348: * will only contain c if it did contain c.
1349: * @param c the character to be retained
1350: * @return this object, for chaining
1351: * @stable ICU 2.0
1352: */
1353: public final UnicodeSet retain(int c) {
1354: return retain(c, c);
1355: }
1356:
1357: /**
1358: * Retain the specified string in this set if it is present.
1359: * Upon return this set will be empty if it did not contain s, or
1360: * will only contain s if it did contain s.
1361: * @param s the string to be retained
1362: * @return this object, for chaining
1363: * @stable ICU 2.0
1364: */
1365: public final UnicodeSet retain(String s) {
1366: int cp = getSingleCP(s);
1367: if (cp < 0) {
1368: boolean isIn = strings.contains(s);
1369: if (isIn && size() == 1) {
1370: return this ;
1371: }
1372: clear();
1373: strings.add(s);
1374: pat = null;
1375: } else {
1376: retain(cp, cp);
1377: }
1378: return this ;
1379: }
1380:
1381: /**
1382: * Removes the specified range from this set if it is present.
1383: * The set will not contain the specified range once the call
1384: * returns. If <code>end > start</code> then an empty range is
1385: * removed, leaving the set unchanged.
1386: *
1387: * @param start first character, inclusive, of range to be removed
1388: * from this set.
1389: * @param end last character, inclusive, of range to be removed
1390: * from this set.
1391: * @stable ICU 2.0
1392: */
1393: public UnicodeSet remove(int start, int end) {
1394: checkFrozen();
1395: if (start < MIN_VALUE || start > MAX_VALUE) {
1396: throw new IllegalArgumentException("Invalid code point U+"
1397: + Utility.hex(start, 6));
1398: }
1399: if (end < MIN_VALUE || end > MAX_VALUE) {
1400: throw new IllegalArgumentException("Invalid code point U+"
1401: + Utility.hex(end, 6));
1402: }
1403: if (start <= end) {
1404: retain(range(start, end), 2, 2);
1405: }
1406: return this ;
1407: }
1408:
1409: /**
1410: * Removes the specified character from this set if it is present.
1411: * The set will not contain the specified character once the call
1412: * returns.
1413: * @param c the character to be removed
1414: * @return this object, for chaining
1415: * @stable ICU 2.0
1416: */
1417: public final UnicodeSet remove(int c) {
1418: return remove(c, c);
1419: }
1420:
1421: /**
1422: * Removes the specified string from this set if it is present.
1423: * The set will not contain the specified string once the call
1424: * returns.
1425: * @param s the string to be removed
1426: * @return this object, for chaining
1427: * @stable ICU 2.0
1428: */
1429: public final UnicodeSet remove(String s) {
1430: int cp = getSingleCP(s);
1431: if (cp < 0) {
1432: strings.remove(s);
1433: pat = null;
1434: } else {
1435: remove(cp, cp);
1436: }
1437: return this ;
1438: }
1439:
1440: /**
1441: * Complements the specified range in this set. Any character in
1442: * the range will be removed if it is in this set, or will be
1443: * added if it is not in this set. If <code>end > start</code>
1444: * then an empty range is complemented, leaving the set unchanged.
1445: *
1446: * @param start first character, inclusive, of range to be removed
1447: * from this set.
1448: * @param end last character, inclusive, of range to be removed
1449: * from this set.
1450: * @stable ICU 2.0
1451: */
1452: public UnicodeSet complement(int start, int end) {
1453: checkFrozen();
1454: if (start < MIN_VALUE || start > MAX_VALUE) {
1455: throw new IllegalArgumentException("Invalid code point U+"
1456: + Utility.hex(start, 6));
1457: }
1458: if (end < MIN_VALUE || end > MAX_VALUE) {
1459: throw new IllegalArgumentException("Invalid code point U+"
1460: + Utility.hex(end, 6));
1461: }
1462: if (start <= end) {
1463: xor(range(start, end), 2, 0);
1464: }
1465: pat = null;
1466: return this ;
1467: }
1468:
1469: /**
1470: * Complements the specified character in this set. The character
1471: * will be removed if it is in this set, or will be added if it is
1472: * not in this set.
1473: * @stable ICU 2.0
1474: */
1475: public final UnicodeSet complement(int c) {
1476: return complement(c, c);
1477: }
1478:
1479: /**
1480: * This is equivalent to
1481: * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1482: * @stable ICU 2.0
1483: */
1484: public UnicodeSet complement() {
1485: checkFrozen();
1486: if (list[0] == LOW) {
1487: System.arraycopy(list, 1, list, 0, len - 1);
1488: --len;
1489: } else {
1490: ensureCapacity(len + 1);
1491: System.arraycopy(list, 0, list, 1, len);
1492: list[0] = LOW;
1493: ++len;
1494: }
1495: pat = null;
1496: return this ;
1497: }
1498:
1499: /**
1500: * Complement the specified string in this set.
1501: * The set will not contain the specified string once the call
1502: * returns.
1503: * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1504: * @param s the string to complement
1505: * @return this object, for chaining
1506: * @stable ICU 2.0
1507: */
1508: public final UnicodeSet complement(String s) {
1509: checkFrozen();
1510: int cp = getSingleCP(s);
1511: if (cp < 0) {
1512: if (strings.contains(s))
1513: strings.remove(s);
1514: else
1515: strings.add(s);
1516: pat = null;
1517: } else {
1518: complement(cp, cp);
1519: }
1520: return this ;
1521: }
1522:
1523: /**
1524: * Returns true if this set contains the given character.
1525: * @param c character to be checked for containment
1526: * @return true if the test condition is met
1527: * @stable ICU 2.0
1528: */
1529: public boolean contains(int c) {
1530: if (c < MIN_VALUE || c > MAX_VALUE) {
1531: throw new IllegalArgumentException("Invalid code point U+"
1532: + Utility.hex(c, 6));
1533: }
1534:
1535: /*
1536: // Set i to the index of the start item greater than ch
1537: // We know we will terminate without length test!
1538: int i = -1;
1539: while (true) {
1540: if (c < list[++i]) break;
1541: }
1542: */
1543:
1544: int i = findCodePoint(c);
1545:
1546: return ((i & 1) != 0); // return true if odd
1547: }
1548:
1549: /**
1550: * Returns the smallest value i such that c < list[i]. Caller
1551: * must ensure that c is a legal value or this method will enter
1552: * an infinite loop. This method performs a binary search.
1553: * @param c a character in the range MIN_VALUE..MAX_VALUE
1554: * inclusive
1555: * @return the smallest integer i in the range 0..len-1,
1556: * inclusive, such that c < list[i]
1557: */
1558: private final int findCodePoint(int c) {
1559: /* Examples:
1560: findCodePoint(c)
1561: set list[] c=0 1 3 4 7 8
1562: === ============== ===========
1563: [] [110000] 0 0 0 0 0 0
1564: [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
1565: [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
1566: [:all:] [0, 110000] 1 1 1 1 1 1
1567: */
1568:
1569: // Return the smallest i such that c < list[i]. Assume
1570: // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1571: if (c < list[0])
1572: return 0;
1573: // High runner test. c is often after the last range, so an
1574: // initial check for this condition pays off.
1575: if (len >= 2 && c >= list[len - 2])
1576: return len - 1;
1577: int lo = 0;
1578: int hi = len - 1;
1579: // invariant: c >= list[lo]
1580: // invariant: c < list[hi]
1581: for (;;) {
1582: int i = (lo + hi) >>> 1;
1583: if (i == lo)
1584: return hi;
1585: if (c < list[i]) {
1586: hi = i;
1587: } else {
1588: lo = i;
1589: }
1590: }
1591: }
1592:
1593: // //----------------------------------------------------------------
1594: // // Unrolled binary search
1595: // //----------------------------------------------------------------
1596: //
1597: // private int validLen = -1; // validated value of len
1598: // private int topOfLow;
1599: // private int topOfHigh;
1600: // private int power;
1601: // private int deltaStart;
1602: //
1603: // private void validate() {
1604: // if (len <= 1) {
1605: // throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1606: // }
1607: //
1608: // // find greatest power of 2 less than or equal to len
1609: // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1610: //
1611: // // assert(exp2[power] <= len);
1612: //
1613: // // determine the starting points
1614: // topOfLow = exp2[power] - 1;
1615: // topOfHigh = len - 1;
1616: // deltaStart = exp2[power-1];
1617: // validLen = len;
1618: // }
1619: //
1620: // private static final int exp2[] = {
1621: // 0x1, 0x2, 0x4, 0x8,
1622: // 0x10, 0x20, 0x40, 0x80,
1623: // 0x100, 0x200, 0x400, 0x800,
1624: // 0x1000, 0x2000, 0x4000, 0x8000,
1625: // 0x10000, 0x20000, 0x40000, 0x80000,
1626: // 0x100000, 0x200000, 0x400000, 0x800000,
1627: // 0x1000000, 0x2000000, 0x4000000, 0x8000000,
1628: // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1629: // };
1630: //
1631: // /**
1632: // * Unrolled lowest index GT.
1633: // */
1634: // private final int leastIndexGT(int searchValue) {
1635: //
1636: // if (len != validLen) {
1637: // if (len == 1) return 0;
1638: // validate();
1639: // }
1640: // int temp;
1641: //
1642: // // set up initial range to search. Each subrange is a power of two in length
1643: // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1644: //
1645: // // Completely unrolled binary search, folhighing "Programming Pearls"
1646: // // Each case deliberately falls through to the next
1647: // // Logically, list[-1] < all_search_values && list[count] > all_search_values
1648: // // although the values -1 and count are never actually touched.
1649: //
1650: // // The bounds at each point are low & high,
1651: // // where low == high - delta*2
1652: // // so high - delta is the midpoint
1653: //
1654: // // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1655: //
1656: // switch (power) {
1657: // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1658: // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1659: // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1660: //
1661: // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1662: // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1663: // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1664: // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1665: //
1666: // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp;
1667: // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp;
1668: // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp;
1669: // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp;
1670: //
1671: // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp;
1672: // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp;
1673: // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp;
1674: // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp;
1675: //
1676: // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp;
1677: // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp;
1678: // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp;
1679: // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp;
1680: //
1681: // case 12: if (searchValue < list[temp = high- 0x800]) high = temp;
1682: // case 11: if (searchValue < list[temp = high- 0x400]) high = temp;
1683: // case 10: if (searchValue < list[temp = high- 0x200]) high = temp;
1684: // case 9: if (searchValue < list[temp = high- 0x100]) high = temp;
1685: //
1686: // case 8: if (searchValue < list[temp = high- 0x80]) high = temp;
1687: // case 7: if (searchValue < list[temp = high- 0x40]) high = temp;
1688: // case 6: if (searchValue < list[temp = high- 0x20]) high = temp;
1689: // case 5: if (searchValue < list[temp = high- 0x10]) high = temp;
1690: //
1691: // case 4: if (searchValue < list[temp = high- 0x8]) high = temp;
1692: // case 3: if (searchValue < list[temp = high- 0x4]) high = temp;
1693: // case 2: if (searchValue < list[temp = high- 0x2]) high = temp;
1694: // case 1: if (searchValue < list[temp = high- 0x1]) high = temp;
1695: // }
1696: //
1697: // return high;
1698: // }
1699: //
1700: // // For debugging only
1701: // public int len() {
1702: // return len;
1703: // }
1704: //
1705: // //----------------------------------------------------------------
1706: // //----------------------------------------------------------------
1707:
1708: /**
1709: * Returns true if this set contains every character
1710: * of the given range.
1711: * @param start first character, inclusive, of the range
1712: * @param end last character, inclusive, of the range
1713: * @return true if the test condition is met
1714: * @stable ICU 2.0
1715: */
1716: public boolean contains(int start, int end) {
1717: if (start < MIN_VALUE || start > MAX_VALUE) {
1718: throw new IllegalArgumentException("Invalid code point U+"
1719: + Utility.hex(start, 6));
1720: }
1721: if (end < MIN_VALUE || end > MAX_VALUE) {
1722: throw new IllegalArgumentException("Invalid code point U+"
1723: + Utility.hex(end, 6));
1724: }
1725: //int i = -1;
1726: //while (true) {
1727: // if (start < list[++i]) break;
1728: //}
1729: int i = findCodePoint(start);
1730: return ((i & 1) != 0 && end < list[i]);
1731: }
1732:
1733: /**
1734: * Returns <tt>true</tt> if this set contains the given
1735: * multicharacter string.
1736: * @param s string to be checked for containment
1737: * @return <tt>true</tt> if this set contains the specified string
1738: * @stable ICU 2.0
1739: */
1740: public final boolean contains(String s) {
1741:
1742: int cp = getSingleCP(s);
1743: if (cp < 0) {
1744: return strings.contains(s);
1745: } else {
1746: return contains(cp);
1747: }
1748: }
1749:
1750: /**
1751: * Returns true if this set contains all the characters and strings
1752: * of the given set.
1753: * @param c set to be checked for containment
1754: * @return true if the test condition is met
1755: * @stable ICU 2.0
1756: */
1757: public boolean containsAll(UnicodeSet c) {
1758: // The specified set is a subset if all of its pairs are contained in
1759: // this set. It's possible to code this more efficiently in terms of
1760: // direct manipulation of the inversion lists if the need arises.
1761: int n = c.getRangeCount();
1762: for (int i = 0; i < n; ++i) {
1763: if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
1764: return false;
1765: }
1766: }
1767: if (!strings.containsAll(c.strings))
1768: return false;
1769: return true;
1770: }
1771:
1772: /**
1773: * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
1774: * For example, for the Unicode set [a{bc}{cd}]<br>
1775: * containsAll is true for each of: "a", "bc", ""cdbca"<br>
1776: * containsAll is false for each of: "acb", "bcda", "bcx"<br>
1777: * @param s string containing characters to be checked for containment
1778: * @return true if the test condition is met
1779: * @stable ICU 2.0
1780: */
1781: public boolean containsAll(String s) {
1782: int cp;
1783: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1784: cp = UTF16.charAt(s, i);
1785: if (!contains(cp)) {
1786: if (strings.size() == 0) {
1787: return false;
1788: }
1789: return containsAll(s, 0);
1790: }
1791: }
1792: return true;
1793: }
1794:
1795: /**
1796: * Recursive routine called if we fail to find a match in containsAll, and there are strings
1797: * @param s source string
1798: * @param i point to match to the end on
1799: * @return true if ok
1800: */
1801: private boolean containsAll(String s, int i) {
1802: if (i >= s.length()) {
1803: return true;
1804: }
1805: int cp = UTF16.charAt(s, i);
1806: if (contains(cp) && containsAll(s, i + UTF16.getCharCount(cp))) {
1807: return true;
1808: }
1809:
1810: Iterator it = strings.iterator();
1811: while (it.hasNext()) {
1812: String setStr = (String) it.next();
1813: if (s.startsWith(setStr, i)
1814: && containsAll(s, i + setStr.length())) {
1815: return true;
1816: }
1817: }
1818: return false;
1819:
1820: }
1821:
1822: /**
1823: * @return regex pattern equivalent to this UnicodeSet
1824: * @internal
1825: * @deprecated This API is ICU internal only.
1826: */
1827: public String getRegexEquivalent() {
1828: if (strings.size() == 0)
1829: return toString();
1830: StringBuffer result = new StringBuffer("(?:");
1831: _generatePattern(result, true, false);
1832: Iterator it = strings.iterator();
1833: while (it.hasNext()) {
1834: result.append('|');
1835: _appendToPat(result, (String) it.next(), true);
1836: }
1837: return result.append(")").toString();
1838: }
1839:
1840: /**
1841: * Returns true if this set contains none of the characters
1842: * of the given range.
1843: * @param start first character, inclusive, of the range
1844: * @param end last character, inclusive, of the range
1845: * @return true if the test condition is met
1846: * @stable ICU 2.0
1847: */
1848: public boolean containsNone(int start, int end) {
1849: if (start < MIN_VALUE || start > MAX_VALUE) {
1850: throw new IllegalArgumentException("Invalid code point U+"
1851: + Utility.hex(start, 6));
1852: }
1853: if (end < MIN_VALUE || end > MAX_VALUE) {
1854: throw new IllegalArgumentException("Invalid code point U+"
1855: + Utility.hex(end, 6));
1856: }
1857: int i = -1;
1858: while (true) {
1859: if (start < list[++i])
1860: break;
1861: }
1862: return ((i & 1) == 0 && end < list[i]);
1863: }
1864:
1865: /**
1866: * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
1867: * For example, for the Unicode set [a{bc}{cd}]<br>
1868: * containsNone is true for: "xy", "cb"<br>
1869: * containsNone is false for: "a", "bc", "bcd"<br>
1870: * @param c set to be checked for containment
1871: * @return true if the test condition is met
1872: * @stable ICU 2.0
1873: */
1874: public boolean containsNone(UnicodeSet c) {
1875: // The specified set is a subset if all of its pairs are contained in
1876: // this set. It's possible to code this more efficiently in terms of
1877: // direct manipulation of the inversion lists if the need arises.
1878: int n = c.getRangeCount();
1879: for (int i = 0; i < n; ++i) {
1880: if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
1881: return false;
1882: }
1883: }
1884: if (!SortedSetRelation.hasRelation(strings,
1885: SortedSetRelation.DISJOINT, c.strings))
1886: return false;
1887: return true;
1888: }
1889:
1890: /**
1891: * Returns true if this set contains none of the characters
1892: * of the given string.
1893: * @param s string containing characters to be checked for containment
1894: * @return true if the test condition is met
1895: * @stable ICU 2.0
1896: */
1897: public boolean containsNone(String s) {
1898: int cp;
1899: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1900: cp = UTF16.charAt(s, i);
1901: if (contains(cp))
1902: return false;
1903: }
1904: if (strings.size() == 0)
1905: return true;
1906: // do a last check to make sure no strings are in.
1907: for (Iterator it = strings.iterator(); it.hasNext();) {
1908: String item = (String) it.next();
1909: if (s.indexOf(item) >= 0)
1910: return false;
1911: }
1912: return true;
1913: }
1914:
1915: /**
1916: * Returns true if this set contains one or more of the characters
1917: * in the given range.
1918: * @param start first character, inclusive, of the range
1919: * @param end last character, inclusive, of the range
1920: * @return true if the condition is met
1921: * @stable ICU 2.0
1922: */
1923: public final boolean containsSome(int start, int end) {
1924: return !containsNone(start, end);
1925: }
1926:
1927: /**
1928: * Returns true if this set contains one or more of the characters
1929: * and strings of the given set.
1930: * @param s set to be checked for containment
1931: * @return true if the condition is met
1932: * @stable ICU 2.0
1933: */
1934: public final boolean containsSome(UnicodeSet s) {
1935: return !containsNone(s);
1936: }
1937:
1938: /**
1939: * Returns true if this set contains one or more of the characters
1940: * of the given string.
1941: * @param s string containing characters to be checked for containment
1942: * @return true if the condition is met
1943: * @stable ICU 2.0
1944: */
1945: public final boolean containsSome(String s) {
1946: return !containsNone(s);
1947: }
1948:
1949: /**
1950: * Adds all of the elements in the specified set to this set if
1951: * they're not already present. This operation effectively
1952: * modifies this set so that its value is the <i>union</i> of the two
1953: * sets. The behavior of this operation is unspecified if the specified
1954: * collection is modified while the operation is in progress.
1955: *
1956: * @param c set whose elements are to be added to this set.
1957: * @stable ICU 2.0
1958: */
1959: public UnicodeSet addAll(UnicodeSet c) {
1960: checkFrozen();
1961: add(c.list, c.len, 0);
1962: strings.addAll(c.strings);
1963: return this ;
1964: }
1965:
1966: /**
1967: * Retains only the elements in this set that are contained in the
1968: * specified set. In other words, removes from this set all of
1969: * its elements that are not contained in the specified set. This
1970: * operation effectively modifies this set so that its value is
1971: * the <i>intersection</i> of the two sets.
1972: *
1973: * @param c set that defines which elements this set will retain.
1974: * @stable ICU 2.0
1975: */
1976: public UnicodeSet retainAll(UnicodeSet c) {
1977: checkFrozen();
1978: retain(c.list, c.len, 0);
1979: strings.retainAll(c.strings);
1980: return this ;
1981: }
1982:
1983: /**
1984: * Removes from this set all of its elements that are contained in the
1985: * specified set. This operation effectively modifies this
1986: * set so that its value is the <i>asymmetric set difference</i> of
1987: * the two sets.
1988: *
1989: * @param c set that defines which elements will be removed from
1990: * this set.
1991: * @stable ICU 2.0
1992: */
1993: public UnicodeSet removeAll(UnicodeSet c) {
1994: checkFrozen();
1995: retain(c.list, c.len, 2);
1996: strings.removeAll(c.strings);
1997: return this ;
1998: }
1999:
2000: /**
2001: * Complements in this set all elements contained in the specified
2002: * set. Any character in the other set will be removed if it is
2003: * in this set, or will be added if it is not in this set.
2004: *
2005: * @param c set that defines which elements will be complemented from
2006: * this set.
2007: * @stable ICU 2.0
2008: */
2009: public UnicodeSet complementAll(UnicodeSet c) {
2010: checkFrozen();
2011: xor(c.list, c.len, 0);
2012: SortedSetRelation.doOperation(strings,
2013: SortedSetRelation.COMPLEMENTALL, c.strings);
2014: return this ;
2015: }
2016:
2017: /**
2018: * Removes all of the elements from this set. This set will be
2019: * empty after this call returns.
2020: * @stable ICU 2.0
2021: */
2022: public UnicodeSet clear() {
2023: checkFrozen();
2024: list[0] = HIGH;
2025: len = 1;
2026: pat = null;
2027: strings.clear();
2028: return this ;
2029: }
2030:
2031: /**
2032: * Iteration method that returns the number of ranges contained in
2033: * this set.
2034: * @see #getRangeStart
2035: * @see #getRangeEnd
2036: * @stable ICU 2.0
2037: */
2038: public int getRangeCount() {
2039: return len / 2;
2040: }
2041:
2042: /**
2043: * Iteration method that returns the first character in the
2044: * specified range of this set.
2045: * @exception ArrayIndexOutOfBoundsException if index is outside
2046: * the range <code>0..getRangeCount()-1</code>
2047: * @see #getRangeCount
2048: * @see #getRangeEnd
2049: * @stable ICU 2.0
2050: */
2051: public int getRangeStart(int index) {
2052: return list[index * 2];
2053: }
2054:
2055: /**
2056: * Iteration method that returns the last character in the
2057: * specified range of this set.
2058: * @exception ArrayIndexOutOfBoundsException if index is outside
2059: * the range <code>0..getRangeCount()-1</code>
2060: * @see #getRangeStart
2061: * @see #getRangeEnd
2062: * @stable ICU 2.0
2063: */
2064: public int getRangeEnd(int index) {
2065: return (list[index * 2 + 1] - 1);
2066: }
2067:
2068: /**
2069: * Reallocate this objects internal structures to take up the least
2070: * possible space, without changing this object's value.
2071: * @stable ICU 2.0
2072: */
2073: public UnicodeSet compact() {
2074: checkFrozen();
2075: if (len != list.length) {
2076: int[] temp = new int[len];
2077: System.arraycopy(list, 0, temp, 0, len);
2078: list = temp;
2079: }
2080: rangeList = null;
2081: buffer = null;
2082: return this ;
2083: }
2084:
2085: /**
2086: * Compares the specified object with this set for equality. Returns
2087: * <tt>true</tt> if the specified object is also a set, the two sets
2088: * have the same size, and every member of the specified set is
2089: * contained in this set (or equivalently, every member of this set is
2090: * contained in the specified set).
2091: *
2092: * @param o Object to be compared for equality with this set.
2093: * @return <tt>true</tt> if the specified Object is equal to this set.
2094: * @stable ICU 2.0
2095: */
2096: public boolean equals(Object o) {
2097: try {
2098: UnicodeSet that = (UnicodeSet) o;
2099: if (len != that.len)
2100: return false;
2101: for (int i = 0; i < len; ++i) {
2102: if (list[i] != that.list[i])
2103: return false;
2104: }
2105: if (!strings.equals(that.strings))
2106: return false;
2107: } catch (Exception e) {
2108: return false;
2109: }
2110: return true;
2111: }
2112:
2113: /**
2114: * Returns the hash code value for this set.
2115: *
2116: * @return the hash code value for this set.
2117: * @see java.lang.Object#hashCode()
2118: * @stable ICU 2.0
2119: */
2120: public int hashCode() {
2121: int result = len;
2122: for (int i = 0; i < len; ++i) {
2123: result *= 1000003;
2124: result += list[i];
2125: }
2126: return result;
2127: }
2128:
2129: /**
2130: * Return a programmer-readable string representation of this object.
2131: * @stable ICU 2.0
2132: */
2133: public String toString() {
2134: return toPattern(true);
2135: }
2136:
2137: //----------------------------------------------------------------
2138: // Implementation: Pattern parsing
2139: //----------------------------------------------------------------
2140:
2141: /**
2142: * Parses the given pattern, starting at the given position. The character
2143: * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2144: * Parsing continues until the corresponding closing ']'. If a syntax error
2145: * is encountered between the opening and closing brace, the parse fails.
2146: * Upon return from a successful parse, the ParsePosition is updated to
2147: * point to the character following the closing ']', and an inversion
2148: * list for the parsed pattern is returned. This method
2149: * calls itself recursively to parse embedded subpatterns.
2150: *
2151: * @param pattern the string containing the pattern to be parsed. The
2152: * portion of the string from pos.getIndex(), which must be a '[', to the
2153: * corresponding closing ']', is parsed.
2154: * @param pos upon entry, the position at which to being parsing. The
2155: * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
2156: * from a successful parse, pos.getIndex() is either the character after the
2157: * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2158: * is the last character of the pattern string.
2159: * @return an inversion list for the parsed substring
2160: * of <code>pattern</code>
2161: * @exception java.lang.IllegalArgumentException if the parse fails.
2162: */
2163: UnicodeSet applyPattern(String pattern, ParsePosition pos,
2164: SymbolTable symbols, int options) {
2165:
2166: // Need to build the pattern in a temporary string because
2167: // _applyPattern calls add() etc., which set pat to empty.
2168: boolean parsePositionWasNull = pos == null;
2169: if (parsePositionWasNull) {
2170: pos = new ParsePosition(0);
2171: }
2172:
2173: StringBuffer rebuiltPat = new StringBuffer();
2174: RuleCharacterIterator chars = new RuleCharacterIterator(
2175: pattern, symbols, pos);
2176: applyPattern(chars, symbols, rebuiltPat, options);
2177: if (chars.inVariable()) {
2178: syntaxError(chars, "Extra chars in variable value");
2179: }
2180: pat = rebuiltPat.toString();
2181: if (parsePositionWasNull) {
2182: int i = pos.getIndex();
2183:
2184: // Skip over trailing whitespace
2185: if ((options & IGNORE_SPACE) != 0) {
2186: i = Utility.skipWhitespace(pattern, i);
2187: }
2188:
2189: if (i != pattern.length()) {
2190: throw new IllegalArgumentException("Parse of \""
2191: + pattern + "\" failed at " + i);
2192: }
2193: }
2194: return this ;
2195: }
2196:
2197: /**
2198: * Parse the pattern from the given RuleCharacterIterator. The
2199: * iterator is advanced over the parsed pattern.
2200: * @param chars iterator over the pattern characters. Upon return
2201: * it will be advanced to the first character after the parsed
2202: * pattern, or the end of the iteration if all characters are
2203: * parsed.
2204: * @param symbols symbol table to use to parse and dereference
2205: * variables, or null if none.
2206: * @param rebuiltPat the pattern that was parsed, rebuilt or
2207: * copied from the input pattern, as appropriate.
2208: * @param options a bit mask of zero or more of the following:
2209: * IGNORE_SPACE, CASE.
2210: */
2211: void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2212: StringBuffer rebuiltPat, int options) {
2213:
2214: // Syntax characters: [ ] ^ - & { }
2215:
2216: // Recognized special forms for chars, sets: c-c s-s s&s
2217:
2218: int opts = RuleCharacterIterator.PARSE_VARIABLES
2219: | RuleCharacterIterator.PARSE_ESCAPES;
2220: if ((options & IGNORE_SPACE) != 0) {
2221: opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2222: }
2223:
2224: StringBuffer pat = new StringBuffer(), buf = null;
2225: boolean usePat = false;
2226: UnicodeSet scratch = null;
2227: Object backup = null;
2228:
2229: // mode: 0=before [, 1=between [...], 2=after ]
2230: // lastItem: 0=none, 1=char, 2=set
2231: int lastItem = 0, lastChar = 0, mode = 0;
2232: char op = 0;
2233:
2234: boolean invert = false;
2235:
2236: clear();
2237:
2238: while (mode != 2 && !chars.atEnd()) {
2239: if (false) {
2240: // Debugging assertion
2241: if (!((lastItem == 0 && op == 0)
2242: || (lastItem == 1 && (op == 0 || op == '-')) || (lastItem == 2 && (op == 0
2243: || op == '-' || op == '&')))) {
2244: throw new IllegalArgumentException();
2245: }
2246: }
2247:
2248: int c = 0;
2249: boolean literal = false;
2250: UnicodeSet nested = null;
2251:
2252: // -------- Check for property pattern
2253:
2254: // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2255: int setMode = 0;
2256: if (resemblesPropertyPattern(chars, opts)) {
2257: setMode = 2;
2258: }
2259:
2260: // -------- Parse '[' of opening delimiter OR nested set.
2261: // If there is a nested set, use `setMode' to define how
2262: // the set should be parsed. If the '[' is part of the
2263: // opening delimiter for this pattern, parse special
2264: // strings "[", "[^", "[-", and "[^-". Check for stand-in
2265: // characters representing a nested set in the symbol
2266: // table.
2267:
2268: else {
2269: // Prepare to backup if necessary
2270: backup = chars.getPos(backup);
2271: c = chars.next(opts);
2272: literal = chars.isEscaped();
2273:
2274: if (c == '[' && !literal) {
2275: if (mode == 1) {
2276: chars.setPos(backup); // backup
2277: setMode = 1;
2278: } else {
2279: // Handle opening '[' delimiter
2280: mode = 1;
2281: pat.append('[');
2282: backup = chars.getPos(backup); // prepare to backup
2283: c = chars.next(opts);
2284: literal = chars.isEscaped();
2285: if (c == '^' && !literal) {
2286: invert = true;
2287: pat.append('^');
2288: backup = chars.getPos(backup); // prepare to backup
2289: c = chars.next(opts);
2290: literal = chars.isEscaped();
2291: }
2292: // Fall through to handle special leading '-';
2293: // otherwise restart loop for nested [], \p{}, etc.
2294: if (c == '-') {
2295: literal = true;
2296: // Fall through to handle literal '-' below
2297: } else {
2298: chars.setPos(backup); // backup
2299: continue;
2300: }
2301: }
2302: } else if (symbols != null) {
2303: UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2304: if (m != null) {
2305: try {
2306: nested = (UnicodeSet) m;
2307: setMode = 3;
2308: } catch (ClassCastException e) {
2309: syntaxError(chars, "Syntax error");
2310: }
2311: }
2312: }
2313: }
2314:
2315: // -------- Handle a nested set. This either is inline in
2316: // the pattern or represented by a stand-in that has
2317: // previously been parsed and was looked up in the symbol
2318: // table.
2319:
2320: if (setMode != 0) {
2321: if (lastItem == 1) {
2322: if (op != 0) {
2323: syntaxError(chars,
2324: "Char expected after operator");
2325: }
2326: add_unchecked(lastChar, lastChar);
2327: _appendToPat(pat, lastChar, false);
2328: lastItem = op = 0;
2329: }
2330:
2331: if (op == '-' || op == '&') {
2332: pat.append(op);
2333: }
2334:
2335: if (nested == null) {
2336: if (scratch == null)
2337: scratch = new UnicodeSet();
2338: nested = scratch;
2339: }
2340: switch (setMode) {
2341: case 1:
2342: nested.applyPattern(chars, symbols, pat, options);
2343: break;
2344: case 2:
2345: chars.skipIgnored(opts);
2346: nested.applyPropertyPattern(chars, pat, symbols);
2347: break;
2348: case 3: // `nested' already parsed
2349: nested._toPattern(pat, false);
2350: break;
2351: }
2352:
2353: usePat = true;
2354:
2355: if (mode == 0) {
2356: // Entire pattern is a category; leave parse loop
2357: set(nested);
2358: mode = 2;
2359: break;
2360: }
2361:
2362: switch (op) {
2363: case '-':
2364: removeAll(nested);
2365: break;
2366: case '&':
2367: retainAll(nested);
2368: break;
2369: case 0:
2370: addAll(nested);
2371: break;
2372: }
2373:
2374: op = 0;
2375: lastItem = 2;
2376:
2377: continue;
2378: }
2379:
2380: if (mode == 0) {
2381: syntaxError(chars, "Missing '['");
2382: }
2383:
2384: // -------- Parse special (syntax) characters. If the
2385: // current character is not special, or if it is escaped,
2386: // then fall through and handle it below.
2387:
2388: if (!literal) {
2389: switch (c) {
2390: case ']':
2391: if (lastItem == 1) {
2392: add_unchecked(lastChar, lastChar);
2393: _appendToPat(pat, lastChar, false);
2394: }
2395: // Treat final trailing '-' as a literal
2396: if (op == '-') {
2397: add_unchecked(op, op);
2398: pat.append(op);
2399: } else if (op == '&') {
2400: syntaxError(chars, "Trailing '&'");
2401: }
2402: pat.append(']');
2403: mode = 2;
2404: continue;
2405: case '-':
2406: if (op == 0) {
2407: if (lastItem != 0) {
2408: op = (char) c;
2409: continue;
2410: } else {
2411: // Treat final trailing '-' as a literal
2412: add_unchecked(c, c);
2413: c = chars.next(opts);
2414: literal = chars.isEscaped();
2415: if (c == ']' && !literal) {
2416: pat.append("-]");
2417: mode = 2;
2418: continue;
2419: }
2420: }
2421: }
2422: syntaxError(chars, "'-' not after char or set");
2423: case '&':
2424: if (lastItem == 2 && op == 0) {
2425: op = (char) c;
2426: continue;
2427: }
2428: syntaxError(chars, "'&' not after set");
2429: case '^':
2430: syntaxError(chars, "'^' not after '['");
2431: case '{':
2432: if (op != 0) {
2433: syntaxError(chars,
2434: "Missing operand after operator");
2435: }
2436: if (lastItem == 1) {
2437: add_unchecked(lastChar, lastChar);
2438: _appendToPat(pat, lastChar, false);
2439: }
2440: lastItem = 0;
2441: if (buf == null) {
2442: buf = new StringBuffer();
2443: } else {
2444: buf.setLength(0);
2445: }
2446: boolean ok = false;
2447: while (!chars.atEnd()) {
2448: c = chars.next(opts);
2449: literal = chars.isEscaped();
2450: if (c == '}' && !literal) {
2451: ok = true;
2452: break;
2453: }
2454: UTF16.append(buf, c);
2455: }
2456: if (buf.length() < 1 || !ok) {
2457: syntaxError(chars,
2458: "Invalid multicharacter string");
2459: }
2460: // We have new string. Add it to set and continue;
2461: // we don't need to drop through to the further
2462: // processing
2463: add(buf.toString());
2464: pat.append('{');
2465: _appendToPat(pat, buf.toString(), false);
2466: pat.append('}');
2467: continue;
2468: case SymbolTable.SYMBOL_REF:
2469: // symbols nosymbols
2470: // [a-$] error error (ambiguous)
2471: // [a$] anchor anchor
2472: // [a-$x] var "x"* literal '$'
2473: // [a-$.] error literal '$'
2474: // *We won't get here in the case of var "x"
2475: backup = chars.getPos(backup);
2476: c = chars.next(opts);
2477: literal = chars.isEscaped();
2478: boolean anchor = (c == ']' && !literal);
2479: if (symbols == null && !anchor) {
2480: c = SymbolTable.SYMBOL_REF;
2481: chars.setPos(backup);
2482: break; // literal '$'
2483: }
2484: if (anchor && op == 0) {
2485: if (lastItem == 1) {
2486: add_unchecked(lastChar, lastChar);
2487: _appendToPat(pat, lastChar, false);
2488: }
2489: add_unchecked(UnicodeMatcher.ETHER);
2490: usePat = true;
2491: pat.append(SymbolTable.SYMBOL_REF).append(']');
2492: mode = 2;
2493: continue;
2494: }
2495: syntaxError(chars, "Unquoted '$'");
2496: default:
2497: break;
2498: }
2499: }
2500:
2501: // -------- Parse literal characters. This includes both
2502: // escaped chars ("\u4E01") and non-syntax characters
2503: // ("a").
2504:
2505: switch (lastItem) {
2506: case 0:
2507: lastItem = 1;
2508: lastChar = c;
2509: break;
2510: case 1:
2511: if (op == '-') {
2512: if (lastChar >= c) {
2513: // Don't allow redundant (a-a) or empty (b-a) ranges;
2514: // these are most likely typos.
2515: syntaxError(chars, "Invalid range");
2516: }
2517: add_unchecked(lastChar, c);
2518: _appendToPat(pat, lastChar, false);
2519: pat.append(op);
2520: _appendToPat(pat, c, false);
2521: lastItem = op = 0;
2522: } else {
2523: add_unchecked(lastChar, lastChar);
2524: _appendToPat(pat, lastChar, false);
2525: lastChar = c;
2526: }
2527: break;
2528: case 2:
2529: if (op != 0) {
2530: syntaxError(chars, "Set expected after operator");
2531: }
2532: lastChar = c;
2533: lastItem = 1;
2534: break;
2535: }
2536: }
2537:
2538: if (mode != 2) {
2539: syntaxError(chars, "Missing ']'");
2540: }
2541:
2542: chars.skipIgnored(opts);
2543:
2544: /**
2545: * Handle global flags (invert, case insensitivity). If this
2546: * pattern should be compiled case-insensitive, then we need
2547: * to close over case BEFORE COMPLEMENTING. This makes
2548: * patterns like /[^abc]/i work.
2549: */
2550: if ((options & CASE) != 0) {
2551: closeOver(CASE);
2552: }
2553: if (invert) {
2554: complement();
2555: }
2556:
2557: // Use the rebuilt pattern (pat) only if necessary. Prefer the
2558: // generated pattern.
2559: if (usePat) {
2560: rebuiltPat.append(pat.toString());
2561: } else {
2562: _generatePattern(rebuiltPat, false, true);
2563: }
2564: }
2565:
2566: private static void syntaxError(RuleCharacterIterator chars,
2567: String msg) {
2568: throw new IllegalArgumentException("Error: " + msg + " at \""
2569: + Utility.escape(chars.toString()) + '"');
2570: }
2571:
2572: /**
2573: * Add the contents of the UnicodeSet (as strings) into a collection.
2574: * @param target collection to add into
2575: * @stable ICU 2.8
2576: */
2577: public void addAllTo(Collection target) {
2578: UnicodeSetIterator it = new UnicodeSetIterator(this );
2579: while (it.next()) {
2580: target.add(it.getString());
2581: }
2582: }
2583:
2584: /**
2585: * Add the contents of the collection (as strings) into this UnicodeSet.
2586: * @param source the collection to add
2587: * @stable ICU 2.8
2588: */
2589: public void addAll(Collection source) {
2590: checkFrozen();
2591: Iterator it = source.iterator();
2592: while (it.hasNext()) {
2593: add(it.next().toString());
2594: }
2595: }
2596:
2597: //----------------------------------------------------------------
2598: // Implementation: Utility methods
2599: //----------------------------------------------------------------
2600:
2601: private void ensureCapacity(int newLen) {
2602: if (newLen <= list.length)
2603: return;
2604: int[] temp = new int[newLen + GROW_EXTRA];
2605: System.arraycopy(list, 0, temp, 0, len);
2606: list = temp;
2607: }
2608:
2609: private void ensureBufferCapacity(int newLen) {
2610: if (buffer != null && newLen <= buffer.length)
2611: return;
2612: buffer = new int[newLen + GROW_EXTRA];
2613: }
2614:
2615: /**
2616: * Assumes start <= end.
2617: */
2618: private int[] range(int start, int end) {
2619: if (rangeList == null) {
2620: rangeList = new int[] { start, end + 1, HIGH };
2621: } else {
2622: rangeList[0] = start;
2623: rangeList[1] = end + 1;
2624: }
2625: return rangeList;
2626: }
2627:
2628: //----------------------------------------------------------------
2629: // Implementation: Fundamental operations
2630: //----------------------------------------------------------------
2631:
2632: // polarity = 0, 3 is normal: x xor y
2633: // polarity = 1, 2: x xor ~y == x === y
2634:
2635: private UnicodeSet xor(int[] other, int otherLen, int polarity) {
2636: ensureBufferCapacity(len + otherLen);
2637: int i = 0, j = 0, k = 0;
2638: int a = list[i++];
2639: int b;
2640: if (polarity == 1 || polarity == 2) {
2641: b = LOW;
2642: if (other[j] == LOW) { // skip base if already LOW
2643: ++j;
2644: b = other[j];
2645: }
2646: } else {
2647: b = other[j++];
2648: }
2649: // simplest of all the routines
2650: // sort the values, discarding identicals!
2651: while (true) {
2652: if (a < b) {
2653: buffer[k++] = a;
2654: a = list[i++];
2655: } else if (b < a) {
2656: buffer[k++] = b;
2657: b = other[j++];
2658: } else if (a != HIGH) { // at this point, a == b
2659: // discard both values!
2660: a = list[i++];
2661: b = other[j++];
2662: } else { // DONE!
2663: buffer[k++] = HIGH;
2664: len = k;
2665: break;
2666: }
2667: }
2668: // swap list and buffer
2669: int[] temp = list;
2670: list = buffer;
2671: buffer = temp;
2672: pat = null;
2673: return this ;
2674: }
2675:
2676: // polarity = 0 is normal: x union y
2677: // polarity = 2: x union ~y
2678: // polarity = 1: ~x union y
2679: // polarity = 3: ~x union ~y
2680:
2681: private UnicodeSet add(int[] other, int otherLen, int polarity) {
2682: ensureBufferCapacity(len + otherLen);
2683: int i = 0, j = 0, k = 0;
2684: int a = list[i++];
2685: int b = other[j++];
2686: // change from xor is that we have to check overlapping pairs
2687: // polarity bit 1 means a is second, bit 2 means b is.
2688: main: while (true) {
2689: switch (polarity) {
2690: case 0: // both first; take lower if unequal
2691: if (a < b) { // take a
2692: // Back up over overlapping ranges in buffer[]
2693: if (k > 0 && a <= buffer[k - 1]) {
2694: // Pick latter end value in buffer[] vs. list[]
2695: a = max(list[i], buffer[--k]);
2696: } else {
2697: // No overlap
2698: buffer[k++] = a;
2699: a = list[i];
2700: }
2701: i++; // Common if/else code factored out
2702: polarity ^= 1;
2703: } else if (b < a) { // take b
2704: if (k > 0 && b <= buffer[k - 1]) {
2705: b = max(other[j], buffer[--k]);
2706: } else {
2707: buffer[k++] = b;
2708: b = other[j];
2709: }
2710: j++;
2711: polarity ^= 2;
2712: } else { // a == b, take a, drop b
2713: if (a == HIGH)
2714: break main;
2715: // This is symmetrical; it doesn't matter if
2716: // we backtrack with a or b. - liu
2717: if (k > 0 && a <= buffer[k - 1]) {
2718: a = max(list[i], buffer[--k]);
2719: } else {
2720: // No overlap
2721: buffer[k++] = a;
2722: a = list[i];
2723: }
2724: i++;
2725: polarity ^= 1;
2726: b = other[j++];
2727: polarity ^= 2;
2728: }
2729: break;
2730: case 3: // both second; take higher if unequal, and drop other
2731: if (b <= a) { // take a
2732: if (a == HIGH)
2733: break main;
2734: buffer[k++] = a;
2735: } else { // take b
2736: if (b == HIGH)
2737: break main;
2738: buffer[k++] = b;
2739: }
2740: a = list[i++];
2741: polarity ^= 1; // factored common code
2742: b = other[j++];
2743: polarity ^= 2;
2744: break;
2745: case 1: // a second, b first; if b < a, overlap
2746: if (a < b) { // no overlap, take a
2747: buffer[k++] = a;
2748: a = list[i++];
2749: polarity ^= 1;
2750: } else if (b < a) { // OVERLAP, drop b
2751: b = other[j++];
2752: polarity ^= 2;
2753: } else { // a == b, drop both!
2754: if (a == HIGH)
2755: break main;
2756: a = list[i++];
2757: polarity ^= 1;
2758: b = other[j++];
2759: polarity ^= 2;
2760: }
2761: break;
2762: case 2: // a first, b second; if a < b, overlap
2763: if (b < a) { // no overlap, take b
2764: buffer[k++] = b;
2765: b = other[j++];
2766: polarity ^= 2;
2767: } else if (a < b) { // OVERLAP, drop a
2768: a = list[i++];
2769: polarity ^= 1;
2770: } else { // a == b, drop both!
2771: if (a == HIGH)
2772: break main;
2773: a = list[i++];
2774: polarity ^= 1;
2775: b = other[j++];
2776: polarity ^= 2;
2777: }
2778: break;
2779: }
2780: }
2781: buffer[k++] = HIGH; // terminate
2782: len = k;
2783: // swap list and buffer
2784: int[] temp = list;
2785: list = buffer;
2786: buffer = temp;
2787: pat = null;
2788: return this ;
2789: }
2790:
2791: // polarity = 0 is normal: x intersect y
2792: // polarity = 2: x intersect ~y == set-minus
2793: // polarity = 1: ~x intersect y
2794: // polarity = 3: ~x intersect ~y
2795:
2796: private UnicodeSet retain(int[] other, int otherLen, int polarity) {
2797: ensureBufferCapacity(len + otherLen);
2798: int i = 0, j = 0, k = 0;
2799: int a = list[i++];
2800: int b = other[j++];
2801: // change from xor is that we have to check overlapping pairs
2802: // polarity bit 1 means a is second, bit 2 means b is.
2803: main: while (true) {
2804: switch (polarity) {
2805: case 0: // both first; drop the smaller
2806: if (a < b) { // drop a
2807: a = list[i++];
2808: polarity ^= 1;
2809: } else if (b < a) { // drop b
2810: b = other[j++];
2811: polarity ^= 2;
2812: } else { // a == b, take one, drop other
2813: if (a == HIGH)
2814: break main;
2815: buffer[k++] = a;
2816: a = list[i++];
2817: polarity ^= 1;
2818: b = other[j++];
2819: polarity ^= 2;
2820: }
2821: break;
2822: case 3: // both second; take lower if unequal
2823: if (a < b) { // take a
2824: buffer[k++] = a;
2825: a = list[i++];
2826: polarity ^= 1;
2827: } else if (b < a) { // take b
2828: buffer[k++] = b;
2829: b = other[j++];
2830: polarity ^= 2;
2831: } else { // a == b, take one, drop other
2832: if (a == HIGH)
2833: break main;
2834: buffer[k++] = a;
2835: a = list[i++];
2836: polarity ^= 1;
2837: b = other[j++];
2838: polarity ^= 2;
2839: }
2840: break;
2841: case 1: // a second, b first;
2842: if (a < b) { // NO OVERLAP, drop a
2843: a = list[i++];
2844: polarity ^= 1;
2845: } else if (b < a) { // OVERLAP, take b
2846: buffer[k++] = b;
2847: b = other[j++];
2848: polarity ^= 2;
2849: } else { // a == b, drop both!
2850: if (a == HIGH)
2851: break main;
2852: a = list[i++];
2853: polarity ^= 1;
2854: b = other[j++];
2855: polarity ^= 2;
2856: }
2857: break;
2858: case 2: // a first, b second; if a < b, overlap
2859: if (b < a) { // no overlap, drop b
2860: b = other[j++];
2861: polarity ^= 2;
2862: } else if (a < b) { // OVERLAP, take a
2863: buffer[k++] = a;
2864: a = list[i++];
2865: polarity ^= 1;
2866: } else { // a == b, drop both!
2867: if (a == HIGH)
2868: break main;
2869: a = list[i++];
2870: polarity ^= 1;
2871: b = other[j++];
2872: polarity ^= 2;
2873: }
2874: break;
2875: }
2876: }
2877: buffer[k++] = HIGH; // terminate
2878: len = k;
2879: // swap list and buffer
2880: int[] temp = list;
2881: list = buffer;
2882: buffer = temp;
2883: pat = null;
2884: return this ;
2885: }
2886:
2887: private static final int max(int a, int b) {
2888: return (a > b) ? a : b;
2889: }
2890:
2891: //----------------------------------------------------------------
2892: // Generic filter-based scanning code
2893: //----------------------------------------------------------------
2894:
2895: private static interface Filter {
2896: boolean contains(int codePoint);
2897: }
2898:
2899: private static class NumericValueFilter implements Filter {
2900: double value;
2901:
2902: NumericValueFilter(double value) {
2903: this .value = value;
2904: }
2905:
2906: public boolean contains(int ch) {
2907: return UCharacter.getUnicodeNumericValue(ch) == value;
2908: }
2909: }
2910:
2911: private static class GeneralCategoryMaskFilter implements Filter {
2912: int mask;
2913:
2914: GeneralCategoryMaskFilter(int mask) {
2915: this .mask = mask;
2916: }
2917:
2918: public boolean contains(int ch) {
2919: return ((1 << UCharacter.getType(ch)) & mask) != 0;
2920: }
2921: }
2922:
2923: private static class IntPropertyFilter implements Filter {
2924: int prop;
2925: int value;
2926:
2927: IntPropertyFilter(int prop, int value) {
2928: this .prop = prop;
2929: this .value = value;
2930: }
2931:
2932: public boolean contains(int ch) {
2933: return UCharacter.getIntPropertyValue(ch, prop) == value;
2934: }
2935: }
2936:
2937: // VersionInfo for unassigned characters
2938: static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0,
2939: 0, 0);
2940:
2941: private static class VersionFilter implements Filter {
2942: VersionInfo version;
2943:
2944: VersionFilter(VersionInfo version) {
2945: this .version = version;
2946: }
2947:
2948: public boolean contains(int ch) {
2949: VersionInfo v = UCharacter.getAge(ch);
2950: // Reference comparison ok; VersionInfo caches and reuses
2951: // unique objects.
2952: return v != NO_VERSION && v.compareTo(version) <= 0;
2953: }
2954: }
2955:
2956: private static synchronized UnicodeSet getInclusions(int src) {
2957: if (INCLUSIONS == null) {
2958: INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
2959: }
2960: if (INCLUSIONS[src] == null) {
2961: UnicodeSet incl = new UnicodeSet();
2962: switch (src) {
2963: case UCharacterProperty.SRC_CHAR:
2964: UCharacterProperty.getInstance()
2965: .addPropertyStarts(incl);
2966: break;
2967: case UCharacterProperty.SRC_PROPSVEC:
2968: UCharacterProperty.getInstance()
2969: .upropsvec_addPropertyStarts(incl);
2970: break;
2971: case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
2972: UCharacterProperty.getInstance()
2973: .addPropertyStarts(incl);
2974: UCharacterProperty.getInstance()
2975: .upropsvec_addPropertyStarts(incl);
2976: break;
2977: case UCharacterProperty.SRC_HST:
2978: UCharacterProperty.getInstance()
2979: .uhst_addPropertyStarts(incl);
2980: break;
2981: case UCharacterProperty.SRC_NORM:
2982: NormalizerImpl.addPropertyStarts(incl);
2983: break;
2984: case UCharacterProperty.SRC_CASE:
2985: try {
2986: UCaseProps.getSingleton().addPropertyStarts(incl);
2987: } catch (IOException e) {
2988: throw new MissingResourceException(e.getMessage(),
2989: "", "");
2990: }
2991: break;
2992: case UCharacterProperty.SRC_BIDI:
2993: try {
2994: UBiDiProps.getSingleton().addPropertyStarts(incl);
2995: } catch (IOException e) {
2996: throw new MissingResourceException(e.getMessage(),
2997: "", "");
2998: }
2999: break;
3000: default:
3001: throw new IllegalStateException(
3002: "UnicodeSet.getInclusions(unknown src " + src
3003: + ")");
3004: }
3005: INCLUSIONS[src] = incl;
3006: }
3007: return INCLUSIONS[src];
3008: }
3009:
3010: /**
3011: * Generic filter-based scanning code for UCD property UnicodeSets.
3012: */
3013: private UnicodeSet applyFilter(Filter filter, int src) {
3014: // Walk through all Unicode characters, noting the start
3015: // and end of each range for which filter.contain(c) is
3016: // true. Add each range to a set.
3017: //
3018: // To improve performance, use the INCLUSIONS set, which
3019: // encodes information about character ranges that are known
3020: // to have identical properties, such as the CJK Ideographs
3021: // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
3022: // except the first characters of such ranges.
3023: //
3024: // TODO Where possible, instead of scanning over code points,
3025: // use internal property data to initialize UnicodeSets for
3026: // those properties. Scanning code points is slow.
3027:
3028: clear();
3029:
3030: int startHasProperty = -1;
3031: UnicodeSet inclusions = getInclusions(src);
3032: int limitRange = inclusions.getRangeCount();
3033:
3034: for (int j = 0; j < limitRange; ++j) {
3035: // get current range
3036: int start = inclusions.getRangeStart(j);
3037: int end = inclusions.getRangeEnd(j);
3038:
3039: // for all the code points in the range, process
3040: for (int ch = start; ch <= end; ++ch) {
3041: // only add to the unicodeset on inflection points --
3042: // where the hasProperty value changes to false
3043: if (filter.contains(ch)) {
3044: if (startHasProperty < 0) {
3045: startHasProperty = ch;
3046: }
3047: } else if (startHasProperty >= 0) {
3048: add_unchecked(startHasProperty, ch - 1);
3049: startHasProperty = -1;
3050: }
3051: }
3052: }
3053: if (startHasProperty >= 0) {
3054: add_unchecked(startHasProperty, 0x10FFFF);
3055: }
3056:
3057: return this ;
3058: }
3059:
3060: /**
3061: * Remove leading and trailing rule white space and compress
3062: * internal rule white space to a single space character.
3063: *
3064: * @see UCharacterProperty#isRuleWhiteSpace
3065: */
3066: private static String mungeCharName(String source) {
3067: StringBuffer buf = new StringBuffer();
3068: for (int i = 0; i < source.length();) {
3069: int ch = UTF16.charAt(source, i);
3070: i += UTF16.getCharCount(ch);
3071: if (UCharacterProperty.isRuleWhiteSpace(ch)) {
3072: if (buf.length() == 0
3073: || buf.charAt(buf.length() - 1) == ' ') {
3074: continue;
3075: }
3076: ch = ' '; // convert to ' '
3077: }
3078: UTF16.append(buf, ch);
3079: }
3080: if (buf.length() != 0 && buf.charAt(buf.length() - 1) == ' ') {
3081: buf.setLength(buf.length() - 1);
3082: }
3083: return buf.toString();
3084: }
3085:
3086: //----------------------------------------------------------------
3087: // Property set API
3088: //----------------------------------------------------------------
3089:
3090: /**
3091: * Modifies this set to contain those code points which have the
3092: * given value for the given binary or enumerated property, as
3093: * returned by UCharacter.getIntPropertyValue. Prior contents of
3094: * this set are lost.
3095: *
3096: * @param prop a property in the range
3097: * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
3098: * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
3099: * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
3100: *
3101: * @param value a value in the range
3102: * UCharacter.getIntPropertyMinValue(prop)..
3103: * UCharacter.getIntPropertyMaxValue(prop), with one exception.
3104: * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
3105: * a UCharacter.getType() result, but rather a mask value produced
3106: * by logically ORing (1 << UCharacter.getType()) values together.
3107: * This allows grouped categories such as [:L:] to be represented.
3108: *
3109: * @return a reference to this set
3110: *
3111: * @stable ICU 2.4
3112: */
3113: public UnicodeSet applyIntPropertyValue(int prop, int value) {
3114: checkFrozen();
3115: if (prop == UProperty.GENERAL_CATEGORY_MASK) {
3116: applyFilter(new GeneralCategoryMaskFilter(value),
3117: UCharacterProperty.SRC_CHAR);
3118: } else {
3119: applyFilter(new IntPropertyFilter(prop, value),
3120: UCharacterProperty.getInstance().getSource(prop));
3121: }
3122: return this ;
3123: }
3124:
3125: /**
3126: * Modifies this set to contain those code points which have the
3127: * given value for the given property. Prior contents of this
3128: * set are lost.
3129: *
3130: * @param propertyAlias a property alias, either short or long.
3131: * The name is matched loosely. See PropertyAliases.txt for names
3132: * and a description of loose matching. If the value string is
3133: * empty, then this string is interpreted as either a
3134: * General_Category value alias, a Script value alias, a binary
3135: * property alias, or a special ID. Special IDs are matched
3136: * loosely and correspond to the following sets:
3137: *
3138: * "ANY" = [\u0000-\U0010FFFF],
3139: * "ASCII" = [\u0000-\u007F].
3140: *
3141: * @param valueAlias a value alias, either short or long. The
3142: * name is matched loosely. See PropertyValueAliases.txt for
3143: * names and a description of loose matching. In addition to
3144: * aliases listed, numeric values and canonical combining classes
3145: * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3146: * "220"). The value string may also be empty.
3147: *
3148: * @return a reference to this set
3149: *
3150: * @stable ICU 2.4
3151: */
3152: public UnicodeSet applyPropertyAlias(String propertyAlias,
3153: String valueAlias) {
3154: return applyPropertyAlias(propertyAlias, valueAlias, null);
3155: }
3156:
3157: /**
3158: * Modifies this set to contain those code points which have the
3159: * given value for the given property. Prior contents of this
3160: * set are lost.
3161: * @param propertyAlias
3162: * @param valueAlias
3163: * @param symbols if not null, then symbols are first called to see if a property
3164: * is available. If true, then everything else is skipped.
3165: * @return this set
3166: * @draft ICU 3.2
3167: * @provisional This API might change or be removed in a future release.
3168: */
3169: public UnicodeSet applyPropertyAlias(String propertyAlias,
3170: String valueAlias, SymbolTable symbols) {
3171: checkFrozen();
3172: int p;
3173: int v;
3174: boolean mustNotBeEmpty = false, invert = false;
3175:
3176: if (symbols != null
3177: && (symbols instanceof XSymbolTable)
3178: && ((XSymbolTable) symbols).applyPropertyAlias(
3179: propertyAlias, valueAlias, this )) {
3180: return this ;
3181: }
3182:
3183: if (valueAlias.length() > 0) {
3184: p = UCharacter.getPropertyEnum(propertyAlias);
3185:
3186: // Treat gc as gcm
3187: if (p == UProperty.GENERAL_CATEGORY) {
3188: p = UProperty.GENERAL_CATEGORY_MASK;
3189: }
3190:
3191: if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT)
3192: || (p >= UProperty.INT_START && p < UProperty.INT_LIMIT)
3193: || (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3194: try {
3195: v = UCharacter.getPropertyValueEnum(p, valueAlias);
3196: } catch (IllegalArgumentException e) {
3197: // Handle numeric CCC
3198: if (p == UProperty.CANONICAL_COMBINING_CLASS
3199: || p == UProperty.LEAD_CANONICAL_COMBINING_CLASS
3200: || p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3201: v = Integer.parseInt(Utility
3202: .deleteRuleWhiteSpace(valueAlias));
3203: // If the resultant set is empty then the numeric value
3204: // was invalid.
3205: mustNotBeEmpty = true;
3206: } else {
3207: throw e;
3208: }
3209: }
3210: }
3211:
3212: else {
3213:
3214: switch (p) {
3215: case UProperty.NUMERIC_VALUE: {
3216: double value = Double.parseDouble(Utility
3217: .deleteRuleWhiteSpace(valueAlias));
3218: applyFilter(new NumericValueFilter(value),
3219: UCharacterProperty.SRC_CHAR);
3220: return this ;
3221: }
3222: case UProperty.NAME:
3223: case UProperty.UNICODE_1_NAME: {
3224: // Must munge name, since
3225: // UCharacter.charFromName() does not do
3226: // 'loose' matching.
3227: String buf = mungeCharName(valueAlias);
3228: int ch = (p == UProperty.NAME) ? UCharacter
3229: .getCharFromExtendedName(buf) : UCharacter
3230: .getCharFromName1_0(buf);
3231: if (ch == -1) {
3232: throw new IllegalArgumentException(
3233: "Invalid character name");
3234: }
3235: clear();
3236: add_unchecked(ch);
3237: return this ;
3238: }
3239: case UProperty.AGE: {
3240: // Must munge name, since
3241: // VersionInfo.getInstance() does not do
3242: // 'loose' matching.
3243: VersionInfo version = VersionInfo
3244: .getInstance(mungeCharName(valueAlias));
3245: applyFilter(new VersionFilter(version),
3246: UCharacterProperty.SRC_PROPSVEC);
3247: return this ;
3248: }
3249: }
3250:
3251: // p is a non-binary, non-enumerated property that we
3252: // don't support (yet).
3253: throw new IllegalArgumentException(
3254: "Unsupported property");
3255: }
3256: }
3257:
3258: else {
3259: // valueAlias is empty. Interpret as General Category, Script,
3260: // Binary property, or ANY or ASCII. Upon success, p and v will
3261: // be set.
3262: try {
3263: p = UProperty.GENERAL_CATEGORY_MASK;
3264: v = UCharacter.getPropertyValueEnum(p, propertyAlias);
3265: } catch (IllegalArgumentException e) {
3266: try {
3267: p = UProperty.SCRIPT;
3268: v = UCharacter.getPropertyValueEnum(p,
3269: propertyAlias);
3270: } catch (IllegalArgumentException e2) {
3271: try {
3272: p = UCharacter.getPropertyEnum(propertyAlias);
3273: } catch (IllegalArgumentException e3) {
3274: p = -1;
3275: }
3276: if (p >= UProperty.BINARY_START
3277: && p < UProperty.BINARY_LIMIT) {
3278: v = 1;
3279: } else if (p == -1) {
3280: if (0 == UPropertyAliases.compare(ANY_ID,
3281: propertyAlias)) {
3282: set(MIN_VALUE, MAX_VALUE);
3283: return this ;
3284: } else if (0 == UPropertyAliases.compare(
3285: ASCII_ID, propertyAlias)) {
3286: set(0, 0x7F);
3287: return this ;
3288: } else if (0 == UPropertyAliases.compare(
3289: ASSIGNED, propertyAlias)) {
3290: // [:Assigned:]=[:^Cn:]
3291: p = UProperty.GENERAL_CATEGORY_MASK;
3292: v = (1 << UCharacter.UNASSIGNED);
3293: invert = true;
3294: } else {
3295: // Property name was never matched.
3296: throw new IllegalArgumentException(
3297: "Invalid property alias: "
3298: + propertyAlias + "="
3299: + valueAlias);
3300: }
3301: } else {
3302: // Valid propery name, but it isn't binary, so the value
3303: // must be supplied.
3304: throw new IllegalArgumentException(
3305: "Missing property value");
3306: }
3307: }
3308: }
3309: }
3310:
3311: applyIntPropertyValue(p, v);
3312: if (invert) {
3313: complement();
3314: }
3315:
3316: if (mustNotBeEmpty && isEmpty()) {
3317: // mustNotBeEmpty is set to true if an empty set indicates
3318: // invalid input.
3319: throw new IllegalArgumentException("Invalid property value");
3320: }
3321:
3322: return this ;
3323: }
3324:
3325: //----------------------------------------------------------------
3326: // Property set patterns
3327: //----------------------------------------------------------------
3328:
3329: /**
3330: * Return true if the given position, in the given pattern, appears
3331: * to be the start of a property set pattern.
3332: */
3333: private static boolean resemblesPropertyPattern(String pattern,
3334: int pos) {
3335: // Patterns are at least 5 characters long
3336: if ((pos + 5) > pattern.length()) {
3337: return false;
3338: }
3339:
3340: // Look for an opening [:, [:^, \p, or \P
3341: return pattern.regionMatches(pos, "[:", 0, 2)
3342: || pattern.regionMatches(true, pos, "\\p", 0, 2)
3343: || pattern.regionMatches(pos, "\\N", 0, 2);
3344: }
3345:
3346: /**
3347: * Return true if the given iterator appears to point at a
3348: * property pattern. Regardless of the result, return with the
3349: * iterator unchanged.
3350: * @param chars iterator over the pattern characters. Upon return
3351: * it will be unchanged.
3352: * @param iterOpts RuleCharacterIterator options
3353: */
3354: private static boolean resemblesPropertyPattern(
3355: RuleCharacterIterator chars, int iterOpts) {
3356: boolean result = false;
3357: iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3358: Object pos = chars.getPos(null);
3359: int c = chars.next(iterOpts);
3360: if (c == '[' || c == '\\') {
3361: int d = chars.next(iterOpts
3362: & ~RuleCharacterIterator.SKIP_WHITESPACE);
3363: result = (c == '[') ? (d == ':')
3364: : (d == 'N' || d == 'p' || d == 'P');
3365: }
3366: chars.setPos(pos);
3367: return result;
3368: }
3369:
3370: /**
3371: * Parse the given property pattern at the given parse position.
3372: * @param symbols TODO
3373: */
3374: private UnicodeSet applyPropertyPattern(String pattern,
3375: ParsePosition ppos, SymbolTable symbols) {
3376: int pos = ppos.getIndex();
3377:
3378: // On entry, ppos should point to one of the following locations:
3379:
3380: // Minimum length is 5 characters, e.g. \p{L}
3381: if ((pos + 5) > pattern.length()) {
3382: return null;
3383: }
3384:
3385: boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3386: boolean isName = false; // true for \N{pat}, o/w false
3387: boolean invert = false;
3388:
3389: // Look for an opening [:, [:^, \p, or \P
3390: if (pattern.regionMatches(pos, "[:", 0, 2)) {
3391: posix = true;
3392: pos = Utility.skipWhitespace(pattern, pos + 2);
3393: if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3394: ++pos;
3395: invert = true;
3396: }
3397: } else if (pattern.regionMatches(true, pos, "\\p", 0, 2)
3398: || pattern.regionMatches(pos, "\\N", 0, 2)) {
3399: char c = pattern.charAt(pos + 1);
3400: invert = (c == 'P');
3401: isName = (c == 'N');
3402: pos = Utility.skipWhitespace(pattern, pos + 2);
3403: if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3404: // Syntax error; "\p" or "\P" not followed by "{"
3405: return null;
3406: }
3407: } else {
3408: // Open delimiter not seen
3409: return null;
3410: }
3411:
3412: // Look for the matching close delimiter, either :] or }
3413: int close = pattern.indexOf(posix ? ":]" : "}", pos);
3414: if (close < 0) {
3415: // Syntax error; close delimiter missing
3416: return null;
3417: }
3418:
3419: // Look for an '=' sign. If this is present, we will parse a
3420: // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3421: // pattern.
3422: int equals = pattern.indexOf('=', pos);
3423: String propName, valueName;
3424: if (equals >= 0 && equals < close && !isName) {
3425: // Equals seen; parse medium/long pattern
3426: propName = pattern.substring(pos, equals);
3427: valueName = pattern.substring(equals + 1, close);
3428: }
3429:
3430: else {
3431: // Handle case where no '=' is seen, and \N{}
3432: propName = pattern.substring(pos, close);
3433: valueName = "";
3434:
3435: // Handle \N{name}
3436: if (isName) {
3437: // This is a little inefficient since it means we have to
3438: // parse "na" back to UProperty.NAME even though we already
3439: // know it's UProperty.NAME. If we refactor the API to
3440: // support args of (int, String) then we can remove
3441: // "na" and make this a little more efficient.
3442: valueName = propName;
3443: propName = "na";
3444: }
3445: }
3446:
3447: applyPropertyAlias(propName, valueName, symbols);
3448:
3449: if (invert) {
3450: complement();
3451: }
3452:
3453: // Move to the limit position after the close delimiter
3454: ppos.setIndex(close + (posix ? 2 : 1));
3455:
3456: return this ;
3457: }
3458:
3459: /**
3460: * Parse a property pattern.
3461: * @param chars iterator over the pattern characters. Upon return
3462: * it will be advanced to the first character after the parsed
3463: * pattern, or the end of the iteration if all characters are
3464: * parsed.
3465: * @param rebuiltPat the pattern that was parsed, rebuilt or
3466: * copied from the input pattern, as appropriate.
3467: * @param symbols TODO
3468: */
3469: private void applyPropertyPattern(RuleCharacterIterator chars,
3470: StringBuffer rebuiltPat, SymbolTable symbols) {
3471: String pat = chars.lookahead();
3472: ParsePosition pos = new ParsePosition(0);
3473: applyPropertyPattern(pat, pos, symbols);
3474: if (pos.getIndex() == 0) {
3475: syntaxError(chars, "Invalid property pattern");
3476: }
3477: chars.jumpahead(pos.getIndex());
3478: rebuiltPat.append(pat.substring(0, pos.getIndex()));
3479: }
3480:
3481: //----------------------------------------------------------------
3482: // Case folding API
3483: //----------------------------------------------------------------
3484:
3485: /**
3486: * Bitmask for constructor and applyPattern() indicating that
3487: * white space should be ignored. If set, ignore characters for
3488: * which UCharacterProperty.isRuleWhiteSpace() returns true,
3489: * unless they are quoted or escaped. This may be ORed together
3490: * with other selectors.
3491: * @internal
3492: * @deprecated This API is ICU internal only.
3493: */
3494: public static final int IGNORE_SPACE = 1;
3495:
3496: /**
3497: * Bitmask for constructor, applyPattern(), and closeOver()
3498: * indicating letter case. This may be ORed together with other
3499: * selectors.
3500: *
3501: * Enable case insensitive matching. E.g., "[ab]" with this flag
3502: * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
3503: * match all except 'a', 'A', 'b', and 'B'. This performs a full
3504: * closure over case mappings, e.g. U+017F for s.
3505: *
3506: * The resulting set is a superset of the input for the code points but
3507: * not for the strings.
3508: * It performs a case mapping closure of the code points and adds
3509: * full case folding strings for the code points, and reduces strings of
3510: * the original set to their full case folding equivalents.
3511: *
3512: * This is designed for case-insensitive matches, for example
3513: * in regular expressions. The full code point case closure allows checking of
3514: * an input character directly against the closure set.
3515: * Strings are matched by comparing the case-folded form from the closure
3516: * set with an incremental case folding of the string in question.
3517: *
3518: * The closure set will also contain single code points if the original
3519: * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3520: * This is not necessary (that is, redundant) for the above matching method
3521: * but results in the same closure sets regardless of whether the original
3522: * set contained the code point or a string.
3523: *
3524: * @internal
3525: * @deprecated This API is ICU internal only.
3526: */
3527: public static final int CASE = 2;
3528:
3529: /**
3530: * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
3531: * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
3532: * @see #CASE
3533: * @draft ICU 3.4
3534: * @provisional This API might change or be removed in a future release.
3535: */
3536: public static final int CASE_INSENSITIVE = 2;
3537:
3538: /**
3539: * Bitmask for constructor, applyPattern(), and closeOver()
3540: * indicating letter case. This may be ORed together with other
3541: * selectors.
3542: *
3543: * Enable case insensitive matching. E.g., "[ab]" with this flag
3544: * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
3545: * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
3546: * title-, and uppercase mappings as well as the case folding
3547: * of each existing element in the set.
3548: * @draft ICU 3.4
3549: * @provisional This API might change or be removed in a future release.
3550: */
3551: public static final int ADD_CASE_MAPPINGS = 4;
3552:
3553: // add the result of a full case mapping to the set
3554: // use str as a temporary string to avoid constructing one
3555: private static final void addCaseMapping(UnicodeSet set,
3556: int result, StringBuffer full) {
3557: if (result >= 0) {
3558: if (result > UCaseProps.MAX_STRING_LENGTH) {
3559: // add a single-code point case mapping
3560: set.add(result);
3561: } else {
3562: // add a string case mapping from full with length result
3563: set.add(full.toString());
3564: full.setLength(0);
3565: }
3566: }
3567: // result < 0: the code point mapped to itself, no need to add it
3568: // see UCaseProps
3569: }
3570:
3571: /**
3572: * Close this set over the given attribute. For the attribute
3573: * CASE, the result is to modify this set so that:
3574: *
3575: * 1. For each character or string 'a' in this set, all strings
3576: * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3577: * (For most 'a' that are single characters, 'b' will have
3578: * b.length() == 1.)
3579: *
3580: * 2. For each string 'e' in the resulting set, if e !=
3581: * foldCase(e), 'e' will be removed.
3582: *
3583: * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
3584: *
3585: * (Here foldCase(x) refers to the operation
3586: * UCharacter.foldCase(x, true), and a == b actually denotes
3587: * a.equals(b), not pointer comparison.)
3588: *
3589: * @param attribute bitmask for attributes to close over.
3590: * Currently only the CASE bit is supported. Any undefined bits
3591: * are ignored.
3592: * @return a reference to this set.
3593: * @internal
3594: * @deprecated This API is ICU internal only.
3595: */
3596: public UnicodeSet closeOver(int attribute) {
3597: checkFrozen();
3598: if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
3599: UCaseProps csp;
3600: try {
3601: csp = UCaseProps.getSingleton();
3602: } catch (IOException e) {
3603: return this ;
3604: }
3605: UnicodeSet foldSet = new UnicodeSet(this );
3606: ULocale root = ULocale.ROOT;
3607:
3608: // start with input set to guarantee inclusion
3609: // CASE: remove strings because the strings will actually be reduced (folded);
3610: // therefore, start with no strings and add only those needed
3611: if ((attribute & CASE) != 0) {
3612: foldSet.strings.clear();
3613: }
3614:
3615: int n = getRangeCount();
3616: int result;
3617: StringBuffer full = new StringBuffer();
3618: int locCache[] = new int[1];
3619:
3620: for (int i = 0; i < n; ++i) {
3621: int start = getRangeStart(i);
3622: int end = getRangeEnd(i);
3623:
3624: if ((attribute & CASE) != 0) {
3625: // full case closure
3626: for (int cp = start; cp <= end; ++cp) {
3627: csp.addCaseClosure(cp, foldSet);
3628: }
3629: } else {
3630: // add case mappings
3631: // (does not add long s for regular s, or Kelvin for k, for example)
3632: for (int cp = start; cp <= end; ++cp) {
3633: result = csp.toFullLower(cp, null, full, root,
3634: locCache);
3635: addCaseMapping(foldSet, result, full);
3636:
3637: result = csp.toFullTitle(cp, null, full, root,
3638: locCache);
3639: addCaseMapping(foldSet, result, full);
3640:
3641: result = csp.toFullUpper(cp, null, full, root,
3642: locCache);
3643: addCaseMapping(foldSet, result, full);
3644:
3645: result = csp.toFullFolding(cp, full, 0);
3646: addCaseMapping(foldSet, result, full);
3647: }
3648: }
3649: }
3650: if (!strings.isEmpty()) {
3651: String str;
3652: if ((attribute & CASE) != 0) {
3653: Iterator it = strings.iterator();
3654: while (it.hasNext()) {
3655: str = UCharacter
3656: .foldCase((String) it.next(), 0);
3657: if (!csp.addStringCaseClosure(str, foldSet)) {
3658: foldSet.add(str); // does not map to code points: add the folded string itself
3659: }
3660: }
3661: } else {
3662: BreakIterator bi = BreakIterator
3663: .getWordInstance(root);
3664: Iterator it = strings.iterator();
3665: while (it.hasNext()) {
3666: str = (String) it.next();
3667: foldSet.add(UCharacter.toLowerCase(root, str));
3668: foldSet.add(UCharacter.toTitleCase(root, str,
3669: bi));
3670: foldSet.add(UCharacter.toUpperCase(root, str));
3671: foldSet.add(UCharacter.foldCase(str, 0));
3672: }
3673: }
3674: }
3675: set(foldSet);
3676: }
3677: return this ;
3678: }
3679:
3680: /**
3681: * Internal class for customizing UnicodeSet parsing of properties.
3682: * TODO: extend to allow customizing of codepoint ranges
3683: * @internal
3684: * @deprecated This API is ICU internal only.
3685: * @author medavis
3686: */
3687: abstract public static class XSymbolTable implements SymbolTable {
3688: /**
3689: * Default constructor
3690: * @internal
3691: * @deprecated This API is ICU internal only.
3692: */
3693: public XSymbolTable() {
3694: }
3695:
3696: /**
3697: * @internal
3698: * @deprecated This API is ICU internal only.
3699: */
3700: public UnicodeMatcher lookupMatcher(int i) {
3701: return null;
3702: }
3703:
3704: /**
3705: * @internal
3706: * @deprecated This API is ICU internal only.
3707: */
3708: public boolean applyPropertyAlias(String propertyName,
3709: String propertyValue, UnicodeSet result) {
3710: return false;
3711: }
3712:
3713: /**
3714: * @internal
3715: * @deprecated This API is ICU internal only.
3716: */
3717: public char[] lookup(String s) {
3718: return null;
3719: }
3720:
3721: /**
3722: * @internal
3723: * @deprecated This API is ICU internal only.
3724: */
3725: public String parseReference(String text, ParsePosition pos,
3726: int limit) {
3727: return null;
3728: }
3729: }
3730:
3731: private boolean frozen;
3732:
3733: /**
3734: * Is this frozen, according to the Freezable interface?
3735: * @return value
3736: * @internal
3737: * @deprecated This API is ICU internal only.
3738: */
3739: public boolean isFrozen() {
3740: return frozen;
3741: }
3742:
3743: /**
3744: * Freeze this class, according to the Freezable interface.
3745: * @return this
3746: * @internal
3747: * @deprecated This API is ICU internal only.
3748: */
3749: public Object freeze() {
3750: frozen = true;
3751: return this ;
3752: }
3753:
3754: /**
3755: * Clone a thawed version of this class, according to the Freezable interface.
3756: * @return this
3757: * @internal
3758: * @deprecated This API is ICU internal only.
3759: */
3760: public Object cloneAsThawed() {
3761: UnicodeSet result = (UnicodeSet) clone();
3762: result.frozen = false;
3763: return result;
3764: }
3765:
3766: // internal function
3767: private void checkFrozen() {
3768: if (frozen) {
3769: throw new UnsupportedOperationException(
3770: "Attempt to modify frozen object");
3771: }
3772: }
3773: }
3774: //eof
|