0001: //##header
0002: /**
0003: *******************************************************************************
0004: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */package com.ibm.icu.text;
0008:
0009: import com.ibm.icu.impl.UCharacterProperty;
0010: import com.ibm.icu.impl.NormalizerImpl;
0011:
0012: /**
0013: * <p>Standalone utility class providing UTF16 character conversions and
0014: * indexing conversions.</p>
0015: * <p>Code that uses strings alone rarely need modification.
0016: * By design, UTF-16 does not allow overlap, so searching for strings is a safe
0017: * operation. Similarly, concatenation is always safe. Substringing is safe if
0018: * the start and end are both on UTF-32 boundaries. In normal code, the values
0019: * for start and end are on those boundaries, since they arose from operations
0020: * like searching. If not, the nearest UTF-32 boundaries can be determined
0021: * using <code>bounds()</code>.</p>
0022: * <strong>Examples:</strong>
0023: * <p>The following examples illustrate use of some of these methods.
0024: * <pre>
0025: * // iteration forwards: Original
0026: * for (int i = 0; i < s.length(); ++i) {
0027: * char ch = s.charAt(i);
0028: * doSomethingWith(ch);
0029: * }
0030: *
0031: * // iteration forwards: Changes for UTF-32
0032: * int ch;
0033: * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
0034: * ch = UTF16.charAt(s,i);
0035: * doSomethingWith(ch);
0036: * }
0037: *
0038: * // iteration backwards: Original
0039: * for (int i = s.length() -1; i >= 0; --i) {
0040: * char ch = s.charAt(i);
0041: * doSomethingWith(ch);
0042: * }
0043: *
0044: * // iteration backwards: Changes for UTF-32
0045: * int ch;
0046: * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
0047: * ch = UTF16.charAt(s,i);
0048: * doSomethingWith(ch);
0049: * }
0050: * </pre>
0051: * <strong>Notes:</strong>
0052: * <ul>
0053: * <li>
0054: * <strong>Naming:</strong> For clarity, High and Low surrogates are called
0055: * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
0056: * sense of their ordering in a string. <code>offset16</code> and
0057: * <code>offset32</code> are used to distinguish offsets to UTF-16
0058: * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
0059: * used to contain UTF-32 characters, as opposed to <code>char16</code>,
0060: * which is a UTF-16 code unit.
0061: * </li>
0062: * <li>
0063: * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
0064: * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
0065: * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
0066: * back if and only if <code>bounds(string, offset16) != TRAIL</code>.
0067: * </li>
0068: * <li>
0069: * <strong>Exceptions:</strong> The error checking will throw an exception
0070: * if indices are out of bounds. Other than than that, all methods will
0071: * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
0072: * values are present. <code>UCharacter.isLegal()</code> can be used to check
0073: * for validity if desired.
0074: * </li>
0075: * <li>
0076: * <strong>Unmatched Surrogates:</strong> If the string contains unmatched
0077: * surrogates, then these are counted as one UTF-32 value. This matches
0078: * their iteration behavior, which is vital. It also matches common display
0079: * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
0080: * </li>
0081: * <li>
0082: * <strong>Optimization:</strong> The method implementations may need
0083: * optimization if the compiler doesn't fold static final methods. Since
0084: * surrogate pairs will form an exceeding small percentage of all the text
0085: * in the world, the singleton case should always be optimized for.
0086: * </li>
0087: * </ul>
0088: * @author Mark Davis, with help from Markus Scherer
0089: * @stable ICU 2.1
0090: */
0091:
0092: public final class UTF16 {
0093: // public variables ---------------------------------------------------
0094:
0095: /**
0096: * Value returned in <code><a href="#bounds(java.lang.String, int)">
0097: * bounds()</a></code>.
0098: * These values are chosen specifically so that it actually represents
0099: * the position of the character
0100: * [offset16 - (value >> 2), offset16 + (value & 3)]
0101: * @stable ICU 2.1
0102: */
0103: public static final int SINGLE_CHAR_BOUNDARY = 1,
0104: LEAD_SURROGATE_BOUNDARY = 2, TRAIL_SURROGATE_BOUNDARY = 5;
0105: /**
0106: * The lowest Unicode code point value.
0107: * @stable ICU 2.1
0108: */
0109: public static final int CODEPOINT_MIN_VALUE = 0;
0110: /**
0111: * The highest Unicode code point value (scalar value) according to the
0112: * Unicode Standard.
0113: * @stable ICU 2.1
0114: */
0115: public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
0116: /**
0117: * The minimum value for Supplementary code points
0118: * @stable ICU 2.1
0119: */
0120: public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
0121: /**
0122: * Lead surrogate minimum value
0123: * @stable ICU 2.1
0124: */
0125: public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
0126: /**
0127: * Trail surrogate minimum value
0128: * @stable ICU 2.1
0129: */
0130: public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
0131: /**
0132: * Lead surrogate maximum value
0133: * @stable ICU 2.1
0134: */
0135: public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
0136: /**
0137: * Trail surrogate maximum value
0138: * @stable ICU 2.1
0139: */
0140: public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
0141: /**
0142: * Surrogate minimum value
0143: * @stable ICU 2.1
0144: */
0145: public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
0146: /**
0147: * Maximum surrogate value
0148: * @stable ICU 2.1
0149: */
0150: public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
0151:
0152: // constructor --------------------------------------------------------
0153:
0154: ///CLOVER:OFF
0155: /**
0156: * Prevent instance from being created.
0157: */
0158: private UTF16() {
0159: }
0160:
0161: ///CLOVER:ON
0162: // public method ------------------------------------------------------
0163:
0164: /**
0165: * Extract a single UTF-32 value from a string.
0166: * Used when iterating forwards or backwards (with
0167: * <code>UTF16.getCharCount()</code>, as well as random access. If a
0168: * validity check is required, use
0169: * <code><a href="../lang/UCharacter.html#isLegal(char)">
0170: * UCharacter.isLegal()</a></code> on the return value.
0171: * If the char retrieved is part of a surrogate pair, its supplementary
0172: * character will be returned. If a complete supplementary character is
0173: * not found the incomplete character will be returned
0174: * @param source array of UTF-16 chars
0175: * @param offset16 UTF-16 offset to the start of the character.
0176: * @return UTF-32 value for the UTF-32 value that contains the char at
0177: * offset16. The boundaries of that codepoint are the same as in
0178: * <code>bounds32()</code>.
0179: * @exception IndexOutOfBoundsException thrown if offset16 is out of
0180: * bounds.
0181: * @stable ICU 2.1
0182: */
0183: public static int charAt(String source, int offset16) {
0184: char single = source.charAt(offset16);
0185: if (single < LEAD_SURROGATE_MIN_VALUE) {
0186: return single;
0187: }
0188: return _charAt(source, offset16, single);
0189: }
0190:
0191: private static int _charAt(String source, int offset16, char single) {
0192: if (single > TRAIL_SURROGATE_MAX_VALUE) {
0193: return single;
0194: }
0195:
0196: // Convert the UTF-16 surrogate pair if necessary.
0197: // For simplicity in usage, and because the frequency of pairs is
0198: // low, look both directions.
0199:
0200: if (single <= LEAD_SURROGATE_MAX_VALUE) {
0201: ++offset16;
0202: if (source.length() != offset16) {
0203: char trail = source.charAt(offset16);
0204: if (trail >= TRAIL_SURROGATE_MIN_VALUE
0205: && trail <= TRAIL_SURROGATE_MAX_VALUE) {
0206: return UCharacterProperty.getRawSupplementary(
0207: single, trail);
0208: }
0209: }
0210: } else {
0211: --offset16;
0212: if (offset16 >= 0) {
0213: // single is a trail surrogate so
0214: char lead = source.charAt(offset16);
0215: if (lead >= LEAD_SURROGATE_MIN_VALUE
0216: && lead <= LEAD_SURROGATE_MAX_VALUE) {
0217: return UCharacterProperty.getRawSupplementary(lead,
0218: single);
0219: }
0220: }
0221: }
0222: return single; // return unmatched surrogate
0223: }
0224:
0225: //#ifndef FOUNDATION
0226: /**
0227: * Extract a single UTF-32 value from a string.
0228: * Used when iterating forwards or backwards (with
0229: * <code>UTF16.getCharCount()</code>, as well as random access. If a
0230: * validity check is required, use
0231: * <code><a href="../lang/UCharacter.html#isLegal(char)">
0232: * UCharacter.isLegal()</a></code> on the return value.
0233: * If the char retrieved is part of a surrogate pair, its supplementary
0234: * character will be returned. If a complete supplementary character is
0235: * not found the incomplete character will be returned
0236: * @param source array of UTF-16 chars
0237: * @param offset16 UTF-16 offset to the start of the character.
0238: * @return UTF-32 value for the UTF-32 value that contains the char at
0239: * offset16. The boundaries of that codepoint are the same as in
0240: * <code>bounds32()</code>.
0241: * @exception IndexOutOfBoundsException thrown if offset16 is out of
0242: * bounds.
0243: * @stable ICU 2.1
0244: */
0245: public static int charAt(CharSequence source, int offset16) {
0246: char single = source.charAt(offset16);
0247: if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
0248: return single;
0249: }
0250: return _charAt(source, offset16, single);
0251: }
0252:
0253: private static int _charAt(CharSequence source, int offset16,
0254: char single) {
0255: if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
0256: return single;
0257: }
0258:
0259: // Convert the UTF-16 surrogate pair if necessary.
0260: // For simplicity in usage, and because the frequency of pairs is
0261: // low, look both directions.
0262:
0263: if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0264: ++offset16;
0265: if (source.length() != offset16) {
0266: char trail = source.charAt(offset16);
0267: if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
0268: && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
0269: return UCharacterProperty.getRawSupplementary(
0270: single, trail);
0271: }
0272: }
0273: } else {
0274: --offset16;
0275: if (offset16 >= 0) {
0276: // single is a trail surrogate so
0277: char lead = source.charAt(offset16);
0278: if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
0279: && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0280: return UCharacterProperty.getRawSupplementary(lead,
0281: single);
0282: }
0283: }
0284: }
0285: return single; // return unmatched surrogate
0286: }
0287:
0288: //#endif
0289:
0290: /**
0291: * Extract a single UTF-32 value from a string.
0292: * Used when iterating forwards or backwards (with
0293: * <code>UTF16.getCharCount()</code>, as well as random access. If a
0294: * validity check is required, use
0295: * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
0296: * </a></code> on the return value.
0297: * If the char retrieved is part of a surrogate pair, its supplementary
0298: * character will be returned. If a complete supplementary character is
0299: * not found the incomplete character will be returned
0300: * @param source UTF-16 chars string buffer
0301: * @param offset16 UTF-16 offset to the start of the character.
0302: * @return UTF-32 value for the UTF-32 value that contains the char at
0303: * offset16. The boundaries of that codepoint are the same as in
0304: * <code>bounds32()</code>.
0305: * @exception IndexOutOfBoundsException thrown if offset16 is out of
0306: * bounds.
0307: * @stable ICU 2.1
0308: */
0309: public static int charAt(StringBuffer source, int offset16) {
0310: if (offset16 < 0 || offset16 >= source.length()) {
0311: throw new StringIndexOutOfBoundsException(offset16);
0312: }
0313:
0314: char single = source.charAt(offset16);
0315: if (!isSurrogate(single)) {
0316: return single;
0317: }
0318:
0319: // Convert the UTF-16 surrogate pair if necessary.
0320: // For simplicity in usage, and because the frequency of pairs is
0321: // low, look both directions.
0322:
0323: if (single <= LEAD_SURROGATE_MAX_VALUE) {
0324: ++offset16;
0325: if (source.length() != offset16) {
0326: char trail = source.charAt(offset16);
0327: if (isTrailSurrogate(trail))
0328: return UCharacterProperty.getRawSupplementary(
0329: single, trail);
0330: }
0331: } else {
0332: --offset16;
0333: if (offset16 >= 0) {
0334: // single is a trail surrogate so
0335: char lead = source.charAt(offset16);
0336: if (isLeadSurrogate(lead)) {
0337: return UCharacterProperty.getRawSupplementary(lead,
0338: single);
0339: }
0340: }
0341: }
0342: return single; // return unmatched surrogate
0343: }
0344:
0345: /**
0346: * Extract a single UTF-32 value from a substring.
0347: * Used when iterating forwards or backwards (with
0348: * <code>UTF16.getCharCount()</code>, as well as random access. If a
0349: * validity check is required, use
0350: * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
0351: * </a></code> on the return value.
0352: * If the char retrieved is part of a surrogate pair, its supplementary
0353: * character will be returned. If a complete supplementary character is
0354: * not found the incomplete character will be returned
0355: * @param source array of UTF-16 chars
0356: * @param start offset to substring in the source array for analyzing
0357: * @param limit offset to substring in the source array for analyzing
0358: * @param offset16 UTF-16 offset relative to start
0359: * @return UTF-32 value for the UTF-32 value that contains the char at
0360: * offset16. The boundaries of that codepoint are the same as in
0361: * <code>bounds32()</code>.
0362: * @exception IndexOutOfBoundsException thrown if offset16 is not within
0363: * the range of start and limit.
0364: * @stable ICU 2.1
0365: */
0366: public static int charAt(char source[], int start, int limit,
0367: int offset16) {
0368: offset16 += start;
0369: if (offset16 < start || offset16 >= limit) {
0370: throw new ArrayIndexOutOfBoundsException(offset16);
0371: }
0372:
0373: char single = source[offset16];
0374: if (!isSurrogate(single)) {
0375: return single;
0376: }
0377:
0378: // Convert the UTF-16 surrogate pair if necessary.
0379: // For simplicity in usage, and because the frequency of pairs is
0380: // low, look both directions.
0381: if (single <= LEAD_SURROGATE_MAX_VALUE) {
0382: offset16++;
0383: if (offset16 >= limit) {
0384: return single;
0385: }
0386: char trail = source[offset16];
0387: if (isTrailSurrogate(trail)) {
0388: return UCharacterProperty.getRawSupplementary(single,
0389: trail);
0390: }
0391: } else { // isTrailSurrogate(single), so
0392: if (offset16 == start) {
0393: return single;
0394: }
0395: offset16--;
0396: char lead = source[offset16];
0397: if (isLeadSurrogate(lead))
0398: return UCharacterProperty.getRawSupplementary(lead,
0399: single);
0400: }
0401: return single; // return unmatched surrogate
0402: }
0403:
0404: /**
0405: * Extract a single UTF-32 value from a string.
0406: * Used when iterating forwards or backwards (with
0407: * <code>UTF16.getCharCount()</code>, as well as random access. If a
0408: * validity check is required, use
0409: * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
0410: * </a></code> on the return value.
0411: * If the char retrieved is part of a surrogate pair, its supplementary
0412: * character will be returned. If a complete supplementary character is
0413: * not found the incomplete character will be returned
0414: * @param source UTF-16 chars string buffer
0415: * @param offset16 UTF-16 offset to the start of the character.
0416: * @return UTF-32 value for the UTF-32 value that contains the char at
0417: * offset16. The boundaries of that codepoint are the same as in
0418: * <code>bounds32()</code>.
0419: * @exception IndexOutOfBoundsException thrown if offset16 is out of
0420: * bounds.
0421: * @stable ICU 2.1
0422: */
0423: public static int charAt(Replaceable source, int offset16) {
0424: if (offset16 < 0 || offset16 >= source.length()) {
0425: throw new StringIndexOutOfBoundsException(offset16);
0426: }
0427:
0428: char single = source.charAt(offset16);
0429: if (!isSurrogate(single)) {
0430: return single;
0431: }
0432:
0433: // Convert the UTF-16 surrogate pair if necessary.
0434: // For simplicity in usage, and because the frequency of pairs is
0435: // low, look both directions.
0436:
0437: if (single <= LEAD_SURROGATE_MAX_VALUE) {
0438: ++offset16;
0439: if (source.length() != offset16) {
0440: char trail = source.charAt(offset16);
0441: if (isTrailSurrogate(trail))
0442: return UCharacterProperty.getRawSupplementary(
0443: single, trail);
0444: }
0445: } else {
0446: --offset16;
0447: if (offset16 >= 0) {
0448: // single is a trail surrogate so
0449: char lead = source.charAt(offset16);
0450: if (isLeadSurrogate(lead)) {
0451: return UCharacterProperty.getRawSupplementary(lead,
0452: single);
0453: }
0454: }
0455: }
0456: return single; // return unmatched surrogate
0457: }
0458:
0459: /**
0460: * Determines how many chars this char32 requires.
0461: * If a validity check is required, use <code>
0462: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
0463: * char32 before calling.
0464: * @param char32 the input codepoint.
0465: * @return 2 if is in supplementary space, otherwise 1.
0466: * @stable ICU 2.1
0467: */
0468: public static int getCharCount(int char32) {
0469: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
0470: return 1;
0471: }
0472: return 2;
0473: }
0474:
0475: /**
0476: * Returns the type of the boundaries around the char at offset16.
0477: * Used for random access.
0478: * @param source text to analyse
0479: * @param offset16 UTF-16 offset
0480: * @return <ul>
0481: * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
0482: * [offset16, offset16+1]
0483: * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
0484: * offset16;
0485: * the bounds are
0486: * [offset16, offset16 + 2]
0487: * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
0488: * offset16 - 1; the bounds are
0489: * [offset16 - 1, offset16 + 1]
0490: * </ul>
0491: * For bit-twiddlers, the return values for these are chosen so
0492: * that the boundaries can be gotten by:
0493: * [offset16 - (value >> 2), offset16 + (value & 3)].
0494: * @exception IndexOutOfBoundsException if offset16 is out of bounds.
0495: * @stable ICU 2.1
0496: */
0497: public static int bounds(String source, int offset16) {
0498: char ch = source.charAt(offset16);
0499: if (isSurrogate(ch)) {
0500: if (isLeadSurrogate(ch)) {
0501: if (++offset16 < source.length()
0502: && isTrailSurrogate(source.charAt(offset16))) {
0503: return LEAD_SURROGATE_BOUNDARY;
0504: }
0505: } else {
0506: // isTrailSurrogate(ch), so
0507: --offset16;
0508: if (offset16 >= 0
0509: && isLeadSurrogate(source.charAt(offset16))) {
0510: return TRAIL_SURROGATE_BOUNDARY;
0511: }
0512: }
0513: }
0514: return SINGLE_CHAR_BOUNDARY;
0515: }
0516:
0517: /**
0518: * Returns the type of the boundaries around the char at offset16. Used
0519: * for random access.
0520: * @param source string buffer to analyse
0521: * @param offset16 UTF16 offset
0522: * @return
0523: * <ul>
0524: * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
0525: * [offset16, offset16 + 1]
0526: * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
0527: * offset16; the bounds are
0528: * [offset16, offset16 + 2]
0529: * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
0530: * offset16 - 1; the bounds are
0531: * [offset16 - 1, offset16 + 1]
0532: * </ul>
0533: * For bit-twiddlers, the return values for these are chosen so that the
0534: * boundaries can be gotten by:
0535: * [offset16 - (value >> 2), offset16 + (value & 3)].
0536: * @exception IndexOutOfBoundsException if offset16 is out of bounds.
0537: * @stable ICU 2.1
0538: */
0539: public static int bounds(StringBuffer source, int offset16) {
0540: char ch = source.charAt(offset16);
0541: if (isSurrogate(ch)) {
0542: if (isLeadSurrogate(ch)) {
0543: if (++offset16 < source.length()
0544: && isTrailSurrogate(source.charAt(offset16))) {
0545: return LEAD_SURROGATE_BOUNDARY;
0546: }
0547: } else {
0548: // isTrailSurrogate(ch), so
0549: --offset16;
0550: if (offset16 >= 0
0551: && isLeadSurrogate(source.charAt(offset16))) {
0552: return TRAIL_SURROGATE_BOUNDARY;
0553: }
0554: }
0555: }
0556: return SINGLE_CHAR_BOUNDARY;
0557: }
0558:
0559: /**
0560: * Returns the type of the boundaries around the char at offset16. Used
0561: * for random access. Note that the boundaries are determined with respect
0562: * to the subarray, hence the char array {0xD800, 0xDC00} has the result
0563: * SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
0564: * @param source char array to analyse
0565: * @param start offset to substring in the source array for analyzing
0566: * @param limit offset to substring in the source array for analyzing
0567: * @param offset16 UTF16 offset relative to start
0568: * @return
0569: * <ul>
0570: * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
0571: * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at
0572: * offset16; the bounds are [offset16, offset16 + 2]
0573: * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at
0574: * offset16 - 1; the bounds are [offset16 - 1, offset16 + 1]
0575: * </ul>
0576: * For bit-twiddlers, the boundary values for these are chosen so that the
0577: * boundaries can be gotten by: [offset16 - (boundvalue >> 2), offset16
0578: * + (boundvalue & 3)].
0579: * @exception IndexOutOfBoundsException if offset16 is not within the
0580: * range of start and limit.
0581: * @stable ICU 2.1
0582: */
0583: public static int bounds(char source[], int start, int limit,
0584: int offset16) {
0585: offset16 += start;
0586: if (offset16 < start || offset16 >= limit) {
0587: throw new ArrayIndexOutOfBoundsException(offset16);
0588: }
0589: char ch = source[offset16];
0590: if (isSurrogate(ch)) {
0591: if (isLeadSurrogate(ch)) {
0592: ++offset16;
0593: if (offset16 < limit
0594: && isTrailSurrogate(source[offset16])) {
0595: return LEAD_SURROGATE_BOUNDARY;
0596: }
0597: } else { // isTrailSurrogate(ch), so
0598: --offset16;
0599: if (offset16 >= start
0600: && isLeadSurrogate(source[offset16])) {
0601: return TRAIL_SURROGATE_BOUNDARY;
0602: }
0603: }
0604: }
0605: return SINGLE_CHAR_BOUNDARY;
0606: }
0607:
0608: /**
0609: * Determines whether the code value is a surrogate.
0610: * @param char16 the input character.
0611: * @return true iff the input character is a surrogate.
0612: * @stable ICU 2.1
0613: */
0614: public static boolean isSurrogate(char char16) {
0615: return LEAD_SURROGATE_MIN_VALUE <= char16
0616: && char16 <= TRAIL_SURROGATE_MAX_VALUE;
0617: }
0618:
0619: /**
0620: * Determines whether the character is a trail surrogate.
0621: * @param char16 the input character.
0622: * @return true iff the input character is a trail surrogate.
0623: * @stable ICU 2.1
0624: */
0625: public static boolean isTrailSurrogate(char char16) {
0626: return (TRAIL_SURROGATE_MIN_VALUE <= char16 && char16 <= TRAIL_SURROGATE_MAX_VALUE);
0627: }
0628:
0629: /**
0630: * Determines whether the character is a lead surrogate.
0631: * @param char16 the input character.
0632: * @return true iff the input character is a lead surrogate
0633: * @stable ICU 2.1
0634: */
0635: public static boolean isLeadSurrogate(char char16) {
0636: return LEAD_SURROGATE_MIN_VALUE <= char16
0637: && char16 <= LEAD_SURROGATE_MAX_VALUE;
0638: }
0639:
0640: /**
0641: * Returns the lead surrogate.
0642: * If a validity check is required, use
0643: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0644: * on char32 before calling.
0645: * @param char32 the input character.
0646: * @return lead surrogate if the getCharCount(ch) is 2; <br>
0647: * and 0 otherwise (note: 0 is not a valid lead surrogate).
0648: * @stable ICU 2.1
0649: */
0650: public static char getLeadSurrogate(int char32) {
0651: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
0652: return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
0653: }
0654:
0655: return 0;
0656: }
0657:
0658: /**
0659: * Returns the trail surrogate.
0660: * If a validity check is required, use
0661: * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0662: * on char32 before calling.
0663: * @param char32 the input character.
0664: * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
0665: * the character itself
0666: * @stable ICU 2.1
0667: */
0668: public static char getTrailSurrogate(int char32) {
0669: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
0670: return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
0671: }
0672:
0673: return (char) char32;
0674: }
0675:
0676: /**
0677: * Convenience method corresponding to String.valueOf(char). Returns a one
0678: * or two char string containing the UTF-32 value in UTF16 format. If a
0679: * validity check is required, use
0680: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
0681: * char32 before calling.
0682: * @param char32 the input character.
0683: * @return string value of char32 in UTF16 format
0684: * @exception IllegalArgumentException thrown if char32 is a invalid
0685: * codepoint.
0686: * @stable ICU 2.1
0687: */
0688: public static String valueOf(int char32) {
0689: if (char32 < CODEPOINT_MIN_VALUE
0690: || char32 > CODEPOINT_MAX_VALUE) {
0691: throw new IllegalArgumentException("Illegal codepoint");
0692: }
0693: return toString(char32);
0694: }
0695:
0696: /**
0697: * Convenience method corresponding to String.valueOf(codepoint at
0698: * offset16).
0699: * Returns a one or two char string containing the UTF-32 value in UTF16
0700: * format. If offset16 indexes a surrogate character, the whole
0701: * supplementary codepoint will be returned.
0702: * If a validity check is required, use
0703: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
0704: * codepoint at offset16 before calling.
0705: * The result returned will be a newly created String obtained by calling
0706: * source.substring(..) with the appropriate indexes.
0707: * @param source the input string.
0708: * @param offset16 the UTF16 index to the codepoint in source
0709: * @return string value of char32 in UTF16 format
0710: * @stable ICU 2.1
0711: */
0712: public static String valueOf(String source, int offset16) {
0713: switch (bounds(source, offset16)) {
0714: case LEAD_SURROGATE_BOUNDARY:
0715: return source.substring(offset16, offset16 + 2);
0716: case TRAIL_SURROGATE_BOUNDARY:
0717: return source.substring(offset16 - 1, offset16 + 1);
0718: default:
0719: return source.substring(offset16, offset16 + 1);
0720: }
0721: }
0722:
0723: /**
0724: * Convenience method corresponding to
0725: * StringBuffer.valueOf(codepoint at offset16).
0726: * Returns a one or two char string containing the UTF-32 value in UTF16
0727: * format. If offset16 indexes a surrogate character, the whole
0728: * supplementary codepoint will be returned.
0729: * If a validity check is required, use
0730: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
0731: * codepoint at offset16 before calling.
0732: * The result returned will be a newly created String obtained by calling
0733: * source.substring(..) with the appropriate indexes.
0734: * @param source the input string buffer.
0735: * @param offset16 the UTF16 index to the codepoint in source
0736: * @return string value of char32 in UTF16 format
0737: * @stable ICU 2.1
0738: */
0739: public static String valueOf(StringBuffer source, int offset16) {
0740: switch (bounds(source, offset16)) {
0741: case LEAD_SURROGATE_BOUNDARY:
0742: return source.substring(offset16, offset16 + 2);
0743: case TRAIL_SURROGATE_BOUNDARY:
0744: return source.substring(offset16 - 1, offset16 + 1);
0745: default:
0746: return source.substring(offset16, offset16 + 1);
0747: }
0748: }
0749:
0750: /**
0751: * Convenience method.
0752: * Returns a one or two char string containing the UTF-32 value in UTF16
0753: * format. If offset16 indexes a surrogate character, the whole
0754: * supplementary codepoint will be returned, except when either the
0755: * leading or trailing surrogate character lies out of the specified
0756: * subarray. In the latter case, only the surrogate character within
0757: * bounds will be returned.
0758: * If a validity check is required, use
0759: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
0760: * codepoint at offset16 before calling.
0761: * The result returned will be a newly created String containing the
0762: * relevant characters.
0763: * @param source the input char array.
0764: * @param start start index of the subarray
0765: * @param limit end index of the subarray
0766: * @param offset16 the UTF16 index to the codepoint in source relative to
0767: * start
0768: * @return string value of char32 in UTF16 format
0769: * @stable ICU 2.1
0770: */
0771: public static String valueOf(char source[], int start, int limit,
0772: int offset16) {
0773: switch (bounds(source, start, limit, offset16)) {
0774: case LEAD_SURROGATE_BOUNDARY:
0775: return new String(source, start + offset16, 2);
0776: case TRAIL_SURROGATE_BOUNDARY:
0777: return new String(source, start + offset16 - 1, 2);
0778: }
0779: return new String(source, start + offset16, 1);
0780: }
0781:
0782: /**
0783: * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
0784: * Used for random access. See the <a name="_top_">class description</a>
0785: * for notes on roundtripping.
0786: * @param source the UTF-16 string
0787: * @param offset32 UTF-32 offset
0788: * @return UTF-16 offset
0789: * @exception IndexOutOfBoundsException if offset32 is out of bounds.
0790: * @stable ICU 2.1
0791: */
0792: public static int findOffsetFromCodePoint(String source,
0793: int offset32) {
0794: char ch;
0795: int size = source.length(), result = 0, count = offset32;
0796: if (offset32 < 0 || offset32 > size) {
0797: throw new StringIndexOutOfBoundsException(offset32);
0798: }
0799: while (result < size && count > 0) {
0800: ch = source.charAt(result);
0801: if (isLeadSurrogate(ch) && ((result + 1) < size)
0802: && isTrailSurrogate(source.charAt(result + 1))) {
0803: result++;
0804: }
0805:
0806: count--;
0807: result++;
0808: }
0809: if (count != 0) {
0810: throw new StringIndexOutOfBoundsException(offset32);
0811: }
0812: return result;
0813: }
0814:
0815: /**
0816: * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
0817: * Used for random access. See the <a name="_top_">class description</a>
0818: * for notes on roundtripping.
0819: * @param source the UTF-16 string buffer
0820: * @param offset32 UTF-32 offset
0821: * @return UTF-16 offset
0822: * @exception IndexOutOfBoundsException if offset32 is out of bounds.
0823: * @stable ICU 2.1
0824: */
0825: public static int findOffsetFromCodePoint(StringBuffer source,
0826: int offset32) {
0827: char ch;
0828: int size = source.length(), result = 0, count = offset32;
0829: if (offset32 < 0 || offset32 > size) {
0830: throw new StringIndexOutOfBoundsException(offset32);
0831: }
0832: while (result < size && count > 0) {
0833: ch = source.charAt(result);
0834: if (isLeadSurrogate(ch) && ((result + 1) < size)
0835: && isTrailSurrogate(source.charAt(result + 1))) {
0836: result++;
0837: }
0838:
0839: count--;
0840: result++;
0841: }
0842: if (count != 0) {
0843: throw new StringIndexOutOfBoundsException(offset32);
0844: }
0845: return result;
0846: }
0847:
0848: /**
0849: * Returns the UTF-16 offset that corresponds to a UTF-32 offset.
0850: * Used for random access. See the <a name="_top_">class description</a>
0851: * for notes on roundtripping.
0852: * @param source the UTF-16 char array whose substring is to be analysed
0853: * @param start offset of the substring to be analysed
0854: * @param limit offset of the substring to be analysed
0855: * @param offset32 UTF-32 offset relative to start
0856: * @return UTF-16 offset relative to start
0857: * @exception IndexOutOfBoundsException if offset32 is out of bounds.
0858: * @stable ICU 2.1
0859: */
0860: public static int findOffsetFromCodePoint(char source[], int start,
0861: int limit, int offset32) {
0862: char ch;
0863: int result = start, count = offset32;
0864: if (offset32 > limit - start) {
0865: throw new ArrayIndexOutOfBoundsException(offset32);
0866: }
0867: while (result < limit && count > 0) {
0868: ch = source[result];
0869: if (isLeadSurrogate(ch) && ((result + 1) < limit)
0870: && isTrailSurrogate(source[result + 1])) {
0871: result++;
0872: }
0873:
0874: count--;
0875: result++;
0876: }
0877: if (count != 0) {
0878: throw new ArrayIndexOutOfBoundsException(offset32);
0879: }
0880: return result - start;
0881: }
0882:
0883: /**
0884: * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
0885: * or after the given UTF-16 offset. Used for random access. See the
0886: * <a name="_top_">class description</a> for notes on roundtripping.<br>
0887: * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
0888: * then the UTF-32 offset of the <strong>lead</strong> of the pair is
0889: * returned.
0890: * </i>
0891: * <p>
0892: * To find the UTF-32 length of a string, use:
0893: * <pre>
0894: * len32 = countCodePoint(source, source.length());
0895: * </pre>
0896: * </p>
0897: * <p>
0898: * @param source text to analyse
0899: * @param offset16 UTF-16 offset < source text length.
0900: * @return UTF-32 offset
0901: * @exception IndexOutOfBoundsException if offset16 is out of bounds.
0902: * @stable ICU 2.1
0903: */
0904: public static int findCodePointOffset(String source, int offset16) {
0905: if (offset16 < 0 || offset16 > source.length()) {
0906: throw new StringIndexOutOfBoundsException(offset16);
0907: }
0908:
0909: int result = 0;
0910: char ch;
0911: boolean hadLeadSurrogate = false;
0912:
0913: for (int i = 0; i < offset16; ++i) {
0914: ch = source.charAt(i);
0915: if (hadLeadSurrogate && isTrailSurrogate(ch)) {
0916: hadLeadSurrogate = false; // count valid trail as zero
0917: } else {
0918: hadLeadSurrogate = isLeadSurrogate(ch);
0919: ++result; // count others as 1
0920: }
0921: }
0922:
0923: if (offset16 == source.length()) {
0924: return result;
0925: }
0926:
0927: // end of source being the less significant surrogate character
0928: // shift result back to the start of the supplementary character
0929: if (hadLeadSurrogate
0930: && (isTrailSurrogate(source.charAt(offset16)))) {
0931: result--;
0932: }
0933:
0934: return result;
0935: }
0936:
0937: /**
0938: * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
0939: * the given UTF-16 offset. Used for random access. See the
0940: * <a name="_top_">class description</a> for notes on roundtripping.<br>
0941: * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
0942: * then the UTF-32 offset of the <strong>lead</strong> of the pair is
0943: * returned.
0944: * </i>
0945: * <p>
0946: * To find the UTF-32 length of a string, use:
0947: * <pre>
0948: * len32 = countCodePoint(source);
0949: * </pre>
0950: * </p>
0951: * <p>
0952: * @param source text to analyse
0953: * @param offset16 UTF-16 offset < source text length.
0954: * @return UTF-32 offset
0955: * @exception IndexOutOfBoundsException if offset16 is out of bounds.
0956: * @stable ICU 2.1
0957: */
0958: public static int findCodePointOffset(StringBuffer source,
0959: int offset16) {
0960: if (offset16 < 0 || offset16 > source.length()) {
0961: throw new StringIndexOutOfBoundsException(offset16);
0962: }
0963:
0964: int result = 0;
0965: char ch;
0966: boolean hadLeadSurrogate = false;
0967:
0968: for (int i = 0; i < offset16; ++i) {
0969: ch = source.charAt(i);
0970: if (hadLeadSurrogate && isTrailSurrogate(ch)) {
0971: hadLeadSurrogate = false; // count valid trail as zero
0972: } else {
0973: hadLeadSurrogate = isLeadSurrogate(ch);
0974: ++result; // count others as 1
0975: }
0976: }
0977:
0978: if (offset16 == source.length()) {
0979: return result;
0980: }
0981:
0982: // end of source being the less significant surrogate character
0983: // shift result back to the start of the supplementary character
0984: if (hadLeadSurrogate
0985: && (isTrailSurrogate(source.charAt(offset16)))) {
0986: result--;
0987: }
0988:
0989: return result;
0990: }
0991:
0992: /**
0993: * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at
0994: * the given UTF-16 offset. Used for random access. See the
0995: * <a name="_top_">class description</a> for notes on roundtripping.<br>
0996: * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair,
0997: * then the UTF-32 offset of the <strong>lead</strong> of the pair is
0998: * returned.
0999: * </i>
1000: * <p>
1001: * To find the UTF-32 length of a substring, use:
1002: * <pre>
1003: * len32 = countCodePoint(source, start, limit);
1004: * </pre>
1005: * </p>
1006: * <p>
1007: * @param source text to analyse
1008: * @param start offset of the substring
1009: * @param limit offset of the substring
1010: * @param offset16 UTF-16 relative to start
1011: * @return UTF-32 offset relative to start
1012: * @exception IndexOutOfBoundsException if offset16 is not within the
1013: * range of start and limit.
1014: * @stable ICU 2.1
1015: */
1016: public static int findCodePointOffset(char source[], int start,
1017: int limit, int offset16) {
1018: offset16 += start;
1019: if (offset16 > limit) {
1020: throw new StringIndexOutOfBoundsException(offset16);
1021: }
1022:
1023: int result = 0;
1024: char ch;
1025: boolean hadLeadSurrogate = false;
1026:
1027: for (int i = start; i < offset16; ++i) {
1028: ch = source[i];
1029: if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1030: hadLeadSurrogate = false; // count valid trail as zero
1031: } else {
1032: hadLeadSurrogate = isLeadSurrogate(ch);
1033: ++result; // count others as 1
1034: }
1035: }
1036:
1037: if (offset16 == limit) {
1038: return result;
1039: }
1040:
1041: // end of source being the less significant surrogate character
1042: // shift result back to the start of the supplementary character
1043: if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1044: result--;
1045: }
1046:
1047: return result;
1048: }
1049:
1050: /**
1051: * Append a single UTF-32 value to the end of a StringBuffer.
1052: * If a validity check is required, use
1053: * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
1054: * char32 before calling.
1055: * @param target the buffer to append to
1056: * @param char32 value to append.
1057: * @return the updated StringBuffer
1058: * @exception IllegalArgumentException thrown when char32 does not lie
1059: * within the range of the Unicode codepoints
1060: * @stable ICU 2.1
1061: */
1062: public static StringBuffer append(StringBuffer target, int char32) {
1063: // Check for irregular values
1064: if (char32 < CODEPOINT_MIN_VALUE
1065: || char32 > CODEPOINT_MAX_VALUE) {
1066: throw new IllegalArgumentException("Illegal codepoint: "
1067: + Integer.toHexString(char32));
1068: }
1069:
1070: // Write the UTF-16 values
1071: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1072: target.append(getLeadSurrogate(char32));
1073: target.append(getTrailSurrogate(char32));
1074: } else {
1075: target.append((char) char32);
1076: }
1077: return target;
1078: }
1079:
1080: /**
1081: * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer
1082: * as a convenience.
1083: *
1084: * @param target the buffer to append to
1085: * @param cp the code point to append
1086: * @return the updated StringBuffer
1087: * @throws IllegalArgumentException if cp is not a valid code point
1088: * @stable ICU 3.0
1089: */
1090: public static StringBuffer appendCodePoint(StringBuffer target,
1091: int cp) {
1092: return append(target, cp);
1093: }
1094:
1095: /**
1096: * Adds a codepoint to offset16 position of the argument char array.
1097: * @param target char array to be append with the new code point
1098: * @param limit UTF16 offset which the codepoint will be appended.
1099: * @param char32 code point to be appended
1100: * @return offset after char32 in the array.
1101: * @exception IllegalArgumentException thrown if there is not enough
1102: * space for the append, or when char32 does not lie within
1103: * the range of the Unicode codepoints.
1104: * @stable ICU 2.1
1105: */
1106: public static int append(char[] target, int limit, int char32) {
1107: // Check for irregular values
1108: if (char32 < CODEPOINT_MIN_VALUE
1109: || char32 > CODEPOINT_MAX_VALUE) {
1110: throw new IllegalArgumentException("Illegal codepoint");
1111: }
1112: // Write the UTF-16 values
1113: if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1114: target[limit++] = getLeadSurrogate(char32);
1115: target[limit++] = getTrailSurrogate(char32);
1116: } else {
1117: target[limit++] = (char) char32;
1118: }
1119: return limit;
1120: }
1121:
1122: /**
1123: * Number of codepoints in a UTF16 String
1124: * @param source UTF16 string
1125: * @return number of codepoint in string
1126: * @stable ICU 2.1
1127: */
1128: public static int countCodePoint(String source) {
1129: if (source == null || source.length() == 0) {
1130: return 0;
1131: }
1132: return findCodePointOffset(source, source.length());
1133: }
1134:
1135: /**
1136: * Number of codepoints in a UTF16 String buffer
1137: * @param source UTF16 string buffer
1138: * @return number of codepoint in string
1139: * @stable ICU 2.1
1140: */
1141: public static int countCodePoint(StringBuffer source) {
1142: if (source == null || source.length() == 0) {
1143: return 0;
1144: }
1145: return findCodePointOffset(source, source.length());
1146: }
1147:
1148: /**
1149: * Number of codepoints in a UTF16 char array substring
1150: * @param source UTF16 char array
1151: * @param start offset of the substring
1152: * @param limit offset of the substring
1153: * @return number of codepoint in the substring
1154: * @exception IndexOutOfBoundsException if start and limit are not valid.
1155: * @stable ICU 2.1
1156: */
1157: public static int countCodePoint(char source[], int start, int limit) {
1158: if (source == null || source.length == 0) {
1159: return 0;
1160: }
1161: return findCodePointOffset(source, start, limit, limit - start);
1162: }
1163:
1164: /**
1165: * Set a code point into a UTF16 position.
1166: * Adjusts target according if we are replacing a non-supplementary
1167: * codepoint with a supplementary and vice versa.
1168: * @param target stringbuffer
1169: * @param offset16 UTF16 position to insert into
1170: * @param char32 code point
1171: * @stable ICU 2.1
1172: */
1173: public static void setCharAt(StringBuffer target, int offset16,
1174: int char32) {
1175: int count = 1;
1176: char single = target.charAt(offset16);
1177:
1178: if (isSurrogate(single)) {
1179: // pairs of the surrogate with offset16 at the lead char found
1180: if (isLeadSurrogate(single)
1181: && (target.length() > offset16 + 1)
1182: && isTrailSurrogate(target.charAt(offset16 + 1))) {
1183: count++;
1184: } else {
1185: // pairs of the surrogate with offset16 at the trail char
1186: // found
1187: if (isTrailSurrogate(single) && (offset16 > 0)
1188: && isLeadSurrogate(target.charAt(offset16 - 1))) {
1189: offset16--;
1190: count++;
1191: }
1192: }
1193: }
1194: target.replace(offset16, offset16 + count, valueOf(char32));
1195: }
1196:
1197: /**
1198: * Set a code point into a UTF16 position in a char array.
1199: * Adjusts target according if we are replacing a non-supplementary
1200: * codepoint with a supplementary and vice versa.
1201: * @param target char array
1202: * @param limit numbers of valid chars in target, different from
1203: * target.length. limit counts the number of chars in target
1204: * that represents a string, not the size of array target.
1205: * @param offset16 UTF16 position to insert into
1206: * @param char32 code point
1207: * @return new number of chars in target that represents a string
1208: * @exception IndexOutOfBoundsException if offset16 is out of range
1209: * @stable ICU 2.1
1210: */
1211: public static int setCharAt(char target[], int limit, int offset16,
1212: int char32) {
1213: if (offset16 >= limit) {
1214: throw new ArrayIndexOutOfBoundsException(offset16);
1215: }
1216: int count = 1;
1217: char single = target[offset16];
1218:
1219: if (isSurrogate(single)) {
1220: // pairs of the surrogate with offset16 at the lead char found
1221: if (isLeadSurrogate(single)
1222: && (target.length > offset16 + 1)
1223: && isTrailSurrogate(target[offset16 + 1])) {
1224: count++;
1225: } else {
1226: // pairs of the surrogate with offset16 at the trail char
1227: // found
1228: if (isTrailSurrogate(single) && (offset16 > 0)
1229: && isLeadSurrogate(target[offset16 - 1])) {
1230: offset16--;
1231: count++;
1232: }
1233: }
1234: }
1235:
1236: String str = valueOf(char32);
1237: int result = limit;
1238: int strlength = str.length();
1239: target[offset16] = str.charAt(0);
1240: if (count == strlength) {
1241: if (count == 2) {
1242: target[offset16 + 1] = str.charAt(1);
1243: }
1244: } else {
1245: // this is not exact match in space, we'll have to do some
1246: // shifting
1247: System.arraycopy(target, offset16 + count, target, offset16
1248: + strlength, limit - (offset16 + count));
1249: if (count < strlength) {
1250: // char32 is a supplementary character trying to squeeze into
1251: // a non-supplementary space
1252: target[offset16 + 1] = str.charAt(1);
1253: result++;
1254: if (result < target.length) {
1255: target[result] = 0;
1256: }
1257: } else {
1258: // char32 is a non-supplementary character trying to fill
1259: // into a supplementary space
1260: result--;
1261: target[result] = 0;
1262: }
1263: }
1264: return result;
1265: }
1266:
1267: /**
1268: * Shifts offset16 by the argument number of codepoints
1269: * @param source string
1270: * @param offset16 UTF16 position to shift
1271: * @param shift32 number of codepoints to shift
1272: * @return new shifted offset16
1273: * @exception IndexOutOfBoundsException if the new offset16 is out of
1274: * bounds.
1275: * @stable ICU 2.1
1276: */
1277: public static int moveCodePointOffset(String source, int offset16,
1278: int shift32) {
1279: int result = offset16;
1280: int size = source.length();
1281: int count;
1282: char ch;
1283: if (offset16 < 0 || offset16 > size) {
1284: throw new StringIndexOutOfBoundsException(offset16);
1285: }
1286: if (shift32 > 0) {
1287: if (shift32 + offset16 > size) {
1288: throw new StringIndexOutOfBoundsException(offset16);
1289: }
1290: count = shift32;
1291: while (result < size && count > 0) {
1292: ch = source.charAt(result);
1293: if (isLeadSurrogate(ch) && ((result + 1) < size)
1294: && isTrailSurrogate(source.charAt(result + 1))) {
1295: result++;
1296: }
1297: count--;
1298: result++;
1299: }
1300: } else {
1301: if (offset16 + shift32 < 0) {
1302: throw new StringIndexOutOfBoundsException(offset16);
1303: }
1304: for (count = -shift32; count > 0; count--) {
1305: result--;
1306: if (result < 0) {
1307: break;
1308: }
1309: ch = source.charAt(result);
1310: if (isTrailSurrogate(ch) && result > 0
1311: && isLeadSurrogate(source.charAt(result - 1))) {
1312: result--;
1313: }
1314: }
1315: }
1316: if (count != 0) {
1317: throw new StringIndexOutOfBoundsException(shift32);
1318: }
1319: return result;
1320: }
1321:
1322: /**
1323: * Shifts offset16 by the argument number of codepoints
1324: * @param source string buffer
1325: * @param offset16 UTF16 position to shift
1326: * @param shift32 number of codepoints to shift
1327: * @return new shifted offset16
1328: * @exception IndexOutOfBoundsException if the new offset16 is out of
1329: * bounds.
1330: * @stable ICU 2.1
1331: */
1332: public static int moveCodePointOffset(StringBuffer source,
1333: int offset16, int shift32) {
1334: int result = offset16;
1335: int size = source.length();
1336: int count;
1337: char ch;
1338: if (offset16 < 0 || offset16 > size) {
1339: throw new StringIndexOutOfBoundsException(offset16);
1340: }
1341: if (shift32 > 0) {
1342: if (shift32 + offset16 > size) {
1343: throw new StringIndexOutOfBoundsException(offset16);
1344: }
1345: count = shift32;
1346: while (result < size && count > 0) {
1347: ch = source.charAt(result);
1348: if (isLeadSurrogate(ch) && ((result + 1) < size)
1349: && isTrailSurrogate(source.charAt(result + 1))) {
1350: result++;
1351: }
1352: count--;
1353: result++;
1354: }
1355: } else {
1356: if (offset16 + shift32 < 0) {
1357: throw new StringIndexOutOfBoundsException(offset16);
1358: }
1359: for (count = -shift32; count > 0; count--) {
1360: result--;
1361: if (result < 0) {
1362: break;
1363: }
1364: ch = source.charAt(result);
1365: if (isTrailSurrogate(ch) && result > 0
1366: && isLeadSurrogate(source.charAt(result - 1))) {
1367: result--;
1368: }
1369: }
1370: }
1371: if (count != 0) {
1372: throw new StringIndexOutOfBoundsException(shift32);
1373: }
1374: return result;
1375: }
1376:
1377: /**
1378: * Shifts offset16 by the argument number of codepoints within a subarray.
1379: * @param source char array
1380: * @param start position of the subarray to be performed on
1381: * @param limit position of the subarray to be performed on
1382: * @param offset16 UTF16 position to shift relative to start
1383: * @param shift32 number of codepoints to shift
1384: * @return new shifted offset16 relative to start
1385: * @exception IndexOutOfBoundsException if the new offset16 is out of
1386: * bounds with respect to the subarray or the subarray bounds
1387: * are out of range.
1388: * @stable ICU 2.1
1389: */
1390: public static int moveCodePointOffset(char source[], int start,
1391: int limit, int offset16, int shift32) {
1392: int size = source.length;
1393: int count;
1394: char ch;
1395: int result = offset16 + start;
1396: if (start < 0 || limit < start) {
1397: throw new StringIndexOutOfBoundsException(start);
1398: }
1399: if (limit > size) {
1400: throw new StringIndexOutOfBoundsException(limit);
1401: }
1402: if (offset16 < 0 || result > limit) {
1403: throw new StringIndexOutOfBoundsException(offset16);
1404: }
1405: if (shift32 > 0) {
1406: if (shift32 + result > size) {
1407: throw new StringIndexOutOfBoundsException(result);
1408: }
1409: count = shift32;
1410: while (result < limit && count > 0) {
1411: ch = source[result];
1412: if (isLeadSurrogate(ch) && (result + 1 < limit)
1413: && isTrailSurrogate(source[result + 1])) {
1414: result++;
1415: }
1416: count--;
1417: result++;
1418: }
1419: } else {
1420: if (result + shift32 < start) {
1421: throw new StringIndexOutOfBoundsException(result);
1422: }
1423: for (count = -shift32; count > 0; count--) {
1424: result--;
1425: if (result < start) {
1426: break;
1427: }
1428: ch = source[result];
1429: if (isTrailSurrogate(ch) && result > start
1430: && isLeadSurrogate(source[result - 1])) {
1431: result--;
1432: }
1433: }
1434: }
1435: if (count != 0) {
1436: throw new StringIndexOutOfBoundsException(shift32);
1437: }
1438: result -= start;
1439: return result;
1440: }
1441:
1442: /**
1443: * Inserts char32 codepoint into target at the argument offset16.
1444: * If the offset16 is in the middle of a supplementary codepoint, char32
1445: * will be inserted after the supplementary codepoint.
1446: * The length of target increases by one if codepoint is non-supplementary,
1447: * 2 otherwise.
1448: * <p>
1449: * The overall effect is exactly as if the argument were converted to a
1450: * string by the method valueOf(char) and the characters in that string
1451: * were then inserted into target at the position indicated by offset16.
1452: * </p>
1453: * <p>
1454: * The offset argument must be greater than or equal to 0, and less than
1455: * or equal to the length of source.
1456: * @param target string buffer to insert to
1457: * @param offset16 offset which char32 will be inserted in
1458: * @param char32 codepoint to be inserted
1459: * @return a reference to target
1460: * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1461: * @stable ICU 2.1
1462: */
1463: public static StringBuffer insert(StringBuffer target,
1464: int offset16, int char32) {
1465: String str = valueOf(char32);
1466: if (offset16 != target.length()
1467: && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1468: offset16++;
1469: }
1470: target.insert(offset16, str);
1471: return target;
1472: }
1473:
1474: /**
1475: * Inserts char32 codepoint into target at the argument offset16.
1476: * If the offset16 is in the middle of a supplementary codepoint, char32
1477: * will be inserted after the supplementary codepoint.
1478: * Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1479: * <p>
1480: * The overall effect is exactly as if the argument were converted to a
1481: * string by the method valueOf(char) and the characters in that string
1482: * were then inserted into target at the position indicated by offset16.
1483: * </p>
1484: * <p>
1485: * The offset argument must be greater than or equal to 0, and less than
1486: * or equal to the limit.
1487: * @param target char array to insert to
1488: * @param limit end index of the char array, limit <= target.length
1489: * @param offset16 offset which char32 will be inserted in
1490: * @param char32 codepoint to be inserted
1491: * @return new limit size
1492: * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1493: * @stable ICU 2.1
1494: */
1495: public static int insert(char target[], int limit, int offset16,
1496: int char32) {
1497: String str = valueOf(char32);
1498: if (offset16 != limit
1499: && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1500: offset16++;
1501: }
1502: int size = str.length();
1503: if (limit + size > target.length) {
1504: throw new ArrayIndexOutOfBoundsException(offset16 + size);
1505: }
1506: System.arraycopy(target, offset16, target, offset16 + size,
1507: limit - offset16);
1508: target[offset16] = str.charAt(0);
1509: if (size == 2) {
1510: target[offset16 + 1] = str.charAt(1);
1511: }
1512: return limit + size;
1513: }
1514:
1515: /**
1516: * Removes the codepoint at the specified position in this target
1517: * (shortening target by 1 character if the codepoint is a
1518: * non-supplementary, 2 otherwise).
1519: * @param target string buffer to remove codepoint from
1520: * @param offset16 offset which the codepoint will be removed
1521: * @return a reference to target
1522: * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1523: * @stable ICU 2.1
1524: */
1525: public static StringBuffer delete(StringBuffer target, int offset16) {
1526: int count = 1;
1527: switch (bounds(target, offset16)) {
1528: case LEAD_SURROGATE_BOUNDARY:
1529: count++;
1530: break;
1531: case TRAIL_SURROGATE_BOUNDARY:
1532: count++;
1533: offset16--;
1534: break;
1535: }
1536: target.delete(offset16, offset16 + count);
1537: return target;
1538: }
1539:
1540: /**
1541: * Removes the codepoint at the specified position in this target
1542: * (shortening target by 1 character if the codepoint is a
1543: * non-supplementary, 2 otherwise).
1544: * @param target string buffer to remove codepoint from
1545: * @param limit end index of the char array, limit <= target.length
1546: * @param offset16 offset which the codepoint will be removed
1547: * @return a new limit size
1548: * @exception IndexOutOfBoundsException thrown if offset16 is invalid.
1549: * @stable ICU 2.1
1550: */
1551: public static int delete(char target[], int limit, int offset16) {
1552: int count = 1;
1553: switch (bounds(target, 0, limit, offset16)) {
1554: case LEAD_SURROGATE_BOUNDARY:
1555: count++;
1556: break;
1557: case TRAIL_SURROGATE_BOUNDARY:
1558: count++;
1559: offset16--;
1560: break;
1561: }
1562: System.arraycopy(target, offset16 + count, target, offset16,
1563: limit - (offset16 + count));
1564: target[limit - count] = 0;
1565: return limit - count;
1566: }
1567:
1568: /**
1569: * Returns the index within the argument UTF16 format Unicode string of
1570: * the first occurrence of the argument codepoint. I.e., the smallest
1571: * index <code>i</code> such that <code>UTF16.charAt(source, i) ==
1572: * char32</code> is true.
1573: * <p>If no such character occurs in this string, then -1 is returned.</p>
1574: * <p>
1575: * Examples:<br>
1576: * UTF16.indexOf("abc", 'a') returns 0<br>
1577: * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1578: * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1579: * </p>
1580: * Note this method is provided as support to jdk 1.3, which does not
1581: * support supplementary characters to its fullest.
1582: * @param source UTF16 format Unicode string that will be searched
1583: * @param char32 codepoint to search for
1584: * @return the index of the first occurrence of the codepoint in the
1585: * argument Unicode string, or -1 if the codepoint does not occur.
1586: * @stable ICU 2.6
1587: */
1588: public static int indexOf(String source, int char32) {
1589: if (char32 < CODEPOINT_MIN_VALUE
1590: || char32 > CODEPOINT_MAX_VALUE) {
1591: throw new IllegalArgumentException(
1592: "Argument char32 is not a valid codepoint");
1593: }
1594: // non-surrogate bmp
1595: if (char32 < LEAD_SURROGATE_MIN_VALUE
1596: || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1597: return source.indexOf((char) char32);
1598: }
1599: // surrogate
1600: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1601: int result = source.indexOf((char) char32);
1602: if (result >= 0) {
1603: if (isLeadSurrogate((char) char32)
1604: && (result < source.length() - 1)
1605: && isTrailSurrogate(source.charAt(result + 1))) {
1606: return indexOf(source, char32, result + 1);
1607: }
1608: // trail surrogate
1609: if (result > 0
1610: && isLeadSurrogate(source.charAt(result - 1))) {
1611: return indexOf(source, char32, result + 1);
1612: }
1613: }
1614: return result;
1615: }
1616: // supplementary
1617: String char32str = toString(char32);
1618: return source.indexOf(char32str);
1619: }
1620:
1621: /**
1622: * Returns the index within the argument UTF16 format Unicode string of
1623: * the first occurrence of the argument string str. This method is
1624: * implemented based on codepoints, hence a "lead surrogate character +
1625: * trail surrogate character" is treated as one entity.e
1626: * Hence if the str starts with trail surrogate character at index 0, a
1627: * source with a leading a surrogate character before str found at in
1628: * source will not have a valid match. Vice versa for lead surrogates
1629: * that ends str.
1630: * See example below.
1631: * <p>If no such string str occurs in this source, then -1 is returned.
1632: * </p> <p>
1633: * Examples:<br>
1634: * UTF16.indexOf("abc", "ab") returns 0<br>
1635: * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1636: * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1637: * </p>
1638: * Note this method is provided as support to jdk 1.3, which does not
1639: * support supplementary characters to its fullest.
1640: * @param source UTF16 format Unicode string that will be searched
1641: * @param str UTF16 format Unicode string to search for
1642: * @return the index of the first occurrence of the codepoint in the
1643: * argument Unicode string, or -1 if the codepoint does not occur.
1644: * @stable ICU 2.6
1645: */
1646: public static int indexOf(String source, String str) {
1647: int strLength = str.length();
1648: // non-surrogate ends
1649: if (!isTrailSurrogate(str.charAt(0))
1650: && !isLeadSurrogate(str.charAt(strLength - 1))) {
1651: return source.indexOf(str);
1652: }
1653:
1654: int result = source.indexOf(str);
1655: int resultEnd = result + strLength;
1656: if (result >= 0) {
1657: // check last character
1658: if (isLeadSurrogate(str.charAt(strLength - 1))
1659: && (result < source.length() - 1)
1660: && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1661: return indexOf(source, str, resultEnd + 1);
1662: }
1663: // check first character which is a trail surrogate
1664: if (isTrailSurrogate(str.charAt(0)) && result > 0
1665: && isLeadSurrogate(source.charAt(result - 1))) {
1666: return indexOf(source, str, resultEnd + 1);
1667: }
1668: }
1669: return result;
1670: }
1671:
1672: /**
1673: * Returns the index within the argument UTF16 format Unicode string of
1674: * the first occurrence of the argument codepoint. I.e., the smallest
1675: * index i such that: <br>
1676: * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1677: * <p>If no such character occurs in this string, then -1 is returned.</p>
1678: * <p>
1679: * Examples:<br>
1680: * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1681: * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1682: * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1683: * </p>
1684: * Note this method is provided as support to jdk 1.3, which does not
1685: * support supplementary characters to its fullest.
1686: * @param source UTF16 format Unicode string that will be searched
1687: * @param char32 codepoint to search for
1688: * @param fromIndex the index to start the search from.
1689: * @return the index of the first occurrence of the codepoint in the
1690: * argument Unicode string at or after fromIndex, or -1 if the
1691: * codepoint does not occur.
1692: * @stable ICU 2.6
1693: */
1694: public static int indexOf(String source, int char32, int fromIndex) {
1695: if (char32 < CODEPOINT_MIN_VALUE
1696: || char32 > CODEPOINT_MAX_VALUE) {
1697: throw new IllegalArgumentException(
1698: "Argument char32 is not a valid codepoint");
1699: }
1700: // non-surrogate bmp
1701: if (char32 < LEAD_SURROGATE_MIN_VALUE
1702: || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1703: return source.indexOf((char) char32, fromIndex);
1704: }
1705: // surrogate
1706: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1707: int result = source.indexOf((char) char32, fromIndex);
1708: if (result >= 0) {
1709: if (isLeadSurrogate((char) char32)
1710: && (result < source.length() - 1)
1711: && isTrailSurrogate(source.charAt(result + 1))) {
1712: return indexOf(source, char32, result + 1);
1713: }
1714: // trail surrogate
1715: if (result > 0
1716: && isLeadSurrogate(source.charAt(result - 1))) {
1717: return indexOf(source, char32, result + 1);
1718: }
1719: }
1720: return result;
1721: }
1722: // supplementary
1723: String char32str = toString(char32);
1724: return source.indexOf(char32str, fromIndex);
1725: }
1726:
1727: /**
1728: * Returns the index within the argument UTF16 format Unicode string of
1729: * the first occurrence of the argument string str. This method is
1730: * implemented based on codepoints, hence a "lead surrogate character +
1731: * trail surrogate character" is treated as one entity.e
1732: * Hence if the str starts with trail surrogate character at index 0, a
1733: * source with a leading a surrogate character before str found at in
1734: * source will not have a valid match. Vice versa for lead surrogates
1735: * that ends str.
1736: * See example below.
1737: * <p>If no such string str occurs in this source, then -1 is returned.
1738: * </p> <p>
1739: * Examples:<br>
1740: * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1741: * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1742: * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1743: * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1744: * </p>
1745: * Note this method is provided as support to jdk 1.3, which does not
1746: * support supplementary characters to its fullest.
1747: * @param source UTF16 format Unicode string that will be searched
1748: * @param str UTF16 format Unicode string to search for
1749: * @param fromIndex the index to start the search from.
1750: * @return the index of the first occurrence of the codepoint in the
1751: * argument Unicode string, or -1 if the codepoint does not occur.
1752: * @stable ICU 2.6
1753: */
1754: public static int indexOf(String source, String str, int fromIndex) {
1755: int strLength = str.length();
1756: // non-surrogate ends
1757: if (!isTrailSurrogate(str.charAt(0))
1758: && !isLeadSurrogate(str.charAt(strLength - 1))) {
1759: return source.indexOf(str, fromIndex);
1760: }
1761:
1762: int result = source.indexOf(str, fromIndex);
1763: int resultEnd = result + strLength;
1764: if (result >= 0) {
1765: // check last character
1766: if (isLeadSurrogate(str.charAt(strLength - 1))
1767: && (result < source.length() - 1)
1768: && isTrailSurrogate(source.charAt(resultEnd))) {
1769: return indexOf(source, str, resultEnd + 1);
1770: }
1771: // check first character which is a trail surrogate
1772: if (isTrailSurrogate(str.charAt(0)) && result > 0
1773: && isLeadSurrogate(source.charAt(result - 1))) {
1774: return indexOf(source, str, resultEnd + 1);
1775: }
1776: }
1777: return result;
1778: }
1779:
1780: /**
1781: * Returns the index within the argument UTF16 format Unicode string of
1782: * the last occurrence of the argument codepoint. I.e., the index returned
1783: * is the largest value i such that: UTF16.charAt(source, i) == char32
1784: * is true.
1785: * <p>
1786: * Examples:<br>
1787: * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1788: * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1789: * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1790: * </p>
1791: * <p>source is searched backwards starting at the last character.</p>
1792: * Note this method is provided as support to jdk 1.3, which does not
1793: * support supplementary characters to its fullest.
1794: * @param source UTF16 format Unicode string that will be searched
1795: * @param char32 codepoint to search for
1796: * @return the index of the last occurrence of the codepoint in source,
1797: * or -1 if the codepoint does not occur.
1798: * @stable ICU 2.6
1799: */
1800: public static int lastIndexOf(String source, int char32) {
1801: if (char32 < CODEPOINT_MIN_VALUE
1802: || char32 > CODEPOINT_MAX_VALUE) {
1803: throw new IllegalArgumentException(
1804: "Argument char32 is not a valid codepoint");
1805: }
1806: // non-surrogate bmp
1807: if (char32 < LEAD_SURROGATE_MIN_VALUE
1808: || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1809: return source.lastIndexOf((char) char32);
1810: }
1811: // surrogate
1812: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1813: int result = source.lastIndexOf((char) char32);
1814: if (result >= 0) {
1815: if (isLeadSurrogate((char) char32)
1816: && (result < source.length() - 1)
1817: && isTrailSurrogate(source.charAt(result + 1))) {
1818: return lastIndexOf(source, char32, result - 1);
1819: }
1820: // trail surrogate
1821: if (result > 0
1822: && isLeadSurrogate(source.charAt(result - 1))) {
1823: return lastIndexOf(source, char32, result - 1);
1824: }
1825: }
1826: return result;
1827: }
1828: // supplementary
1829: String char32str = toString(char32);
1830: return source.lastIndexOf(char32str);
1831: }
1832:
1833: /**
1834: * Returns the index within the argument UTF16 format Unicode string of
1835: * the last occurrence of the argument string str. This method is
1836: * implemented based on codepoints, hence a "lead surrogate character +
1837: * trail surrogate character" is treated as one entity.e
1838: * Hence if the str starts with trail surrogate character at index 0, a
1839: * source with a leading a surrogate character before str found at in
1840: * source will not have a valid match. Vice versa for lead surrogates
1841: * that ends str.
1842: * See example below.
1843: * <p>
1844: * Examples:<br>
1845: * UTF16.lastIndexOf("abc", "a") returns 0<br>
1846: * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1847: * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1848: * </p>
1849: * <p>source is searched backwards starting at the last character.</p>
1850: * Note this method is provided as support to jdk 1.3, which does not
1851: * support supplementary characters to its fullest.
1852: * @param source UTF16 format Unicode string that will be searched
1853: * @param str UTF16 format Unicode string to search for
1854: * @return the index of the last occurrence of the codepoint in source,
1855: * or -1 if the codepoint does not occur.
1856: * @stable ICU 2.6
1857: */
1858: public static int lastIndexOf(String source, String str) {
1859: int strLength = str.length();
1860: // non-surrogate ends
1861: if (!isTrailSurrogate(str.charAt(0))
1862: && !isLeadSurrogate(str.charAt(strLength - 1))) {
1863: return source.lastIndexOf(str);
1864: }
1865:
1866: int result = source.lastIndexOf(str);
1867: if (result >= 0) {
1868: // check last character
1869: if (isLeadSurrogate(str.charAt(strLength - 1))
1870: && (result < source.length() - 1)
1871: && isTrailSurrogate(source.charAt(result
1872: + strLength + 1))) {
1873: return lastIndexOf(source, str, result - 1);
1874: }
1875: // check first character which is a trail surrogate
1876: if (isTrailSurrogate(str.charAt(0)) && result > 0
1877: && isLeadSurrogate(source.charAt(result - 1))) {
1878: return lastIndexOf(source, str, result - 1);
1879: }
1880: }
1881: return result;
1882: }
1883:
1884: /**
1885: * <p>Returns the index within the argument UTF16 format Unicode string of
1886: * the last occurrence of the argument codepoint, where the result is less
1887: * than or equals to fromIndex.</p>
1888: * <p>This method is implemented based on codepoints, hence a single
1889: * surrogate character will not match a supplementary character.</p>
1890: * <p>source is searched backwards starting at the last character starting
1891: * at the specified index.</p>
1892: * <p>
1893: * Examples:<br>
1894: * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1895: * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1896: * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1897: * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1898: * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1899: * </p>
1900: * Note this method is provided as support to jdk 1.3, which does not
1901: * support supplementary characters to its fullest.
1902: * @param source UTF16 format Unicode string that will be searched
1903: * @param char32 codepoint to search for
1904: * @param fromIndex the index to start the search from. There is no
1905: * restriction on the value of fromIndex. If it is
1906: * greater than or equal to the length of this string,
1907: * it has the same effect as if it were equal to one
1908: * less than the length of this string: this entire
1909: * string may be searched. If it is negative, it has
1910: * the same effect as if it were -1: -1 is returned.
1911: * @return the index of the last occurrence of the codepoint in source,
1912: * or -1 if the codepoint does not occur.
1913: * @stable ICU 2.6
1914: */
1915: public static int lastIndexOf(String source, int char32,
1916: int fromIndex) {
1917: if (char32 < CODEPOINT_MIN_VALUE
1918: || char32 > CODEPOINT_MAX_VALUE) {
1919: throw new IllegalArgumentException(
1920: "Argument char32 is not a valid codepoint");
1921: }
1922: // non-surrogate bmp
1923: if (char32 < LEAD_SURROGATE_MIN_VALUE
1924: || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1925: return source.lastIndexOf((char) char32, fromIndex);
1926: }
1927: // surrogate
1928: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1929: int result = source.lastIndexOf((char) char32, fromIndex);
1930: if (result >= 0) {
1931: if (isLeadSurrogate((char) char32)
1932: && (result < source.length() - 1)
1933: && isTrailSurrogate(source.charAt(result + 1))) {
1934: return lastIndexOf(source, char32, result - 1);
1935: }
1936: // trail surrogate
1937: if (result > 0
1938: && isLeadSurrogate(source.charAt(result - 1))) {
1939: return lastIndexOf(source, char32, result - 1);
1940: }
1941: }
1942: return result;
1943: }
1944: // supplementary
1945: String char32str = toString(char32);
1946: return source.lastIndexOf(char32str, fromIndex);
1947: }
1948:
1949: /**
1950: * <p>Returns the index within the argument UTF16 format Unicode string of
1951: * the last occurrence of the argument string str, where the result is less
1952: * than or equals to fromIndex.</p>
1953: * <p>This method is implemented based on codepoints, hence a
1954: * "lead surrogate character + trail surrogate character" is treated as one
1955: * entity.
1956: * Hence if the str starts with trail surrogate character at index 0, a
1957: * source with a leading a surrogate character before str found at in
1958: * source will not have a valid match. Vice versa for lead surrogates
1959: * that ends str.
1960: * </p>
1961: * See example below.
1962: * <p>
1963: * Examples:<br>
1964: * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1965: * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1966: * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1967: * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1968: * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1969: * </p>
1970: * <p>source is searched backwards starting at the last character.</p>
1971: * Note this method is provided as support to jdk 1.3, which does not
1972: * support supplementary characters to its fullest.
1973: * @param source UTF16 format Unicode string that will be searched
1974: * @param str UTF16 format Unicode string to search for
1975: * @param fromIndex the index to start the search from. There is no
1976: * restriction on the value of fromIndex. If it is
1977: * greater than or equal to the length of this string,
1978: * it has the same effect as if it were equal to one
1979: * less than the length of this string: this entire
1980: * string may be searched. If it is negative, it has
1981: * the same effect as if it were -1: -1 is returned.
1982: * @return the index of the last occurrence of the codepoint in source,
1983: * or -1 if the codepoint does not occur.
1984: * @stable ICU 2.6
1985: */
1986: public static int lastIndexOf(String source, String str,
1987: int fromIndex) {
1988: int strLength = str.length();
1989: // non-surrogate ends
1990: if (!isTrailSurrogate(str.charAt(0))
1991: && !isLeadSurrogate(str.charAt(strLength - 1))) {
1992: return source.lastIndexOf(str, fromIndex);
1993: }
1994:
1995: int result = source.lastIndexOf(str, fromIndex);
1996: if (result >= 0) {
1997: // check last character
1998: if (isLeadSurrogate(str.charAt(strLength - 1))
1999: && (result < source.length() - 1)
2000: && isTrailSurrogate(source.charAt(result
2001: + strLength))) {
2002: return lastIndexOf(source, str, result - 1);
2003: }
2004: // check first character which is a trail surrogate
2005: if (isTrailSurrogate(str.charAt(0)) && result > 0
2006: && isLeadSurrogate(source.charAt(result - 1))) {
2007: return lastIndexOf(source, str, result - 1);
2008: }
2009: }
2010: return result;
2011: }
2012:
2013: /**
2014: * Returns a new UTF16 format Unicode string resulting from replacing all
2015: * occurrences of oldChar32 in source with newChar32.
2016: * If the character oldChar32 does not occur in the UTF16 format Unicode
2017: * string source, then source will be returned. Otherwise, a new String
2018: * object is created that represents a codepoint sequence identical to the
2019: * codepoint sequence represented by source, except that every occurrence
2020: * of oldChar32 is replaced by an occurrence of newChar32.
2021: * <p>
2022: * Examples: <br>
2023: * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
2024: * returns "mosquito in your collar"<br>
2025: * UTF16.replace("JonL", 'q', 'x');<br>
2026: * returns "JonL" (no change)<br>
2027: * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!');
2028: * <br> returns "Supplementary character !"<br>
2029: * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!');
2030: * <br> returns "Supplementary character \ud800\udc00"<br>
2031: * </p>
2032: * Note this method is provided as support to jdk 1.3, which does not
2033: * support supplementary characters to its fullest.
2034: * @param source UTF16 format Unicode string which the codepoint
2035: * replacements will be based on.
2036: * @param oldChar32 non-zero old codepoint to be replaced.
2037: * @param newChar32 the new codepoint to replace oldChar32
2038: * @return new String derived from source by replacing every occurrence
2039: * of oldChar32 with newChar32, unless when no oldChar32 is found
2040: * in source then source will be returned.
2041: * @stable ICU 2.6
2042: */
2043: public static String replace(String source, int oldChar32,
2044: int newChar32) {
2045: if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
2046: throw new IllegalArgumentException(
2047: "Argument oldChar32 is not a valid codepoint");
2048: }
2049: if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
2050: throw new IllegalArgumentException(
2051: "Argument newChar32 is not a valid codepoint");
2052: }
2053:
2054: int index = indexOf(source, oldChar32);
2055: if (index == -1) {
2056: return source;
2057: }
2058: String newChar32Str = toString(newChar32);
2059: int oldChar32Size = 1;
2060: int newChar32Size = newChar32Str.length();
2061: StringBuffer result = new StringBuffer(source);
2062: int resultIndex = index;
2063:
2064: if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
2065: oldChar32Size = 2;
2066: }
2067:
2068: while (index != -1) {
2069: int endResultIndex = resultIndex + oldChar32Size;
2070: result.replace(resultIndex, endResultIndex, newChar32Str);
2071: int lastEndIndex = index + oldChar32Size;
2072: index = indexOf(source, oldChar32, lastEndIndex);
2073: resultIndex += newChar32Size + index - lastEndIndex;
2074: }
2075: return result.toString();
2076: }
2077:
2078: /**
2079: * Returns a new UTF16 format Unicode string resulting from replacing all
2080: * occurrences of oldStr in source with newStr.
2081: * If the string oldStr does not occur in the UTF16 format Unicode
2082: * string source, then source will be returned. Otherwise, a new String
2083: * object is created that represents a codepoint sequence identical to the
2084: * codepoint sequence represented by source, except that every occurrence
2085: * of oldStr is replaced by an occurrence of newStr.
2086: * <p>
2087: * Examples: <br>
2088: * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2089: * returns "mosquito in your collar"<br>
2090: * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2091: * returns "cat in your cellar"<br>
2092: * UTF16.replace("JonL", "q", "x");<br>
2093: * returns "JonL" (no change)<br>
2094: * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00",
2095: * '!');
2096: * <br> returns "Supplementary character !"<br>
2097: * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!');
2098: * <br> returns "Supplementary character \ud800\udc00"<br>
2099: * </p>
2100: * Note this method is provided as support to jdk 1.3, which does not
2101: * support supplementary characters to its fullest.
2102: * @param source UTF16 format Unicode string which the
2103: * replacements will be based on.
2104: * @param oldStr non-zero-length string to be replaced.
2105: * @param newStr the new string to replace oldStr
2106: * @return new String derived from source by replacing every occurrence
2107: * of oldStr with newStr. When no oldStr is found
2108: * in source, then source will be returned.
2109: * @stable ICU 2.6
2110: */
2111: public static String replace(String source, String oldStr,
2112: String newStr) {
2113: int index = indexOf(source, oldStr);
2114: if (index == -1) {
2115: return source;
2116: }
2117: int oldStrSize = oldStr.length();
2118: int newStrSize = newStr.length();
2119: StringBuffer result = new StringBuffer(source);
2120: int resultIndex = index;
2121:
2122: while (index != -1) {
2123: int endResultIndex = resultIndex + oldStrSize;
2124: result.replace(resultIndex, endResultIndex, newStr);
2125: int lastEndIndex = index + oldStrSize;
2126: index = indexOf(source, oldStr, lastEndIndex);
2127: resultIndex += newStrSize + index - lastEndIndex;
2128: }
2129: return result.toString();
2130: }
2131:
2132: /**
2133: * Reverses a UTF16 format Unicode string and replaces source's content
2134: * with it.
2135: * This method will reverse surrogate characters correctly, instead of
2136: * blindly reversing every character.
2137: * <p>
2138: * Examples:<br>
2139: * UTF16.reverse(new StringBuffer(
2140: * "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2141: * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2142: * @param source the source StringBuffer that contains UTF16 format
2143: * Unicode string to be reversed
2144: * @return a modified source with reversed UTF16 format Unicode string.
2145: * @stable ICU 2.6
2146: */
2147: public static StringBuffer reverse(StringBuffer source) {
2148: int length = source.length();
2149: StringBuffer result = new StringBuffer(length);
2150: for (int i = length; i-- > 0;) {
2151: char ch = source.charAt(i);
2152: if (isTrailSurrogate(ch) && i > 0) {
2153: char ch2 = source.charAt(i - 1);
2154: if (isLeadSurrogate(ch2)) {
2155: result.append(ch2);
2156: result.append(ch);
2157: --i;
2158: continue;
2159: }
2160: }
2161: result.append(ch);
2162: }
2163: return result;
2164: }
2165:
2166: /**
2167: * Check if the string contains more Unicode code points than a certain
2168: * number. This is more efficient than counting all code points in the
2169: * entire string and comparing that number with a threshold.
2170: * This function may not need to scan the string at all if the length is
2171: * within a certain range, and never needs to count more than 'number + 1'
2172: * code points. Logically equivalent to (countCodePoint(s) > number). A
2173: * Unicode code point may occupy either one or two code units.
2174: * @param source The input string.
2175: * @param number The number of code points in the string is compared
2176: * against the 'number' parameter.
2177: * @return boolean value for whether the string contains more Unicode code
2178: * points than 'number'.
2179: * @stable ICU 2.4
2180: */
2181: public static boolean hasMoreCodePointsThan(String source,
2182: int number) {
2183: if (number < 0) {
2184: return true;
2185: }
2186: if (source == null) {
2187: return false;
2188: }
2189: int length = source.length();
2190:
2191: // length >= 0 known
2192: // source contains at least (length + 1) / 2 code points: <= 2
2193: // chars per cp
2194: if (((length + 1) >> 1) > number) {
2195: return true;
2196: }
2197:
2198: // check if source does not even contain enough chars
2199: int maxsupplementary = length - number;
2200: if (maxsupplementary <= 0) {
2201: return false;
2202: }
2203:
2204: // there are maxsupplementary = length - number more chars than
2205: // asked-for code points
2206:
2207: // count code points until they exceed and also check that there are
2208: // no more than maxsupplementary supplementary code points (char pairs)
2209: int start = 0;
2210: while (true) {
2211: if (length == 0) {
2212: return false;
2213: }
2214: if (number == 0) {
2215: return true;
2216: }
2217: if (isLeadSurrogate(source.charAt(start++))
2218: && start != length
2219: && isTrailSurrogate(source.charAt(start))) {
2220: start++;
2221: if (--maxsupplementary <= 0) {
2222: // too many pairs - too few code points
2223: return false;
2224: }
2225: }
2226: --number;
2227: }
2228: }
2229:
2230: /**
2231: * Check if the sub-range of char array, from argument start to limit,
2232: * contains more Unicode code points than a certain
2233: * number. This is more efficient than counting all code points in the
2234: * entire char array range and comparing that number with a threshold.
2235: * This function may not need to scan the char array at all if start and
2236: * limit is within a certain range, and never needs to count more than
2237: * 'number + 1' code points.
2238: * Logically equivalent to (countCodePoint(source, start, limit) > number).
2239: * A Unicode code point may occupy either one or two code units.
2240: * @param source array of UTF-16 chars
2241: * @param start offset to substring in the source array for analyzing
2242: * @param limit offset to substring in the source array for analyzing
2243: * @param number The number of code points in the string is compared
2244: * against the 'number' parameter.
2245: * @return boolean value for whether the string contains more Unicode code
2246: * points than 'number'.
2247: * @exception IndexOutOfBoundsException thrown when limit < start
2248: * @stable ICU 2.4
2249: */
2250: public static boolean hasMoreCodePointsThan(char source[],
2251: int start, int limit, int number) {
2252: int length = limit - start;
2253: if (length < 0 || start < 0 || limit < 0) {
2254: throw new IndexOutOfBoundsException(
2255: "Start and limit indexes should be non-negative and start <= limit");
2256: }
2257: if (number < 0) {
2258: return true;
2259: }
2260: if (source == null) {
2261: return false;
2262: }
2263:
2264: // length >= 0 known
2265: // source contains at least (length + 1) / 2 code points: <= 2
2266: // chars per cp
2267: if (((length + 1) >> 1) > number) {
2268: return true;
2269: }
2270:
2271: // check if source does not even contain enough chars
2272: int maxsupplementary = length - number;
2273: if (maxsupplementary <= 0) {
2274: return false;
2275: }
2276:
2277: // there are maxsupplementary = length - number more chars than
2278: // asked-for code points
2279:
2280: // count code points until they exceed and also check that there are
2281: // no more than maxsupplementary supplementary code points (char pairs)
2282: while (true) {
2283: if (length == 0) {
2284: return false;
2285: }
2286: if (number == 0) {
2287: return true;
2288: }
2289: if (isLeadSurrogate(source[start++]) && start != limit
2290: && isTrailSurrogate(source[start])) {
2291: start++;
2292: if (--maxsupplementary <= 0) {
2293: // too many pairs - too few code points
2294: return false;
2295: }
2296: }
2297: --number;
2298: }
2299: }
2300:
2301: /**
2302: * Check if the string buffer contains more Unicode code points than a
2303: * certain number. This is more efficient than counting all code points in
2304: * the entire string buffer and comparing that number with a threshold.
2305: * This function may not need to scan the string buffer at all if the
2306: * length is within a certain range, and never needs to count more than
2307: * 'number + 1' code points. Logically equivalent to
2308: * (countCodePoint(s) > number). A Unicode code point may occupy either one
2309: * or two code units.
2310: * @param source The input string buffer.
2311: * @param number The number of code points in the string buffer is compared
2312: * against the 'number' parameter.
2313: * @return boolean value for whether the string buffer contains more
2314: * Unicode code points than 'number'.
2315: * @stable ICU 2.4
2316: */
2317: public static boolean hasMoreCodePointsThan(StringBuffer source,
2318: int number) {
2319: if (number < 0) {
2320: return true;
2321: }
2322: if (source == null) {
2323: return false;
2324: }
2325: int length = source.length();
2326:
2327: // length >= 0 known
2328: // source contains at least (length + 1) / 2 code points: <= 2
2329: // chars per cp
2330: if (((length + 1) >> 1) > number) {
2331: return true;
2332: }
2333:
2334: // check if source does not even contain enough chars
2335: int maxsupplementary = length - number;
2336: if (maxsupplementary <= 0) {
2337: return false;
2338: }
2339:
2340: // there are maxsupplementary = length - number more chars than
2341: // asked-for code points
2342:
2343: // count code points until they exceed and also check that there are
2344: // no more than maxsupplementary supplementary code points (char pairs)
2345: int start = 0;
2346: while (true) {
2347: if (length == 0) {
2348: return false;
2349: }
2350: if (number == 0) {
2351: return true;
2352: }
2353: if (isLeadSurrogate(source.charAt(start++))
2354: && start != length
2355: && isTrailSurrogate(source.charAt(start))) {
2356: start++;
2357: if (--maxsupplementary <= 0) {
2358: // too many pairs - too few code points
2359: return false;
2360: }
2361: }
2362: --number;
2363: }
2364: }
2365:
2366: /**
2367: * Cover JDK 1.5 API. Create a String from an array of codePoints.
2368: * @param codePoints the code array
2369: * @param offset the start of the text in the code point array
2370: * @param count the number of code points
2371: * @return a String representing the code points between offset and count
2372: * @throws IllegalArgumentException if an invalid code point is encountered
2373: * @throws IndexOutOfBoundsException if the offset or count are out of bounds.
2374: * @stable ICU 3.0
2375: */
2376: public static String newString(int[] codePoints, int offset,
2377: int count) {
2378: if (count < 0) {
2379: throw new IllegalArgumentException();
2380: }
2381: char[] chars = new char[count];
2382: int w = 0;
2383: for (int r = offset, e = offset + count; r < e; ++r) {
2384: int cp = codePoints[r];
2385: if (cp < 0 || cp > 0x10ffff) {
2386: throw new IllegalArgumentException();
2387: }
2388: while (true) {
2389: try {
2390: if (cp < 0x010000) {
2391: chars[w] = (char) cp;
2392: w++;
2393: } else {
2394: chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2395: chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2396: w += 2;
2397: }
2398: break;
2399: } catch (IndexOutOfBoundsException ex) {
2400: int newlen = (int) (Math
2401: .ceil((double) codePoints.length * (w + 2)
2402: / (r - offset + 1)));
2403: char[] temp = new char[newlen];
2404: System.arraycopy(chars, 0, temp, 0, w);
2405: chars = temp;
2406: }
2407: }
2408: }
2409: return new String(chars, 0, w);
2410: }
2411:
2412: /**
2413: * <p>UTF16 string comparator class.
2414: * Allows UTF16 string comparison to be done with the various modes</p>
2415: * <ul>
2416: * <li> Code point comparison or code unit comparison
2417: * <li> Case sensitive comparison, case insensitive comparison or case
2418: * insensitive comparison with special handling for character 'i'.
2419: * </ul>
2420: * <p>The code unit or code point comparison differ only when comparing
2421: * supplementary code points (\u10000..\u10ffff) to BMP code points
2422: * near the end of the BMP (i.e., \ue000..\uffff). In code unit
2423: * comparison, high BMP code points sort after supplementary code points
2424: * because they are stored as pairs of surrogates which are at
2425: * \ud800..\udfff.</p>
2426: * @see #FOLD_CASE_DEFAULT
2427: * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2428: * @stable ICU 2.1
2429: */
2430: public static final class StringComparator implements
2431: java.util.Comparator {
2432: // public constructor ------------------------------------------------
2433:
2434: /**
2435: * Default constructor that does code unit comparison and case
2436: * sensitive comparison.
2437: * @stable ICU 2.1
2438: */
2439: public StringComparator() {
2440: this (false, false, FOLD_CASE_DEFAULT);
2441: }
2442:
2443: /**
2444: * Constructor that does comparison based on the argument options.
2445: * @param codepointcompare flag to indicate true for code point
2446: * comparison or false for code unit comparison.
2447: * @param ignorecase false for case sensitive comparison, true for
2448: * case-insensitive comparison
2449: * @param foldcaseoption FOLD_CASE_DEFAULT or
2450: * FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
2451: * ignorecase is set to true. If ignorecase is false, this option
2452: * is ignored.
2453: * @see #FOLD_CASE_DEFAULT
2454: * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2455: * @throws IllegalArgumentException if foldcaseoption is out of range
2456: * @stable ICU 2.4
2457: */
2458: public StringComparator(boolean codepointcompare,
2459: boolean ignorecase, int foldcaseoption) {
2460: setCodePointCompare(codepointcompare);
2461: m_ignoreCase_ = ignorecase;
2462: if (foldcaseoption < FOLD_CASE_DEFAULT
2463: || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2464: throw new IllegalArgumentException(
2465: "Invalid fold case option");
2466: }
2467: m_foldCase_ = foldcaseoption;
2468: }
2469:
2470: // public data member ------------------------------------------------
2471:
2472: /**
2473: * <p>Option value for case folding comparison:</p>
2474: * <p>Comparison is case insensitive, strings are folded using default
2475: * mappings defined in Unicode data file CaseFolding.txt, before
2476: * comparison.
2477: * </p>
2478: * @stable ICU 2.4
2479: */
2480: public static final int FOLD_CASE_DEFAULT = 0;
2481: /**
2482: * <p>Option value for case folding comparison:</p>
2483: * <p>Comparison is case insensitive, strings are folded using modified
2484: * mappings defined in Unicode data file CaseFolding.txt, before
2485: * comparison.
2486: * </p>
2487: * <p>The modified set of mappings is provided in a Unicode data file
2488: * CaseFolding.txt to handle dotted I and dotless i appropriately for
2489: * Turkic languages (tr, az).</p>
2490: * <p>Before Unicode 3.2, CaseFolding.txt contains mappings marked with
2491: * 'I' that are to be included for default mappings and excluded for
2492: * the Turkic-specific mappings.</p>
2493: * <p>Unicode 3.2 CaseFolding.txt instead contains mappings marked with
2494: * 'T' that are to be excluded for default mappings and included for
2495: * the Turkic-specific mappings.</p>
2496: * @stable ICU 2.4
2497: */
2498: public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2499:
2500: // public methods ----------------------------------------------------
2501:
2502: // public setters ----------------------------------------------------
2503:
2504: /**
2505: * Sets the comparison mode to code point compare if flag is true.
2506: * Otherwise comparison mode is set to code unit compare
2507: * @param flag true for code point compare, false for code unit compare
2508: * @stable ICU 2.4
2509: */
2510: public void setCodePointCompare(boolean flag) {
2511: if (flag) {
2512: m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2513: } else {
2514: m_codePointCompare_ = 0;
2515: }
2516: }
2517:
2518: /**
2519: * Sets the Comparator to case-insensitive comparison mode if argument
2520: * is true, otherwise case sensitive comparison mode if set to false.
2521: * @param ignorecase true for case-insitive comparison, false for
2522: * case sensitive comparison
2523: * @param foldcaseoption FOLD_CASE_DEFAULT or
2524: * FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only when
2525: * ignorecase is set to true. If ignorecase is false, this option
2526: * is ignored.
2527: * @see #FOLD_CASE_DEFAULT
2528: * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2529: * @stable ICU 2.4
2530: */
2531: public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2532: m_ignoreCase_ = ignorecase;
2533: if (foldcaseoption < FOLD_CASE_DEFAULT
2534: || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2535: throw new IllegalArgumentException(
2536: "Invalid fold case option");
2537: }
2538: m_foldCase_ = foldcaseoption;
2539: }
2540:
2541: // public getters ----------------------------------------------------
2542:
2543: /**
2544: * Checks if the comparison mode is code point compare.
2545: * @return true for code point compare, false for code unit compare
2546: * @stable ICU 2.4
2547: */
2548: public boolean getCodePointCompare() {
2549: return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2550: }
2551:
2552: /**
2553: * Checks if Comparator is in the case insensitive mode.
2554: * @return true if Comparator performs case insensitive comparison,
2555: * false otherwise
2556: * @stable ICU 2.4
2557: */
2558: public boolean getIgnoreCase() {
2559: return m_ignoreCase_;
2560: }
2561:
2562: /**
2563: * Gets the fold case options set in Comparator to be used with case
2564: * insensitive comparison.
2565: * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2566: * @see #FOLD_CASE_DEFAULT
2567: * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2568: * @stable ICU 2.4
2569: */
2570: public int getIgnoreCaseOption() {
2571: return m_foldCase_;
2572: }
2573:
2574: // public other methods ----------------------------------------------
2575:
2576: /**
2577: * Compare two strings depending on the options selected during
2578: * construction.
2579: * @param a first source string.
2580: * @param b second source string.
2581: * @return 0 returned if a == b. If a < b, a negative value is returned.
2582: * Otherwise if a > b, a positive value is returned.
2583: * @exception ClassCastException thrown when either a or b is not a
2584: * String object
2585: * @stable ICU 2.4
2586: */
2587: public int compare(Object a, Object b) {
2588: String str1 = (String) a;
2589: String str2 = (String) b;
2590:
2591: if (str1 == str2) {
2592: return 0;
2593: }
2594: if (str1 == null) {
2595: return -1;
2596: }
2597: if (str2 == null) {
2598: return 1;
2599: }
2600:
2601: if (m_ignoreCase_) {
2602: return compareCaseInsensitive(str1, str2);
2603: }
2604: return compareCaseSensitive(str1, str2);
2605: }
2606:
2607: // private data member ----------------------------------------------
2608:
2609: /**
2610: * Code unit comparison flag. True if code unit comparison is required.
2611: * False if code point comparison is required.
2612: */
2613: private int m_codePointCompare_;
2614:
2615: /**
2616: * Fold case comparison option.
2617: */
2618: private int m_foldCase_;
2619:
2620: /**
2621: * Flag indicator if ignore case is to be used during comparison
2622: */
2623: private boolean m_ignoreCase_;
2624:
2625: /**
2626: * Code point order offset for surrogate characters
2627: */
2628: private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2629:
2630: // private method ---------------------------------------------------
2631:
2632: /**
2633: * Compares case insensitive. This is a direct port of ICU4C, to make
2634: * maintainence life easier.
2635: * @param s1 first string to compare
2636: * @param s2 second string to compare
2637: * @return -1 is s1 < s2, 0 if equals,
2638: */
2639: private int compareCaseInsensitive(String s1, String s2) {
2640: return NormalizerImpl.cmpEquivFold(s1, s2, m_foldCase_
2641: | m_codePointCompare_
2642: | Normalizer.COMPARE_IGNORE_CASE);
2643: }
2644:
2645: /**
2646: * Compares case sensitive. This is a direct port of ICU4C, to make
2647: * maintainence life easier.
2648: * @param s1 first string to compare
2649: * @param s2 second string to compare
2650: * @return -1 is s1 < s2, 0 if equals,
2651: */
2652: private int compareCaseSensitive(String s1, String s2) {
2653: // compare identical prefixes - they do not need to be fixed up
2654: // limit1 = start1 + min(lenght1, length2)
2655: int length1 = s1.length();
2656: int length2 = s2.length();
2657: int minlength = length1;
2658: int result = 0;
2659: if (length1 < length2) {
2660: result = -1;
2661: } else if (length1 > length2) {
2662: result = 1;
2663: minlength = length2;
2664: }
2665:
2666: char c1 = 0;
2667: char c2 = 0;
2668: int index = 0;
2669: for (; index < minlength; index++) {
2670: c1 = s1.charAt(index);
2671: c2 = s2.charAt(index);
2672: // check pseudo-limit
2673: if (c1 != c2) {
2674: break;
2675: }
2676: }
2677:
2678: if (index == minlength) {
2679: return result;
2680: }
2681:
2682: boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2683: // if both values are in or above the surrogate range, fix them up
2684: if (c1 >= LEAD_SURROGATE_MIN_VALUE
2685: && c2 >= LEAD_SURROGATE_MIN_VALUE
2686: && codepointcompare) {
2687: // subtract 0x2800 from BMP code points to make them smaller
2688: // than supplementary ones
2689: if ((c1 <= LEAD_SURROGATE_MAX_VALUE
2690: && (index + 1) != length1 && isTrailSurrogate(s1
2691: .charAt(index + 1)))
2692: || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1
2693: .charAt(index - 1)))) {
2694: // part of a surrogate pair, leave >=d800
2695: } else {
2696: // BMP code point - may be surrogate code point - make
2697: // < d800
2698: c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2699: }
2700:
2701: if ((c2 <= LEAD_SURROGATE_MAX_VALUE
2702: && (index + 1) != length2 && isTrailSurrogate(s2
2703: .charAt(index + 1)))
2704: || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2
2705: .charAt(index - 1)))) {
2706: // part of a surrogate pair, leave >=d800
2707: } else {
2708: // BMP code point - may be surrogate code point - make <d800
2709: c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2710: }
2711: }
2712:
2713: // now c1 and c2 are in UTF-32-compatible order
2714: return c1 - c2;
2715: }
2716: }
2717:
2718: // private data members -------------------------------------------------
2719:
2720: /**
2721: * Shift value for lead surrogate to form a supplementary character.
2722: */
2723: private static final int LEAD_SURROGATE_SHIFT_ = 10;
2724:
2725: /**
2726: * Mask to retrieve the significant value from a trail surrogate.
2727: */
2728: private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2729:
2730: /**
2731: * Value that all lead surrogate starts with
2732: */
2733: private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2734: - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2735:
2736: // private methods ------------------------------------------------------
2737:
2738: /**
2739: * <p>Converts argument code point and returns a String object representing
2740: * the code point's value in UTF16 format.</p>
2741: * <p>This method does not check for the validity of the codepoint, the
2742: * results are not guaranteed if a invalid codepoint is passed as
2743: * argument.</p>
2744: * <p>The result is a string whose length is 1 for non-supplementary code
2745: * points, 2 otherwise.</p>
2746: * @param ch code point
2747: * @return string representation of the code point
2748: */
2749: private static String toString(int ch) {
2750: if (ch < SUPPLEMENTARY_MIN_VALUE) {
2751: return String.valueOf((char) ch);
2752: }
2753:
2754: StringBuffer result = new StringBuffer();
2755: result.append(getLeadSurrogate(ch));
2756: result.append(getTrailSurrogate(ch));
2757: return result.toString();
2758: }
2759: }
2760: //eof
|