0001: //##header
0002: /*
0003: *******************************************************************************
0004: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */
0008: package com.ibm.icu.impl;
0009:
0010: import java.util.ArrayList;
0011:
0012: import com.ibm.icu.lang.*;
0013: import com.ibm.icu.text.*;
0014: import com.ibm.icu.impl.UCharacterProperty;
0015:
0016: // This class contains utility functions so testing not needed
0017: ///CLOVER:OFF
0018: public final class Utility {
0019:
0020: private static final char APOSTROPHE = '\'';
0021: private static final char BACKSLASH = '\\';
0022: private static final int MAGIC_UNSIGNED = 0x80000000;
0023:
0024: /**
0025: * Convenience utility to compare two Object[]s.
0026: * Ought to be in System
0027: */
0028: public final static boolean arrayEquals(Object[] source,
0029: Object target) {
0030: if (source == null)
0031: return (target == null);
0032: if (!(target instanceof Object[]))
0033: return false;
0034: Object[] targ = (Object[]) target;
0035: return (source.length == targ.length && arrayRegionMatches(
0036: source, 0, targ, 0, source.length));
0037: }
0038:
0039: /**
0040: * Convenience utility to compare two int[]s
0041: * Ought to be in System
0042: */
0043: ///CLOVER:OFF
0044: public final static boolean arrayEquals(int[] source, Object target) {
0045: if (source == null)
0046: return (target == null);
0047: if (!(target instanceof int[]))
0048: return false;
0049: int[] targ = (int[]) target;
0050: return (source.length == targ.length && arrayRegionMatches(
0051: source, 0, targ, 0, source.length));
0052: }
0053:
0054: ///CLOVER:ON
0055:
0056: /**
0057: * Convenience utility to compare two double[]s
0058: * Ought to be in System
0059: */
0060: ///CLOVER:OFF
0061: public final static boolean arrayEquals(double[] source,
0062: Object target) {
0063: if (source == null)
0064: return (target == null);
0065: if (!(target instanceof double[]))
0066: return false;
0067: double[] targ = (double[]) target;
0068: return (source.length == targ.length && arrayRegionMatches(
0069: source, 0, targ, 0, source.length));
0070: }
0071:
0072: public final static boolean arrayEquals(byte[] source, Object target) {
0073: if (source == null)
0074: return (target == null);
0075: if (!(target instanceof byte[]))
0076: return false;
0077: byte[] targ = (byte[]) target;
0078: return (source.length == targ.length && arrayRegionMatches(
0079: source, 0, targ, 0, source.length));
0080: }
0081:
0082: ///CLOVER:ON
0083:
0084: /**
0085: * Convenience utility to compare two Object[]s
0086: * Ought to be in System
0087: */
0088: public final static boolean arrayEquals(Object source, Object target) {
0089: if (source == null)
0090: return (target == null);
0091: // for some reason, the correct arrayEquals is not being called
0092: // so do it by hand for now.
0093: if (source instanceof Object[])
0094: return (arrayEquals((Object[]) source, target));
0095: if (source instanceof int[])
0096: return (arrayEquals((int[]) source, target));
0097: if (source instanceof double[])
0098: return (arrayEquals((int[]) source, target));
0099: if (source instanceof byte[])
0100: return (arrayEquals((byte[]) source, target));
0101: return source.equals(target);
0102: }
0103:
0104: /**
0105: * Convenience utility to compare two Object[]s
0106: * Ought to be in System.
0107: * @param len the length to compare.
0108: * The start indices and start+len must be valid.
0109: */
0110: public final static boolean arrayRegionMatches(Object[] source,
0111: int sourceStart, Object[] target, int targetStart, int len) {
0112: int sourceEnd = sourceStart + len;
0113: int delta = targetStart - sourceStart;
0114: for (int i = sourceStart; i < sourceEnd; i++) {
0115: if (!arrayEquals(source[i], target[i + delta]))
0116: return false;
0117: }
0118: return true;
0119: }
0120:
0121: /**
0122: * Convenience utility to compare two Object[]s
0123: * Ought to be in System.
0124: * @param len the length to compare.
0125: * The start indices and start+len must be valid.
0126: */
0127: public final static boolean arrayRegionMatches(char[] source,
0128: int sourceStart, char[] target, int targetStart, int len) {
0129: int sourceEnd = sourceStart + len;
0130: int delta = targetStart - sourceStart;
0131: for (int i = sourceStart; i < sourceEnd; i++) {
0132: if (source[i] != target[i + delta])
0133: return false;
0134: }
0135: return true;
0136: }
0137:
0138: /**
0139: * Convenience utility to compare two int[]s.
0140: * @param len the length to compare.
0141: * The start indices and start+len must be valid.
0142: * Ought to be in System
0143: */
0144: ///CLOVER:OFF
0145: public final static boolean arrayRegionMatches(int[] source,
0146: int sourceStart, int[] target, int targetStart, int len) {
0147: int sourceEnd = sourceStart + len;
0148: int delta = targetStart - sourceStart;
0149: for (int i = sourceStart; i < sourceEnd; i++) {
0150: if (source[i] != target[i + delta])
0151: return false;
0152: }
0153: return true;
0154: }
0155:
0156: ///CLOVER:ON
0157:
0158: /**
0159: * Convenience utility to compare two arrays of doubles.
0160: * @param len the length to compare.
0161: * The start indices and start+len must be valid.
0162: * Ought to be in System
0163: */
0164: ///CLOVER:OFF
0165: public final static boolean arrayRegionMatches(double[] source,
0166: int sourceStart, double[] target, int targetStart, int len) {
0167: int sourceEnd = sourceStart + len;
0168: int delta = targetStart - sourceStart;
0169: for (int i = sourceStart; i < sourceEnd; i++) {
0170: if (source[i] != target[i + delta])
0171: return false;
0172: }
0173: return true;
0174: }
0175:
0176: public final static boolean arrayRegionMatches(byte[] source,
0177: int sourceStart, byte[] target, int targetStart, int len) {
0178: int sourceEnd = sourceStart + len;
0179: int delta = targetStart - sourceStart;
0180: for (int i = sourceStart; i < sourceEnd; i++) {
0181: if (source[i] != target[i + delta])
0182: return false;
0183: }
0184: return true;
0185: }
0186:
0187: ///CLOVER:ON
0188:
0189: /**
0190: * Convenience utility. Does null checks on objects, then calls equals.
0191: */
0192: public final static boolean objectEquals(Object source,
0193: Object target) {
0194: if (source == null)
0195: return (target == null);
0196: else
0197: return source.equals(target);
0198: }
0199:
0200: /**
0201: * The ESCAPE character is used during run-length encoding. It signals
0202: * a run of identical chars.
0203: */
0204: private static final char ESCAPE = '\uA5A5';
0205:
0206: /**
0207: * The ESCAPE_BYTE character is used during run-length encoding. It signals
0208: * a run of identical bytes.
0209: */
0210: static final byte ESCAPE_BYTE = (byte) 0xA5;
0211:
0212: /**
0213: * Construct a string representing an int array. Use run-length encoding.
0214: * A character represents itself, unless it is the ESCAPE character. Then
0215: * the following notations are possible:
0216: * ESCAPE ESCAPE ESCAPE literal
0217: * ESCAPE n c n instances of character c
0218: * Since an encoded run occupies 3 characters, we only encode runs of 4 or
0219: * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
0220: * If we encounter a run where n == ESCAPE, we represent this as:
0221: * c ESCAPE n-1 c
0222: * The ESCAPE value is chosen so as not to collide with commonly
0223: * seen values.
0224: */
0225: ///CLOVER:OFF
0226: static public final String arrayToRLEString(int[] a) {
0227: StringBuffer buffer = new StringBuffer();
0228:
0229: appendInt(buffer, a.length);
0230: int runValue = a[0];
0231: int runLength = 1;
0232: for (int i = 1; i < a.length; ++i) {
0233: int s = a[i];
0234: if (s == runValue && runLength < 0xFFFF) {
0235: ++runLength;
0236: } else {
0237: encodeRun(buffer, runValue, runLength);
0238: runValue = s;
0239: runLength = 1;
0240: }
0241: }
0242: encodeRun(buffer, runValue, runLength);
0243: return buffer.toString();
0244: }
0245:
0246: ///CLOVER:ON
0247:
0248: /**
0249: * Construct a string representing a short array. Use run-length encoding.
0250: * A character represents itself, unless it is the ESCAPE character. Then
0251: * the following notations are possible:
0252: * ESCAPE ESCAPE ESCAPE literal
0253: * ESCAPE n c n instances of character c
0254: * Since an encoded run occupies 3 characters, we only encode runs of 4 or
0255: * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
0256: * If we encounter a run where n == ESCAPE, we represent this as:
0257: * c ESCAPE n-1 c
0258: * The ESCAPE value is chosen so as not to collide with commonly
0259: * seen values.
0260: */
0261: ///CLOVER:OFF
0262: static public final String arrayToRLEString(short[] a) {
0263: StringBuffer buffer = new StringBuffer();
0264: // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
0265: buffer.append((char) (a.length >> 16));
0266: buffer.append((char) a.length);
0267: short runValue = a[0];
0268: int runLength = 1;
0269: for (int i = 1; i < a.length; ++i) {
0270: short s = a[i];
0271: if (s == runValue && runLength < 0xFFFF)
0272: ++runLength;
0273: else {
0274: encodeRun(buffer, runValue, runLength);
0275: runValue = s;
0276: runLength = 1;
0277: }
0278: }
0279: encodeRun(buffer, runValue, runLength);
0280: return buffer.toString();
0281: }
0282:
0283: ///CLOVER:ON
0284:
0285: /**
0286: * Construct a string representing a char array. Use run-length encoding.
0287: * A character represents itself, unless it is the ESCAPE character. Then
0288: * the following notations are possible:
0289: * ESCAPE ESCAPE ESCAPE literal
0290: * ESCAPE n c n instances of character c
0291: * Since an encoded run occupies 3 characters, we only encode runs of 4 or
0292: * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
0293: * If we encounter a run where n == ESCAPE, we represent this as:
0294: * c ESCAPE n-1 c
0295: * The ESCAPE value is chosen so as not to collide with commonly
0296: * seen values.
0297: */
0298: static public final String arrayToRLEString(char[] a) {
0299: StringBuffer buffer = new StringBuffer();
0300: buffer.append((char) (a.length >> 16));
0301: buffer.append((char) a.length);
0302: char runValue = a[0];
0303: int runLength = 1;
0304: for (int i = 1; i < a.length; ++i) {
0305: char s = a[i];
0306: if (s == runValue && runLength < 0xFFFF)
0307: ++runLength;
0308: else {
0309: encodeRun(buffer, (short) runValue, runLength);
0310: runValue = s;
0311: runLength = 1;
0312: }
0313: }
0314: encodeRun(buffer, (short) runValue, runLength);
0315: return buffer.toString();
0316: }
0317:
0318: /**
0319: * Construct a string representing a byte array. Use run-length encoding.
0320: * Two bytes are packed into a single char, with a single extra zero byte at
0321: * the end if needed. A byte represents itself, unless it is the
0322: * ESCAPE_BYTE. Then the following notations are possible:
0323: * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal
0324: * ESCAPE_BYTE n b n instances of byte b
0325: * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
0326: * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
0327: * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
0328: * b ESCAPE_BYTE n-1 b
0329: * The ESCAPE_BYTE value is chosen so as not to collide with commonly
0330: * seen values.
0331: */
0332: static public final String arrayToRLEString(byte[] a) {
0333: StringBuffer buffer = new StringBuffer();
0334: buffer.append((char) (a.length >> 16));
0335: buffer.append((char) a.length);
0336: byte runValue = a[0];
0337: int runLength = 1;
0338: byte[] state = new byte[2];
0339: for (int i = 1; i < a.length; ++i) {
0340: byte b = a[i];
0341: if (b == runValue && runLength < 0xFF)
0342: ++runLength;
0343: else {
0344: encodeRun(buffer, runValue, runLength, state);
0345: runValue = b;
0346: runLength = 1;
0347: }
0348: }
0349: encodeRun(buffer, runValue, runLength, state);
0350:
0351: // We must save the final byte, if there is one, by padding
0352: // an extra zero.
0353: if (state[0] != 0)
0354: appendEncodedByte(buffer, (byte) 0, state);
0355:
0356: return buffer.toString();
0357: }
0358:
0359: /**
0360: * Encode a run, possibly a degenerate run (of < 4 values).
0361: * @param length The length of the run; must be > 0 && <= 0xFFFF.
0362: */
0363: ///CLOVER:OFF
0364: private static final void encodeRun(StringBuffer buffer, int value,
0365: int length) {
0366: if (length < 4) {
0367: for (int j = 0; j < length; ++j) {
0368: if (value == ESCAPE) {
0369: appendInt(buffer, value);
0370: }
0371: appendInt(buffer, value);
0372: }
0373: } else {
0374: if (length == (int) ESCAPE) {
0375: if (value == (int) ESCAPE) {
0376: appendInt(buffer, ESCAPE);
0377: }
0378: appendInt(buffer, value);
0379: --length;
0380: }
0381: appendInt(buffer, ESCAPE);
0382: appendInt(buffer, length);
0383: appendInt(buffer, value); // Don't need to escape this value
0384: }
0385: }
0386:
0387: ///CLOVER:ON
0388:
0389: ///CLOVER:OFF
0390: private static final void appendInt(StringBuffer buffer, int value) {
0391: buffer.append((char) (value >>> 16));
0392: buffer.append((char) (value & 0xFFFF));
0393: }
0394:
0395: ///CLOVER:ON
0396:
0397: /**
0398: * Encode a run, possibly a degenerate run (of < 4 values).
0399: * @param length The length of the run; must be > 0 && <= 0xFFFF.
0400: */
0401: private static final void encodeRun(StringBuffer buffer,
0402: short value, int length) {
0403: if (length < 4) {
0404: for (int j = 0; j < length; ++j) {
0405: if (value == (int) ESCAPE)
0406: buffer.append(ESCAPE);
0407: buffer.append((char) value);
0408: }
0409: } else {
0410: if (length == (int) ESCAPE) {
0411: if (value == (int) ESCAPE)
0412: buffer.append(ESCAPE);
0413: buffer.append((char) value);
0414: --length;
0415: }
0416: buffer.append(ESCAPE);
0417: buffer.append((char) length);
0418: buffer.append((char) value); // Don't need to escape this value
0419: }
0420: }
0421:
0422: /**
0423: * Encode a run, possibly a degenerate run (of < 4 values).
0424: * @param length The length of the run; must be > 0 && <= 0xFF.
0425: */
0426: private static final void encodeRun(StringBuffer buffer,
0427: byte value, int length, byte[] state) {
0428: if (length < 4) {
0429: for (int j = 0; j < length; ++j) {
0430: if (value == ESCAPE_BYTE)
0431: appendEncodedByte(buffer, ESCAPE_BYTE, state);
0432: appendEncodedByte(buffer, value, state);
0433: }
0434: } else {
0435: if (length == ESCAPE_BYTE) {
0436: if (value == ESCAPE_BYTE)
0437: appendEncodedByte(buffer, ESCAPE_BYTE, state);
0438: appendEncodedByte(buffer, value, state);
0439: --length;
0440: }
0441: appendEncodedByte(buffer, ESCAPE_BYTE, state);
0442: appendEncodedByte(buffer, (byte) length, state);
0443: appendEncodedByte(buffer, value, state); // Don't need to escape this value
0444: }
0445: }
0446:
0447: /**
0448: * Append a byte to the given StringBuffer, packing two bytes into each
0449: * character. The state parameter maintains intermediary data between
0450: * calls.
0451: * @param state A two-element array, with state[0] == 0 if this is the
0452: * first byte of a pair, or state[0] != 0 if this is the second byte
0453: * of a pair, in which case state[1] is the first byte.
0454: */
0455: private static final void appendEncodedByte(StringBuffer buffer,
0456: byte value, byte[] state) {
0457: if (state[0] != 0) {
0458: char c = (char) ((state[1] << 8) | (((int) value) & 0xFF));
0459: buffer.append(c);
0460: state[0] = 0;
0461: } else {
0462: state[0] = 1;
0463: state[1] = value;
0464: }
0465: }
0466:
0467: ///CLOVER:OFF
0468: /**
0469: * Construct an array of ints from a run-length encoded string.
0470: */
0471: static public final int[] RLEStringToIntArray(String s) {
0472: int length = getInt(s, 0);
0473: int[] array = new int[length];
0474: int ai = 0, i = 1;
0475:
0476: int maxI = s.length() / 2;
0477: while (ai < length && i < maxI) {
0478: int c = getInt(s, i++);
0479:
0480: if (c == ESCAPE) {
0481: c = getInt(s, i++);
0482: if (c == ESCAPE) {
0483: array[ai++] = c;
0484: } else {
0485: int runLength = c;
0486: int runValue = getInt(s, i++);
0487: for (int j = 0; j < runLength; ++j) {
0488: array[ai++] = runValue;
0489: }
0490: }
0491: } else {
0492: array[ai++] = c;
0493: }
0494: }
0495:
0496: if (ai != length || i != maxI) {
0497: throw new IllegalStateException(
0498: "Bad run-length encoded int array");
0499: }
0500:
0501: return array;
0502: }
0503:
0504: static final int getInt(String s, int i) {
0505: return (((int) s.charAt(2 * i)) << 16)
0506: | (int) s.charAt(2 * i + 1);
0507: }
0508:
0509: ///CLOVER:ON
0510:
0511: /**
0512: * Construct an array of shorts from a run-length encoded string.
0513: */
0514: ///CLOVER:OFF
0515: static public final short[] RLEStringToShortArray(String s) {
0516: int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
0517: short[] array = new short[length];
0518: int ai = 0;
0519: for (int i = 2; i < s.length(); ++i) {
0520: char c = s.charAt(i);
0521: if (c == ESCAPE) {
0522: c = s.charAt(++i);
0523: if (c == ESCAPE) {
0524: array[ai++] = (short) c;
0525: } else {
0526: int runLength = (int) c;
0527: short runValue = (short) s.charAt(++i);
0528: for (int j = 0; j < runLength; ++j)
0529: array[ai++] = runValue;
0530: }
0531: } else {
0532: array[ai++] = (short) c;
0533: }
0534: }
0535:
0536: if (ai != length)
0537: throw new IllegalStateException(
0538: "Bad run-length encoded short array");
0539:
0540: return array;
0541: }
0542:
0543: ///CLOVER:ON
0544:
0545: /**
0546: * Construct an array of shorts from a run-length encoded string.
0547: */
0548: static public final char[] RLEStringToCharArray(String s) {
0549: int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
0550: char[] array = new char[length];
0551: int ai = 0;
0552: for (int i = 2; i < s.length(); ++i) {
0553: char c = s.charAt(i);
0554: if (c == ESCAPE) {
0555: c = s.charAt(++i);
0556: if (c == ESCAPE) {
0557: array[ai++] = c;
0558: } else {
0559: int runLength = (int) c;
0560: char runValue = s.charAt(++i);
0561: for (int j = 0; j < runLength; ++j)
0562: array[ai++] = runValue;
0563: }
0564: } else {
0565: array[ai++] = c;
0566: }
0567: }
0568:
0569: if (ai != length)
0570: throw new IllegalStateException(
0571: "Bad run-length encoded short array");
0572:
0573: return array;
0574: }
0575:
0576: /**
0577: * Construct an array of bytes from a run-length encoded string.
0578: */
0579: static public final byte[] RLEStringToByteArray(String s) {
0580: int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
0581: byte[] array = new byte[length];
0582: boolean nextChar = true;
0583: char c = 0;
0584: int node = 0;
0585: int runLength = 0;
0586: int i = 2;
0587: for (int ai = 0; ai < length;) {
0588: // This part of the loop places the next byte into the local
0589: // variable 'b' each time through the loop. It keeps the
0590: // current character in 'c' and uses the boolean 'nextChar'
0591: // to see if we've taken both bytes out of 'c' yet.
0592: byte b;
0593: if (nextChar) {
0594: c = s.charAt(i++);
0595: b = (byte) (c >> 8);
0596: nextChar = false;
0597: } else {
0598: b = (byte) (c & 0xFF);
0599: nextChar = true;
0600: }
0601:
0602: // This part of the loop is a tiny state machine which handles
0603: // the parsing of the run-length encoding. This would be simpler
0604: // if we could look ahead, but we can't, so we use 'node' to
0605: // move between three nodes in the state machine.
0606: switch (node) {
0607: case 0:
0608: // Normal idle node
0609: if (b == ESCAPE_BYTE) {
0610: node = 1;
0611: } else {
0612: array[ai++] = b;
0613: }
0614: break;
0615: case 1:
0616: // We have seen one ESCAPE_BYTE; we expect either a second
0617: // one, or a run length and value.
0618: if (b == ESCAPE_BYTE) {
0619: array[ai++] = ESCAPE_BYTE;
0620: node = 0;
0621: } else {
0622: runLength = b;
0623: // Interpret signed byte as unsigned
0624: if (runLength < 0)
0625: runLength += 0x100;
0626: node = 2;
0627: }
0628: break;
0629: case 2:
0630: // We have seen an ESCAPE_BYTE and length byte. We interpret
0631: // the next byte as the value to be repeated.
0632: for (int j = 0; j < runLength; ++j)
0633: array[ai++] = b;
0634: node = 0;
0635: break;
0636: }
0637: }
0638:
0639: if (node != 0)
0640: throw new IllegalStateException(
0641: "Bad run-length encoded byte array");
0642:
0643: if (i != s.length())
0644: throw new IllegalStateException(
0645: "Excess data in RLE byte array string");
0646:
0647: return array;
0648: }
0649:
0650: static public String LINE_SEPARATOR = System
0651: .getProperty("line.separator");
0652:
0653: /**
0654: * Format a String for representation in a source file. This includes
0655: * breaking it into lines and escaping characters using octal notation
0656: * when necessary (control characters and double quotes).
0657: */
0658: static public final String formatForSource(String s) {
0659: StringBuffer buffer = new StringBuffer();
0660: for (int i = 0; i < s.length();) {
0661: if (i > 0)
0662: buffer.append('+').append(LINE_SEPARATOR);
0663: buffer.append(" \"");
0664: int count = 11;
0665: while (i < s.length() && count < 80) {
0666: char c = s.charAt(i++);
0667: if (c < '\u0020' || c == '"' || c == '\\') {
0668: if (c == '\n') {
0669: buffer.append("\\n");
0670: count += 2;
0671: } else if (c == '\t') {
0672: buffer.append("\\t");
0673: count += 2;
0674: } else if (c == '\r') {
0675: buffer.append("\\r");
0676: count += 2;
0677: } else {
0678: // Represent control characters, backslash and double quote
0679: // using octal notation; otherwise the string we form
0680: // won't compile, since Unicode escape sequences are
0681: // processed before tokenization.
0682: buffer.append('\\');
0683: buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
0684: buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
0685: buffer.append(HEX_DIGIT[(c & 0007)]);
0686: count += 4;
0687: }
0688: } else if (c <= '\u007E') {
0689: buffer.append(c);
0690: count += 1;
0691: } else {
0692: buffer.append("\\u");
0693: buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
0694: buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
0695: buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
0696: buffer.append(HEX_DIGIT[(c & 0x000F)]);
0697: count += 6;
0698: }
0699: }
0700: buffer.append('"');
0701: }
0702: return buffer.toString();
0703: }
0704:
0705: static final char[] HEX_DIGIT = { '0', '1', '2', '3', '4', '5',
0706: '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
0707:
0708: /**
0709: * Format a String for representation in a source file. Like
0710: * formatForSource but does not do line breaking.
0711: */
0712: static public final String format1ForSource(String s) {
0713: StringBuffer buffer = new StringBuffer();
0714: buffer.append("\"");
0715: for (int i = 0; i < s.length();) {
0716: char c = s.charAt(i++);
0717: if (c < '\u0020' || c == '"' || c == '\\') {
0718: if (c == '\n') {
0719: buffer.append("\\n");
0720: } else if (c == '\t') {
0721: buffer.append("\\t");
0722: } else if (c == '\r') {
0723: buffer.append("\\r");
0724: } else {
0725: // Represent control characters, backslash and double quote
0726: // using octal notation; otherwise the string we form
0727: // won't compile, since Unicode escape sequences are
0728: // processed before tokenization.
0729: buffer.append('\\');
0730: buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
0731: buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
0732: buffer.append(HEX_DIGIT[(c & 0007)]);
0733: }
0734: } else if (c <= '\u007E') {
0735: buffer.append(c);
0736: } else {
0737: buffer.append("\\u");
0738: buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
0739: buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
0740: buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
0741: buffer.append(HEX_DIGIT[(c & 0x000F)]);
0742: }
0743: }
0744: buffer.append('"');
0745: return buffer.toString();
0746: }
0747:
0748: /**
0749: * Convert characters outside the range U+0020 to U+007F to
0750: * Unicode escapes, and convert backslash to a double backslash.
0751: */
0752: public static final String escape(String s) {
0753: StringBuffer buf = new StringBuffer();
0754: for (int i = 0; i < s.length();) {
0755: int c = UTF16.charAt(s, i);
0756: i += UTF16.getCharCount(c);
0757: if (c >= ' ' && c <= 0x007F) {
0758: if (c == '\\') {
0759: buf.append("\\\\"); // That is, "\\"
0760: } else {
0761: buf.append((char) c);
0762: }
0763: } else {
0764: boolean four = c <= 0xFFFF;
0765: buf.append(four ? "\\u" : "\\U");
0766: hex(c, four ? 4 : 8, buf);
0767: }
0768: }
0769: return buf.toString();
0770: }
0771:
0772: /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
0773: static private final char[] UNESCAPE_MAP = {
0774: /*" 0x22, 0x22 */
0775: /*' 0x27, 0x27 */
0776: /*? 0x3F, 0x3F */
0777: /*\ 0x5C, 0x5C */
0778: /*a*/0x61, 0x07,
0779: /*b*/0x62, 0x08,
0780: /*e*/0x65, 0x1b,
0781: /*f*/0x66, 0x0c,
0782: /*n*/0x6E, 0x0a,
0783: /*r*/0x72, 0x0d,
0784: /*t*/0x74, 0x09,
0785: /*v*/0x76, 0x0b };
0786:
0787: /**
0788: * Convert an escape to a 32-bit code point value. We attempt
0789: * to parallel the icu4c unescapeAt() function.
0790: * @param offset16 an array containing offset to the character
0791: * <em>after</em> the backslash. Upon return offset16[0] will
0792: * be updated to point after the escape sequence.
0793: * @return character value from 0 to 10FFFF, or -1 on error.
0794: */
0795: public static int unescapeAt(String s, int[] offset16) {
0796: int c;
0797: int result = 0;
0798: int n = 0;
0799: int minDig = 0;
0800: int maxDig = 0;
0801: int bitsPerDigit = 4;
0802: int dig;
0803: int i;
0804: boolean braces = false;
0805:
0806: /* Check that offset is in range */
0807: int offset = offset16[0];
0808: int length = s.length();
0809: if (offset < 0 || offset >= length) {
0810: return -1;
0811: }
0812:
0813: /* Fetch first UChar after '\\' */
0814: c = UTF16.charAt(s, offset);
0815: offset += UTF16.getCharCount(c);
0816:
0817: /* Convert hexadecimal and octal escapes */
0818: switch (c) {
0819: case 'u':
0820: minDig = maxDig = 4;
0821: break;
0822: case 'U':
0823: minDig = maxDig = 8;
0824: break;
0825: case 'x':
0826: minDig = 1;
0827: if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
0828: ++offset;
0829: braces = true;
0830: maxDig = 8;
0831: } else {
0832: maxDig = 2;
0833: }
0834: break;
0835: default:
0836: dig = UCharacter.digit(c, 8);
0837: if (dig >= 0) {
0838: minDig = 1;
0839: maxDig = 3;
0840: n = 1; /* Already have first octal digit */
0841: bitsPerDigit = 3;
0842: result = dig;
0843: }
0844: break;
0845: }
0846: if (minDig != 0) {
0847: while (offset < length && n < maxDig) {
0848: c = UTF16.charAt(s, offset);
0849: dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
0850: if (dig < 0) {
0851: break;
0852: }
0853: result = (result << bitsPerDigit) | dig;
0854: offset += UTF16.getCharCount(c);
0855: ++n;
0856: }
0857: if (n < minDig) {
0858: return -1;
0859: }
0860: if (braces) {
0861: if (c != 0x7D /*}*/) {
0862: return -1;
0863: }
0864: ++offset;
0865: }
0866: if (result < 0 || result >= 0x110000) {
0867: return -1;
0868: }
0869: // If an escape sequence specifies a lead surrogate, see
0870: // if there is a trail surrogate after it, either as an
0871: // escape or as a literal. If so, join them up into a
0872: // supplementary.
0873: if (offset < length && UTF16.isLeadSurrogate((char) result)) {
0874: int ahead = offset + 1;
0875: c = s.charAt(offset); // [sic] get 16-bit code unit
0876: if (c == '\\' && ahead < length) {
0877: int o[] = new int[] { ahead };
0878: c = unescapeAt(s, o);
0879: ahead = o[0];
0880: }
0881: if (UTF16.isTrailSurrogate((char) c)) {
0882: offset = ahead;
0883: result = UCharacterProperty.getRawSupplementary(
0884: (char) result, (char) c);
0885: }
0886: }
0887: offset16[0] = offset;
0888: return result;
0889: }
0890:
0891: /* Convert C-style escapes in table */
0892: for (i = 0; i < UNESCAPE_MAP.length; i += 2) {
0893: if (c == UNESCAPE_MAP[i]) {
0894: offset16[0] = offset;
0895: return UNESCAPE_MAP[i + 1];
0896: } else if (c < UNESCAPE_MAP[i]) {
0897: break;
0898: }
0899: }
0900:
0901: /* Map \cX to control-X: X & 0x1F */
0902: if (c == 'c' && offset < length) {
0903: c = UTF16.charAt(s, offset);
0904: offset16[0] = offset + UTF16.getCharCount(c);
0905: return 0x1F & c;
0906: }
0907:
0908: /* If no special forms are recognized, then consider
0909: * the backslash to generically escape the next character. */
0910: offset16[0] = offset;
0911: return c;
0912: }
0913:
0914: /**
0915: * Convert all escapes in a given string using unescapeAt().
0916: * @exception IllegalArgumentException if an invalid escape is
0917: * seen.
0918: */
0919: public static String unescape(String s) {
0920: StringBuffer buf = new StringBuffer();
0921: int[] pos = new int[1];
0922: for (int i = 0; i < s.length();) {
0923: char c = s.charAt(i++);
0924: if (c == '\\') {
0925: pos[0] = i;
0926: int e = unescapeAt(s, pos);
0927: if (e < 0) {
0928: throw new IllegalArgumentException(
0929: "Invalid escape sequence "
0930: + s.substring(i - 1, Math.min(
0931: i + 8, s.length())));
0932: }
0933: UTF16.append(buf, e);
0934: i = pos[0];
0935: } else {
0936: buf.append(c);
0937: }
0938: }
0939: return buf.toString();
0940: }
0941:
0942: /**
0943: * Convert all escapes in a given string using unescapeAt().
0944: * Leave invalid escape sequences unchanged.
0945: */
0946: ///CLOVER:OFF
0947: public static String unescapeLeniently(String s) {
0948: StringBuffer buf = new StringBuffer();
0949: int[] pos = new int[1];
0950: for (int i = 0; i < s.length();) {
0951: char c = s.charAt(i++);
0952: if (c == '\\') {
0953: pos[0] = i;
0954: int e = unescapeAt(s, pos);
0955: if (e < 0) {
0956: buf.append(c);
0957: } else {
0958: UTF16.append(buf, e);
0959: i = pos[0];
0960: }
0961: } else {
0962: buf.append(c);
0963: }
0964: }
0965: return buf.toString();
0966: }
0967:
0968: ///CLOVER:ON
0969:
0970: /**
0971: * Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
0972: * "0041".
0973: */
0974: ///CLOVER:OFF
0975: public static String hex(char ch) {
0976: StringBuffer temp = new StringBuffer();
0977: return hex(ch, temp).toString();
0978: }
0979:
0980: ///CLOVER:ON
0981:
0982: /**
0983: * Convert a string to comma-separated groups of 4 hex uppercase
0984: * digits. E.g., hex('ab') => "0041,0042".
0985: */
0986: ///CLOVER:OFF
0987: public static String hex(String s) {
0988: StringBuffer temp = new StringBuffer();
0989: return hex(s, temp).toString();
0990: }
0991:
0992: ///CLOVER:ON
0993:
0994: /**
0995: * Convert a string to comma-separated groups of 4 hex uppercase
0996: * digits. E.g., hex('ab') => "0041,0042".
0997: */
0998: ///CLOVER:OFF
0999: public static String hex(StringBuffer s) {
1000: return hex(s.toString());
1001: }
1002:
1003: ///CLOVER:ON
1004:
1005: /**
1006: * Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
1007: * "0041". Append the output to the given StringBuffer.
1008: */
1009: ///CLOVER:OFF
1010: public static StringBuffer hex(char ch, StringBuffer output) {
1011: return appendNumber(output, ch, 16, 4);
1012: }
1013:
1014: ///CLOVER:ON
1015:
1016: /**
1017: * Convert a integer to size width hex uppercase digits.
1018: * E.g., hex('a', 4, str) => "0041".
1019: * Append the output to the given StringBuffer.
1020: * If width is too small to fit, nothing will be appended to output.
1021: */
1022: public static StringBuffer hex(int ch, int width,
1023: StringBuffer output) {
1024: return appendNumber(output, ch, 16, width);
1025: }
1026:
1027: /**
1028: * Convert a integer to size width (minimum) hex uppercase digits.
1029: * E.g., hex('a', 4, str) => "0041". If the integer requires more
1030: * than width digits, more will be used.
1031: */
1032: public static String hex(int ch, int width) {
1033: StringBuffer buf = new StringBuffer();
1034: return appendNumber(buf, ch, 16, width).toString();
1035: }
1036:
1037: /**
1038: * Supplies a zero-padded hex representation of an integer (without 0x)
1039: */
1040: static public String hex(long i, int places) {
1041: if (i == Long.MIN_VALUE)
1042: return "-8000000000000000";
1043: boolean negative = i < 0;
1044: if (negative) {
1045: i = -i;
1046: }
1047: String result = Long.toString(i, 16).toUpperCase();
1048: if (result.length() < places) {
1049: result = "0000000000000000".substring(result.length(),
1050: places)
1051: + result;
1052: }
1053: if (negative) {
1054: return '-' + result;
1055: }
1056: return result;
1057: }
1058:
1059: public static String hex(long ch) {
1060: return hex(ch, 4);
1061: }
1062:
1063: /**
1064: * Convert a string to comma-separated groups of 4 hex uppercase
1065: * digits. E.g., hex('ab') => "0041,0042". Append the output
1066: * to the given StringBuffer.
1067: */
1068: ///CLOVER:OFF
1069: public static StringBuffer hex(String s, StringBuffer result) {
1070: for (int i = 0; i < s.length(); ++i) {
1071: if (i != 0)
1072: result.append(',');
1073: hex(s.charAt(i), result);
1074: }
1075: return result;
1076: }
1077:
1078: ///CLOVER:ON
1079:
1080: /**
1081: * Split a string into pieces based on the given divider character
1082: * @param s the string to split
1083: * @param divider the character on which to split. Occurrences of
1084: * this character are not included in the output
1085: * @param output an array to receive the substrings between
1086: * instances of divider. It must be large enough on entry to
1087: * accomodate all output. Adjacent instances of the divider
1088: * character will place empty strings into output. Before
1089: * returning, output is padded out with empty strings.
1090: */
1091: ///CLOVER:OFF
1092: public static void split(String s, char divider, String[] output) {
1093: int last = 0;
1094: int current = 0;
1095: int i;
1096: for (i = 0; i < s.length(); ++i) {
1097: if (s.charAt(i) == divider) {
1098: output[current++] = s.substring(last, i);
1099: last = i + 1;
1100: }
1101: }
1102: output[current++] = s.substring(last, i);
1103: while (current < output.length) {
1104: output[current++] = "";
1105: }
1106: }
1107:
1108: /**
1109: * Split a string into pieces based on the given divider character
1110: * @param s the string to split
1111: * @param divider the character on which to split. Occurrences of
1112: * this character are not included in the output
1113: * @return output an array to receive the substrings between
1114: * instances of divider. Adjacent instances of the divider
1115: * character will place empty strings into output.
1116: */
1117: public static String[] split(String s, char divider) {
1118: int last = 0;
1119: int i;
1120: ArrayList output = new ArrayList();
1121: for (i = 0; i < s.length(); ++i) {
1122: if (s.charAt(i) == divider) {
1123: output.add(s.substring(last, i));
1124: last = i + 1;
1125: }
1126: }
1127: output.add(s.substring(last, i));
1128: return (String[]) output.toArray(new String[output.size()]);
1129: }
1130:
1131: ///CLOVER:ON
1132:
1133: /**
1134: * Look up a given string in a string array. Returns the index at
1135: * which the first occurrence of the string was found in the
1136: * array, or -1 if it was not found.
1137: * @param source the string to search for
1138: * @param target the array of zero or more strings in which to
1139: * look for source
1140: * @return the index of target at which source first occurs, or -1
1141: * if not found
1142: */
1143: ///CLOVER:OFF
1144: public static int lookup(String source, String[] target) {
1145: for (int i = 0; i < target.length; ++i) {
1146: if (source.equals(target[i]))
1147: return i;
1148: }
1149: return -1;
1150: }
1151:
1152: ///CLOVER:ON
1153:
1154: /**
1155: * Skip over a sequence of zero or more white space characters
1156: * at pos. Return the index of the first non-white-space character
1157: * at or after pos, or str.length(), if there is none.
1158: */
1159: public static int skipWhitespace(String str, int pos) {
1160: while (pos < str.length()) {
1161: int c = UTF16.charAt(str, pos);
1162: if (!UCharacterProperty.isRuleWhiteSpace(c)) {
1163: break;
1164: }
1165: pos += UTF16.getCharCount(c);
1166: }
1167: return pos;
1168: }
1169:
1170: /**
1171: * Skip over a sequence of zero or more white space characters
1172: * at pos[0], advancing it.
1173: */
1174: public static void skipWhitespace(String str, int[] pos) {
1175: pos[0] = skipWhitespace(str, pos[0]);
1176: }
1177:
1178: /**
1179: * Remove all rule white space from a string.
1180: */
1181: public static String deleteRuleWhiteSpace(String str) {
1182: StringBuffer buf = new StringBuffer();
1183: for (int i = 0; i < str.length();) {
1184: int ch = UTF16.charAt(str, i);
1185: i += UTF16.getCharCount(ch);
1186: if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1187: continue;
1188: }
1189: UTF16.append(buf, ch);
1190: }
1191: return buf.toString();
1192: }
1193:
1194: /**
1195: * Parse a single non-whitespace character 'ch', optionally
1196: * preceded by whitespace.
1197: * @param id the string to be parsed
1198: * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
1199: * offset of the first character to be parsed. On output, pos[0]
1200: * is the index after the last parsed character. If the parse
1201: * fails, pos[0] will be unchanged.
1202: * @param ch the non-whitespace character to be parsed.
1203: * @return true if 'ch' is seen preceded by zero or more
1204: * whitespace characters.
1205: */
1206: public static boolean parseChar(String id, int[] pos, char ch) {
1207: int start = pos[0];
1208: skipWhitespace(id, pos);
1209: if (pos[0] == id.length() || id.charAt(pos[0]) != ch) {
1210: pos[0] = start;
1211: return false;
1212: }
1213: ++pos[0];
1214: return true;
1215: }
1216:
1217: /**
1218: * Parse a pattern string starting at offset pos. Keywords are
1219: * matched case-insensitively. Spaces may be skipped and may be
1220: * optional or required. Integer values may be parsed, and if
1221: * they are, they will be returned in the given array. If
1222: * successful, the offset of the next non-space character is
1223: * returned. On failure, -1 is returned.
1224: * @param pattern must only contain lowercase characters, which
1225: * will match their uppercase equivalents as well. A space
1226: * character matches one or more required spaces. A '~' character
1227: * matches zero or more optional spaces. A '#' character matches
1228: * an integer and stores it in parsedInts, which the caller must
1229: * ensure has enough capacity.
1230: * @param parsedInts array to receive parsed integers. Caller
1231: * must ensure that parsedInts.length is >= the number of '#'
1232: * signs in 'pattern'.
1233: * @return the position after the last character parsed, or -1 if
1234: * the parse failed
1235: */
1236: public static int parsePattern(String rule, int pos, int limit,
1237: String pattern, int[] parsedInts) {
1238: // TODO Update this to handle surrogates
1239: int[] p = new int[1];
1240: int intCount = 0; // number of integers parsed
1241: for (int i = 0; i < pattern.length(); ++i) {
1242: char cpat = pattern.charAt(i);
1243: char c;
1244: switch (cpat) {
1245: case ' ':
1246: if (pos >= limit) {
1247: return -1;
1248: }
1249: c = rule.charAt(pos++);
1250: if (!UCharacterProperty.isRuleWhiteSpace(c)) {
1251: return -1;
1252: }
1253: // FALL THROUGH to skipWhitespace
1254: case '~':
1255: pos = skipWhitespace(rule, pos);
1256: break;
1257: case '#':
1258: p[0] = pos;
1259: parsedInts[intCount++] = parseInteger(rule, p, limit);
1260: if (p[0] == pos) {
1261: // Syntax error; failed to parse integer
1262: return -1;
1263: }
1264: pos = p[0];
1265: break;
1266: default:
1267: if (pos >= limit) {
1268: return -1;
1269: }
1270: c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1271: if (c != cpat) {
1272: return -1;
1273: }
1274: break;
1275: }
1276: }
1277: return pos;
1278: }
1279:
1280: /**
1281: * Parse a pattern string within the given Replaceable and a parsing
1282: * pattern. Characters are matched literally and case-sensitively
1283: * except for the following special characters:
1284: *
1285: * ~ zero or more uprv_isRuleWhiteSpace chars
1286: *
1287: * If end of pattern is reached with all matches along the way,
1288: * pos is advanced to the first unparsed index and returned.
1289: * Otherwise -1 is returned.
1290: * @param pat pattern that controls parsing
1291: * @param text text to be parsed, starting at index
1292: * @param index offset to first character to parse
1293: * @param limit offset after last character to parse
1294: * @return index after last parsed character, or -1 on parse failure.
1295: */
1296: public static int parsePattern(String pat, Replaceable text,
1297: int index, int limit) {
1298: int ipat = 0;
1299:
1300: // empty pattern matches immediately
1301: if (ipat == pat.length()) {
1302: return index;
1303: }
1304:
1305: int cpat = UTF16.charAt(pat, ipat);
1306:
1307: while (index < limit) {
1308: int c = text.char32At(index);
1309:
1310: // parse \s*
1311: if (cpat == '~') {
1312: if (UCharacterProperty.isRuleWhiteSpace(c)) {
1313: index += UTF16.getCharCount(c);
1314: continue;
1315: } else {
1316: if (++ipat == pat.length()) {
1317: return index; // success; c unparsed
1318: }
1319: // fall thru; process c again with next cpat
1320: }
1321: }
1322:
1323: // parse literal
1324: else if (c == cpat) {
1325: int n = UTF16.getCharCount(c);
1326: index += n;
1327: ipat += n;
1328: if (ipat == pat.length()) {
1329: return index; // success; c parsed
1330: }
1331: // fall thru; get next cpat
1332: }
1333:
1334: // match failure of literal
1335: else {
1336: return -1;
1337: }
1338:
1339: cpat = UTF16.charAt(pat, ipat);
1340: }
1341:
1342: return -1; // text ended before end of pat
1343: }
1344:
1345: /**
1346: * Parse an integer at pos, either of the form \d+ or of the form
1347: * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1348: * or octal format.
1349: * @param pos INPUT-OUTPUT parameter. On input, the first
1350: * character to parse. On output, the character after the last
1351: * parsed character.
1352: */
1353: public static int parseInteger(String rule, int[] pos, int limit) {
1354: int count = 0;
1355: int value = 0;
1356: int p = pos[0];
1357: int radix = 10;
1358:
1359: if (rule.regionMatches(true, p, "0x", 0, 2)) {
1360: p += 2;
1361: radix = 16;
1362: } else if (p < limit && rule.charAt(p) == '0') {
1363: p++;
1364: count = 1;
1365: radix = 8;
1366: }
1367:
1368: while (p < limit) {
1369: int d = UCharacter.digit(rule.charAt(p++), radix);
1370: if (d < 0) {
1371: --p;
1372: break;
1373: }
1374: ++count;
1375: int v = (value * radix) + d;
1376: if (v <= value) {
1377: // If there are too many input digits, at some point
1378: // the value will go negative, e.g., if we have seen
1379: // "0x8000000" already and there is another '0', when
1380: // we parse the next 0 the value will go negative.
1381: return 0;
1382: }
1383: value = v;
1384: }
1385: if (count > 0) {
1386: pos[0] = p;
1387: }
1388: return value;
1389: }
1390:
1391: /**
1392: * Parse a Unicode identifier from the given string at the given
1393: * position. Return the identifier, or null if there is no
1394: * identifier.
1395: * @param str the string to parse
1396: * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the
1397: * first character to examine. It must be less than str.length(),
1398: * and it must not point to a whitespace character. That is, must
1399: * have pos[0] < str.length() and
1400: * !UCharacterProperty.isRuleWhiteSpace(UTF16.charAt(str, pos[0])). On
1401: * OUTPUT, the position after the last parsed character.
1402: * @return the Unicode identifier, or null if there is no valid
1403: * identifier at pos[0].
1404: */
1405: public static String parseUnicodeIdentifier(String str, int[] pos) {
1406: // assert(pos[0] < str.length());
1407: // assert(!UCharacterProperty.isRuleWhiteSpace(UTF16.charAt(str, pos[0])));
1408: StringBuffer buf = new StringBuffer();
1409: int p = pos[0];
1410: while (p < str.length()) {
1411: int ch = UTF16.charAt(str, p);
1412: if (buf.length() == 0) {
1413: if (UCharacter.isUnicodeIdentifierStart(ch)) {
1414: UTF16.append(buf, ch);
1415: } else {
1416: return null;
1417: }
1418: } else {
1419: if (UCharacter.isUnicodeIdentifierPart(ch)) {
1420: UTF16.append(buf, ch);
1421: } else {
1422: break;
1423: }
1424: }
1425: p += UTF16.getCharCount(ch);
1426: }
1427: pos[0] = p;
1428: return buf.toString();
1429: }
1430:
1431: /**
1432: * Trim whitespace from ends of a StringBuffer.
1433: */
1434: ///CLOVER:OFF
1435: public static StringBuffer trim(StringBuffer b) {
1436: // TODO update to handle surrogates
1437: int i;
1438: for (i = 0; i < b.length()
1439: && Character.isWhitespace(b.charAt(i)); ++i) {
1440: }
1441: b.delete(0, i);
1442: for (i = b.length() - 1; i >= 0
1443: && Character.isWhitespace(b.charAt(i)); --i) {
1444: }
1445: return b.delete(i + 1, b.length());
1446: }
1447:
1448: ///CLOVER:ON
1449:
1450: static final char DIGITS[] = { '0', '1', '2', '3', '4', '5', '6',
1451: '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
1452: 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
1453: 'V', 'W', 'X', 'Y', 'Z' };
1454:
1455: /**
1456: * Append a number to the given StringBuffer in the radix 10
1457: * generating at least one digit.
1458: */
1459: ///CLOVER:OFF
1460: public static StringBuffer appendNumber(StringBuffer result, int n) {
1461: return appendNumber(result, n, 10, 1);
1462: }
1463:
1464: ///CLOVER:ON
1465:
1466: /**
1467: * Append the digits of a positive integer to the given
1468: * <code>StringBuffer</code> in the given radix. This is
1469: * done recursively since it is easiest to generate the low-
1470: * order digit first, but it must be appended last.
1471: *
1472: * @param result is the <code>StringBuffer</code> to append to
1473: * @param n is the positive integer
1474: * @param radix is the radix, from 2 to 36 inclusive
1475: * @param minDigits is the minimum number of digits to append.
1476: */
1477: private static void recursiveAppendNumber(StringBuffer result,
1478: int n, int radix, int minDigits) {
1479: int digit = n % radix;
1480:
1481: if (n >= radix || minDigits > 1) {
1482: recursiveAppendNumber(result, n / radix, radix,
1483: minDigits - 1);
1484: }
1485:
1486: result.append(DIGITS[digit]);
1487: }
1488:
1489: /**
1490: * Append a number to the given StringBuffer in the given radix.
1491: * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1492: * radices 11 through 36.
1493: * @param result the digits of the number are appended here
1494: * @param n the number to be converted to digits; may be negative.
1495: * If negative, a '-' is prepended to the digits.
1496: * @param radix a radix from 2 to 36 inclusive.
1497: * @param minDigits the minimum number of digits, not including
1498: * any '-', to produce. Values less than 2 have no effect. One
1499: * digit is always emitted regardless of this parameter.
1500: * @return a reference to result
1501: */
1502: public static StringBuffer appendNumber(StringBuffer result, int n,
1503: int radix, int minDigits) throws IllegalArgumentException {
1504: if (radix < 2 || radix > 36) {
1505: throw new IllegalArgumentException("Illegal radix " + radix);
1506: }
1507:
1508: int abs = n;
1509:
1510: if (n < 0) {
1511: abs = -n;
1512: result.append("-");
1513: }
1514:
1515: recursiveAppendNumber(result, abs, radix, minDigits);
1516:
1517: return result;
1518: }
1519:
1520: /**
1521: * Parse an unsigned 31-bit integer at the given offset. Use
1522: * UCharacter.digit() to parse individual characters into digits.
1523: * @param text the text to be parsed
1524: * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
1525: * offset within text at which to start parsing; it should point
1526: * to a valid digit. On exit, pos[0] is the offset after the last
1527: * parsed character. If the parse failed, it will be unchanged on
1528: * exit. Must be >= 0 on entry.
1529: * @param radix the radix in which to parse; must be >= 2 and <=
1530: * 36.
1531: * @return a non-negative parsed number, or -1 upon parse failure.
1532: * Parse fails if there are no digits, that is, if pos[0] does not
1533: * point to a valid digit on entry, or if the number to be parsed
1534: * does not fit into a 31-bit unsigned integer.
1535: */
1536: public static int parseNumber(String text, int[] pos, int radix) {
1537: // assert(pos[0] >= 0);
1538: // assert(radix >= 2);
1539: // assert(radix <= 36);
1540: int n = 0;
1541: int p = pos[0];
1542: while (p < text.length()) {
1543: int ch = UTF16.charAt(text, p);
1544: int d = UCharacter.digit(ch, radix);
1545: if (d < 0) {
1546: break;
1547: }
1548: n = radix * n + d;
1549: // ASSUME that when a 32-bit integer overflows it becomes
1550: // negative. E.g., 214748364 * 10 + 8 => negative value.
1551: if (n < 0) {
1552: return -1;
1553: }
1554: ++p;
1555: }
1556: if (p == pos[0]) {
1557: return -1;
1558: }
1559: pos[0] = p;
1560: return n;
1561: }
1562:
1563: /**
1564: * Return true if the character is NOT printable ASCII. The tab,
1565: * newline and linefeed characters are considered unprintable.
1566: */
1567: public static boolean isUnprintable(int c) {
1568: return !(c >= 0x20 && c <= 0x7E);
1569: }
1570:
1571: /**
1572: * Escape unprintable characters using <backslash>uxxxx notation
1573: * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1574: * above. If the character is printable ASCII, then do nothing
1575: * and return FALSE. Otherwise, append the escaped notation and
1576: * return TRUE.
1577: */
1578: public static boolean escapeUnprintable(StringBuffer result, int c) {
1579: if (isUnprintable(c)) {
1580: result.append('\\');
1581: if ((c & ~0xFFFF) != 0) {
1582: result.append('U');
1583: result.append(DIGITS[0xF & (c >> 28)]);
1584: result.append(DIGITS[0xF & (c >> 24)]);
1585: result.append(DIGITS[0xF & (c >> 20)]);
1586: result.append(DIGITS[0xF & (c >> 16)]);
1587: } else {
1588: result.append('u');
1589: }
1590: result.append(DIGITS[0xF & (c >> 12)]);
1591: result.append(DIGITS[0xF & (c >> 8)]);
1592: result.append(DIGITS[0xF & (c >> 4)]);
1593: result.append(DIGITS[0xF & c]);
1594: return true;
1595: }
1596: return false;
1597: }
1598:
1599: /**
1600: * Returns the index of the first character in a set, ignoring quoted text.
1601: * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1602: * found by a search for "h". Unlike String.indexOf(), this method searches
1603: * not for a single character, but for any character of the string
1604: * <code>setOfChars</code>.
1605: * @param text text to be searched
1606: * @param start the beginning index, inclusive; <code>0 <= start
1607: * <= limit</code>.
1608: * @param limit the ending index, exclusive; <code>start <= limit
1609: * <= text.length()</code>.
1610: * @param setOfChars string with one or more distinct characters
1611: * @return Offset of the first character in <code>setOfChars</code>
1612: * found, or -1 if not found.
1613: * @see String#indexOf
1614: */
1615: public static int quotedIndexOf(String text, int start, int limit,
1616: String setOfChars) {
1617: for (int i = start; i < limit; ++i) {
1618: char c = text.charAt(i);
1619: if (c == BACKSLASH) {
1620: ++i;
1621: } else if (c == APOSTROPHE) {
1622: while (++i < limit && text.charAt(i) != APOSTROPHE) {
1623: }
1624: } else if (setOfChars.indexOf(c) >= 0) {
1625: return i;
1626: }
1627: }
1628: return -1;
1629: }
1630:
1631: /**
1632: * Similar to StringBuffer.getChars, version 1.3.
1633: * Since JDK 1.2 implements StringBuffer.getChars differently, this method
1634: * is here to provide consistent results.
1635: * To be removed after JDK 1.2 ceased to be the reference platform.
1636: * @param src source string buffer
1637: * @param srcBegin offset to the start of the src to retrieve from
1638: * @param srcEnd offset to the end of the src to retrieve from
1639: * @param dst char array to store the retrieved chars
1640: * @param dstBegin offset to the start of the destination char array to
1641: * store the retrieved chars
1642: * @draft since ICU4J 2.0
1643: */
1644: public static void getChars(StringBuffer src, int srcBegin,
1645: int srcEnd, char dst[], int dstBegin) {
1646: if (srcBegin == srcEnd) {
1647: return;
1648: }
1649: src.getChars(srcBegin, srcEnd, dst, dstBegin);
1650: }
1651:
1652: /**
1653: * Append a character to a rule that is being built up. To flush
1654: * the quoteBuf to rule, make one final call with isLiteral == true.
1655: * If there is no final character, pass in (int)-1 as c.
1656: * @param rule the string to append the character to
1657: * @param c the character to append, or (int)-1 if none.
1658: * @param isLiteral if true, then the given character should not be
1659: * quoted or escaped. Usually this means it is a syntactic element
1660: * such as > or $
1661: * @param escapeUnprintable if true, then unprintable characters
1662: * should be escaped using escapeUnprintable(). These escapes will
1663: * appear outside of quotes.
1664: * @param quoteBuf a buffer which is used to build up quoted
1665: * substrings. The caller should initially supply an empty buffer,
1666: * and thereafter should not modify the buffer. The buffer should be
1667: * cleared out by, at the end, calling this method with a literal
1668: * character (which may be -1).
1669: */
1670: public static void appendToRule(StringBuffer rule, int c,
1671: boolean isLiteral, boolean escapeUnprintable,
1672: StringBuffer quoteBuf) {
1673: // If we are escaping unprintables, then escape them outside
1674: // quotes. \\u and \\U are not recognized within quotes. The same
1675:// logic applies to literals, but literals are never escaped.
1676: if (isLiteral
1677: || (escapeUnprintable && Utility.isUnprintable(c))) {
1678: if (quoteBuf.length() > 0) {
1679: // We prefer backslash APOSTROPHE to double APOSTROPHE
1680: // (more readable, less similar to ") so if there are
1681: // double APOSTROPHEs at the ends, we pull them outside
1682: // of the quote.
1683:
1684: // If the first thing in the quoteBuf is APOSTROPHE
1685: // (doubled) then pull it out.
1686: while (quoteBuf.length() >= 2
1687: && quoteBuf.charAt(0) == APOSTROPHE
1688: && quoteBuf.charAt(1) == APOSTROPHE) {
1689: rule.append(BACKSLASH).append(APOSTROPHE);
1690: quoteBuf.delete(0, 2);
1691: }
1692: // If the last thing in the quoteBuf is APOSTROPHE
1693: // (doubled) then remove and count it and add it after.
1694: int trailingCount = 0;
1695: while (quoteBuf.length() >= 2
1696: && quoteBuf.charAt(quoteBuf.length() - 2) == APOSTROPHE
1697: && quoteBuf.charAt(quoteBuf.length() - 1) == APOSTROPHE) {
1698: quoteBuf.setLength(quoteBuf.length() - 2);
1699: ++trailingCount;
1700: }
1701: if (quoteBuf.length() > 0) {
1702: rule.append(APOSTROPHE);
1703: // jdk 1.3.1 does not have append(StringBuffer) yet
1704: if (ICUDebug.isJDK14OrHigher) {
1705: rule.append(quoteBuf);
1706: } else {
1707: rule.append(quoteBuf.toString());
1708: }
1709: rule.append(APOSTROPHE);
1710: quoteBuf.setLength(0);
1711: }
1712: while (trailingCount-- > 0) {
1713: rule.append(BACKSLASH).append(APOSTROPHE);
1714: }
1715: }
1716: if (c != -1) {
1717: /* Since spaces are ignored during parsing, they are
1718: * emitted only for readability. We emit one here
1719: * only if there isn't already one at the end of the
1720: * rule.
1721: */
1722: if (c == ' ') {
1723: int len = rule.length();
1724: if (len > 0 && rule.charAt(len - 1) != ' ') {
1725: rule.append(' ');
1726: }
1727: } else if (!escapeUnprintable
1728: || !Utility.escapeUnprintable(rule, c)) {
1729: UTF16.append(rule, c);
1730: }
1731: }
1732: }
1733:
1734: // Escape ' and '\' and don't begin a quote just for them
1735: else if (quoteBuf.length() == 0
1736: && (c == APOSTROPHE || c == BACKSLASH)) {
1737: rule.append(BACKSLASH).append((char) c);
1738: }
1739:
1740: // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1741: // whitespace need quoting. Also append stuff to quotes if we are
1742: // building up a quoted substring already.
1743: else if (quoteBuf.length() > 0
1744: || (c >= 0x0021 && c <= 0x007E && !((c >= 0x0030/*'0'*/&& c <= 0x0039/*'9'*/)
1745: || (c >= 0x0041/*'A'*/&& c <= 0x005A/*'Z'*/) || (c >= 0x0061/*'a'*/&& c <= 0x007A/*'z'*/)))
1746: || UCharacterProperty.isRuleWhiteSpace(c)) {
1747: UTF16.append(quoteBuf, c);
1748: // Double ' within a quote
1749: if (c == APOSTROPHE) {
1750: quoteBuf.append((char) c);
1751: }
1752: }
1753:
1754: // Otherwise just append
1755: else {
1756: UTF16.append(rule, c);
1757: }
1758: }
1759:
1760: /**
1761: * Append the given string to the rule. Calls the single-character
1762: * version of appendToRule for each character.
1763: */
1764: public static void appendToRule(StringBuffer rule, String text,
1765: boolean isLiteral, boolean escapeUnprintable,
1766: StringBuffer quoteBuf) {
1767: for (int i = 0; i < text.length(); ++i) {
1768: // Okay to process in 16-bit code units here
1769: appendToRule(rule, text.charAt(i), isLiteral,
1770: escapeUnprintable, quoteBuf);
1771: }
1772: }
1773:
1774: /**
1775: * Given a matcher reference, which may be null, append its
1776: * pattern as a literal to the given rule.
1777: */
1778: public static void appendToRule(StringBuffer rule,
1779: UnicodeMatcher matcher, boolean escapeUnprintable,
1780: StringBuffer quoteBuf) {
1781: if (matcher != null) {
1782: appendToRule(rule, matcher.toPattern(escapeUnprintable),
1783: true, escapeUnprintable, quoteBuf);
1784: }
1785: }
1786:
1787: /**
1788: * Compares 2 unsigned integers
1789: * @param source 32 bit unsigned integer
1790: * @param target 32 bit unsigned integer
1791: * @return 0 if equals, 1 if source is greater than target and -1
1792: * otherwise
1793: */
1794: public static final int compareUnsigned(int source, int target) {
1795: source += MAGIC_UNSIGNED;
1796: target += MAGIC_UNSIGNED;
1797: if (source < target) {
1798: return -1;
1799: } else if (source > target) {
1800: return 1;
1801: }
1802: return 0;
1803: }
1804:
1805: /**
1806: * Find the highest bit in a positive integer. This is done
1807: * by doing a binary search through the bits.
1808: *
1809: * @param n is the integer
1810: *
1811: * @return the bit number of the highest bit, with 0 being
1812: * the low order bit, or -1 if <code>n</code> is not positive
1813: */
1814: public static final byte highBit(int n) {
1815: if (n <= 0) {
1816: return -1;
1817: }
1818:
1819: byte bit = 0;
1820:
1821: if (n >= 1 << 16) {
1822: n >>= 16;
1823: bit += 16;
1824: }
1825:
1826: if (n >= 1 << 8) {
1827: n >>= 8;
1828: bit += 8;
1829: }
1830:
1831: if (n >= 1 << 4) {
1832: n >>= 4;
1833: bit += 4;
1834: }
1835:
1836: if (n >= 1 << 2) {
1837: n >>= 2;
1838: bit += 2;
1839: }
1840:
1841: if (n >= 1 << 1) {
1842: n >>= 1;
1843: bit += 1;
1844: }
1845:
1846: return bit;
1847: }
1848:
1849: /**
1850: * Utility method to take a int[] containing codepoints and return
1851: * a string representation with code units.
1852: */
1853: public static String valueOf(int[] source) {
1854: // TODO: Investigate why this method is not on UTF16 class
1855: StringBuffer result = new StringBuffer(source.length);
1856: for (int i = 0; i < source.length; i++) {
1857: UTF16.append(result, source[i]);
1858: }
1859: return result.toString();
1860: }
1861:
1862: /**
1863: * Utility to duplicate a string count times
1864: * @param s
1865: * @param count
1866: */
1867: public static String repeat(String s, int count) {
1868: if (count <= 0)
1869: return "";
1870: if (count == 1)
1871: return s;
1872: StringBuffer result = new StringBuffer();
1873: for (int i = 0; i < count; ++i) {
1874: result.append(s);
1875: }
1876: return result.toString();
1877: }
1878:
1879: // !!! 1.3 compatibiliy
1880: public static int indexOf(StringBuffer buf, String s) {
1881: //#ifndef FOUNDATION
1882: return buf.indexOf(s);
1883: //#else
1884: //## return buf.toString().indexOf(s);
1885: //#endif
1886: }
1887:
1888: // !!! 1.3 compatibiliy
1889: public static int lastIndexOf(StringBuffer buf, String s) {
1890: //#ifndef FOUNDATION
1891: return buf.lastIndexOf(s);
1892: //#else
1893: //## return buf.toString().lastIndexOf(s);
1894: //#endif
1895: }
1896:
1897: // !!! 1.3 compatibiliy
1898: public static int indexOf(StringBuffer buf, String s, int i) {
1899: //#ifndef FOUNDATION
1900: return buf.indexOf(s, i);
1901: //#else
1902: //## return buf.toString().indexOf(s, i);
1903: //#endif
1904: }
1905:
1906: // !!! 1.3 compatibiliy
1907: public static int lastIndexOf(StringBuffer buf, String s, int i) {
1908: //#ifndef FOUNDATION
1909: return buf.lastIndexOf(s, i);
1910: //#else
1911: //## return buf.toString().lastIndexOf(s, i);
1912: //#endif
1913: }
1914:
1915: // !!! 1.3 compatibiliy
1916: public static String replaceAll(String src, String target,
1917: String replacement) {
1918: //#ifndef FOUNDATION
1919: return src.replaceAll(target, replacement);
1920: //#else
1921: //## int i = src.indexOf(target);
1922: //## if (i == -1) {
1923: //## return src;
1924: //## }
1925: //## StringBuffer buf = new StringBuffer();
1926: //## int n = 0;
1927: //## do {
1928: //## buf.append(src.substring(n, i));
1929: //## buf.append(replacement);
1930: //## n = i + target.length();
1931: //## i = src.indexOf(target, n);
1932: //## } while (i != -1);
1933: //## if (n < src.length()) {
1934: //## buf.append(src.substring(n));
1935: //## }
1936: //## return buf.toString();
1937: //#endif
1938: }
1939: }
1940: ///CLOVER:ON
|