001: /*-
002: * See the file LICENSE for redistribution information.
003: *
004: * Copyright (c) 2000,2008 Oracle. All rights reserved.
005: *
006: * $Id: UtfOps.java,v 1.17.2.2 2008/01/07 15:14:21 cwl Exp $
007: */
008:
009: package com.sleepycat.util;
010:
011: /**
012: * UTF operations with more flexibility than is provided by DataInput and
013: * DataOutput.
014: *
015: * @author Mark Hayes
016: */
017: public class UtfOps {
018:
019: private static byte[] EMPTY_BYTES = {};
020: private static String EMPTY_STRING = "";
021:
022: /**
023: * Returns the byte length of a null terminated UTF string, not including
024: * the terminator.
025: *
026: * @param bytes the data containing the UTF string.
027: *
028: * @param offset the beginning of the string the measure.
029: *
030: * @throws IndexOutOfBoundsException if no zero terminator is found.
031: *
032: * @return the number of bytes.
033: */
034: public static int getZeroTerminatedByteLength(byte[] bytes,
035: int offset) throws IndexOutOfBoundsException {
036:
037: int len = 0;
038: while (bytes[offset++] != 0) {
039: len++;
040: }
041: return len;
042: }
043:
044: /**
045: * Returns the byte length of the UTF string that would be created by
046: * converting the given characters to UTF.
047: *
048: * @param chars the characters that would be converted.
049: *
050: * @return the byte length of the equivalent UTF data.
051: */
052: public static int getByteLength(char[] chars) {
053:
054: return getByteLength(chars, 0, chars.length);
055: }
056:
057: /**
058: * Returns the byte length of the UTF string that would be created by
059: * converting the given characters to UTF.
060: *
061: * @param chars the characters that would be converted.
062: *
063: * @param offset the first character to be converted.
064: *
065: * @param length the number of characters to be converted.
066: *
067: * @return the byte length of the equivalent UTF data.
068: */
069: public static int getByteLength(char[] chars, int offset, int length) {
070:
071: int len = 0;
072: length += offset;
073: for (int i = offset; i < length; i++) {
074: int c = chars[i];
075: if ((c >= 0x0001) && (c <= 0x007F)) {
076: len++;
077: } else if (c > 0x07FF) {
078: len += 3;
079: } else {
080: len += 2;
081: }
082: }
083: return len;
084: }
085:
086: /**
087: * Returns the number of characters represented by the given UTF string.
088: *
089: * @param bytes the UTF string.
090: *
091: * @return the number of characters.
092: *
093: * @throws IndexOutOfBoundsException if a UTF character sequence at the end
094: * of the data is not complete.
095: *
096: * @throws IllegalArgumentException if an illegal UTF sequence is
097: * encountered.
098: */
099: public static int getCharLength(byte[] bytes)
100: throws IllegalArgumentException, IndexOutOfBoundsException {
101:
102: return getCharLength(bytes, 0, bytes.length);
103: }
104:
105: /**
106: * Returns the number of characters represented by the given UTF string.
107: *
108: * @param bytes the data containing the UTF string.
109: *
110: * @param offset the first byte to be converted.
111: *
112: * @param length the number of byte to be converted.
113: *
114: * @throws IndexOutOfBoundsException if a UTF character sequence at the end
115: * of the data is not complete.
116: *
117: * @throws IllegalArgumentException if an illegal UTF sequence is
118: * encountered.
119: */
120: public static int getCharLength(byte[] bytes, int offset, int length)
121: throws IllegalArgumentException, IndexOutOfBoundsException {
122:
123: int charCount = 0;
124: length += offset;
125: while (offset < length) {
126: switch ((bytes[offset] & 0xff) >> 4) {
127: case 0:
128: case 1:
129: case 2:
130: case 3:
131: case 4:
132: case 5:
133: case 6:
134: case 7:
135: offset++;
136: break;
137: case 12:
138: case 13:
139: offset += 2;
140: break;
141: case 14:
142: offset += 3;
143: break;
144: default:
145: throw new IllegalArgumentException();
146: }
147: charCount++;
148: }
149: return charCount;
150: }
151:
152: /**
153: * Converts byte arrays into character arrays.
154: *
155: * @param bytes the source byte data to convert
156: *
157: * @param byteOffset the offset into the byte array at which
158: * to start the conversion
159: *
160: * @param chars the destination array
161: *
162: * @param charOffset the offset into chars at which to begin the copy
163: *
164: * @param len the amount of information to copy into chars
165: *
166: * @param isByteLen if true then len is a measure of bytes, otherwise
167: * len is a measure of characters
168: *
169: * @throws IndexOutOfBoundsException if a UTF character sequence at the end
170: * of the data is not complete.
171: *
172: * @throws IllegalArgumentException if an illegal UTF sequence is
173: * encountered.
174: */
175: public static int bytesToChars(byte[] bytes, int byteOffset,
176: char[] chars, int charOffset, int len, boolean isByteLen)
177: throws IllegalArgumentException, IndexOutOfBoundsException {
178:
179: int char1, char2, char3;
180: len += isByteLen ? byteOffset : charOffset;
181: while ((isByteLen ? byteOffset : charOffset) < len) {
182: char1 = bytes[byteOffset++] & 0xff;
183: switch ((char1 & 0xff) >> 4) {
184: case 0:
185: case 1:
186: case 2:
187: case 3:
188: case 4:
189: case 5:
190: case 6:
191: case 7:
192: chars[charOffset++] = (char) char1;
193: break;
194: case 12:
195: case 13:
196: char2 = bytes[byteOffset++];
197: if ((char2 & 0xC0) != 0x80) {
198: throw new IllegalArgumentException();
199: }
200: chars[charOffset++] = (char) (((char1 & 0x1F) << 6) | (char2 & 0x3F));
201: break;
202: case 14:
203: char2 = bytes[byteOffset++];
204: char3 = bytes[byteOffset++];
205: if (((char2 & 0xC0) != 0x80)
206: || ((char3 & 0xC0) != 0x80))
207: throw new IllegalArgumentException();
208: chars[charOffset++] = (char) (((char1 & 0x0F) << 12)
209: | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0));
210: break;
211: default:
212: throw new IllegalArgumentException();
213: }
214: }
215: return byteOffset;
216: }
217:
218: /**
219: * Converts character arrays into byte arrays.
220: *
221: * @param chars the source character data to convert
222: *
223: * @param charOffset the offset into the character array at which
224: * to start the conversion
225: *
226: * @param bytes the destination array
227: *
228: * @param byteOffset the offset into bytes at which to begin the copy
229: *
230: * @param charLength the length of characters to copy into bytes
231: */
232: public static void charsToBytes(char[] chars, int charOffset,
233: byte[] bytes, int byteOffset, int charLength) {
234: charLength += charOffset;
235: for (int i = charOffset; i < charLength; i++) {
236: int c = chars[i];
237: if ((c >= 0x0001) && (c <= 0x007F)) {
238: bytes[byteOffset++] = (byte) c;
239: } else if (c > 0x07FF) {
240: bytes[byteOffset++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
241: bytes[byteOffset++] = (byte) (0x80 | ((c >> 6) & 0x3F));
242: bytes[byteOffset++] = (byte) (0x80 | ((c >> 0) & 0x3F));
243: } else {
244: bytes[byteOffset++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
245: bytes[byteOffset++] = (byte) (0x80 | ((c >> 0) & 0x3F));
246: }
247: }
248: }
249:
250: /**
251: * Converts byte arrays into strings.
252: *
253: * @param bytes the source byte data to convert
254: *
255: * @param offset the offset into the byte array at which
256: * to start the conversion
257: *
258: * @param length the number of bytes to be converted.
259: *
260: * @return the string.
261: *
262: * @throws IndexOutOfBoundsException if a UTF character sequence at the end
263: * of the data is not complete.
264: *
265: * @throws IllegalArgumentException if an illegal UTF sequence is
266: * encountered.
267: */
268: public static String bytesToString(byte[] bytes, int offset,
269: int length) throws IllegalArgumentException,
270: IndexOutOfBoundsException {
271:
272: if (length == 0)
273: return EMPTY_STRING;
274: int charLen = UtfOps.getCharLength(bytes, offset, length);
275: char[] chars = new char[charLen];
276: UtfOps.bytesToChars(bytes, offset, chars, 0, length, true);
277: return new String(chars, 0, charLen);
278: }
279:
280: /**
281: * Converts strings to byte arrays.
282: *
283: * @param string the string to convert.
284: *
285: * @return the UTF byte array.
286: */
287: public static byte[] stringToBytes(String string) {
288:
289: if (string.length() == 0)
290: return EMPTY_BYTES;
291: char[] chars = string.toCharArray();
292: byte[] bytes = new byte[UtfOps.getByteLength(chars)];
293: UtfOps.charsToBytes(chars, 0, bytes, 0, chars.length);
294: return bytes;
295: }
296: }
|