001: /*
002: *******************************************************************************
003: * Copyright (C) 2003-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import com.ibm.icu.lang.UCharacter;
010:
011: /**
012: * Ported code from ICU punycode.c
013: * @author ram
014: */
015:
016: /* Package Private class */
017: final class Punycode {
018:
019: /* Punycode parameters for Bootstring */
020: private static final int BASE = 36;
021: private static final int TMIN = 1;
022: private static final int TMAX = 26;
023: private static final int SKEW = 38;
024: private static final int DAMP = 700;
025: private static final int INITIAL_BIAS = 72;
026: private static final int INITIAL_N = 0x80;
027:
028: /* "Basic" Unicode/ASCII code points */
029: private static final int HYPHEN = 0x2d;
030: private static final int DELIMITER = HYPHEN;
031:
032: private static final int ZERO = 0x30;
033: private static final int NINE = 0x39;
034:
035: private static final int SMALL_A = 0x61;
036: private static final int SMALL_Z = 0x7a;
037:
038: private static final int CAPITAL_A = 0x41;
039: private static final int CAPITAL_Z = 0x5a;
040: private static final int MAX_CP_COUNT = 200;
041: private static final int UINT_MAGIC = 0x80000000;
042: private static final long ULONG_MAGIC = 0x8000000000000000L;
043:
044: private static int adaptBias(int delta, int length,
045: boolean firstTime) {
046: if (firstTime) {
047: delta /= DAMP;
048: } else {
049: delta /= 2;
050: }
051: delta += delta / length;
052:
053: int count = 0;
054: for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) {
055: delta /= (BASE - TMIN);
056: }
057:
058: return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
059: }
060:
061: /**
062: * basicToDigit[] contains the numeric value of a basic code
063: * point (for use in representing integers) in the range 0 to
064: * BASE-1, or -1 if b is does not represent a value.
065: */
066: static final int[] basicToDigit = new int[] { -1, -1, -1, -1, -1,
067: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
068: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
069:
070: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
071: -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1,
072: -1, -1,
073:
074: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
075: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
076:
077: -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
078: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
079:
080: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
081: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
082: -1, -1,
083:
084: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
085: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
086: -1, -1,
087:
088: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
089: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
090: -1, -1,
091:
092: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
093: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
094: -1, -1 };
095:
096: ///CLOVER:OFF
097: private static char asciiCaseMap(char b, boolean uppercase) {
098: if (uppercase) {
099: if (SMALL_A <= b && b <= SMALL_Z) {
100: b -= (SMALL_A - CAPITAL_A);
101: }
102: } else {
103: if (CAPITAL_A <= b && b <= CAPITAL_Z) {
104: b += (SMALL_A - CAPITAL_A);
105: }
106: }
107: return b;
108: }
109:
110: ///CLOVER:ON
111: /**
112: * digitToBasic() returns the basic code point whose value
113: * (when used for representing integers) is d, which must be in the
114: * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
115: * nonzero, in which case the uppercase form is used.
116: */
117: private static char digitToBasic(int digit, boolean uppercase) {
118: /* 0..25 map to ASCII a..z or A..Z */
119: /* 26..35 map to ASCII 0..9 */
120: if (digit < 26) {
121: if (uppercase) {
122: return (char) (CAPITAL_A + digit);
123: } else {
124: return (char) (SMALL_A + digit);
125: }
126: } else {
127: return (char) ((ZERO - 26) + digit);
128: }
129: }
130:
131: /**
132: * Converts Unicode to Punycode.
133: * The input string must not contain single, unpaired surrogates.
134: * The output will be represented as an array of ASCII code points.
135: *
136: * @param src
137: * @param caseFlags
138: * @return
139: * @throws ParseException
140: */
141: public static StringBuffer encode(StringBuffer src,
142: boolean[] caseFlags) throws StringPrepParseException {
143:
144: int[] cpBuffer = new int[MAX_CP_COUNT];
145: int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
146: char c, c2;
147: int srcLength = src.length();
148: int destCapacity = MAX_CP_COUNT;
149: char[] dest = new char[destCapacity];
150: StringBuffer result = new StringBuffer();
151: /*
152: * Handle the basic code points and
153: * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
154: */
155: srcCPCount = destLength = 0;
156:
157: for (j = 0; j < srcLength; ++j) {
158: if (srcCPCount == MAX_CP_COUNT) {
159: /* too many input code points */
160: throw new IndexOutOfBoundsException();
161: }
162: c = src.charAt(j);
163: if (isBasic(c)) {
164: if (destLength < destCapacity) {
165: cpBuffer[srcCPCount++] = 0;
166: dest[destLength] = caseFlags != null ? asciiCaseMap(
167: (char) c, caseFlags[j])
168: : (char) c;
169: }
170: ++destLength;
171: } else {
172: n = ((caseFlags != null && caseFlags[j]) ? 1 : 0) << 31L;
173: if (!UTF16.isSurrogate(c)) {
174: n |= c;
175: } else if (UTF16.isLeadSurrogate(c)
176: && (j + 1) < srcLength
177: && UTF16.isTrailSurrogate(c2 = src
178: .charAt(j + 1))) {
179: ++j;
180:
181: n |= UCharacter.getCodePoint(c, c2);
182: } else {
183: /* error: unmatched surrogate */
184: throw new StringPrepParseException(
185: "Illegal char found",
186: StringPrepParseException.ILLEGAL_CHAR_FOUND);
187: }
188: cpBuffer[srcCPCount++] = n;
189: }
190: }
191:
192: /* Finish the basic string - if it is not empty - with a delimiter. */
193: basicLength = destLength;
194: if (basicLength > 0) {
195: if (destLength < destCapacity) {
196: dest[destLength] = DELIMITER;
197: }
198: ++destLength;
199: }
200:
201: /*
202: * handledCPCount is the number of code points that have been handled
203: * basicLength is the number of basic code points
204: * destLength is the number of chars that have been output
205: */
206:
207: /* Initialize the state: */
208: n = INITIAL_N;
209: delta = 0;
210: bias = INITIAL_BIAS;
211:
212: /* Main encoding loop: */
213: for (handledCPCount = basicLength; handledCPCount < srcCPCount; /* no op */) {
214: /*
215: * All non-basic code points < n have been handled already.
216: * Find the next larger one:
217: */
218: for (m = 0x7fffffff, j = 0; j < srcCPCount; ++j) {
219: q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
220: if (n <= q && q < m) {
221: m = q;
222: }
223: }
224:
225: /*
226: * Increase delta enough to advance the decoder's
227: * <n,i> state to <m,0>, but guard against overflow:
228: */
229: if (m - n > (0x7fffffff - MAX_CP_COUNT - delta)
230: / (handledCPCount + 1)) {
231: throw new IllegalStateException(
232: "Internal program error");
233: }
234: delta += (m - n) * (handledCPCount + 1);
235: n = m;
236:
237: /* Encode a sequence of same code points n */
238: for (j = 0; j < srcCPCount; ++j) {
239: q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
240: if (q < n) {
241: ++delta;
242: } else if (q == n) {
243: /* Represent delta as a generalized variable-length integer: */
244: for (q = delta, k = BASE; /* no condition */; k += BASE) {
245:
246: /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
247:
248: t=k-bias;
249: if(t<TMIN) {
250: t=TMIN;
251: } else if(t>TMAX) {
252: t=TMAX;
253: }
254: */
255:
256: t = k - bias;
257: if (t < TMIN) {
258: t = TMIN;
259: } else if (k >= (bias + TMAX)) {
260: t = TMAX;
261: }
262:
263: if (q < t) {
264: break;
265: }
266:
267: if (destLength < destCapacity) {
268: dest[destLength++] = digitToBasic(t
269: + (q - t) % (BASE - t), false);
270: }
271: q = (q - t) / (BASE - t);
272: }
273:
274: if (destLength < destCapacity) {
275: dest[destLength++] = digitToBasic(q,
276: (cpBuffer[j] < 0));
277: }
278: bias = adaptBias(delta, handledCPCount + 1,
279: (handledCPCount == basicLength));
280: delta = 0;
281: ++handledCPCount;
282: }
283: }
284:
285: ++delta;
286: ++n;
287: }
288:
289: return result.append(dest, 0, destLength);
290: }
291:
292: private static boolean isBasic(int ch) {
293: return (ch < INITIAL_N);
294: }
295:
296: ///CLOVER:OFF
297: private static boolean isBasicUpperCase(int ch) {
298: return (CAPITAL_A <= ch && ch >= CAPITAL_Z);
299: }
300:
301: ///CLOVER:ON
302: private static boolean isSurrogate(int ch) {
303: return (((ch) & 0xfffff800) == 0xd800);
304: }
305:
306: /**
307: * Converts Punycode to Unicode.
308: * The Unicode string will be at most as long as the Punycode string.
309: *
310: * @param src
311: * @param caseFlags
312: * @return
313: * @throws ParseException
314: */
315: public static StringBuffer decode(StringBuffer src,
316: boolean[] caseFlags) throws StringPrepParseException {
317: int srcLength = src.length();
318: StringBuffer result = new StringBuffer();
319: int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, destCPCount, firstSupplementaryIndex, cpLength;
320: char b;
321: int destCapacity = MAX_CP_COUNT;
322: char[] dest = new char[destCapacity];
323:
324: /*
325: * Handle the basic code points:
326: * Let basicLength be the number of input code points
327: * before the last delimiter, or 0 if there is none,
328: * then copy the first basicLength code points to the output.
329: *
330: * The two following loops iterate backward.
331: */
332: for (j = srcLength; j > 0;) {
333: if (src.charAt(--j) == DELIMITER) {
334: break;
335: }
336: }
337: destLength = basicLength = destCPCount = j;
338:
339: while (j > 0) {
340: b = src.charAt(--j);
341: if (!isBasic(b)) {
342: throw new StringPrepParseException(
343: "Illegal char found",
344: StringPrepParseException.INVALID_CHAR_FOUND);
345: }
346:
347: if (j < destCapacity) {
348: dest[j] = b;
349:
350: if (caseFlags != null) {
351: caseFlags[j] = isBasicUpperCase(b);
352: }
353: }
354: }
355:
356: /* Initialize the state: */
357: n = INITIAL_N;
358: i = 0;
359: bias = INITIAL_BIAS;
360: firstSupplementaryIndex = 1000000000;
361:
362: /*
363: * Main decoding loop:
364: * Start just after the last delimiter if any
365: * basic code points were copied; start at the beginning otherwise.
366: */
367: for (in = basicLength > 0 ? basicLength + 1 : 0; in < srcLength; /* no op */) {
368: /*
369: * in is the index of the next character to be consumed, and
370: * destCPCount is the number of code points in the output array.
371: *
372: * Decode a generalized variable-length integer into delta,
373: * which gets added to i. The overflow checking is easier
374: * if we increase i as we go, then subtract off its starting
375: * value at the end to obtain delta.
376: */
377: for (oldi = i, w = 1, k = BASE; /* no condition */; k += BASE) {
378: if (in >= srcLength) {
379: throw new StringPrepParseException(
380: "Illegal char found",
381: StringPrepParseException.ILLEGAL_CHAR_FOUND);
382: }
383:
384: digit = basicToDigit[src.charAt(in++) & 0xFF];
385: if (digit < 0) {
386: throw new StringPrepParseException(
387: "Invalid char found",
388: StringPrepParseException.INVALID_CHAR_FOUND);
389: }
390: if (digit > (0x7fffffff - i) / w) {
391: /* integer overflow */
392: throw new StringPrepParseException(
393: "Illegal char found",
394: StringPrepParseException.ILLEGAL_CHAR_FOUND);
395: }
396:
397: i += digit * w;
398: t = k - bias;
399: if (t < TMIN) {
400: t = TMIN;
401: } else if (k >= (bias + TMAX)) {
402: t = TMAX;
403: }
404: if (digit < t) {
405: break;
406: }
407:
408: if (w > 0x7fffffff / (BASE - t)) {
409: /* integer overflow */
410: throw new StringPrepParseException(
411: "Illegal char found",
412: StringPrepParseException.ILLEGAL_CHAR_FOUND);
413: }
414: w *= BASE - t;
415: }
416:
417: /*
418: * Modification from sample code:
419: * Increments destCPCount here,
420: * where needed instead of in for() loop tail.
421: */
422: ++destCPCount;
423: bias = adaptBias(i - oldi, destCPCount, (oldi == 0));
424:
425: /*
426: * i was supposed to wrap around from (incremented) destCPCount to 0,
427: * incrementing n each time, so we'll fix that now:
428: */
429: if (i / destCPCount > (0x7fffffff - n)) {
430: /* integer overflow */
431: throw new StringPrepParseException(
432: "Illegal char found",
433: StringPrepParseException.ILLEGAL_CHAR_FOUND);
434: }
435:
436: n += i / destCPCount;
437: i %= destCPCount;
438: /* not needed for Punycode: */
439: /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
440:
441: if (n > 0x10ffff || isSurrogate(n)) {
442: /* Unicode code point overflow */
443: throw new StringPrepParseException(
444: "Illegal char found",
445: StringPrepParseException.ILLEGAL_CHAR_FOUND);
446: }
447:
448: /* Insert n at position i of the output: */
449: cpLength = UTF16.getCharCount(n);
450: if ((destLength + cpLength) < destCapacity) {
451: int codeUnitIndex;
452:
453: /*
454: * Handle indexes when supplementary code points are present.
455: *
456: * In almost all cases, there will be only BMP code points before i
457: * and even in the entire string.
458: * This is handled with the same efficiency as with UTF-32.
459: *
460: * Only the rare cases with supplementary code points are handled
461: * more slowly - but not too bad since this is an insertion anyway.
462: */
463: if (i <= firstSupplementaryIndex) {
464: codeUnitIndex = i;
465: if (cpLength > 1) {
466: firstSupplementaryIndex = codeUnitIndex;
467: } else {
468: ++firstSupplementaryIndex;
469: }
470: } else {
471: codeUnitIndex = firstSupplementaryIndex;
472: codeUnitIndex = UTF16.moveCodePointOffset(dest, 0,
473: destLength, codeUnitIndex, i
474: - codeUnitIndex);
475: }
476:
477: /* use the UChar index codeUnitIndex instead of the code point index i */
478: if (codeUnitIndex < destLength) {
479: System.arraycopy(dest, codeUnitIndex, dest,
480: codeUnitIndex + cpLength,
481: (destLength - codeUnitIndex));
482: if (caseFlags != null) {
483: System.arraycopy(caseFlags, codeUnitIndex,
484: caseFlags, codeUnitIndex + cpLength,
485: destLength - codeUnitIndex);
486: }
487: }
488: if (cpLength == 1) {
489: /* BMP, insert one code unit */
490: dest[codeUnitIndex] = (char) n;
491: } else {
492: /* supplementary character, insert two code units */
493: dest[codeUnitIndex] = UTF16.getLeadSurrogate(n);
494: dest[codeUnitIndex + 1] = UTF16
495: .getTrailSurrogate(n);
496: }
497: if (caseFlags != null) {
498: /* Case of last character determines uppercase flag: */
499: caseFlags[codeUnitIndex] = isBasicUpperCase(src
500: .charAt(in - 1));
501: if (cpLength == 2) {
502: caseFlags[codeUnitIndex + 1] = false;
503: }
504: }
505: }
506: destLength += cpLength;
507: ++i;
508: }
509: result.append(dest, 0, destLength);
510: return result;
511: }
512: }
|