001: /*
002: *******************************************************************************
003: * Copyright (C) 2003-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import java.io.IOException;
010: import java.io.InputStream;
011: import java.util.MissingResourceException;
012:
013: import com.ibm.icu.impl.ICUData;
014: import com.ibm.icu.impl.ICUResourceBundle;
015:
016: /**
017: *
018: * IDNA API implements the IDNA protocol as defined in the <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
019: * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
020: * containing non-ASCII code points are required to be processed by
021: * ToASCII operation before passing it to resolver libraries. Domain names
022: * that are obtained from resolver libraries are required to be processed by
023: * ToUnicode operation before displaying the domain name to the user.
024: * IDNA requires that implementations process input strings with
025: * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a>,
026: * which is a profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a> ,
027: * and then with <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a>.
028: * Implementations of IDNA MUST fully implement Nameprep and Punycode;
029: * neither Nameprep nor Punycode are optional.
030: * The input and output of ToASCII and ToUnicode operations are Unicode
031: * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
032: * multiple times to an input string will yield the same result as applying the operation
033: * once.
034: * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
035: * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
036: *
037: * @author Ram Viswanadha
038: * @stable ICU 2.8
039: */
040: public final class IDNA {
041:
042: /* IDNA ACE Prefix is "xn--" */
043: private static char[] ACE_PREFIX = new char[] { 0x0078, 0x006E,
044: 0x002d, 0x002d };
045: private static final int ACE_PREFIX_LENGTH = 4;
046:
047: private static final int MAX_LABEL_LENGTH = 63;
048: private static final int HYPHEN = 0x002D;
049: private static final int CAPITAL_A = 0x0041;
050: private static final int CAPITAL_Z = 0x005A;
051: private static final int LOWER_CASE_DELTA = 0x0020;
052: private static final int FULL_STOP = 0x002E;
053:
054: /**
055: * Option to prohibit processing of unassigned codepoints in the input and
056: * do not check if the input conforms to STD-3 ASCII rules.
057: *
058: * @see #convertToASCII #convertToUnicode
059: * @stable ICU 2.8
060: */
061: public static final int DEFAULT = 0x0000;
062: /**
063: * Option to allow processing of unassigned codepoints in the input
064: *
065: * @see #convertToASCII #convertToUnicode
066: * @stable ICU 2.8
067: */
068: public static final int ALLOW_UNASSIGNED = 0x0001;
069: /**
070: * Option to check if input conforms to STD-3 ASCII rules
071: *
072: * @see #convertToASCII #convertToUnicode
073: * @stable ICU 2.8
074: */
075: public static final int USE_STD3_RULES = 0x0002;
076:
077: // static final singleton object that is initialized
078: // at class initialization time, hence guaranteed to
079: // be initialized and thread safe
080: private static final IDNA singleton = new IDNA();
081:
082: // The NamePrep profile object
083: private StringPrep namePrep;
084:
085: /* private constructor to prevent construction of the object */
086: private IDNA() {
087: try {
088: InputStream stream = ICUData
089: .getRequiredStream(ICUResourceBundle.ICU_BUNDLE
090: + "/uidna.spp");
091: namePrep = new StringPrep(stream);
092: stream.close();
093: } catch (IOException e) {
094: throw new MissingResourceException(e.toString(), "", "");
095: }
096: }
097:
098: private static boolean startsWithPrefix(StringBuffer src) {
099: boolean startsWithPrefix = true;
100:
101: if (src.length() < ACE_PREFIX_LENGTH) {
102: return false;
103: }
104: for (int i = 0; i < ACE_PREFIX_LENGTH; i++) {
105: if (toASCIILower(src.charAt(i)) != ACE_PREFIX[i]) {
106: startsWithPrefix = false;
107: }
108: }
109: return startsWithPrefix;
110: }
111:
112: private static char toASCIILower(char ch) {
113: if (CAPITAL_A <= ch && ch <= CAPITAL_Z) {
114: return (char) (ch + LOWER_CASE_DELTA);
115: }
116: return ch;
117: }
118:
119: private static StringBuffer toASCIILower(StringBuffer src) {
120: StringBuffer dest = new StringBuffer();
121: for (int i = 0; i < src.length(); i++) {
122: dest.append(toASCIILower(src.charAt(i)));
123: }
124: return dest;
125: }
126:
127: private static int compareCaseInsensitiveASCII(StringBuffer s1,
128: StringBuffer s2) {
129: char c1, c2;
130: int rc;
131: for (int i = 0;/* no condition */; i++) {
132: /* If we reach the ends of both strings then they match */
133: if (i == s1.length()) {
134: return 0;
135: }
136:
137: c1 = s1.charAt(i);
138: c2 = s2.charAt(i);
139:
140: /* Case-insensitive comparison */
141: if (c1 != c2) {
142: rc = toASCIILower(c1) - toASCIILower(c2);
143: if (rc != 0) {
144: return rc;
145: }
146: }
147: }
148: }
149:
150: private static int getSeparatorIndex(char[] src, int start,
151: int limit) {
152: for (; start < limit; start++) {
153: if (isLabelSeparator(src[start])) {
154: return start;
155: }
156: }
157: // we have not found the separator just return length
158: return start;
159: }
160:
161: /*
162: private static int getSeparatorIndex(UCharacterIterator iter){
163: int currentIndex = iter.getIndex();
164: int separatorIndex = 0;
165: int ch;
166: while((ch=iter.next())!= UCharacterIterator.DONE){
167: if(isLabelSeparator(ch)){
168: separatorIndex = iter.getIndex();
169: iter.setIndex(currentIndex);
170: return separatorIndex;
171: }
172: }
173: // reset index
174: iter.setIndex(currentIndex);
175: // we have not found the separator just return the length
176:
177: }
178: */
179:
180: private static boolean isLDHChar(int ch) {
181: // high runner case
182: if (ch > 0x007A) {
183: return false;
184: }
185: //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
186: if ((ch == 0x002D) || (0x0030 <= ch && ch <= 0x0039)
187: || (0x0041 <= ch && ch <= 0x005A)
188: || (0x0061 <= ch && ch <= 0x007A)) {
189: return true;
190: }
191: return false;
192: }
193:
194: /**
195: * Ascertain if the given code point is a label separator as
196: * defined by the IDNA RFC
197: *
198: * @param ch The code point to be ascertained
199: * @return true if the char is a label separator
200: * @stable ICU 2.8
201: */
202: private static boolean isLabelSeparator(int ch) {
203: switch (ch) {
204: case 0x002e:
205: case 0x3002:
206: case 0xFF0E:
207: case 0xFF61:
208: return true;
209: default:
210: return false;
211: }
212: }
213:
214: /**
215: * This function implements the ToASCII operation as defined in the IDNA RFC.
216: * This operation is done on <b>single labels</b> before sending it to something that expects
217: * ASCII names. A label is an individual part of a domain name. Labels are usually
218: * separated by dots; e.g." "www.example.com" is composed of 3 labels
219: * "www","example", and "com".
220: *
221: * @param src The input string to be processed
222: * @param options A bit set of options:
223: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
224: * and do not use STD3 ASCII rules
225: * If unassigned code points are found the operation fails with
226: * ParseException.
227: *
228: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
229: * If this option is set, the unassigned code points are in the input
230: * are treated as normal Unicode code points.
231: *
232: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
233: * If this option is set and the input does not satisfy STD3 rules,
234: * the operation will fail with ParseException
235: * @return StringBuffer the converted String
236: * @throws ParseException
237: * @stable ICU 2.8
238: */
239: public static StringBuffer convertToASCII(String src, int options)
240: throws StringPrepParseException {
241: UCharacterIterator iter = UCharacterIterator.getInstance(src);
242: return convertToASCII(iter, options);
243: }
244:
245: /**
246: * This function implements the ToASCII operation as defined in the IDNA RFC.
247: * This operation is done on <b>single labels</b> before sending it to something that expects
248: * ASCII names. A label is an individual part of a domain name. Labels are usually
249: * separated by dots; e.g." "www.example.com" is composed of 3 labels
250: * "www","example", and "com".
251: *
252: * @param src The input string as StringBuffer to be processed
253: * @param options A bit set of options:
254: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
255: * and do not use STD3 ASCII rules
256: * If unassigned code points are found the operation fails with
257: * ParseException.
258: *
259: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
260: * If this option is set, the unassigned code points are in the input
261: * are treated as normal Unicode code points.
262: *
263: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
264: * If this option is set and the input does not satisfy STD3 rules,
265: * the operation will fail with ParseException
266: * @return StringBuffer the converted String
267: * @throws ParseException
268: * @stable ICU 2.8
269: */
270: public static StringBuffer convertToASCII(StringBuffer src,
271: int options) throws StringPrepParseException {
272: UCharacterIterator iter = UCharacterIterator.getInstance(src);
273: return convertToASCII(iter, options);
274: }
275:
276: /**
277: * This function implements the ToASCII operation as defined in the IDNA RFC.
278: * This operation is done on <b>single labels</b> before sending it to something that expects
279: * ASCII names. A label is an individual part of a domain name. Labels are usually
280: * separated by dots; e.g." "www.example.com" is composed of 3 labels
281: * "www","example", and "com".
282: *
283: * @param src The input string as UCharacterIterator to be processed
284: * @param options A bit set of options:
285: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
286: * and do not use STD3 ASCII rules
287: * If unassigned code points are found the operation fails with
288: * ParseException.
289: *
290: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
291: * If this option is set, the unassigned code points are in the input
292: * are treated as normal Unicode code points.
293: *
294: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
295: * If this option is set and the input does not satisfy STD3 rules,
296: * the operation will fail with ParseException
297: * @return StringBuffer the converted String
298: * @throws ParseException
299: * @stable ICU 2.8
300: */
301: public static StringBuffer convertToASCII(UCharacterIterator src,
302: int options) throws StringPrepParseException {
303:
304: boolean[] caseFlags = null;
305:
306: // the source contains all ascii codepoints
307: boolean srcIsASCII = true;
308: // assume the source contains all LDH codepoints
309: boolean srcIsLDH = true;
310:
311: //get the options
312: boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
313: int ch;
314: // step 1
315: while ((ch = src.next()) != UCharacterIterator.DONE) {
316: if (ch > 0x7f) {
317: srcIsASCII = false;
318: }
319: }
320: int failPos = -1;
321: src.setToStart();
322: StringBuffer processOut = null;
323: // step 2 is performed only if the source contains non ASCII
324: if (!srcIsASCII) {
325: // step 2
326: processOut = singleton.namePrep.prepare(src, options);
327: } else {
328: processOut = new StringBuffer(src.getText());
329: }
330: int poLen = processOut.length();
331:
332: if (poLen == 0) {
333: throw new StringPrepParseException(
334: "Found zero length lable after NamePrep.",
335: StringPrepParseException.ZERO_LENGTH_LABEL);
336: }
337: StringBuffer dest = new StringBuffer();
338:
339: // reset the variable to verify if output of prepare is ASCII or not
340: srcIsASCII = true;
341:
342: // step 3 & 4
343: for (int j = 0; j < poLen; j++) {
344: ch = processOut.charAt(j);
345: if (ch > 0x7F) {
346: srcIsASCII = false;
347: } else if (isLDHChar(ch) == false) {
348: // here we do not assemble surrogates
349: // since we know that LDH code points
350: // are in the ASCII range only
351: srcIsLDH = false;
352: failPos = j;
353: }
354: }
355:
356: if (useSTD3ASCIIRules == true) {
357: // verify 3a and 3b
358: if (srcIsLDH == false /* source contains some non-LDH characters */
359: || processOut.charAt(0) == HYPHEN
360: || processOut.charAt(processOut.length() - 1) == HYPHEN) {
361:
362: /* populate the parseError struct */
363: if (srcIsLDH == false) {
364: throw new StringPrepParseException(
365: "The input does not conform to the STD 3 ASCII rules",
366: StringPrepParseException.STD3_ASCII_RULES_ERROR,
367: processOut.toString(),
368: (failPos > 0) ? (failPos - 1) : failPos);
369: } else if (processOut.charAt(0) == HYPHEN) {
370: throw new StringPrepParseException(
371: "The input does not conform to the STD 3 ASCII rules",
372: StringPrepParseException.STD3_ASCII_RULES_ERROR,
373: processOut.toString(), 0);
374:
375: } else {
376: throw new StringPrepParseException(
377: "The input does not conform to the STD 3 ASCII rules",
378: StringPrepParseException.STD3_ASCII_RULES_ERROR,
379: processOut.toString(),
380: (poLen > 0) ? poLen - 1 : poLen);
381:
382: }
383: }
384: }
385: if (srcIsASCII) {
386: dest = processOut;
387: } else {
388: // step 5 : verify the sequence does not begin with ACE prefix
389: if (!startsWithPrefix(processOut)) {
390:
391: //step 6: encode the sequence with punycode
392: caseFlags = new boolean[poLen];
393:
394: StringBuffer punyout = Punycode.encode(processOut,
395: caseFlags);
396:
397: // convert all codepoints to lower case ASCII
398: StringBuffer lowerOut = toASCIILower(punyout);
399:
400: //Step 7: prepend the ACE prefix
401: dest.append(ACE_PREFIX, 0, ACE_PREFIX_LENGTH);
402: //Step 6: copy the contents in b2 into dest
403: dest.append(lowerOut);
404: } else {
405:
406: throw new StringPrepParseException(
407: "The input does not start with the ACE Prefix.",
408: StringPrepParseException.ACE_PREFIX_ERROR,
409: processOut.toString(), 0);
410: }
411: }
412: if (dest.length() > MAX_LABEL_LENGTH) {
413: throw new StringPrepParseException(
414: "The labels in the input are too long. Length > 64.",
415: StringPrepParseException.LABEL_TOO_LONG_ERROR, dest
416: .toString(), 0);
417: }
418: return dest;
419: }
420:
421: /**
422: * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
423: * This operation is done on complete domain names, e.g: "www.example.com".
424: * It is important to note that this operation can fail. If it fails, then the input
425: * domain name cannot be used as an Internationalized Domain Name and the application
426: * should have methods defined to deal with the failure.
427: *
428: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
429: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
430: * and then convert. This function does not offer that level of granularity. The options once
431: * set will apply to all labels in the domain name
432: *
433: * @param src The input string as UCharacterIterator to be processed
434: * @param options A bit set of options:
435: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
436: * and do not use STD3 ASCII rules
437: * If unassigned code points are found the operation fails with
438: * ParseException.
439: *
440: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
441: * If this option is set, the unassigned code points are in the input
442: * are treated as normal Unicode code points.
443: *
444: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
445: * If this option is set and the input does not satisfy STD3 rules,
446: * the operation will fail with ParseException
447: * @return StringBuffer the converted String
448: * @throws ParseException
449: * @stable ICU 2.8
450: */
451: public static StringBuffer convertIDNToASCII(
452: UCharacterIterator src, int options)
453: throws StringPrepParseException {
454: return convertIDNToASCII(src.getText(), options);
455: }
456:
457: /**
458: * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
459: * This operation is done on complete domain names, e.g: "www.example.com".
460: * It is important to note that this operation can fail. If it fails, then the input
461: * domain name cannot be used as an Internationalized Domain Name and the application
462: * should have methods defined to deal with the failure.
463: *
464: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
465: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
466: * and then convert. This function does not offer that level of granularity. The options once
467: * set will apply to all labels in the domain name
468: *
469: * @param src The input string as a StringBuffer to be processed
470: * @param options A bit set of options:
471: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
472: * and do not use STD3 ASCII rules
473: * If unassigned code points are found the operation fails with
474: * ParseException.
475: *
476: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
477: * If this option is set, the unassigned code points are in the input
478: * are treated as normal Unicode code points.
479: *
480: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
481: * If this option is set and the input does not satisfy STD3 rules,
482: * the operation will fail with ParseException
483: * @return StringBuffer the converted String
484: * @throws ParseException
485: * @stable ICU 2.8
486: */
487: public static StringBuffer convertIDNToASCII(StringBuffer src,
488: int options) throws StringPrepParseException {
489: return convertIDNToASCII(src.toString(), options);
490: }
491:
492: /**
493: * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
494: * This operation is done on complete domain names, e.g: "www.example.com".
495: * It is important to note that this operation can fail. If it fails, then the input
496: * domain name cannot be used as an Internationalized Domain Name and the application
497: * should have methods defined to deal with the failure.
498: *
499: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
500: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
501: * and then convert. This function does not offer that level of granularity. The options once
502: * set will apply to all labels in the domain name
503: *
504: * @param src The input string to be processed
505: * @param options A bit set of options:
506: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
507: * and do not use STD3 ASCII rules
508: * If unassigned code points are found the operation fails with
509: * ParseException.
510: *
511: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
512: * If this option is set, the unassigned code points are in the input
513: * are treated as normal Unicode code points.
514: *
515: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
516: * If this option is set and the input does not satisfy STD3 rules,
517: * the operation will fail with ParseException
518: * @return StringBuffer the converted String
519: * @throws ParseException
520: * @stable ICU 2.8
521: */
522: public static StringBuffer convertIDNToASCII(String src, int options)
523: throws StringPrepParseException {
524:
525: char[] srcArr = src.toCharArray();
526: StringBuffer result = new StringBuffer();
527: int sepIndex = 0;
528: int oldSepIndex = 0;
529: for (;;) {
530: sepIndex = getSeparatorIndex(srcArr, sepIndex,
531: srcArr.length);
532: String label = new String(srcArr, oldSepIndex, sepIndex
533: - oldSepIndex);
534: //make sure this is not a root label separator.
535: if (!(label.length() == 0 && sepIndex == srcArr.length)) {
536: UCharacterIterator iter = UCharacterIterator
537: .getInstance(label);
538: result.append(convertToASCII(iter, options));
539: }
540: if (sepIndex == srcArr.length) {
541: break;
542: }
543:
544: // increment the sepIndex to skip past the separator
545: sepIndex++;
546: oldSepIndex = sepIndex;
547: result.append((char) FULL_STOP);
548: }
549: return result;
550: }
551:
552: /**
553: * This function implements the ToUnicode operation as defined in the IDNA RFC.
554: * This operation is done on <b>single labels</b> before sending it to something that expects
555: * Unicode names. A label is an individual part of a domain name. Labels are usually
556: * separated by dots; for e.g." "www.example.com" is composed of 3 labels
557: * "www","example", and "com".
558: *
559: * @param src The input string to be processed
560: * @param options A bit set of options:
561: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
562: * and do not use STD3 ASCII rules
563: * If unassigned code points are found the operation fails with
564: * ParseException.
565: *
566: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
567: * If this option is set, the unassigned code points are in the input
568: * are treated as normal Unicode code points.
569: *
570: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
571: * If this option is set and the input does not satisfy STD3 rules,
572: * the operation will fail with ParseException
573: * @return StringBuffer the converted String
574: * @throws ParseException
575: * @stable ICU 2.8
576: */
577: public static StringBuffer convertToUnicode(String src, int options)
578: throws StringPrepParseException {
579: UCharacterIterator iter = UCharacterIterator.getInstance(src);
580: return convertToUnicode(iter, options);
581: }
582:
583: /**
584: * This function implements the ToUnicode operation as defined in the IDNA RFC.
585: * This operation is done on <b>single labels</b> before sending it to something that expects
586: * Unicode names. A label is an individual part of a domain name. Labels are usually
587: * separated by dots; for e.g." "www.example.com" is composed of 3 labels
588: * "www","example", and "com".
589: *
590: * @param src The input string as StringBuffer to be processed
591: * @param options A bit set of options:
592: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
593: * and do not use STD3 ASCII rules
594: * If unassigned code points are found the operation fails with
595: * ParseException.
596: *
597: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
598: * If this option is set, the unassigned code points are in the input
599: * are treated as normal Unicode code points.
600: *
601: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
602: * If this option is set and the input does not satisfy STD3 rules,
603: * the operation will fail with ParseException
604: * @return StringBuffer the converted String
605: * @throws ParseException
606: * @stable ICU 2.8
607: */
608: public static StringBuffer convertToUnicode(StringBuffer src,
609: int options) throws StringPrepParseException {
610: UCharacterIterator iter = UCharacterIterator.getInstance(src);
611: return convertToUnicode(iter, options);
612: }
613:
614: /**
615: * This function implements the ToUnicode operation as defined in the IDNA RFC.
616: * This operation is done on <b>single labels</b> before sending it to something that expects
617: * Unicode names. A label is an individual part of a domain name. Labels are usually
618: * separated by dots; for e.g." "www.example.com" is composed of 3 labels
619: * "www","example", and "com".
620: *
621: * @param src The input string as UCharacterIterator to be processed
622: * @param options A bit set of options:
623: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
624: * and do not use STD3 ASCII rules
625: * If unassigned code points are found the operation fails with
626: * ParseException.
627: *
628: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
629: * If this option is set, the unassigned code points are in the input
630: * are treated as normal Unicode code points.
631: *
632: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
633: * If this option is set and the input does not satisfy STD3 rules,
634: * the operation will fail with ParseException
635: * @return StringBuffer the converted String
636: * @throws ParseException
637: * @stable ICU 2.8
638: */
639: public static StringBuffer convertToUnicode(UCharacterIterator src,
640: int options) throws StringPrepParseException {
641:
642: boolean[] caseFlags = null;
643:
644: // the source contains all ascii codepoints
645: boolean srcIsASCII = true;
646: // assume the source contains all LDH codepoints
647: boolean srcIsLDH = true;
648:
649: //get the options
650: boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
651:
652: int failPos = -1;
653: int ch;
654: int saveIndex = src.getIndex();
655: // step 1: find out if all the codepoints in src are ASCII
656: while ((ch = src.next()) != UCharacterIterator.DONE) {
657: if (ch > 0x7F) {
658: srcIsASCII = false;
659: } else if ((srcIsLDH = isLDHChar(ch)) == false) {
660: failPos = src.getIndex();
661: }
662: }
663: StringBuffer processOut;
664:
665: if (srcIsASCII == false) {
666: try {
667: // step 2: process the string
668: src.setIndex(saveIndex);
669: processOut = singleton.namePrep.prepare(src, options);
670: } catch (StringPrepParseException ex) {
671: return new StringBuffer(src.getText());
672: }
673:
674: } else {
675: //just point to source
676: processOut = new StringBuffer(src.getText());
677: }
678: // TODO:
679: // The RFC states that
680: // <quote>
681: // ToUnicode never fails. If any step fails, then the original input
682: // is returned immediately in that step.
683: // </quote>
684:
685: //step 3: verify ACE Prefix
686: if (startsWithPrefix(processOut)) {
687: StringBuffer decodeOut = null;
688:
689: //step 4: Remove the ACE Prefix
690: String temp = processOut.substring(ACE_PREFIX_LENGTH,
691: processOut.length());
692:
693: //step 5: Decode using punycode
694: try {
695: decodeOut = Punycode.decode(new StringBuffer(temp),
696: caseFlags);
697: } catch (StringPrepParseException e) {
698: decodeOut = null;
699: }
700:
701: //step 6:Apply toASCII
702: if (decodeOut != null) {
703: StringBuffer toASCIIOut = convertToASCII(decodeOut,
704: options);
705:
706: //step 7: verify
707: if (compareCaseInsensitiveASCII(processOut, toASCIIOut) != 0) {
708: // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
709: // StringPrepParseException.VERIFICATION_ERROR);
710: decodeOut = null;
711: }
712: }
713:
714: //step 8: return output of step 5
715: if (decodeOut != null) {
716: return decodeOut;
717: }
718: }
719:
720: // }else{
721: // // verify that STD3 ASCII rules are satisfied
722: // if(useSTD3ASCIIRules == true){
723: // if( srcIsLDH == false /* source contains some non-LDH characters */
724: // || processOut.charAt(0) == HYPHEN
725: // || processOut.charAt(processOut.length()-1) == HYPHEN){
726: //
727: // if(srcIsLDH==false){
728: // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
729: // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
730: // (failPos>0) ? (failPos-1) : failPos);
731: // }else if(processOut.charAt(0) == HYPHEN){
732: // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
733: // StringPrepParseException.STD3_ASCII_RULES_ERROR,
734: // processOut.toString(),0);
735: //
736: // }else{
737: // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
738: // StringPrepParseException.STD3_ASCII_RULES_ERROR,
739: // processOut.toString(),
740: // processOut.length());
741: //
742: // }
743: // }
744: // }
745: // // just return the source
746: // return new StringBuffer(src.getText());
747: // }
748:
749: return new StringBuffer(src.getText());
750: }
751:
752: /**
753: * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
754: * This operation is done on complete domain names, e.g: "www.example.com".
755: *
756: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
757: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
758: * and then convert. This function does not offer that level of granularity. The options once
759: * set will apply to all labels in the domain name
760: *
761: * @param src The input string as UCharacterIterator to be processed
762: * @param options A bit set of options:
763: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
764: * and do not use STD3 ASCII rules
765: * If unassigned code points are found the operation fails with
766: * ParseException.
767: *
768: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
769: * If this option is set, the unassigned code points are in the input
770: * are treated as normal Unicode code points.
771: *
772: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
773: * If this option is set and the input does not satisfy STD3 rules,
774: * the operation will fail with ParseException
775: * @return StringBuffer the converted String
776: * @throws ParseException
777: * @stable ICU 2.8
778: */
779: public static StringBuffer convertIDNToUnicode(
780: UCharacterIterator src, int options)
781: throws StringPrepParseException {
782: return convertIDNToUnicode(src.getText(), options);
783: }
784:
785: /**
786: * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
787: * This operation is done on complete domain names, e.g: "www.example.com".
788: *
789: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
790: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
791: * and then convert. This function does not offer that level of granularity. The options once
792: * set will apply to all labels in the domain name
793: *
794: * @param src The input string as StringBuffer to be processed
795: * @param options A bit set of options:
796: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
797: * and do not use STD3 ASCII rules
798: * If unassigned code points are found the operation fails with
799: * ParseException.
800: *
801: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
802: * If this option is set, the unassigned code points are in the input
803: * are treated as normal Unicode code points.
804: *
805: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
806: * If this option is set and the input does not satisfy STD3 rules,
807: * the operation will fail with ParseException
808: * @return StringBuffer the converted String
809: * @throws ParseException
810: * @stable ICU 2.8
811: */
812: public static StringBuffer convertIDNToUnicode(StringBuffer src,
813: int options) throws StringPrepParseException {
814: return convertIDNToUnicode(src.toString(), options);
815: }
816:
817: /**
818: * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
819: * This operation is done on complete domain names, e.g: "www.example.com".
820: *
821: * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
822: * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
823: * and then convert. This function does not offer that level of granularity. The options once
824: * set will apply to all labels in the domain name
825: *
826: * @param src The input string to be processed
827: * @param options A bit set of options:
828: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
829: * and do not use STD3 ASCII rules
830: * If unassigned code points are found the operation fails with
831: * ParseException.
832: *
833: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
834: * If this option is set, the unassigned code points are in the input
835: * are treated as normal Unicode code points.
836: *
837: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
838: * If this option is set and the input does not satisfy STD3 rules,
839: * the operation will fail with ParseException
840: * @return StringBuffer the converted String
841: * @throws ParseException
842: * @stable ICU 2.8
843: */
844: public static StringBuffer convertIDNToUnicode(String src,
845: int options) throws StringPrepParseException {
846:
847: char[] srcArr = src.toCharArray();
848: StringBuffer result = new StringBuffer();
849: int sepIndex = 0;
850: int oldSepIndex = 0;
851: for (;;) {
852: sepIndex = getSeparatorIndex(srcArr, sepIndex,
853: srcArr.length);
854: String label = new String(srcArr, oldSepIndex, sepIndex
855: - oldSepIndex);
856: if (label.length() == 0 && sepIndex != srcArr.length) {
857: throw new StringPrepParseException(
858: "Found zero length lable after NamePrep.",
859: StringPrepParseException.ZERO_LENGTH_LABEL);
860: }
861: UCharacterIterator iter = UCharacterIterator
862: .getInstance(label);
863: result.append(convertToUnicode(iter, options));
864: if (sepIndex == srcArr.length) {
865: break;
866: }
867: // increment the sepIndex to skip past the separator
868: sepIndex++;
869: oldSepIndex = sepIndex;
870: result.append((char) FULL_STOP);
871: }
872: return result;
873: }
874:
875: /**
876: * Compare two IDN strings for equivalence.
877: * This function splits the domain names into labels and compares them.
878: * According to IDN RFC, whenever two labels are compared, they are
879: * considered equal if and only if their ASCII forms (obtained by
880: * applying toASCII) match using an case-insensitive ASCII comparison.
881: * Two domain names are considered a match if and only if all labels
882: * match regardless of whether label separators match.
883: *
884: * @param s1 First IDN string as StringBuffer
885: * @param s2 Second IDN string as StringBuffer
886: * @param options A bit set of options:
887: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
888: * and do not use STD3 ASCII rules
889: * If unassigned code points are found the operation fails with
890: * ParseException.
891: *
892: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
893: * If this option is set, the unassigned code points are in the input
894: * are treated as normal Unicode code points.
895: *
896: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
897: * If this option is set and the input does not satisfy STD3 rules,
898: * the operation will fail with ParseException
899: * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
900: * @throws ParseException
901: * @stable ICU 2.8
902: */
903: // TODO: optimize
904: public static int compare(StringBuffer s1, StringBuffer s2,
905: int options) throws StringPrepParseException {
906: if (s1 == null || s2 == null) {
907: throw new IllegalArgumentException(
908: "One of the source buffers is null");
909: }
910: StringBuffer s1Out = convertIDNToASCII(s1.toString(), options);
911: StringBuffer s2Out = convertIDNToASCII(s2.toString(), options);
912: return compareCaseInsensitiveASCII(s1Out, s2Out);
913: }
914:
915: /**
916: * Compare two IDN strings for equivalence.
917: * This function splits the domain names into labels and compares them.
918: * According to IDN RFC, whenever two labels are compared, they are
919: * considered equal if and only if their ASCII forms (obtained by
920: * applying toASCII) match using an case-insensitive ASCII comparison.
921: * Two domain names are considered a match if and only if all labels
922: * match regardless of whether label separators match.
923: *
924: * @param s1 First IDN string
925: * @param s2 Second IDN string
926: * @param options A bit set of options:
927: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
928: * and do not use STD3 ASCII rules
929: * If unassigned code points are found the operation fails with
930: * ParseException.
931: *
932: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
933: * If this option is set, the unassigned code points are in the input
934: * are treated as normal Unicode code points.
935: *
936: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
937: * If this option is set and the input does not satisfy STD3 rules,
938: * the operation will fail with ParseException
939: * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
940: * @throws ParseException
941: * @stable ICU 2.8
942: */
943: // TODO: optimize
944: public static int compare(String s1, String s2, int options)
945: throws StringPrepParseException {
946: if (s1 == null || s2 == null) {
947: throw new IllegalArgumentException(
948: "One of the source buffers is null");
949: }
950: StringBuffer s1Out = convertIDNToASCII(s1, options);
951: StringBuffer s2Out = convertIDNToASCII(s2, options);
952: return compareCaseInsensitiveASCII(s1Out, s2Out);
953: }
954:
955: /**
956: * Compare two IDN strings for equivalence.
957: * This function splits the domain names into labels and compares them.
958: * According to IDN RFC, whenever two labels are compared, they are
959: * considered equal if and only if their ASCII forms (obtained by
960: * applying toASCII) match using an case-insensitive ASCII comparison.
961: * Two domain names are considered a match if and only if all labels
962: * match regardless of whether label separators match.
963: *
964: * @param s1 First IDN string as UCharacterIterator
965: * @param s2 Second IDN string as UCharacterIterator
966: * @param options A bit set of options:
967: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
968: * and do not use STD3 ASCII rules
969: * If unassigned code points are found the operation fails with
970: * ParseException.
971: *
972: * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
973: * If this option is set, the unassigned code points are in the input
974: * are treated as normal Unicode code points.
975: *
976: * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
977: * If this option is set and the input does not satisfy STD3 rules,
978: * the operation will fail with ParseException
979: * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
980: * @throws ParseException
981: * @stable ICU 2.8
982: */
983: // TODO: optimize
984: public static int compare(UCharacterIterator s1,
985: UCharacterIterator s2, int options)
986: throws StringPrepParseException {
987: if (s1 == null || s2 == null) {
988: throw new IllegalArgumentException(
989: "One of the source buffers is null");
990: }
991: StringBuffer s1Out = convertIDNToASCII(s1.getText(), options);
992: StringBuffer s2Out = convertIDNToASCII(s2.getText(), options);
993: return compareCaseInsensitiveASCII(s1Out, s2Out);
994: }
995: }
|