001 /*
002 * Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
003 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004 *
005 * This code is free software; you can redistribute it and/or modify it
006 * under the terms of the GNU General Public License version 2 only, as
007 * published by the Free Software Foundation. Sun designates this
008 * particular file as subject to the "Classpath" exception as provided
009 * by Sun in the LICENSE file that accompanied this code.
010 *
011 * This code is distributed in the hope that it will be useful, but WITHOUT
012 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014 * version 2 for more details (a copy is included in the LICENSE file that
015 * accompanied this code).
016 *
017 * You should have received a copy of the GNU General Public License version
018 * 2 along with this work; if not, write to the Free Software Foundation,
019 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020 *
021 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022 * CA 95054 USA or visit www.sun.com if you need additional information or
023 * have any questions.
024 */
025 package java.net;
026
027 import java.io.InputStream;
028 import java.io.IOException;
029 import java.security.AccessController;
030 import java.security.PrivilegedAction;
031
032 import sun.net.idn.StringPrep;
033 import sun.net.idn.Punycode;
034 import sun.text.normalizer.UCharacterIterator;
035
036 /**
037 * Provides methods to convert internationalized domain names (IDNs) between
038 * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
039 * Internationalized domain names can use characters from the entire range of
040 * Unicode, while traditional domain names are restricted to ASCII characters.
041 * ACE is an encoding of Unicode strings that uses only ASCII characters and
042 * can be used with software (such as the Domain Name System) that only
043 * understands traditional domain names.
044 *
045 * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
046 * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
047 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
048 * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
049 * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
050 * domain name string back and forth.
051 *
052 * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
053 * <ul>
054 * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
055 * can contain code points that are unassigned in Unicode 3.2, which is the
056 * Unicode version on which IDN conversion is based. If the flag is not used,
057 * the presence of such unassigned code points is treated as an error.
058 * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
059 * It is an error if they don't meet the requirements.
060 * </ul>
061 * These flags can be logically OR'ed together.
062 *
063 * <p>The security consideration is important with respect to internationalization
064 * domain name support. For example, English domain names may be <i>homographed</i>
065 * - maliciously misspelled by substitution of non-Latin letters.
066 * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
067 * discusses security issues of IDN support as well as possible solutions.
068 * Applications are responsible for taking adequate security measures when using
069 * international domain names.
070 *
071 * @version 1.9, 07/05/05
072 * @author Edward Wang
073 * @since 1.6
074 *
075 */
076 public final class IDN {
077 /**
078 * Flag to allow processing of unassigned code points
079 */
080 public static final int ALLOW_UNASSIGNED = 0x01;
081
082 /**
083 * Flag to turn on the check against STD-3 ASCII rules
084 */
085 public static final int USE_STD3_ASCII_RULES = 0x02;
086
087 /**
088 * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
089 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
090 *
091 * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
092 * If ToASCII operation fails, an IllegalArgumentException will be thrown.
093 * In this case, the input string should not be used in an internationalized domain name.
094 *
095 * <p> A label is an individual part of a domain name. The original ToASCII operation,
096 * as defined in RFC 3490, only operates on a single label. This method can handle
097 * both label and entire domain name, by assuming that labels in a domain name are
098 * always separated by dots. The following characters are recognized as dots:
099 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
100 * and \uFF61 (halfwidth ideographic full stop). if dots are
101 * used as label separators, this method also changes all of them to \u002E (full stop)
102 * in output translated string.
103 *
104 * @param input the string to be processed
105 * @param flag process flag; can be 0 or any logical OR of possible flags
106 *
107 * @return the translated <tt>String</tt>
108 *
109 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
110 */
111 public static String toASCII(String input, int flag) {
112 int p = 0, q = 0;
113 StringBuffer out = new StringBuffer();
114
115 while (p < input.length()) {
116 q = searchDots(input, p);
117 out.append(toASCIIInternal(input.substring(p, q), flag));
118 p = q + 1;
119 if (p < input.length())
120 out.append('.');
121 }
122
123 return out.toString();
124 }
125
126 /**
127 * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
128 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
129 *
130 * <p> This convenience method works as if by invoking the
131 * two-argument counterpart as follows:
132 * <blockquote><tt>
133 * {@link #toASCII(String, int) toASCII}(input, 0);
134 * </tt></blockquote>
135 *
136 * @param input the string to be processed
137 *
138 * @return the translated <tt>String</tt>
139 *
140 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
141 */
142 public static String toASCII(String input) {
143 return toASCII(input, 0);
144 }
145
146 /**
147 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
148 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
149 *
150 * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
151 *
152 * <p> A label is an individual part of a domain name. The original ToUnicode operation,
153 * as defined in RFC 3490, only operates on a single label. This method can handle
154 * both label and entire domain name, by assuming that labels in a domain name are
155 * always separated by dots. The following characters are recognized as dots:
156 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
157 * and \uFF61 (halfwidth ideographic full stop).
158 *
159 * @param input the string to be processed
160 * @param flag process flag; can be 0 or any logical OR of possible flags
161 *
162 * @return the translated <tt>String</tt>
163 */
164 public static String toUnicode(String input, int flag) {
165 int p = 0, q = 0;
166 StringBuffer out = new StringBuffer();
167
168 while (p < input.length()) {
169 q = searchDots(input, p);
170 out.append(toUnicodeInternal(input.substring(p, q), flag));
171 p = q + 1;
172 if (p < input.length())
173 out.append('.');
174 }
175
176 return out.toString();
177 }
178
179 /**
180 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
181 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
182 *
183 * <p> This convenience method works as if by invoking the
184 * two-argument counterpart as follows:
185 * <blockquote><tt>
186 * {@link #toUnicode(String, int) toUnicode}(input, 0);
187 * </tt></blockquote>
188 *
189 * @param input the string to be processed
190 *
191 * @return the translated <tt>String</tt>
192 */
193 public static String toUnicode(String input) {
194 return toUnicode(input, 0);
195 }
196
197 /* ---------------- Private members -------------- */
198
199 // ACE Prefix is "xn--"
200 private static final String ACE_PREFIX = "xn--";
201 private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
202
203 private static final int MAX_LABEL_LENGTH = 63;
204
205 // single instance of nameprep
206 private static StringPrep namePrep = null;
207
208 static {
209 InputStream stream = null;
210
211 try {
212 final String IDN_PROFILE = "uidna.spp";
213 if (System.getSecurityManager() != null) {
214 stream = AccessController
215 .doPrivileged(new PrivilegedAction<InputStream>() {
216 public InputStream run() {
217 return StringPrep.class
218 .getResourceAsStream(IDN_PROFILE);
219 }
220 });
221 } else {
222 stream = StringPrep.class
223 .getResourceAsStream(IDN_PROFILE);
224 }
225
226 namePrep = new StringPrep(stream);
227 stream.close();
228 } catch (IOException e) {
229 // should never reach here
230 assert false;
231 }
232 }
233
234 /* ---------------- Private operations -------------- */
235
236 //
237 // to suppress the default zero-argument constructor
238 //
239 private IDN() {
240 }
241
242 //
243 // toASCII operation; should only apply to a single label
244 //
245 private static String toASCIIInternal(String label, int flag) {
246 // step 1
247 // Check if the string contains code points outside the ASCII range 0..0x7c.
248 boolean isASCII = isAllASCII(label);
249 StringBuffer dest;
250
251 // step 2
252 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
253 if (!isASCII) {
254 UCharacterIterator iter = UCharacterIterator
255 .getInstance(label);
256 try {
257 dest = namePrep.prepare(iter, flag);
258 } catch (java.text.ParseException e) {
259 throw new IllegalArgumentException(e);
260 }
261 } else {
262 dest = new StringBuffer(label);
263 }
264
265 // step 3
266 // Verify the absence of non-LDH ASCII code points
267 // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
268 // Verify the absence of leading and trailing hyphen
269 boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
270 if (useSTD3ASCIIRules) {
271 for (int i = 0; i < dest.length(); i++) {
272 int c = dest.charAt(i);
273 if (!isLDHChar(c)) {
274 throw new IllegalArgumentException(
275 "Contains non-LDH characters");
276 }
277 }
278
279 if (dest.charAt(0) == '-'
280 || dest.charAt(dest.length() - 1) == '-') {
281 throw new IllegalArgumentException(
282 "Has leading or trailing hyphen");
283 }
284 }
285
286 if (!isASCII) {
287 // step 4
288 // If all code points are inside 0..0x7f, skip to step 8
289 if (!isAllASCII(dest.toString())) {
290 // step 5
291 // verify the sequence does not begin with ACE prefix
292 if (!startsWithACEPrefix(dest)) {
293
294 // step 6
295 // encode the sequence with punycode
296 try {
297 dest = Punycode.encode(dest, null);
298 } catch (java.text.ParseException e) {
299 throw new IllegalArgumentException(e);
300 }
301
302 dest = toASCIILower(dest);
303
304 // step 7
305 // prepend the ACE prefix
306 dest.insert(0, ACE_PREFIX);
307 } else {
308 throw new IllegalArgumentException(
309 "The input starts with the ACE Prefix");
310 }
311
312 }
313 }
314
315 // step 8
316 // the length must be inside 1..63
317 if (dest.length() > MAX_LABEL_LENGTH) {
318 throw new IllegalArgumentException(
319 "The label in the input is too long");
320 }
321
322 return dest.toString();
323 }
324
325 //
326 // toUnicode operation; should only apply to a single label
327 //
328 private static String toUnicodeInternal(String label, int flag) {
329 boolean[] caseFlags = null;
330 StringBuffer dest;
331
332 // step 1
333 // find out if all the codepoints in input are ASCII
334 boolean isASCII = isAllASCII(label);
335
336 if (!isASCII) {
337 // step 2
338 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
339 try {
340 UCharacterIterator iter = UCharacterIterator
341 .getInstance(label);
342 dest = namePrep.prepare(iter, flag);
343 } catch (Exception e) {
344 // toUnicode never fails; if any step fails, return the input string
345 return label;
346 }
347 } else {
348 dest = new StringBuffer(label);
349 }
350
351 // step 3
352 // verify ACE Prefix
353 if (startsWithACEPrefix(dest)) {
354
355 // step 4
356 // Remove the ACE Prefix
357 String temp = dest.substring(ACE_PREFIX_LENGTH, dest
358 .length());
359
360 try {
361 // step 5
362 // Decode using punycode
363 StringBuffer decodeOut = Punycode.decode(
364 new StringBuffer(temp), null);
365
366 // step 6
367 // Apply toASCII
368 String toASCIIOut = toASCII(decodeOut.toString(), flag);
369
370 // step 7
371 // verify
372 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
373 // step 8
374 // return output of step 5
375 return decodeOut.toString();
376 }
377 } catch (Exception ignored) {
378 // no-op
379 }
380 }
381
382 // just return the input
383 return label;
384 }
385
386 //
387 // LDH stands for "letter/digit/hyphen", with characters restricted to the
388 // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
389 // <->
390 // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x56..0x60, 0x7B..0x7F
391 //
392 private static boolean isLDHChar(int ch) {
393 // high runner case
394 if (ch > 0x007A) {
395 return false;
396 }
397 //['-' '0'..'9' 'A'..'Z' 'a'..'z']
398 if ((ch == 0x002D) || (0x0030 <= ch && ch <= 0x0039)
399 || (0x0041 <= ch && ch <= 0x005A)
400 || (0x0061 <= ch && ch <= 0x007A)) {
401 return true;
402 }
403 return false;
404 }
405
406 //
407 // search dots in a string and return the index of that character;
408 // or if there is no dots, return the length of input string
409 // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
410 // and \uFF61 (halfwidth ideographic full stop).
411 //
412 private static int searchDots(String s, int start) {
413 int i;
414 for (i = start; i < s.length(); i++) {
415 char c = s.charAt(i);
416 if (c == '.' || c == '\u3002' || c == '\uFF0E'
417 || c == '\uFF61') {
418 break;
419 }
420 }
421
422 return i;
423 }
424
425 //
426 // to check if a string only contains US-ASCII code point
427 //
428 private static boolean isAllASCII(String input) {
429 boolean isASCII = true;
430 for (int i = 0; i < input.length(); i++) {
431 int c = input.charAt(i);
432 if (c > 0x7F) {
433 isASCII = false;
434 break;
435 }
436 }
437 return isASCII;
438 }
439
440 //
441 // to check if a string starts with ACE-prefix
442 //
443 private static boolean startsWithACEPrefix(StringBuffer input) {
444 boolean startsWithPrefix = true;
445
446 if (input.length() < ACE_PREFIX_LENGTH) {
447 return false;
448 }
449 for (int i = 0; i < ACE_PREFIX_LENGTH; i++) {
450 if (toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)) {
451 startsWithPrefix = false;
452 }
453 }
454 return startsWithPrefix;
455 }
456
457 private static char toASCIILower(char ch) {
458 if ('A' <= ch && ch <= 'Z') {
459 return (char) (ch + 'a' - 'A');
460 }
461 return ch;
462 }
463
464 private static StringBuffer toASCIILower(StringBuffer input) {
465 StringBuffer dest = new StringBuffer();
466 for (int i = 0; i < input.length(); i++) {
467 dest.append(toASCIILower(input.charAt(i)));
468 }
469 return dest;
470 }
471 }
|