Source Code Cross Referenced for IDNA.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         *******************************************************************************
003:         * Copyright (C) 2003-2006, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         */
007:        package com.ibm.icu.text;
008:
009:        import java.io.IOException;
010:        import java.io.InputStream;
011:        import java.util.MissingResourceException;
012:
013:        import com.ibm.icu.impl.ICUData;
014:        import com.ibm.icu.impl.ICUResourceBundle;
015:
016:        /**
017:         *
018:         * IDNA API implements the IDNA protocol as defined in the <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
019:         * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels 
020:         * containing non-ASCII code points are required to be processed by
021:         * ToASCII operation before passing it to resolver libraries. Domain names
022:         * that are obtained from resolver libraries are required to be processed by
023:         * ToUnicode operation before displaying the domain name to the user.
024:         * IDNA requires that implementations process input strings with 
025:         * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a>, 
026:         * which is a profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a> , 
027:         * and then with <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a>. 
028:         * Implementations of IDNA MUST fully implement Nameprep and Punycode; 
029:         * neither Nameprep nor Punycode are optional.
030:         * The input and output of ToASCII and ToUnicode operations are Unicode 
031:         * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
032:         * multiple times to an input string will yield the same result as applying the operation
033:         * once.
034:         * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) 
035:         * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
036:         * 
037:         * @author Ram Viswanadha
038:         * @stable ICU 2.8
039:         */
040:        public final class IDNA {
041:
042:            /* IDNA ACE Prefix is "xn--" */
043:            private static char[] ACE_PREFIX = new char[] { 0x0078, 0x006E,
044:                    0x002d, 0x002d };
045:            private static final int ACE_PREFIX_LENGTH = 4;
046:
047:            private static final int MAX_LABEL_LENGTH = 63;
048:            private static final int HYPHEN = 0x002D;
049:            private static final int CAPITAL_A = 0x0041;
050:            private static final int CAPITAL_Z = 0x005A;
051:            private static final int LOWER_CASE_DELTA = 0x0020;
052:            private static final int FULL_STOP = 0x002E;
053:
054:            /** 
055:             * Option to prohibit processing of unassigned codepoints in the input and
056:             * do not check if the input conforms to STD-3 ASCII rules.
057:             * 
058:             * @see  #convertToASCII #convertToUnicode
059:             * @stable ICU 2.8
060:             */
061:            public static final int DEFAULT = 0x0000;
062:            /** 
063:             * Option to allow processing of unassigned codepoints in the input
064:             * 
065:             * @see  #convertToASCII #convertToUnicode
066:             * @stable ICU 2.8
067:             */
068:            public static final int ALLOW_UNASSIGNED = 0x0001;
069:            /** 
070:             * Option to check if input conforms to STD-3 ASCII rules
071:             * 
072:             * @see #convertToASCII #convertToUnicode
073:             * @stable ICU 2.8
074:             */
075:            public static final int USE_STD3_RULES = 0x0002;
076:
077:            // static final singleton object that is initialized
078:            // at class initialization time, hence guaranteed to
079:            // be initialized and thread safe
080:            private static final IDNA singleton = new IDNA();
081:
082:            // The NamePrep profile object
083:            private StringPrep namePrep;
084:
085:            /* private constructor to prevent construction of the object */
086:            private IDNA() {
087:                try {
088:                    InputStream stream = ICUData
089:                            .getRequiredStream(ICUResourceBundle.ICU_BUNDLE
090:                                    + "/uidna.spp");
091:                    namePrep = new StringPrep(stream);
092:                    stream.close();
093:                } catch (IOException e) {
094:                    throw new MissingResourceException(e.toString(), "", "");
095:                }
096:            }
097:
098:            private static boolean startsWithPrefix(StringBuffer src) {
099:                boolean startsWithPrefix = true;
100:
101:                if (src.length() < ACE_PREFIX_LENGTH) {
102:                    return false;
103:                }
104:                for (int i = 0; i < ACE_PREFIX_LENGTH; i++) {
105:                    if (toASCIILower(src.charAt(i)) != ACE_PREFIX[i]) {
106:                        startsWithPrefix = false;
107:                    }
108:                }
109:                return startsWithPrefix;
110:            }
111:
112:            private static char toASCIILower(char ch) {
113:                if (CAPITAL_A <= ch && ch <= CAPITAL_Z) {
114:                    return (char) (ch + LOWER_CASE_DELTA);
115:                }
116:                return ch;
117:            }
118:
119:            private static StringBuffer toASCIILower(StringBuffer src) {
120:                StringBuffer dest = new StringBuffer();
121:                for (int i = 0; i < src.length(); i++) {
122:                    dest.append(toASCIILower(src.charAt(i)));
123:                }
124:                return dest;
125:            }
126:
127:            private static int compareCaseInsensitiveASCII(StringBuffer s1,
128:                    StringBuffer s2) {
129:                char c1, c2;
130:                int rc;
131:                for (int i = 0;/* no condition */; i++) {
132:                    /* If we reach the ends of both strings then they match */
133:                    if (i == s1.length()) {
134:                        return 0;
135:                    }
136:
137:                    c1 = s1.charAt(i);
138:                    c2 = s2.charAt(i);
139:
140:                    /* Case-insensitive comparison */
141:                    if (c1 != c2) {
142:                        rc = toASCIILower(c1) - toASCIILower(c2);
143:                        if (rc != 0) {
144:                            return rc;
145:                        }
146:                    }
147:                }
148:            }
149:
150:            private static int getSeparatorIndex(char[] src, int start,
151:                    int limit) {
152:                for (; start < limit; start++) {
153:                    if (isLabelSeparator(src[start])) {
154:                        return start;
155:                    }
156:                }
157:                // we have not found the separator just return length
158:                return start;
159:            }
160:
161:            /*
162:            private static int getSeparatorIndex(UCharacterIterator iter){
163:                int currentIndex = iter.getIndex();
164:                int separatorIndex = 0;
165:                int ch;
166:                while((ch=iter.next())!= UCharacterIterator.DONE){
167:                    if(isLabelSeparator(ch)){
168:                        separatorIndex = iter.getIndex();
169:                        iter.setIndex(currentIndex);
170:                        return separatorIndex;
171:                    }
172:                }
173:                // reset index
174:                iter.setIndex(currentIndex);
175:                // we have not found the separator just return the length
176:               
177:            }
178:             */
179:
180:            private static boolean isLDHChar(int ch) {
181:                // high runner case
182:                if (ch > 0x007A) {
183:                    return false;
184:                }
185:                //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
186:                if ((ch == 0x002D) || (0x0030 <= ch && ch <= 0x0039)
187:                        || (0x0041 <= ch && ch <= 0x005A)
188:                        || (0x0061 <= ch && ch <= 0x007A)) {
189:                    return true;
190:                }
191:                return false;
192:            }
193:
194:            /**
195:             * Ascertain if the given code point is a label separator as 
196:             * defined by the IDNA RFC
197:             * 
198:             * @param ch The code point to be ascertained
199:             * @return true if the char is a label separator
200:             * @stable ICU 2.8
201:             */
202:            private static boolean isLabelSeparator(int ch) {
203:                switch (ch) {
204:                case 0x002e:
205:                case 0x3002:
206:                case 0xFF0E:
207:                case 0xFF61:
208:                    return true;
209:                default:
210:                    return false;
211:                }
212:            }
213:
214:            /**
215:             * This function implements the ToASCII operation as defined in the IDNA RFC.
216:             * This operation is done on <b>single labels</b> before sending it to something that expects
217:             * ASCII names. A label is an individual part of a domain name. Labels are usually
218:             * separated by dots; e.g." "www.example.com" is composed of 3 labels 
219:             * "www","example", and "com".
220:             *
221:             * @param src       The input string to be processed
222:             * @param options   A bit set of options:
223:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
224:             *                              and do not use STD3 ASCII rules
225:             *                              If unassigned code points are found the operation fails with 
226:             *                              ParseException.
227:             *
228:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
229:             *                              If this option is set, the unassigned code points are in the input 
230:             *                              are treated as normal Unicode code points.
231:             *                          
232:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
233:             *                              If this option is set and the input does not satisfy STD3 rules,  
234:             *                              the operation will fail with ParseException
235:             * @return StringBuffer the converted String
236:             * @throws ParseException
237:             * @stable ICU 2.8
238:             */
239:            public static StringBuffer convertToASCII(String src, int options)
240:                    throws StringPrepParseException {
241:                UCharacterIterator iter = UCharacterIterator.getInstance(src);
242:                return convertToASCII(iter, options);
243:            }
244:
245:            /**
246:             * This function implements the ToASCII operation as defined in the IDNA RFC.
247:             * This operation is done on <b>single labels</b> before sending it to something that expects
248:             * ASCII names. A label is an individual part of a domain name. Labels are usually
249:             * separated by dots; e.g." "www.example.com" is composed of 3 labels 
250:             * "www","example", and "com".
251:             *
252:             * @param src       The input string as StringBuffer to be processed
253:             * @param options   A bit set of options:
254:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
255:             *                              and do not use STD3 ASCII rules
256:             *                              If unassigned code points are found the operation fails with 
257:             *                              ParseException.
258:             *
259:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
260:             *                              If this option is set, the unassigned code points are in the input 
261:             *                              are treated as normal Unicode code points.
262:             *                          
263:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
264:             *                              If this option is set and the input does not satisfy STD3 rules,  
265:             *                              the operation will fail with ParseException
266:             * @return StringBuffer the converted String
267:             * @throws ParseException
268:             * @stable ICU 2.8
269:             */
270:            public static StringBuffer convertToASCII(StringBuffer src,
271:                    int options) throws StringPrepParseException {
272:                UCharacterIterator iter = UCharacterIterator.getInstance(src);
273:                return convertToASCII(iter, options);
274:            }
275:
276:            /**
277:             * This function implements the ToASCII operation as defined in the IDNA RFC.
278:             * This operation is done on <b>single labels</b> before sending it to something that expects
279:             * ASCII names. A label is an individual part of a domain name. Labels are usually
280:             * separated by dots; e.g." "www.example.com" is composed of 3 labels 
281:             * "www","example", and "com".
282:             *
283:             * @param src       The input string as UCharacterIterator to be processed
284:             * @param options   A bit set of options:
285:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
286:             *                              and do not use STD3 ASCII rules
287:             *                              If unassigned code points are found the operation fails with 
288:             *                              ParseException.
289:             *
290:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
291:             *                              If this option is set, the unassigned code points are in the input 
292:             *                              are treated as normal Unicode code points.
293:             *                          
294:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
295:             *                              If this option is set and the input does not satisfy STD3 rules,  
296:             *                              the operation will fail with ParseException
297:             * @return StringBuffer the converted String
298:             * @throws ParseException
299:             * @stable ICU 2.8
300:             */
301:            public static StringBuffer convertToASCII(UCharacterIterator src,
302:                    int options) throws StringPrepParseException {
303:
304:                boolean[] caseFlags = null;
305:
306:                // the source contains all ascii codepoints
307:                boolean srcIsASCII = true;
308:                // assume the source contains all LDH codepoints
309:                boolean srcIsLDH = true;
310:
311:                //get the options
312:                boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
313:                int ch;
314:                // step 1
315:                while ((ch = src.next()) != UCharacterIterator.DONE) {
316:                    if (ch > 0x7f) {
317:                        srcIsASCII = false;
318:                    }
319:                }
320:                int failPos = -1;
321:                src.setToStart();
322:                StringBuffer processOut = null;
323:                // step 2 is performed only if the source contains non ASCII
324:                if (!srcIsASCII) {
325:                    // step 2
326:                    processOut = singleton.namePrep.prepare(src, options);
327:                } else {
328:                    processOut = new StringBuffer(src.getText());
329:                }
330:                int poLen = processOut.length();
331:
332:                if (poLen == 0) {
333:                    throw new StringPrepParseException(
334:                            "Found zero length lable after NamePrep.",
335:                            StringPrepParseException.ZERO_LENGTH_LABEL);
336:                }
337:                StringBuffer dest = new StringBuffer();
338:
339:                // reset the variable to verify if output of prepare is ASCII or not
340:                srcIsASCII = true;
341:
342:                // step 3 & 4
343:                for (int j = 0; j < poLen; j++) {
344:                    ch = processOut.charAt(j);
345:                    if (ch > 0x7F) {
346:                        srcIsASCII = false;
347:                    } else if (isLDHChar(ch) == false) {
348:                        // here we do not assemble surrogates
349:                        // since we know that LDH code points
350:                        // are in the ASCII range only
351:                        srcIsLDH = false;
352:                        failPos = j;
353:                    }
354:                }
355:
356:                if (useSTD3ASCIIRules == true) {
357:                    // verify 3a and 3b
358:                    if (srcIsLDH == false /* source contains some non-LDH characters */
359:                            || processOut.charAt(0) == HYPHEN
360:                            || processOut.charAt(processOut.length() - 1) == HYPHEN) {
361:
362:                        /* populate the parseError struct */
363:                        if (srcIsLDH == false) {
364:                            throw new StringPrepParseException(
365:                                    "The input does not conform to the STD 3 ASCII rules",
366:                                    StringPrepParseException.STD3_ASCII_RULES_ERROR,
367:                                    processOut.toString(),
368:                                    (failPos > 0) ? (failPos - 1) : failPos);
369:                        } else if (processOut.charAt(0) == HYPHEN) {
370:                            throw new StringPrepParseException(
371:                                    "The input does not conform to the STD 3 ASCII rules",
372:                                    StringPrepParseException.STD3_ASCII_RULES_ERROR,
373:                                    processOut.toString(), 0);
374:
375:                        } else {
376:                            throw new StringPrepParseException(
377:                                    "The input does not conform to the STD 3 ASCII rules",
378:                                    StringPrepParseException.STD3_ASCII_RULES_ERROR,
379:                                    processOut.toString(),
380:                                    (poLen > 0) ? poLen - 1 : poLen);
381:
382:                        }
383:                    }
384:                }
385:                if (srcIsASCII) {
386:                    dest = processOut;
387:                } else {
388:                    // step 5 : verify the sequence does not begin with ACE prefix
389:                    if (!startsWithPrefix(processOut)) {
390:
391:                        //step 6: encode the sequence with punycode
392:                        caseFlags = new boolean[poLen];
393:
394:                        StringBuffer punyout = Punycode.encode(processOut,
395:                                caseFlags);
396:
397:                        // convert all codepoints to lower case ASCII
398:                        StringBuffer lowerOut = toASCIILower(punyout);
399:
400:                        //Step 7: prepend the ACE prefix
401:                        dest.append(ACE_PREFIX, 0, ACE_PREFIX_LENGTH);
402:                        //Step 6: copy the contents in b2 into dest
403:                        dest.append(lowerOut);
404:                    } else {
405:
406:                        throw new StringPrepParseException(
407:                                "The input does not start with the ACE Prefix.",
408:                                StringPrepParseException.ACE_PREFIX_ERROR,
409:                                processOut.toString(), 0);
410:                    }
411:                }
412:                if (dest.length() > MAX_LABEL_LENGTH) {
413:                    throw new StringPrepParseException(
414:                            "The labels in the input are too long. Length > 64.",
415:                            StringPrepParseException.LABEL_TOO_LONG_ERROR, dest
416:                                    .toString(), 0);
417:                }
418:                return dest;
419:            }
420:
421:            /**
422:             * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
423:             * This operation is done on complete domain names, e.g: "www.example.com". 
424:             * It is important to note that this operation can fail. If it fails, then the input 
425:             * domain name cannot be used as an Internationalized Domain Name and the application
426:             * should have methods defined to deal with the failure.
427:             * 
428:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
429:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
430:             * and then convert. This function does not offer that level of granularity. The options once  
431:             * set will apply to all labels in the domain name
432:             *
433:             * @param src       The input string as UCharacterIterator to be processed
434:             * @param options   A bit set of options:
435:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
436:             *                              and do not use STD3 ASCII rules
437:             *                              If unassigned code points are found the operation fails with 
438:             *                              ParseException.
439:             *
440:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
441:             *                              If this option is set, the unassigned code points are in the input 
442:             *                              are treated as normal Unicode code points.
443:             *                          
444:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
445:             *                              If this option is set and the input does not satisfy STD3 rules,  
446:             *                              the operation will fail with ParseException
447:             * @return StringBuffer the converted String
448:             * @throws ParseException
449:             * @stable ICU 2.8
450:             */
451:            public static StringBuffer convertIDNToASCII(
452:                    UCharacterIterator src, int options)
453:                    throws StringPrepParseException {
454:                return convertIDNToASCII(src.getText(), options);
455:            }
456:
457:            /**
458:             * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
459:             * This operation is done on complete domain names, e.g: "www.example.com". 
460:             * It is important to note that this operation can fail. If it fails, then the input 
461:             * domain name cannot be used as an Internationalized Domain Name and the application
462:             * should have methods defined to deal with the failure.
463:             * 
464:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
465:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
466:             * and then convert. This function does not offer that level of granularity. The options once  
467:             * set will apply to all labels in the domain name
468:             *
469:             * @param src       The input string as a StringBuffer to be processed
470:             * @param options   A bit set of options:
471:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
472:             *                              and do not use STD3 ASCII rules
473:             *                              If unassigned code points are found the operation fails with 
474:             *                              ParseException.
475:             *
476:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
477:             *                              If this option is set, the unassigned code points are in the input 
478:             *                              are treated as normal Unicode code points.
479:             *                          
480:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
481:             *                              If this option is set and the input does not satisfy STD3 rules,  
482:             *                              the operation will fail with ParseException
483:             * @return StringBuffer the converted String
484:             * @throws ParseException
485:             * @stable ICU 2.8
486:             */
487:            public static StringBuffer convertIDNToASCII(StringBuffer src,
488:                    int options) throws StringPrepParseException {
489:                return convertIDNToASCII(src.toString(), options);
490:            }
491:
492:            /**
493:             * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
494:             * This operation is done on complete domain names, e.g: "www.example.com". 
495:             * It is important to note that this operation can fail. If it fails, then the input 
496:             * domain name cannot be used as an Internationalized Domain Name and the application
497:             * should have methods defined to deal with the failure.
498:             * 
499:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
500:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
501:             * and then convert. This function does not offer that level of granularity. The options once  
502:             * set will apply to all labels in the domain name
503:             *
504:             * @param src       The input string to be processed
505:             * @param options   A bit set of options:
506:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
507:             *                              and do not use STD3 ASCII rules
508:             *                              If unassigned code points are found the operation fails with 
509:             *                              ParseException.
510:             *
511:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
512:             *                              If this option is set, the unassigned code points are in the input 
513:             *                              are treated as normal Unicode code points.
514:             *                          
515:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
516:             *                              If this option is set and the input does not satisfy STD3 rules,  
517:             *                              the operation will fail with ParseException
518:             * @return StringBuffer the converted String
519:             * @throws ParseException
520:             * @stable ICU 2.8
521:             */
522:            public static StringBuffer convertIDNToASCII(String src, int options)
523:                    throws StringPrepParseException {
524:
525:                char[] srcArr = src.toCharArray();
526:                StringBuffer result = new StringBuffer();
527:                int sepIndex = 0;
528:                int oldSepIndex = 0;
529:                for (;;) {
530:                    sepIndex = getSeparatorIndex(srcArr, sepIndex,
531:                            srcArr.length);
532:                    String label = new String(srcArr, oldSepIndex, sepIndex
533:                            - oldSepIndex);
534:                    //make sure this is not a root label separator.
535:                    if (!(label.length() == 0 && sepIndex == srcArr.length)) {
536:                        UCharacterIterator iter = UCharacterIterator
537:                                .getInstance(label);
538:                        result.append(convertToASCII(iter, options));
539:                    }
540:                    if (sepIndex == srcArr.length) {
541:                        break;
542:                    }
543:
544:                    // increment the sepIndex to skip past the separator
545:                    sepIndex++;
546:                    oldSepIndex = sepIndex;
547:                    result.append((char) FULL_STOP);
548:                }
549:                return result;
550:            }
551:
552:            /**
553:             * This function implements the ToUnicode operation as defined in the IDNA RFC.
554:             * This operation is done on <b>single labels</b> before sending it to something that expects
555:             * Unicode names. A label is an individual part of a domain name. Labels are usually
556:             * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
557:             * "www","example", and "com".
558:             * 
559:             * @param src       The input string to be processed
560:             * @param options   A bit set of options:
561:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
562:             *                              and do not use STD3 ASCII rules
563:             *                              If unassigned code points are found the operation fails with 
564:             *                              ParseException.
565:             *
566:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
567:             *                              If this option is set, the unassigned code points are in the input 
568:             *                              are treated as normal Unicode code points.
569:             *                          
570:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
571:             *                              If this option is set and the input does not satisfy STD3 rules,  
572:             *                              the operation will fail with ParseException
573:             * @return StringBuffer the converted String
574:             * @throws ParseException
575:             * @stable ICU 2.8
576:             */
577:            public static StringBuffer convertToUnicode(String src, int options)
578:                    throws StringPrepParseException {
579:                UCharacterIterator iter = UCharacterIterator.getInstance(src);
580:                return convertToUnicode(iter, options);
581:            }
582:
583:            /**
584:             * This function implements the ToUnicode operation as defined in the IDNA RFC.
585:             * This operation is done on <b>single labels</b> before sending it to something that expects
586:             * Unicode names. A label is an individual part of a domain name. Labels are usually
587:             * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
588:             * "www","example", and "com".
589:             * 
590:             * @param src       The input string as StringBuffer to be processed
591:             * @param options   A bit set of options:
592:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
593:             *                              and do not use STD3 ASCII rules
594:             *                              If unassigned code points are found the operation fails with 
595:             *                              ParseException.
596:             *
597:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
598:             *                              If this option is set, the unassigned code points are in the input 
599:             *                              are treated as normal Unicode code points.
600:             *                          
601:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
602:             *                              If this option is set and the input does not satisfy STD3 rules,  
603:             *                              the operation will fail with ParseException
604:             * @return StringBuffer the converted String
605:             * @throws ParseException
606:             * @stable ICU 2.8
607:             */
608:            public static StringBuffer convertToUnicode(StringBuffer src,
609:                    int options) throws StringPrepParseException {
610:                UCharacterIterator iter = UCharacterIterator.getInstance(src);
611:                return convertToUnicode(iter, options);
612:            }
613:
614:            /**
615:             * This function implements the ToUnicode operation as defined in the IDNA RFC.
616:             * This operation is done on <b>single labels</b> before sending it to something that expects
617:             * Unicode names. A label is an individual part of a domain name. Labels are usually
618:             * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
619:             * "www","example", and "com".
620:             * 
621:             * @param src       The input string as UCharacterIterator to be processed
622:             * @param options   A bit set of options:
623:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
624:             *                              and do not use STD3 ASCII rules
625:             *                              If unassigned code points are found the operation fails with 
626:             *                              ParseException.
627:             *
628:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
629:             *                              If this option is set, the unassigned code points are in the input 
630:             *                              are treated as normal Unicode code points.
631:             *                          
632:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
633:             *                              If this option is set and the input does not satisfy STD3 rules,  
634:             *                              the operation will fail with ParseException
635:             * @return StringBuffer the converted String
636:             * @throws ParseException
637:             * @stable ICU 2.8
638:             */
639:            public static StringBuffer convertToUnicode(UCharacterIterator src,
640:                    int options) throws StringPrepParseException {
641:
642:                boolean[] caseFlags = null;
643:
644:                // the source contains all ascii codepoints
645:                boolean srcIsASCII = true;
646:                // assume the source contains all LDH codepoints
647:                boolean srcIsLDH = true;
648:
649:                //get the options
650:                boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
651:
652:                int failPos = -1;
653:                int ch;
654:                int saveIndex = src.getIndex();
655:                // step 1: find out if all the codepoints in src are ASCII  
656:                while ((ch = src.next()) != UCharacterIterator.DONE) {
657:                    if (ch > 0x7F) {
658:                        srcIsASCII = false;
659:                    } else if ((srcIsLDH = isLDHChar(ch)) == false) {
660:                        failPos = src.getIndex();
661:                    }
662:                }
663:                StringBuffer processOut;
664:
665:                if (srcIsASCII == false) {
666:                    try {
667:                        // step 2: process the string
668:                        src.setIndex(saveIndex);
669:                        processOut = singleton.namePrep.prepare(src, options);
670:                    } catch (StringPrepParseException ex) {
671:                        return new StringBuffer(src.getText());
672:                    }
673:
674:                } else {
675:                    //just point to source
676:                    processOut = new StringBuffer(src.getText());
677:                }
678:                // TODO:
679:                // The RFC states that 
680:                // <quote>
681:                // ToUnicode never fails. If any step fails, then the original input
682:                // is returned immediately in that step.
683:                // </quote>
684:
685:                //step 3: verify ACE Prefix
686:                if (startsWithPrefix(processOut)) {
687:                    StringBuffer decodeOut = null;
688:
689:                    //step 4: Remove the ACE Prefix
690:                    String temp = processOut.substring(ACE_PREFIX_LENGTH,
691:                            processOut.length());
692:
693:                    //step 5: Decode using punycode
694:                    try {
695:                        decodeOut = Punycode.decode(new StringBuffer(temp),
696:                                caseFlags);
697:                    } catch (StringPrepParseException e) {
698:                        decodeOut = null;
699:                    }
700:
701:                    //step 6:Apply toASCII
702:                    if (decodeOut != null) {
703:                        StringBuffer toASCIIOut = convertToASCII(decodeOut,
704:                                options);
705:
706:                        //step 7: verify
707:                        if (compareCaseInsensitiveASCII(processOut, toASCIIOut) != 0) {
708:                            //                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
709:                            //                                             StringPrepParseException.VERIFICATION_ERROR); 
710:                            decodeOut = null;
711:                        }
712:                    }
713:
714:                    //step 8: return output of step 5
715:                    if (decodeOut != null) {
716:                        return decodeOut;
717:                    }
718:                }
719:
720:                //        }else{
721:                //            // verify that STD3 ASCII rules are satisfied
722:                //            if(useSTD3ASCIIRules == true){
723:                //                if( srcIsLDH == false /* source contains some non-LDH characters */
724:                //                    || processOut.charAt(0) ==  HYPHEN 
725:                //                    || processOut.charAt(processOut.length()-1) == HYPHEN){
726:                //    
727:                //                    if(srcIsLDH==false){
728:                //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
729:                //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
730:                //                                                 (failPos>0) ? (failPos-1) : failPos);
731:                //                    }else if(processOut.charAt(0) == HYPHEN){
732:                //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
733:                //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
734:                //                                                 processOut.toString(),0);
735:                //         
736:                //                    }else{
737:                //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
738:                //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
739:                //                                                 processOut.toString(),
740:                //                                                 processOut.length());
741:                //    
742:                //                    }
743:                //                }
744:                //            }
745:                //            // just return the source
746:                //            return new StringBuffer(src.getText());
747:                //        }  
748:
749:                return new StringBuffer(src.getText());
750:            }
751:
752:            /**
753:             * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
754:             * This operation is done on complete domain names, e.g: "www.example.com". 
755:             *
756:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
757:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
758:             * and then convert. This function does not offer that level of granularity. The options once  
759:             * set will apply to all labels in the domain name
760:             *
761:             * @param src       The input string as UCharacterIterator to be processed
762:             * @param options   A bit set of options:
763:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
764:             *                              and do not use STD3 ASCII rules
765:             *                              If unassigned code points are found the operation fails with 
766:             *                              ParseException.
767:             *
768:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
769:             *                              If this option is set, the unassigned code points are in the input 
770:             *                              are treated as normal Unicode code points.
771:             *                          
772:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
773:             *                              If this option is set and the input does not satisfy STD3 rules,  
774:             *                              the operation will fail with ParseException
775:             * @return StringBuffer the converted String
776:             * @throws ParseException
777:             * @stable ICU 2.8
778:             */
779:            public static StringBuffer convertIDNToUnicode(
780:                    UCharacterIterator src, int options)
781:                    throws StringPrepParseException {
782:                return convertIDNToUnicode(src.getText(), options);
783:            }
784:
785:            /**
786:             * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
787:             * This operation is done on complete domain names, e.g: "www.example.com". 
788:             *
789:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
790:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
791:             * and then convert. This function does not offer that level of granularity. The options once  
792:             * set will apply to all labels in the domain name
793:             *
794:             * @param src       The input string as StringBuffer to be processed
795:             * @param options   A bit set of options:
796:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
797:             *                              and do not use STD3 ASCII rules
798:             *                              If unassigned code points are found the operation fails with 
799:             *                              ParseException.
800:             *
801:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
802:             *                              If this option is set, the unassigned code points are in the input 
803:             *                              are treated as normal Unicode code points.
804:             *                          
805:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
806:             *                              If this option is set and the input does not satisfy STD3 rules,  
807:             *                              the operation will fail with ParseException
808:             * @return StringBuffer the converted String
809:             * @throws ParseException
810:             * @stable ICU 2.8
811:             */
812:            public static StringBuffer convertIDNToUnicode(StringBuffer src,
813:                    int options) throws StringPrepParseException {
814:                return convertIDNToUnicode(src.toString(), options);
815:            }
816:
817:            /**
818:             * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
819:             * This operation is done on complete domain names, e.g: "www.example.com". 
820:             *
821:             * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
822:             * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
823:             * and then convert. This function does not offer that level of granularity. The options once  
824:             * set will apply to all labels in the domain name
825:             *
826:             * @param src       The input string to be processed
827:             * @param options   A bit set of options:
828:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
829:             *                              and do not use STD3 ASCII rules
830:             *                              If unassigned code points are found the operation fails with 
831:             *                              ParseException.
832:             *
833:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
834:             *                              If this option is set, the unassigned code points are in the input 
835:             *                              are treated as normal Unicode code points.
836:             *                          
837:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
838:             *                              If this option is set and the input does not satisfy STD3 rules,  
839:             *                              the operation will fail with ParseException
840:             * @return StringBuffer the converted String
841:             * @throws ParseException
842:             * @stable ICU 2.8
843:             */
844:            public static StringBuffer convertIDNToUnicode(String src,
845:                    int options) throws StringPrepParseException {
846:
847:                char[] srcArr = src.toCharArray();
848:                StringBuffer result = new StringBuffer();
849:                int sepIndex = 0;
850:                int oldSepIndex = 0;
851:                for (;;) {
852:                    sepIndex = getSeparatorIndex(srcArr, sepIndex,
853:                            srcArr.length);
854:                    String label = new String(srcArr, oldSepIndex, sepIndex
855:                            - oldSepIndex);
856:                    if (label.length() == 0 && sepIndex != srcArr.length) {
857:                        throw new StringPrepParseException(
858:                                "Found zero length lable after NamePrep.",
859:                                StringPrepParseException.ZERO_LENGTH_LABEL);
860:                    }
861:                    UCharacterIterator iter = UCharacterIterator
862:                            .getInstance(label);
863:                    result.append(convertToUnicode(iter, options));
864:                    if (sepIndex == srcArr.length) {
865:                        break;
866:                    }
867:                    // increment the sepIndex to skip past the separator
868:                    sepIndex++;
869:                    oldSepIndex = sepIndex;
870:                    result.append((char) FULL_STOP);
871:                }
872:                return result;
873:            }
874:
875:            /**
876:             * Compare two IDN strings for equivalence.
877:             * This function splits the domain names into labels and compares them.
878:             * According to IDN RFC, whenever two labels are compared, they are 
879:             * considered equal if and only if their ASCII forms (obtained by 
880:             * applying toASCII) match using an case-insensitive ASCII comparison.
881:             * Two domain names are considered a match if and only if all labels 
882:             * match regardless of whether label separators match.
883:             * 
884:             * @param s1        First IDN string as StringBuffer
885:             * @param s2        Second IDN string as StringBuffer
886:             * @param options   A bit set of options:
887:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
888:             *                              and do not use STD3 ASCII rules
889:             *                              If unassigned code points are found the operation fails with 
890:             *                              ParseException.
891:             *
892:             *  - IDNA.ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
893:             *                              If this option is set, the unassigned code points are in the input 
894:             *                              are treated as normal Unicode code points.
895:             *                          
896:             *  - IDNA.USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
897:             *                              If this option is set and the input does not satisfy STD3 rules,  
898:             *                              the operation will fail with ParseException
899:             * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
900:             * @throws ParseException
901:             * @stable ICU 2.8
902:             */
903:            //  TODO: optimize
904:            public static int compare(StringBuffer s1, StringBuffer s2,
905:                    int options) throws StringPrepParseException {
906:                if (s1 == null || s2 == null) {
907:                    throw new IllegalArgumentException(
908:                            "One of the source buffers is null");
909:                }
910:                StringBuffer s1Out = convertIDNToASCII(s1.toString(), options);
911:                StringBuffer s2Out = convertIDNToASCII(s2.toString(), options);
912:                return compareCaseInsensitiveASCII(s1Out, s2Out);
913:            }
914:
915:            /**
916:             * Compare two IDN strings for equivalence.
917:             * This function splits the domain names into labels and compares them.
918:             * According to IDN RFC, whenever two labels are compared, they are 
919:             * considered equal if and only if their ASCII forms (obtained by 
920:             * applying toASCII) match using an case-insensitive ASCII comparison.
921:             * Two domain names are considered a match if and only if all labels 
922:             * match regardless of whether label separators match.
923:             * 
924:             * @param s1        First IDN string 
925:             * @param s2        Second IDN string
926:             * @param options   A bit set of options:
927:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
928:             *                              and do not use STD3 ASCII rules
929:             *                              If unassigned code points are found the operation fails with 
930:             *                              ParseException.
931:             *
932:             *  - IDNA.ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
933:             *                              If this option is set, the unassigned code points are in the input 
934:             *                              are treated as normal Unicode code points.
935:             *                          
936:             *  - IDNA.USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
937:             *                              If this option is set and the input does not satisfy STD3 rules,  
938:             *                              the operation will fail with ParseException
939:             * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
940:             * @throws ParseException
941:             * @stable ICU 2.8
942:             */
943:            //  TODO: optimize
944:            public static int compare(String s1, String s2, int options)
945:                    throws StringPrepParseException {
946:                if (s1 == null || s2 == null) {
947:                    throw new IllegalArgumentException(
948:                            "One of the source buffers is null");
949:                }
950:                StringBuffer s1Out = convertIDNToASCII(s1, options);
951:                StringBuffer s2Out = convertIDNToASCII(s2, options);
952:                return compareCaseInsensitiveASCII(s1Out, s2Out);
953:            }
954:
955:            /**
956:             * Compare two IDN strings for equivalence.
957:             * This function splits the domain names into labels and compares them.
958:             * According to IDN RFC, whenever two labels are compared, they are 
959:             * considered equal if and only if their ASCII forms (obtained by 
960:             * applying toASCII) match using an case-insensitive ASCII comparison.
961:             * Two domain names are considered a match if and only if all labels 
962:             * match regardless of whether label separators match.
963:             * 
964:             * @param s1        First IDN string as UCharacterIterator
965:             * @param s2        Second IDN string as UCharacterIterator
966:             * @param options   A bit set of options:
967:             *  - IDNA.DEFAULT              Use default options, i.e., do not process unassigned code points
968:             *                              and do not use STD3 ASCII rules
969:             *                              If unassigned code points are found the operation fails with 
970:             *                              ParseException.
971:             *
972:             *  - IDNA.ALLOW_UNASSIGNED     Unassigned values can be converted to ASCII for query operations
973:             *                              If this option is set, the unassigned code points are in the input 
974:             *                              are treated as normal Unicode code points.
975:             *                          
976:             *  - IDNA.USE_STD3_RULES       Use STD3 ASCII rules for host name syntax restrictions
977:             *                              If this option is set and the input does not satisfy STD3 rules,  
978:             *                              the operation will fail with ParseException
979:             * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
980:             * @throws ParseException
981:             * @stable ICU 2.8
982:             */
983:            //  TODO: optimize
984:            public static int compare(UCharacterIterator s1,
985:                    UCharacterIterator s2, int options)
986:                    throws StringPrepParseException {
987:                if (s1 == null || s2 == null) {
988:                    throw new IllegalArgumentException(
989:                            "One of the source buffers is null");
990:                }
991:                StringBuffer s1Out = convertIDNToASCII(s1.getText(), options);
992:                StringBuffer s2Out = convertIDNToASCII(s2.getText(), options);
993:                return compareCaseInsensitiveASCII(s1Out, s2Out);
994:            }
995:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.