Source Code Cross Referenced for CharsetRecog_mbcs.java in  » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:         ****************************************************************************
003:         * Copyright (C) 2005-2006, International Business Machines Corporation and *
004:         * others. All Rights Reserved.                                             *
005:         ****************************************************************************
006:         *
007:         */
008:        package com.ibm.icu.text;
009:
010:        import java.util.Arrays;
011:
012:        /**
013:         * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
014:         *                   Match is determined mostly by the input data adhering to the
015:         *                   encoding scheme for the charset, and, optionally,
016:         *                   frequency-of-occurence of characters.
017:         * <p/>
018:         *                   Instances of this class are singletons, one per encoding
019:         *                   being recognized.  They are created in the main
020:         *                   CharsetDetector class and kept in the global list of available
021:         *                   encodings to be checked.  The specific encoding being recognized
022:         *                   is determined by subclass.
023:         * 
024:         * @internal                  
025:         */
026:        abstract class CharsetRecog_mbcs extends CharsetRecognizer {
027:
028:            /**
029:             * Get the IANA name of this charset.
030:             * @return the charset name.
031:             */
032:            abstract String getName();
033:
034:            /**
035:             * Test the match of this charset with the input text data
036:             *      which is obtained via the CharsetDetector object.
037:             * 
038:             * @param det  The CharsetDetector, which contains the input text
039:             *             to be checked for being in this charset.
040:             * @return     Two values packed into one int  (Damn java, anyhow)
041:             *             <br/>
042:             *             bits 0-7:  the match confidence, ranging from 0-100
043:             *             <br/>
044:             *             bits 8-15: The match reason, an enum-like value.
045:             */
046:            int match(CharsetDetector det, int[] commonChars) {
047:                int singleByteCharCount = 0;
048:                int doubleByteCharCount = 0;
049:                int commonCharCount = 0;
050:                int badCharCount = 0;
051:                int totalCharCount = 0;
052:                int confidence = 0;
053:                iteratedChar iter = new iteratedChar();
054:
055:                detectBlock: {
056:                    for (iter.reset(); nextChar(iter, det);) {
057:                        totalCharCount++;
058:                        if (iter.error) {
059:                            badCharCount++;
060:                        } else {
061:
062:                            if (iter.charValue <= 0xff) {
063:                                singleByteCharCount++;
064:                            } else {
065:                                doubleByteCharCount++;
066:                                if (commonChars != null) {
067:                                    if (Arrays.binarySearch(commonChars,
068:                                            iter.charValue) >= 0) {
069:                                        commonCharCount++;
070:                                    }
071:                                }
072:                            }
073:                        }
074:                        if (badCharCount >= 2
075:                                && badCharCount * 5 >= doubleByteCharCount) {
076:                            // Bail out early if the byte data is not matching the encoding scheme.
077:                            break detectBlock;
078:                        }
079:                    }
080:
081:                    if (doubleByteCharCount <= 10 && badCharCount == 0) {
082:                        // Not many multi-byte chars.
083:                        //   ASCII or ISO file?  It's probably not our encoding,
084:                        //   but is not incompatible with our encoding, so don't give it a zero.
085:                        confidence = 10;
086:                        break detectBlock;
087:                    }
088:
089:                    //
090:                    //  No match if there are too many characters that don't fit the encoding scheme.
091:                    //    (should we have zero tolerance for these?)
092:                    //
093:                    if (doubleByteCharCount < 20 * badCharCount) {
094:                        confidence = 0;
095:                        break detectBlock;
096:                    }
097:
098:                    if (commonChars == null) {
099:                        // We have no statistics on frequently occuring characters.
100:                        //  Assess confidence purely on having a reasonable number of
101:                        //  multi-byte characters (the more the better
102:                        confidence = 30 + doubleByteCharCount - 20
103:                                * badCharCount;
104:                        if (confidence > 100) {
105:                            confidence = 100;
106:                        }
107:                    } else {
108:                        //
109:                        // Frequency of occurence statistics exist.
110:                        //
111:                        double maxVal = Math
112:                                .log((float) doubleByteCharCount / 4);
113:                        double scaleFactor = 90.0 / maxVal;
114:                        confidence = (int) (Math.log(commonCharCount + 1)
115:                                * scaleFactor + 10);
116:                        confidence = Math.min(confidence, 100);
117:                    }
118:                } // end of detectBlock:
119:
120:                return confidence;
121:            }
122:
123:            // "Character"  iterated character class.
124:            //    Recognizers for specific mbcs encodings make their "characters" available
125:            //    by providing a nextChar() function that fills in an instance of iteratedChar
126:            //    with the next char from the input.
127:            //    The returned characters are not converted to Unicode, but remain as the raw
128:            //    bytes (concatenated into an int) from the codepage data.
129:            //
130:            //  For Asian charsets, use the raw input rather than the input that has been
131:            //   stripped of markup.  Detection only considers multi-byte chars, effectively
132:            //   stripping markup anyway, and double byte chars do occur in markup too.
133:            //
134:            static class iteratedChar {
135:                int charValue = 0; // 1-4 bytes from the raw input data
136:                int index = 0;
137:                int nextIndex = 0;
138:                boolean error = false;
139:                boolean done = false;
140:
141:                void reset() {
142:                    charValue = 0;
143:                    index = -1;
144:                    nextIndex = 0;
145:                    error = false;
146:                    done = false;
147:                }
148:
149:                int nextByte(CharsetDetector det) {
150:                    if (nextIndex >= det.fRawLength) {
151:                        done = true;
152:                        return -1;
153:                    }
154:                    int byteValue = (int) det.fRawInput[nextIndex++] & 0x00ff;
155:                    return byteValue;
156:                }
157:            }
158:
159:            /**
160:             * Get the next character (however many bytes it is) from the input data
161:             *    Subclasses for specific charset encodings must implement this function
162:             *    to get characters according to the rules of their encoding scheme.
163:             * 
164:             *  This function is not a method of class iteratedChar only because
165:             *   that would require a lot of extra derived classes, which is awkward.
166:             * @param it  The iteratedChar "struct" into which the returned char is placed.
167:             * @param det The charset detector, which is needed to get at the input byte data
168:             *            being iterated over.
169:             * @return    True if a character was returned, false at end of input.
170:             */
171:            abstract boolean nextChar(iteratedChar it, CharsetDetector det);
172:
173:            /**
174:             *   Shift-JIS charset recognizer.   
175:             *
176:             */
177:            static class CharsetRecog_sjis extends CharsetRecog_mbcs {
178:                static int[] commonChars =
179:                // TODO:  This set of data comes from the character frequency-
180:                //        of-occurence analysis tool.  The data needs to be moved
181:                //        into a resource and loaded from there.
182:                { 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a,
183:                        0x8175, 0x8176, 0x82a0, 0x82a2, 0x82a4, 0x82a9, 0x82aa,
184:                        0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 0x82b7,
185:                        0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8,
186:                        0x82c9, 0x82cc, 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8,
187:                        0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 0x8343, 0x834e,
188:                        0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376,
189:                        0x8389, 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa,
190:                        0x95aa };
191:
192:                boolean nextChar(iteratedChar it, CharsetDetector det) {
193:                    it.index = it.nextIndex;
194:                    it.error = false;
195:                    int firstByte;
196:                    firstByte = it.charValue = it.nextByte(det);
197:                    if (firstByte < 0) {
198:                        return false;
199:                    }
200:
201:                    if (firstByte <= 0x7f
202:                            || (firstByte > 0xa0 && firstByte <= 0xdf)) {
203:                        return true;
204:                    }
205:
206:                    int secondByte = it.nextByte(det);
207:                    if (secondByte < 0) {
208:                        return false;
209:                    }
210:                    it.charValue = (firstByte << 8) | secondByte;
211:                    if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
212:                        // Illegal second byte value.
213:                        it.error = true;
214:                    }
215:                    return true;
216:                }
217:
218:                int match(CharsetDetector det) {
219:                    return match(det, commonChars);
220:                }
221:
222:                String getName() {
223:                    return "Shift_JIS";
224:                }
225:
226:                public String getLanguage() {
227:                    return "ja";
228:                }
229:
230:            }
231:
232:            /**
233:             *   Big5 charset recognizer.   
234:             *
235:             */
236:            static class CharsetRecog_big5 extends CharsetRecog_mbcs {
237:                static int[] commonChars =
238:                // TODO:  This set of data comes from the character frequency-
239:                //        of-occurence analysis tool.  The data needs to be moved
240:                //        into a resource and loaded from there.
241:                { 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175,
242:                        0xa176, 0xa440, 0xa446, 0xa447, 0xa448, 0xa451, 0xa454,
243:                        0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 0xa4a4,
244:                        0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd,
245:                        0xa540, 0xa548, 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657,
246:                        0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 0xa6b3, 0xa6b9,
247:                        0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759,
248:                        0xa7da, 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4,
249:                        0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 0xaa6b, 0xaaba, 0xaabe,
250:                        0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
251:                        0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f,
252:                        0xb44c, 0xb44e, 0xb54c, 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8,
253:                        0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 0xbba1,
254:                        0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f };
255:
256:                boolean nextChar(iteratedChar it, CharsetDetector det) {
257:                    it.index = it.nextIndex;
258:                    it.error = false;
259:                    int firstByte;
260:                    firstByte = it.charValue = it.nextByte(det);
261:                    if (firstByte < 0) {
262:                        return false;
263:                    }
264:
265:                    if (firstByte <= 0x7f || firstByte == 0xff) {
266:                        // single byte character.
267:                        return true;
268:                    }
269:
270:                    int secondByte = it.nextByte(det);
271:                    if (secondByte < 0) {
272:                        return false;
273:                    }
274:                    it.charValue = (it.charValue << 8) | secondByte;
275:
276:                    if (secondByte < 0x40 || secondByte == 0x7f
277:                            || secondByte == 0xff) {
278:                        it.error = true;
279:                    }
280:                    return true;
281:                }
282:
283:                int match(CharsetDetector det) {
284:                    return match(det, commonChars);
285:                }
286:
287:                String getName() {
288:                    return "Big5";
289:                }
290:
291:                public String getLanguage() {
292:                    return "zh";
293:                }
294:            }
295:
296:            /**
297:             *   EUC charset recognizers.  One abstract class that provides the common function
298:             *             for getting the next character according to the EUC encoding scheme,
299:             *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.   
300:             *
301:             */
302:            abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
303:
304:                /*
305:                 *  (non-Javadoc)
306:                 *  Get the next character value for EUC based encodings.
307:                 *  Character "value" is simply the raw bytes that make up the character
308:                 *     packed into an int.
309:                 */
310:                boolean nextChar(iteratedChar it, CharsetDetector det) {
311:                    it.index = it.nextIndex;
312:                    it.error = false;
313:                    int firstByte = 0;
314:                    int secondByte = 0;
315:                    int thirdByte = 0;
316:                    int fourthByte = 0;
317:
318:                    buildChar: {
319:                        firstByte = it.charValue = it.nextByte(det);
320:                        if (firstByte < 0) {
321:                            // Ran off the end of the input data
322:                            it.done = true;
323:                            break buildChar;
324:                        }
325:                        if (firstByte <= 0x8d) {
326:                            // single byte char
327:                            break buildChar;
328:                        }
329:
330:                        secondByte = it.nextByte(det);
331:                        it.charValue = (it.charValue << 8) | secondByte;
332:
333:                        if (firstByte >= 0xA1 && firstByte <= 0xfe) {
334:                            // Two byte Char
335:                            if (secondByte < 0xa1) {
336:                                it.error = true;
337:                            }
338:                            break buildChar;
339:                        }
340:                        if (firstByte == 0x8e) {
341:                            // Code Set 2.
342:                            //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
343:                            //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
344:                            // We don't know which we've got.
345:                            // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
346:                            //   bytes will look like a well formed 2 byte char.  
347:                            if (secondByte < 0xa1) {
348:                                it.error = true;
349:                            }
350:                            break buildChar;
351:                        }
352:
353:                        if (firstByte == 0x8f) {
354:                            // Code set 3.
355:                            // Three byte total char size, two bytes of actual char value.
356:                            thirdByte = it.nextByte(det);
357:                            it.charValue = (it.charValue << 8) | thirdByte;
358:                            if (thirdByte < 0xa1) {
359:                                it.error = true;
360:                            }
361:                        }
362:                    }
363:
364:                    return (it.done == false);
365:                }
366:
367:                /**
368:                 * The charset recognize for EUC-JP.  A singleton instance of this class
369:                 *    is created and kept by the public CharsetDetector class
370:                 */
371:                static class CharsetRecog_euc_jp extends CharsetRecog_euc {
372:                    static int[] commonChars =
373:                    // TODO:  This set of data comes from the character frequency-
374:                    //        of-occurence analysis tool.  The data needs to be moved
375:                    //        into a resource and loaded from there.
376:                    { 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb,
377:                            0xa1d6, 0xa1d7, 0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8,
378:                            0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1,
379:                            0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd,
380:                            0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 0xa4c6,
381:                            0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce,
382:                            0xa4cf, 0xa4d0, 0xa4de, 0xa4df, 0xa4e1, 0xa4e2,
383:                            0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec,
384:                            0xa4ef, 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4,
385:                            0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 0xa5b0,
386:                            0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf,
387:                            0xa5c3, 0xa5c6, 0xa5c7, 0xa5c8, 0xa5c9, 0xa5cb,
388:                            0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0,
389:                            0xa5e1, 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec,
390:                            0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 0xbbc8,
391:                            0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8,
392:                            0xcaf3, 0xcbdc, 0xcdd1 };
393:
394:                    String getName() {
395:                        return "EUC-JP";
396:                    }
397:
398:                    int match(CharsetDetector det) {
399:                        return match(det, commonChars);
400:                    }
401:
402:                    public String getLanguage() {
403:                        return "ja";
404:                    }
405:                }
406:
407:                /**
408:                 * The charset recognize for EUC-KR.  A singleton instance of this class
409:                 *    is created and kept by the public CharsetDetector class
410:                 */
411:                static class CharsetRecog_euc_kr extends CharsetRecog_euc {
412:                    static int[] commonChars =
413:                    // TODO:  This set of data comes from the character frequency-
414:                    //        of-occurence analysis tool.  The data needs to be moved
415:                    //        into a resource and loaded from there.
416:                    { 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed,
417:                            0xb0f8, 0xb0fa, 0xb0fc, 0xb1b8, 0xb1b9, 0xb1c7,
418:                            0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf,
419:                            0xb4d9, 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7,
420:                            0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 0xb8a6,
421:                            0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab,
422:                            0xb9ae, 0xb9cc, 0xb9ce, 0xb9fd, 0xbab8, 0xbace,
423:                            0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad,
424:                            0xbcba, 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3,
425:                            0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 0xbef8,
426:                            0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb,
427:                            0xbfec, 0xbff8, 0xc0a7, 0xc0af, 0xc0b8, 0xc0ba,
428:                            0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf,
429:                            0xc0d6, 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4,
430:                            0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 0xc1f8,
431:                            0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2,
432:                            0xc7d8, 0xc7e5, 0xc8ad };
433:
434:                    String getName() {
435:                        return "EUC-KR";
436:                    }
437:
438:                    int match(CharsetDetector det) {
439:                        return match(det, commonChars);
440:                    }
441:
442:                    public String getLanguage() {
443:                        return "ko";
444:                    }
445:                }
446:            }
447:
448:            /**
449:             * 
450:             *   GB-18030 recognizer. Uses simplified Chinese statistics.   
451:             *
452:             */
453:            static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
454:
455:                /*
456:                 *  (non-Javadoc)
457:                 *  Get the next character value for EUC based encodings.
458:                 *  Character "value" is simply the raw bytes that make up the character
459:                 *     packed into an int.
460:                 */
461:                boolean nextChar(iteratedChar it, CharsetDetector det) {
462:                    it.index = it.nextIndex;
463:                    it.error = false;
464:                    int firstByte = 0;
465:                    int secondByte = 0;
466:                    int thirdByte = 0;
467:                    int fourthByte = 0;
468:
469:                    buildChar: {
470:                        firstByte = it.charValue = it.nextByte(det);
471:
472:                        if (firstByte < 0) {
473:                            // Ran off the end of the input data
474:                            it.done = true;
475:                            break buildChar;
476:                        }
477:
478:                        if (firstByte <= 0x80) {
479:                            // single byte char
480:                            break buildChar;
481:                        }
482:
483:                        secondByte = it.nextByte(det);
484:                        it.charValue = (it.charValue << 8) | secondByte;
485:
486:                        if (firstByte >= 0x81 && firstByte <= 0xFE) {
487:                            // Two byte Char
488:                            if ((secondByte >= 0x40 && secondByte <= 0x7E)
489:                                    || (secondByte >= 80 && secondByte <= 0xFE)) {
490:                                break buildChar;
491:                            }
492:
493:                            // Four byte char
494:                            if (secondByte >= 0x30 && secondByte <= 0x39) {
495:                                thirdByte = it.nextByte(det);
496:
497:                                if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
498:                                    fourthByte = it.nextByte(det);
499:
500:                                    if (fourthByte >= 0x30
501:                                            && fourthByte <= 0x39) {
502:                                        it.charValue = (it.charValue << 16)
503:                                                | (thirdByte << 8) | fourthByte;
504:                                        break buildChar;
505:                                    }
506:                                }
507:                            }
508:
509:                            it.error = true;
510:                            break buildChar;
511:                        }
512:                    }
513:
514:                    return (it.done == false);
515:                }
516:
517:                static int[] commonChars =
518:                // TODO:  This set of data comes from the character frequency-
519:                //        of-occurence analysis tool.  The data needs to be moved
520:                //        into a resource and loaded from there.
521:                { 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1,
522:                        0xa1f3, 0xa3a1, 0xa3ac, 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be,
523:                        0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 0xb5e3,
524:                        0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6,
525:                        0xb7dd, 0xb8b4, 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8,
526:                        0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 0xbbe1, 0xbbfa,
527:                        0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4,
528:                        0xbfc6, 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7,
529:                        0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 0xc7f8, 0xc8ab, 0xc8cb,
530:                        0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
531:                        0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa,
532:                        0xcec4, 0xced2, 0xcee5, 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2,
533:                        0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 0xd2b5,
534:                        0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da,
535:                        0xd5e2, 0xd6d0 };
536:
537:                String getName() {
538:                    return "GB18030";
539:                }
540:
541:                int match(CharsetDetector det) {
542:                    return match(det, commonChars);
543:                }
544:
545:                public String getLanguage() {
546:                    return "zh";
547:                }
548:            }
549:
550:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.