Source Code Cross Referenced for CharsetUTF8.java in » Internationalization-Localization » icu4j » com » ibm » icu » charset » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.charset
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /**
002:         *******************************************************************************
003:         * Copyright (C) 2006, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         *
007:         *******************************************************************************
008:         */package com.ibm.icu.charset;
009:
010:        import java.nio.ByteBuffer;
011:        import java.nio.CharBuffer;
012:        import java.nio.IntBuffer;
013:        import java.nio.charset.CharsetDecoder;
014:        import java.nio.charset.CharsetEncoder;
015:        import java.nio.charset.CoderResult;
016:
017:        import com.ibm.icu.lang.UCharacter;
018:        import com.ibm.icu.text.UTF16;
019:
020:        /**
021:         * @author Niti Hantaweepant
022:         */
023:        class CharsetUTF8 extends CharsetICU {
024:
025:            protected byte[] fromUSubstitution = new byte[] { (byte) 0xef,
026:                    (byte) 0xbf, (byte) 0xbd };
027:
028:            public CharsetUTF8(String icuCanonicalName,
029:                    String javaCanonicalName, String[] aliases) {
030:                super (icuCanonicalName, javaCanonicalName, aliases);
031:                maxBytesPerChar = 4;
032:                minBytesPerChar = 1;
033:                maxCharsPerByte = 1;
034:            }
035:
036:            /* UTF-8 Conversion DATA
037:             *   for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
038:             */
039:            private static final long OFFSETS_FROM_UTF8[] = { 0, 0x00000000L,
040:                    0x00003080L, 0x000E2080L, 0x03C82080L, 0xFA082080L,
041:                    0x82082080L };
042:
043:            private static final byte BYTES_FROM_UTF8[] = { 1, 1, 1, 1, 1, 1,
044:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
045:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
046:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
047:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
048:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
049:                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
050:                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
051:                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
052:                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
053:                    0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
054:                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
055:                    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
056:                    4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
057:
058:            /*
059:             * Starting with Unicode 3.0.1:
060:             * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
061:             * byte sequences with more than 4 bytes are illegal in UTF-8,
062:             * which is tested with impossible values for them
063:             */
064:            private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L,
065:                    0x800L, 0x10000L, 0xffffffffL, 0xffffffffL };
066:
067:            class CharsetDecoderUTF8 extends CharsetDecoderICU {
068:
069:                public CharsetDecoderUTF8(CharsetICU cs) {
070:                    super (cs);
071:                }
072:
073:                protected CoderResult decodeLoop(ByteBuffer source,
074:                        CharBuffer target, IntBuffer offsets, boolean flush) {
075:                    CoderResult cr = CoderResult.UNDERFLOW;
076:
077:                    int sourceArrayIndex = source.position();
078:
079:                    // Todo: CESU8 implementation
080:                    // boolean isCESU8 = args.converter.sharedData == _CESU8Data;
081:                    boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
082:                    int ch, ch2 = 0;
083:                    int i, inBytes;
084:
085:                    donefornow: {
086:                        if (toUnicodeStatus != 0 && target.hasRemaining()) {
087:                            inBytes = mode; /* restore # of bytes to consume */
088:                            i = toULength; /* restore # of bytes consumed */
089:
090:                            ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/
091:                            toUnicodeStatus = 0;
092:
093:                            while (i < inBytes) {
094:                                if (sourceArrayIndex < source.limit()) {
095:                                    toUBytesArray[i] = (byte) (ch2 = source
096:                                            .get(sourceArrayIndex)
097:                                            & UConverterConstants.UNSIGNED_BYTE_MASK);
098:                                    if (!isTrail((byte) ch2)) {
099:                                        break; /* i < inBytes */
100:                                    }
101:                                    ch = (ch << 6) + ch2;
102:                                    ++sourceArrayIndex;
103:                                    i++;
104:                                } else {
105:                                    /* stores a partially calculated target*/
106:                                    toUnicodeStatus = ch;
107:                                    mode = inBytes;
108:                                    toULength = (byte) i;
109:                                    break donefornow;
110:                                }
111:                            }
112:
113:                            /* Remove the accumulated high bits */
114:                            ch -= OFFSETS_FROM_UTF8[inBytes];
115:
116:                            /*
117:                             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
118:                             * - use only trail bytes after a lead byte (checked above)
119:                             * - use the right number of trail bytes for a given lead byte
120:                             * - encode a code point <= U+10ffff
121:                             * - use the fewest possible number of bytes for their code points
122:                             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
123:                             *
124:                             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
125:                             * There are no irregular sequences any more.
126:                             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
127:                             */
128:                            if (i == inBytes
129:                                    && ch <= UConverterSharedData.MAXIMUM_UTF
130:                                    && ch >= UTF8_MIN_CHAR32[i]
131:                                    && (isCESU8 ? i <= 3 : !UTF16
132:                                            .isSurrogate((char) ch))) {
133:                                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
134:                                toULength = 0;
135:                                if (ch <= UConverterSharedData.MAXIMUM_UCS2) {
136:                                    /* fits in 16 bits */
137:                                    target.put((char) ch);
138:                                } else {
139:                                    /* write out the surrogates */
140:                                    ch -= UConverterSharedData.HALF_BASE;
141:                                    target
142:                                            .put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
143:                                    ch = (ch & UConverterSharedData.HALF_MASK)
144:                                            + UConverterSharedData.SURROGATE_LOW_START;
145:                                    if (target.hasRemaining()) {
146:                                        target.put((char) ch);
147:
148:                                    } else /* targetCapacity==1 */{
149:                                        charErrorBufferArray[charErrorBufferBegin + 0] = (char) ch;
150:                                        charErrorBufferLength = 1;
151:                                        cr = CoderResult.OVERFLOW;
152:
153:                                    }
154:                                }
155:                            } else {
156:                                toULength = (byte) i;
157:                                cr = CoderResult
158:                                        .malformedForLength(sourceArrayIndex);
159:                                break donefornow;
160:                            }
161:                        }
162:
163:                        while (sourceArrayIndex < source.limit()
164:                                && target.hasRemaining()) {
165:                            ch = source.get(sourceArrayIndex++)
166:                                    & UConverterConstants.UNSIGNED_BYTE_MASK;
167:                            if (ch < 0x80) /* Simple case */
168:                            {
169:                                target.put((char) ch);
170:                            } else {
171:                                /* store the first char */
172:                                toUBytesArray[0] = (byte) ch;
173:                                inBytes = BYTES_FROM_UTF8[(int) ch]; /* lookup current sequence length */
174:                                i = 1;
175:
176:                                while (i < inBytes) {
177:                                    if (sourceArrayIndex < source.limit()) {
178:                                        toUBytesArray[i] = (byte) (ch2 = source
179:                                                .get(sourceArrayIndex)
180:                                                & UConverterConstants.UNSIGNED_BYTE_MASK);
181:                                        if (!isTrail((byte) ch2)) {
182:                                            break; /* i < inBytes */
183:                                        }
184:                                        ch = (ch << 6) + ch2;
185:                                        ++sourceArrayIndex;
186:                                        i++;
187:                                    } else {
188:                                        /* stores a partially calculated target*/
189:                                        toUnicodeStatus = ch;
190:                                        mode = inBytes;
191:                                        toULength = (byte) i;
192:                                        break donefornow;
193:                                    }
194:                                }
195:
196:                                /* Remove the accumulated high bits */
197:                                ch -= OFFSETS_FROM_UTF8[inBytes];
198:
199:                                /*
200:                                 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
201:                                 * - use only trail bytes after a lead byte (checked above)
202:                                 * - use the right number of trail bytes for a given lead byte
203:                                 * - encode a code point <= U+10ffff
204:                                 * - use the fewest possible number of bytes for their code points
205:                                 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
206:                                 *
207:                                 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
208:                                 * There are no irregular sequences any more.
209:                                 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
210:                                 */
211:                                if (i == inBytes
212:                                        && ch <= UConverterSharedData.MAXIMUM_UTF
213:                                        && ch >= UTF8_MIN_CHAR32[i]
214:                                        && (isCESU8 ? i <= 3 : !UTF16
215:                                                .isSurrogate((char) ch))) {
216:                                    /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
217:                                    toULength = 0;
218:                                    if (ch <= UConverterSharedData.MAXIMUM_UCS2) {
219:                                        /* fits in 16 bits */
220:                                        target.put((char) ch);
221:                                    } else {
222:                                        /* write out the surrogates */
223:                                        ch -= UConverterSharedData.HALF_BASE;
224:                                        target
225:                                                .put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
226:                                        ch = (ch & UConverterSharedData.HALF_MASK)
227:                                                + UConverterSharedData.SURROGATE_LOW_START;
228:                                        if (target.hasRemaining()) {
229:                                            target.put((char) ch);
230:                                        } else {
231:                                            /* Put in overflow buffer (not handled here) */
232:                                            charErrorBufferArray[charErrorBufferBegin + 0] = (char) ch;
233:                                            charErrorBufferLength = 1;
234:                                            cr = CoderResult.OVERFLOW;
235:                                            break;
236:                                        }
237:                                    }
238:                                } else {
239:                                    toULength = (byte) i;
240:                                    cr = CoderResult
241:                                            .malformedForLength(sourceArrayIndex);
242:                                    break;
243:                                }
244:                            }
245:                        }
246:                    }
247:
248:                    if (sourceArrayIndex < source.limit()
249:                            && !target.hasRemaining()) {
250:                        /* End of target buffer */
251:                        cr = CoderResult.OVERFLOW;
252:                    }
253:
254:                    source.position(sourceArrayIndex);
255:
256:                    return cr;
257:                }
258:
259:            }
260:
261:            class CharsetEncoderUTF8 extends CharsetEncoderICU {
262:
263:                public CharsetEncoderUTF8(CharsetICU cs) {
264:                    super (cs, fromUSubstitution);
265:                    implReset();
266:                }
267:
268:                protected void implReset() {
269:                    super .implReset();
270:                }
271:
272:                protected CoderResult encodeLoop(CharBuffer source,
273:                        ByteBuffer target, IntBuffer offsets, boolean flush) {
274:                    CoderResult cr = CoderResult.UNDERFLOW;
275:
276:                    int sourceArrayIndex = source.position();
277:
278:                    // Todo: CESU8 implementation
279:                    // boolean isCESU8 = args.converter.sharedData == _CESU8Data;
280:                    boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
281:
282:                    int ch;
283:                    short indexToWrite;
284:                    byte temp[] = new byte[4];
285:                    boolean doloop = true;
286:
287:                    if (fromUChar32 != 0 && target.hasRemaining()) {
288:                        ch = fromUChar32;
289:                        fromUChar32 = 0;
290:
291:                        if (sourceArrayIndex < source.limit()) {
292:                            /* test the following code unit */
293:                            char trail = source.get(sourceArrayIndex);
294:                            if (UTF16.isTrailSurrogate(trail)) {
295:                                ++sourceArrayIndex;
296:                                ch = UCharacter.getCodePoint((char) ch, trail);
297:                                /* convert this supplementary code point */
298:                                /* exit this condition tree */
299:                            } else {
300:                                /* this is an unmatched lead code unit (1st surrogate) */
301:                                /* callback(illegal) */
302:                                fromUChar32 = (int) ch;
303:                                cr = CoderResult
304:                                        .malformedForLength(sourceArrayIndex);
305:                                doloop = false;
306:                            }
307:                        } else {
308:                            /* no more input */
309:                            fromUChar32 = (int) ch;
310:                            doloop = false;
311:                        }
312:
313:                        if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
314:                            indexToWrite = 2;
315:                            temp[2] = (byte) ((ch >>> 12) | 0xe0);
316:                        } else {
317:                            indexToWrite = 3;
318:                            temp[3] = (byte) ((ch >>> 18) | 0xf0);
319:                            temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
320:                        }
321:                        temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
322:                        temp[0] = (byte) ((ch & 0x3f) | 0x80);
323:
324:                        for (; indexToWrite >= 0; indexToWrite--) {
325:                            if (target.hasRemaining()) {
326:                                target.put(temp[indexToWrite]);
327:                            } else {
328:                                errorBuffer[errorBufferLength++] = temp[indexToWrite];
329:                                cr = CoderResult.OVERFLOW;
330:                            }
331:                        }
332:                    }
333:
334:                    if (doloop) {
335:                        while (sourceArrayIndex < source.limit()
336:                                && target.hasRemaining()) {
337:                            ch = source.get(sourceArrayIndex++);
338:                            if (ch < 0x80) { /* Single byte */
339:                                target.put((byte) ch);
340:                            } else if (ch < 0x800) { /* Double byte */
341:                                target.put((byte) ((ch >>> 6) | 0xc0));
342:                                if (target.hasRemaining()) {
343:                                    target.put((byte) ((ch & 0x3f) | 0x80));
344:                                } else {
345:                                    errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80);
346:                                    errorBufferLength = 1;
347:                                    cr = CoderResult.OVERFLOW;
348:                                    break;
349:                                }
350:                            } else { /* Check for surrogates */
351:                                if (UTF16.isSurrogate((char) ch) && !isCESU8) {
352:                                    if (UTF16.isLeadSurrogate((char) ch)) {
353:
354:                                        if (sourceArrayIndex < source.limit()) {
355:                                            /* test the following code unit */
356:                                            char trail = source
357:                                                    .get(sourceArrayIndex);
358:                                            if (UTF16.isTrailSurrogate(trail)) {
359:                                                ++sourceArrayIndex;
360:                                                ch = UCharacter.getCodePoint(
361:                                                        (char) ch, trail);
362:                                                //ch2 = 0;
363:                                                /* convert this supplementary code point */
364:                                                /* exit this condition tree */
365:                                            } else {
366:                                                /* this is an unmatched lead code unit (1st surrogate) */
367:                                                /* callback(illegal) */
368:                                                fromUChar32 = ch;
369:                                                cr = CoderResult
370:                                                        .malformedForLength(sourceArrayIndex);
371:                                                break;
372:                                            }
373:                                        } else {
374:                                            /* no more input */
375:                                            fromUChar32 = ch;
376:                                            break;
377:                                        }
378:                                    } else {
379:                                        fromUChar32 = (int) ch;
380:                                        cr = CoderResult
381:                                                .malformedForLength(sourceArrayIndex);
382:                                        break;
383:                                    }
384:                                }
385:
386:                                if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
387:                                    indexToWrite = 2;
388:                                    temp[2] = (byte) ((ch >>> 12) | 0xe0);
389:                                } else {
390:                                    indexToWrite = 3;
391:                                    temp[3] = (byte) ((ch >>> 18) | 0xf0);
392:                                    temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
393:                                }
394:                                temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
395:                                temp[0] = (byte) ((ch & 0x3f) | 0x80);
396:
397:                                for (; indexToWrite >= 0; indexToWrite--) {
398:                                    if (target.hasRemaining()) {
399:                                        target.put(temp[indexToWrite]);
400:                                    } else {
401:                                        errorBuffer[errorBufferLength++] = temp[indexToWrite];
402:                                        cr = CoderResult.OVERFLOW;
403:                                    }
404:                                }
405:                            }
406:                        }
407:                    }
408:
409:                    if (sourceArrayIndex < source.limit()
410:                            && !target.hasRemaining()) {
411:                        cr = CoderResult.OVERFLOW;
412:                    }
413:
414:                    source.position(sourceArrayIndex);
415:
416:                    return cr;
417:                }
418:            }
419:
420:            /* single-code point definitions -------------------------------------------- */
421:
422:            /*
423:             * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
424:             * @param c 8-bit code unit (byte)
425:             * @return TRUE or FALSE
426:             * @draft ICU 3.6
427:             */
428:            //static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
429:            /*
430:             * Is this code unit (byte) a UTF-8 lead byte?
431:             * @param c 8-bit code unit (byte)
432:             * @return TRUE or FALSE
433:             * @draft ICU 3.6
434:             */
435:            //static final boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
436:            /**
437:             * Is this code unit (byte) a UTF-8 trail byte?
438:             * @param c 8-bit code unit (byte)
439:             * @return TRUE or FALSE
440:             * @draft ICU 3.6
441:             */
442:            static final boolean isTrail(byte c) {
443:                return (((c) & 0xc0) == 0x80);
444:            }
445:
446:            /*
447:             * How many code units (bytes) are used for the UTF-8 encoding
448:             * of this Unicode code point?
449:             * @param c 32-bit code point
450:             * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
451:             * @draft ICU 3.6
452:             */
453:            /*static final int length(int c)
454:            {
455:            	long uc = c & UConverterConstants.UNSIGNED_INT_MASK;
456:            	return
457:                (uc<=0x7f ? 1 : 
458:                    (uc<=0x7ff ? 2 : 
459:                        (uc<=0xd7ff ? 3 : 
460:                            (uc<=0xdfff || uc>0x10ffff ? 0 : 
461:                                (uc<=0xffff ? 3 : 4)
462:                            ) 
463:                        ) 
464:                    ) 
465:                );
466:            }*/
467:
468:            public CharsetDecoder newDecoder() {
469:                return new CharsetDecoderUTF8(this );
470:            }
471:
472:            public CharsetEncoder newEncoder() {
473:                return new CharsetEncoderUTF8(this);
474:            }
475:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.