001: /*
002: * Java HTML Tidy - JTidy
003: * HTML parser and pretty printer
004: *
005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
006: * Institute of Technology, Institut National de Recherche en
007: * Informatique et en Automatique, Keio University). All Rights
008: * Reserved.
009: *
010: * Contributing Author(s):
011: *
012: * Dave Raggett <dsr@w3.org>
013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
014: * Gary L Peskin <garyp@firstech.com> (Java development)
015: * Sami Lempinen <sami@lempinen.net> (release management)
016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
017: *
018: * The contributing author(s) would like to thank all those who
019: * helped with testing, bug fixes, and patience. This wouldn't
020: * have been possible without all of you.
021: *
022: * COPYRIGHT NOTICE:
023: *
024: * This software and documentation is provided "as is," and
025: * the copyright holders and contributing author(s) make no
026: * representations or warranties, express or implied, including
027: * but not limited to, warranties of merchantability or fitness
028: * for any particular purpose or that the use of the software or
029: * documentation will not infringe any third party patents,
030: * copyrights, trademarks or other rights.
031: *
032: * The copyright holders and contributing author(s) will not be
033: * liable for any direct, indirect, special or consequential damages
034: * arising out of any use of the software or documentation, even if
035: * advised of the possibility of such damage.
036: *
037: * Permission is hereby granted to use, copy, modify, and distribute
038: * this source code, or portions hereof, documentation and executables,
039: * for any purpose, without fee, subject to the following restrictions:
040: *
041: * 1. The origin of this source code must not be misrepresented.
042: * 2. Altered versions must be plainly marked as such and must
043: * not be misrepresented as being the original source.
044: * 3. This Copyright notice may not be removed or altered from any
045: * source or altered source distribution.
046: *
047: * The copyright holders and contributing author(s) specifically
048: * permit, without fee, and encourage the use of this source code
049: * as a component for supporting the Hypertext Markup Language in
050: * commercial products. If you use this source code in a product,
051: * acknowledgment is not required but would be appreciated.
052: *
053: */
054: package org.w3c.tidy;
055:
056: /**
057: * @author Fabrizio Giustina
058: * @version $Revision: 1.7 $ ($Author: fgiust $)
059: */
060: public final class EncodingUtils {
061:
062: /**
063: * the big-endian (default) UNICODE BOM.
064: */
065: public static final int UNICODE_BOM_BE = 0xFEFF;
066:
067: /**
068: * the default (big-endian) UNICODE BOM.
069: */
070: public static final int UNICODE_BOM = UNICODE_BOM_BE;
071:
072: /**
073: * the little-endian UNICODE BOM.
074: */
075: public static final int UNICODE_BOM_LE = 0xFFFE;
076:
077: /**
078: * the UTF-8 UNICODE BOM.
079: */
080: public static final int UNICODE_BOM_UTF8 = 0xEFBBBF;
081:
082: /**
083: * states for ISO 2022 A document in ISO-2022 based encoding uses some ESC sequences called "designator" to switch
084: * character sets. The designators defined and used in ISO-2022-JP are: "ESC" + "(" + ? for ISO646 variants "ESC" +
085: * "$" + ? and "ESC" + "$" + "(" + ? for multibyte character sets. State ASCII.
086: */
087: public static final int FSM_ASCII = 0;
088:
089: /**
090: * state ESC.
091: */
092: public static final int FSM_ESC = 1;
093:
094: /**
095: * state ESCD.
096: */
097: public static final int FSM_ESCD = 2;
098:
099: /**
100: * state ESCDP.
101: */
102: public static final int FSM_ESCDP = 3;
103:
104: /**
105: * state ESCP.
106: */
107: public static final int FSM_ESCP = 4;
108:
109: /**
110: * state NONASCII.
111: */
112: public static final int FSM_NONASCII = 5;
113:
114: /**
115: * Max UTF-88 valid char value.
116: */
117: public static final int MAX_UTF8_FROM_UCS4 = 0x10FFFF;
118:
119: /**
120: * Max UTF-16 value.
121: */
122: public static final int MAX_UTF16_FROM_UCS4 = 0x10FFFF;
123:
124: /**
125: * utf16 low surrogate.
126: */
127: public static final int LOW_UTF16_SURROGATE = 0xD800;
128:
129: /**
130: * UTF-16 surrogates begin.
131: */
132: public static final int UTF16_SURROGATES_BEGIN = 0x10000;
133:
134: /**
135: * UTF-16 surrogate pair areas: low surrogates begin.
136: */
137: public static final int UTF16_LOW_SURROGATE_BEGIN = 0xD800;
138:
139: /**
140: * UTF-16 surrogate pair areas: low surrogates end.
141: */
142: public static final int UTF16_LOW_SURROGATE_END = 0xDBFF;
143:
144: /**
145: * UTF-16 surrogate pair areas: high surrogates begin.
146: */
147: public static final int UTF16_HIGH_SURROGATE_BEGIN = 0xDC00;
148:
149: /**
150: * UTF-16 surrogate pair areas: high surrogates end.
151: */
152: public static final int UTF16_HIGH_SURROGATE_END = 0xDFFF;
153:
154: /**
155: * UTF-16 high surrogate.
156: */
157: public static final int HIGH_UTF16_SURROGATE = 0xDFFF;
158:
159: /**
160: * UTF-8 bye swap: invalid char.
161: */
162: private static final int UTF8_BYTE_SWAP_NOT_A_CHAR = 0xFFFE;
163:
164: /**
165: * UTF-8 invalid char.
166: */
167: private static final int UTF8_NOT_A_CHAR = 0xFFFF;
168:
169: /**
170: * Mapping for Windows Western character set (128-159) to Unicode.
171: */
172: private static final int[] WIN2UNICODE = { 0x20AC, 0x0000, 0x201A,
173: 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030,
174: 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, 0x0000,
175: 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
176: 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E,
177: 0x0178 };
178:
179: /**
180: * John Love-Jensen contributed this table for mapping MacRoman character set to Unicode.
181: */
182: private static final int[] MAC2UNICODE = { // modified to only need chars 128-255/U+0080-U+00FF Terry T 19 Aug 01
183: // x7F = DEL
184: 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC,
185: 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7,
186: 0x00E9, 0x00E8, 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE,
187: 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5,
188: 0x00FA, 0x00F9, 0x00FB, 0x00FC, 0x2020, 0x00B0, 0x00A2,
189: 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9,
190: 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6,
191: 0x00D8,
192: 0x221E,
193: 0x00B1,
194: 0x2264,
195: 0x2265,
196: 0x00A5,
197: 0x00B5,
198: 0x2202,
199: 0x2211,
200: // =BD U+2126 OHM SIGN
201: 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6,
202: 0x00F8, 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248,
203: 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3,
204: 0x00D5, 0x0152, 0x0153, 0x2013, 0x2014, 0x201C,
205: 0x201D,
206: 0x2018,
207: 0x2019,
208: 0x00F7,
209: 0x25CA,
210: // =DB U+00A4 CURRENCY SIGN
211: 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01,
212: 0xFB02, 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2,
213: 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
214: 0x00CF,
215: 0x00CC,
216: 0x00D3,
217: 0x00D4,
218: // xF0 = Apple Logo
219: // =F0 U+2665 BLACK HEART SUIT
220: 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6,
221: 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD,
222: 0x02DB, 0x02C7 };
223:
224: /**
225: * table to map symbol font characters to Unicode; undefined characters are mapped to 0x0000 and characters without
226: * any unicode equivalent are mapped to '?'. Is this appropriate?
227: */
228: private static final int[] SYMBOL2UNICODE = { 0x0000, 0x0001,
229: 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
230: 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
231:
232: 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016,
233: 0x0017, 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D,
234: 0x001E, 0x001F,
235:
236: 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026,
237: 0x220D, 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212,
238: 0x002E, 0x002F,
239:
240: 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036,
241: 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D,
242: 0x003E, 0x003F,
243:
244: 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6,
245: 0x0393, 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C,
246: 0x039D, 0x039F,
247:
248: 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2,
249: 0x03A9, 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D,
250: 0x22A5, 0x005F,
251:
252: 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6,
253: 0x03B3, 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC,
254: 0x03BD, 0x03BF,
255:
256: 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6,
257: 0x03C9, 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D,
258: 0x223C, 0x003F,
259:
260: 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
261: 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
262: 0x0000, 0x0000,
263:
264: 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
265: 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
266: 0x0000, 0x0000,
267:
268: 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192,
269: 0x2663, 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191,
270: 0x2192, 0x2193,
271:
272: 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202,
273: 0x00B7, 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F,
274: 0x003F, 0x21B5,
275:
276: 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205,
277: 0x2229, 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286,
278: 0x2208, 0x2209,
279:
280: 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A,
281: 0x22C5, 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1,
282: 0x21D2, 0x21D3,
283:
284: 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F,
285: 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
286: 0x003F, 0x003F,
287:
288: 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F,
289: 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
290: 0x003F, 0x003F };
291:
292: /**
293: * Array of valid UTF8 sequences.
294: */
295: private static final ValidUTF8Sequence[] VALID_UTF8 = {
296: new ValidUTF8Sequence(0x0000, 0x007F, 1, new char[] { 0x00,
297: 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }),
298: new ValidUTF8Sequence(0x0080, 0x07FF, 2, new char[] { 0xC2,
299: 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00 }),
300: new ValidUTF8Sequence(0x0800, 0x0FFF, 3, new char[] { 0xE0,
301: 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00 }),
302: new ValidUTF8Sequence(0x1000, 0xFFFF, 3, new char[] { 0xE1,
303: 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00 }),
304: new ValidUTF8Sequence(0x10000, 0x3FFFF, 4, new char[] {
305: 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }),
306: new ValidUTF8Sequence(0x40000, 0xFFFFF, 4, new char[] {
307: 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }),
308: new ValidUTF8Sequence(0x100000, 0x10FFFF, 4, new char[] {
309: 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }) };
310:
311: /**
312: * number of valid utf8 sequances.
313: */
314: private static final int NUM_UTF8_SEQUENCES = VALID_UTF8.length;
315:
316: /**
317: * Offset for utf8 sequences.
318: */
319: private static final int[] OFFSET_UTF8_SEQUENCES = { 0, // 1 byte
320: 1, // 2 bytes
321: 2, // 3 bytes
322: 4, // 4 bytes
323: NUM_UTF8_SEQUENCES }; // must be last
324:
325: /**
326: * don't instantiate.
327: */
328: private EncodingUtils() {
329: // unused
330: }
331:
332: /**
333: * Function for conversion from Windows-1252 to Unicode.
334: * @param c char to decode
335: * @return decoded char
336: */
337: protected static int decodeWin1252(int c) {
338: return WIN2UNICODE[c - 128];
339: }
340:
341: /**
342: * Function to convert from MacRoman to Unicode.
343: * @param c char to decode
344: * @return decoded char
345: */
346: protected static int decodeMacRoman(int c) {
347: if (127 < c) {
348: c = MAC2UNICODE[c - 128];
349: }
350: return c;
351: }
352:
353: /**
354: * Function to convert from Symbol Font chars to Unicode.
355: * @param c char to decode
356: * @return decoded char
357: */
358: static int decodeSymbolFont(int c) {
359: if (c > 255) {
360: return c;
361: }
362:
363: return SYMBOL2UNICODE[c];
364: }
365:
366: /**
367: * Decodes an array of bytes to a char.
368: * @param c will contain the decoded char
369: * @param firstByte first input byte
370: * @param successorBytes array containing successor bytes (can be null if a getter is provided).
371: * @param getter callback used to get new bytes if successorBytes doesn't contain enough bytes
372: * @param count will contain the number of bytes read
373: * @param startInSuccessorBytesArray starting offset for bytes in successorBytes
374: * @return <code>true</code> if error
375: */
376: static boolean decodeUTF8BytesToChar(int[] c, int firstByte,
377: byte[] successorBytes, GetBytes getter, int[] count,
378: int startInSuccessorBytesArray) {
379: byte[] buf = new byte[10];
380:
381: int ch = 0;
382: int n = 0;
383: int i, bytes = 0;
384: boolean hasError = false;
385:
386: if (successorBytes.length != 0) {
387: buf = successorBytes;
388: }
389:
390: // special check if we have been passed an EOF char
391: if (firstByte == StreamIn.END_OF_STREAM) //uint
392: {
393: // at present
394: c[0] = firstByte;
395: count[0] = 1;
396: return false;
397: }
398:
399: ch = TidyUtils.toUnsigned(firstByte); // first byte is passed in separately
400:
401: if (ch <= 0x7F) // 0XXX XXXX one byte
402: {
403: n = ch;
404: bytes = 1;
405: } else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
406: {
407: n = ch & 31;
408: bytes = 2;
409: } else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
410: {
411: n = ch & 15;
412: bytes = 3;
413: } else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
414: {
415: n = ch & 7;
416: bytes = 4;
417: } else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
418: {
419: n = ch & 3;
420: bytes = 5;
421: hasError = true;
422: } else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
423: {
424: n = ch & 1;
425: bytes = 6;
426: hasError = true;
427: } else {
428: // not a valid first byte of a UTF-8 sequence
429: n = ch;
430: bytes = 1;
431: hasError = true;
432: }
433:
434: for (i = 1; i < bytes; ++i) {
435: int[] tempCount = new int[1]; // no. of additional bytes to get
436:
437: // successor bytes should have the form 10XX XXXX
438: if (getter != null && (bytes - i > 0)) {
439: tempCount[0] = 1; // to simplify things, get 1 byte at a time
440: int[] buftocopy = new int[] { buf[startInSuccessorBytesArray
441: + i - 1] };
442:
443: getter.doGet(buftocopy, tempCount, false);
444: //readRawBytesFromStream(buftocopy, tempCount, false);
445: if (tempCount[0] <= 0) // EOF
446: {
447: hasError = true;
448: bytes = i;
449: break;
450: }
451: }
452:
453: if ((buf[startInSuccessorBytesArray + i - 1] & 0xC0) != 0x80) {
454: // illegal successor byte value
455: hasError = true;
456: bytes = i;
457: if (getter != null) {
458: int[] buftocopy = new int[] { buf[startInSuccessorBytesArray
459: + i - 1] };
460: tempCount[0] = 1; // to simplify things, unget 1 byte at a time
461: getter.doGet(buftocopy, tempCount, true);
462: }
463: break;
464: }
465:
466: n = (n << 6)
467: | (buf[startInSuccessorBytesArray + i - 1] & 0x3F);
468: }
469:
470: if (!hasError
471: && ((n == UTF8_BYTE_SWAP_NOT_A_CHAR) || (n == UTF8_NOT_A_CHAR))) {
472: hasError = true;
473: }
474:
475: if (!hasError && (n > MAX_UTF8_FROM_UCS4)) {
476: hasError = true;
477: }
478:
479: if (!hasError && (n >= UTF16_LOW_SURROGATE_BEGIN)
480: && (n <= UTF16_HIGH_SURROGATE_END)) {
481: // unpaired surrogates not allowed
482: hasError = true;
483: }
484:
485: if (!hasError) {
486: int lo = OFFSET_UTF8_SEQUENCES[bytes - 1];
487: int hi = OFFSET_UTF8_SEQUENCES[bytes] - 1;
488:
489: // check for overlong sequences
490: if ((n < VALID_UTF8[lo].lowChar)
491: || (n > VALID_UTF8[hi].highChar)) {
492: hasError = true;
493: } else {
494: hasError = true; // assume error until proven otherwise
495:
496: for (i = lo; i <= hi; i++) {
497: int tempCount;
498: char theByte; //unsigned
499:
500: for (tempCount = 0; tempCount < bytes; tempCount++) {
501: if (!TidyUtils.toBoolean(tempCount)) {
502: theByte = (char) firstByte;
503: } else {
504: theByte = (char) buf[startInSuccessorBytesArray
505: + tempCount - 1];
506: }
507: if ((theByte >= VALID_UTF8[i].validBytes[(tempCount * 2)])
508: && (theByte <= VALID_UTF8[i].validBytes[(tempCount * 2) + 1])) {
509: hasError = false;
510: }
511: if (hasError) {
512: break;
513: }
514: }
515: }
516: }
517: }
518:
519: count[0] = bytes;
520:
521: c[0] = n;
522:
523: // n = 0xFFFD;
524: // replacement char - do this in the caller
525: return hasError;
526:
527: }
528:
529: /**
530: * Encode a char to an array of bytes.
531: * @param c char to encode
532: * @param encodebuf will contain the decoded bytes
533: * @param putter if not null it will be called to write bytes to out
534: * @param count number of bytes written
535: * @return <code>false</code>= ok, <code>true</code>= error
536: */
537: static boolean encodeCharToUTF8Bytes(int c, byte[] encodebuf,
538: PutBytes putter, int[] count) {
539: int bytes = 0;
540:
541: byte[] buf = new byte[10];
542:
543: if (encodebuf != null) {
544: buf = encodebuf;
545: }
546:
547: boolean hasError = false;
548:
549: if (c <= 0x7F) // 0XXX XXXX one byte
550: {
551: buf[0] = (byte) c;
552: bytes = 1;
553: } else if (c <= 0x7FF) // 110X XXXX two bytes
554: {
555: buf[0] = (byte) (0xC0 | (c >> 6));
556: buf[1] = (byte) (0x80 | (c & 0x3F));
557: bytes = 2;
558: } else if (c <= 0xFFFF) // 1110 XXXX three bytes
559: {
560: buf[0] = (byte) (0xE0 | (c >> 12));
561: buf[1] = (byte) (0x80 | ((c >> 6) & 0x3F));
562: buf[2] = (byte) (0x80 | (c & 0x3F));
563: bytes = 3;
564: if ((c == UTF8_BYTE_SWAP_NOT_A_CHAR)
565: || (c == UTF8_NOT_A_CHAR)) {
566: hasError = true;
567: } else if ((c >= UTF16_LOW_SURROGATE_BEGIN)
568: && (c <= UTF16_HIGH_SURROGATE_END)) {
569: // unpaired surrogates not allowed
570: hasError = true;
571: }
572: } else if (c <= 0x1FFFFF) // 1111 0XXX four bytes
573: {
574: buf[0] = (byte) (0xF0 | (c >> 18));
575: buf[1] = (byte) (0x80 | ((c >> 12) & 0x3F));
576: buf[2] = (byte) (0x80 | ((c >> 6) & 0x3F));
577: buf[3] = (byte) (0x80 | (c & 0x3F));
578: bytes = 4;
579: if (c > MAX_UTF8_FROM_UCS4) {
580: hasError = true;
581: }
582: } else if (c <= 0x3FFFFFF) // 1111 10XX five bytes
583: {
584: buf[0] = (byte) (0xF8 | (c >> 24));
585: buf[1] = (byte) (0x80 | (c >> 18));
586: buf[2] = (byte) (0x80 | ((c >> 12) & 0x3F));
587: buf[3] = (byte) (0x80 | ((c >> 6) & 0x3F));
588: buf[4] = (byte) (0x80 | (c & 0x3F));
589: bytes = 5;
590: hasError = true;
591: } else if (c <= 0x7FFFFFFF) // 1111 110X six bytes
592: {
593: buf[0] = (byte) (0xFC | (c >> 30));
594: buf[1] = (byte) (0x80 | ((c >> 24) & 0x3F));
595: buf[2] = (byte) (0x80 | ((c >> 18) & 0x3F));
596: buf[3] = (byte) (0x80 | ((c >> 12) & 0x3F));
597: buf[4] = (byte) (0x80 | ((c >> 6) & 0x3F));
598: buf[5] = (byte) (0x80 | (c & 0x3F));
599: bytes = 6;
600: hasError = true;
601: } else {
602: hasError = true;
603: }
604:
605: if (!hasError && putter != null) // don't output invalid UTF-8 byte sequence to a stream
606: {
607: int[] tempCount = new int[] { bytes };
608: putter.doPut(buf, tempCount);
609:
610: if (tempCount[0] < bytes) {
611: hasError = true;
612: }
613: }
614:
615: count[0] = bytes;
616: return hasError;
617: }
618:
619: /**
620: * Getter callback: called to retrieve 1 or more additional UTF-8 bytes. The Getter callback can also unget if
621: * necessary to re-synchronize the input stream.
622: */
623: static interface GetBytes {
624:
625: /**
626: * Get one or more byte.
627: * @param buf will contain the bytes.
628: * @param count number of bytes actually stored in "buf". <= 0 if error or EOF
629: * @param unget unget bytes?
630: */
631: void doGet(int[] buf, int[] count, boolean unget);
632: }
633:
634: /**
635: * Putter callbacks: called to store 1 or more additional UTF-8 bytes.
636: */
637: static interface PutBytes {
638:
639: /**
640: * Store one or more byte.
641: * @param buf will contain the bytes.
642: * @param count number of bytes actually stored in "buf". <= 0 if error or EOF
643: */
644: void doPut(byte[] buf, int[] count);
645: }
646: }
|