001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.text;
009:
010: /**
011: * A decompression engine implementing the Standard Compression Scheme
012: * for Unicode (SCSU) as outlined in <A
013: * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
014: * Report #6</A>.
015: *
016: * <P><STRONG>USAGE</STRONG></P>
017: *
018: * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
019: * straightforward manner to decompress simple strings:</P>
020: *
021: * <PRE>
022: * byte [] compressed = ... ; // get compressed bytes from somewhere
023: * String result = UnicodeDecompressor.decompress(compressed);
024: * </PRE>
025: *
026: * <P>The static methods have a fairly large memory footprint.
027: * For finer-grained control over memory usage,
028: * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
029: * iterative decompression:</P>
030: *
031: * <PRE>
032: * // Decompress an array "bytes" of length "len" using a buffer of 512 chars
033: * // to the Writer "out"
034: *
035: * UnicodeDecompressor myDecompressor = new UnicodeDecompressor();
036: * final static int BUFSIZE = 512;
037: * char [] charBuffer = new char [ BUFSIZE ];
038: * int charsWritten = 0;
039: * int [] bytesRead = new int [1];
040: * int totalBytesDecompressed = 0;
041: * int totalCharsWritten = 0;
042: *
043: * do {
044: * // do the decompression
045: * charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
046: * len, bytesRead,
047: * charBuffer, 0, BUFSIZE);
048: *
049: * // do something with the current set of chars
050: * out.write(charBuffer, 0, charsWritten);
051: *
052: * // update the no. of bytes decompressed
053: * totalBytesDecompressed += bytesRead[0];
054: *
055: * // update the no. of chars written
056: * totalCharsWritten += charsWritten;
057: *
058: * } while(totalBytesDecompressed < len);
059: *
060: * myDecompressor.reset(); // reuse decompressor
061: * </PRE>
062: *
063: * <P>Decompression is performed according to the standard set forth in
064: * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
065: * Report #6</A></P>
066: *
067: * @see UnicodeCompressor
068: *
069: * @author Stephen F. Booth
070: * @stable ICU 2.4
071: */
072: public final class UnicodeDecompressor implements SCSU {
073: //==========================
074: // Instance variables
075: //==========================
076:
077: /** Alias to current dynamic window */
078: private int fCurrentWindow = 0;
079:
080: /** Dynamic compression window offsets */
081: private int[] fOffsets = new int[NUMWINDOWS];
082:
083: /** Current compression mode */
084: private int fMode = SINGLEBYTEMODE;
085:
086: /** Size of our internal buffer */
087: private final static int BUFSIZE = 3;
088:
089: /** Internal buffer for saving state */
090: private byte[] fBuffer = new byte[BUFSIZE];
091:
092: /** Number of characters in our internal buffer */
093: private int fBufferLength = 0;
094:
095: /**
096: * Create a UnicodeDecompressor.
097: * Sets all windows to their default values.
098: * @see #reset
099: * @stable ICU 2.4
100: */
101: public UnicodeDecompressor() {
102: reset(); // initialize to defaults
103: }
104:
105: /**
106: * Decompress a byte array into a String.
107: * @param buffer The byte array to decompress.
108: * @return A String containing the decompressed characters.
109: * @see #decompress(byte [], int, int)
110: * @stable ICU 2.4
111: */
112: public static String decompress(byte[] buffer) {
113: char[] buf = decompress(buffer, 0, buffer.length);
114: return new String(buf);
115: }
116:
117: /**
118: * Decompress a byte array into a Unicode character array.
119: * @param buffer The byte array to decompress.
120: * @param start The start of the byte run to decompress.
121: * @param limit The limit of the byte run to decompress.
122: * @return A character array containing the decompressed bytes.
123: * @see #decompress(byte [])
124: * @stable ICU 2.4
125: */
126: public static char[] decompress(byte[] buffer, int start, int limit) {
127: UnicodeDecompressor comp = new UnicodeDecompressor();
128:
129: // use a buffer we know will never overflow
130: // in the worst case, each byte will decompress
131: // to a surrogate pair (buffer must be at least 2 chars)
132: int len = Math.max(2, 2 * (limit - start));
133: char[] temp = new char[len];
134:
135: int charCount = comp.decompress(buffer, start, limit, null,
136: temp, 0, len);
137:
138: char[] result = new char[charCount];
139: System.arraycopy(temp, 0, result, 0, charCount);
140: return result;
141: }
142:
143: /**
144: * Decompress a byte array into a Unicode character array.
145: *
146: * This function will either completely fill the output buffer,
147: * or consume the entire input.
148: *
149: * @param byteBuffer The byte buffer to decompress.
150: * @param byteBufferStart The start of the byte run to decompress.
151: * @param byteBufferLimit The limit of the byte run to decompress.
152: * @param bytesRead A one-element array. If not null, on return
153: * the number of bytes read from byteBuffer.
154: * @param charBuffer A buffer to receive the decompressed data.
155: * This buffer must be at minimum two characters in size.
156: * @param charBufferStart The starting offset to which to write
157: * decompressed data.
158: * @param charBufferLimit The limiting offset for writing
159: * decompressed data.
160: * @return The number of Unicode characters written to charBuffer.
161: * @stable ICU 2.4
162: */
163: public int decompress(byte[] byteBuffer, int byteBufferStart,
164: int byteBufferLimit, int[] bytesRead, char[] charBuffer,
165: int charBufferStart, int charBufferLimit) {
166: // the current position in the source byte buffer
167: int bytePos = byteBufferStart;
168:
169: // the current position in the target char buffer
170: int ucPos = charBufferStart;
171:
172: // the current byte from the source buffer
173: int aByte = 0x00;
174:
175: // charBuffer must be at least 2 chars in size
176: if (charBuffer.length < 2
177: || (charBufferLimit - charBufferStart) < 2)
178: throw new IllegalArgumentException("charBuffer.length < 2");
179:
180: // if our internal buffer isn't empty, flush its contents
181: // to the output buffer before doing any more decompression
182: if (fBufferLength > 0) {
183:
184: int newBytes = 0;
185:
186: // fill the buffer completely, to guarantee one full character
187: if (fBufferLength != BUFSIZE) {
188: newBytes = fBuffer.length - fBufferLength;
189:
190: // verify there are newBytes bytes in byteBuffer
191: if (byteBufferLimit - byteBufferStart < newBytes)
192: newBytes = byteBufferLimit - byteBufferStart;
193:
194: System.arraycopy(byteBuffer, byteBufferStart, fBuffer,
195: fBufferLength, newBytes);
196: }
197:
198: // reset buffer length to 0 before recursive call
199: fBufferLength = 0;
200:
201: // call self recursively to decompress the buffer
202: int count = decompress(fBuffer, 0, fBuffer.length, null,
203: charBuffer, charBufferStart, charBufferLimit);
204:
205: // update the positions into the arrays
206: ucPos += count;
207: bytePos += newBytes;
208: }
209:
210: // the main decompression loop
211: mainLoop: while (bytePos < byteBufferLimit
212: && ucPos < charBufferLimit) {
213: switch (fMode) {
214: case SINGLEBYTEMODE:
215: // single-byte mode decompression loop
216: singleByteModeLoop: while (bytePos < byteBufferLimit
217: && ucPos < charBufferLimit) {
218: aByte = byteBuffer[bytePos++] & 0xFF;
219: switch (aByte) {
220: // All bytes from 0x80 through 0xFF are remapped
221: // to chars or surrogate pairs according to the
222: // currently active window
223: case 0x80:
224: case 0x81:
225: case 0x82:
226: case 0x83:
227: case 0x84:
228: case 0x85:
229: case 0x86:
230: case 0x87:
231: case 0x88:
232: case 0x89:
233: case 0x8A:
234: case 0x8B:
235: case 0x8C:
236: case 0x8D:
237: case 0x8E:
238: case 0x8F:
239: case 0x90:
240: case 0x91:
241: case 0x92:
242: case 0x93:
243: case 0x94:
244: case 0x95:
245: case 0x96:
246: case 0x97:
247: case 0x98:
248: case 0x99:
249: case 0x9A:
250: case 0x9B:
251: case 0x9C:
252: case 0x9D:
253: case 0x9E:
254: case 0x9F:
255: case 0xA0:
256: case 0xA1:
257: case 0xA2:
258: case 0xA3:
259: case 0xA4:
260: case 0xA5:
261: case 0xA6:
262: case 0xA7:
263: case 0xA8:
264: case 0xA9:
265: case 0xAA:
266: case 0xAB:
267: case 0xAC:
268: case 0xAD:
269: case 0xAE:
270: case 0xAF:
271: case 0xB0:
272: case 0xB1:
273: case 0xB2:
274: case 0xB3:
275: case 0xB4:
276: case 0xB5:
277: case 0xB6:
278: case 0xB7:
279: case 0xB8:
280: case 0xB9:
281: case 0xBA:
282: case 0xBB:
283: case 0xBC:
284: case 0xBD:
285: case 0xBE:
286: case 0xBF:
287: case 0xC0:
288: case 0xC1:
289: case 0xC2:
290: case 0xC3:
291: case 0xC4:
292: case 0xC5:
293: case 0xC6:
294: case 0xC7:
295: case 0xC8:
296: case 0xC9:
297: case 0xCA:
298: case 0xCB:
299: case 0xCC:
300: case 0xCD:
301: case 0xCE:
302: case 0xCF:
303: case 0xD0:
304: case 0xD1:
305: case 0xD2:
306: case 0xD3:
307: case 0xD4:
308: case 0xD5:
309: case 0xD6:
310: case 0xD7:
311: case 0xD8:
312: case 0xD9:
313: case 0xDA:
314: case 0xDB:
315: case 0xDC:
316: case 0xDD:
317: case 0xDE:
318: case 0xDF:
319: case 0xE0:
320: case 0xE1:
321: case 0xE2:
322: case 0xE3:
323: case 0xE4:
324: case 0xE5:
325: case 0xE6:
326: case 0xE7:
327: case 0xE8:
328: case 0xE9:
329: case 0xEA:
330: case 0xEB:
331: case 0xEC:
332: case 0xED:
333: case 0xEE:
334: case 0xEF:
335: case 0xF0:
336: case 0xF1:
337: case 0xF2:
338: case 0xF3:
339: case 0xF4:
340: case 0xF5:
341: case 0xF6:
342: case 0xF7:
343: case 0xF8:
344: case 0xF9:
345: case 0xFA:
346: case 0xFB:
347: case 0xFC:
348: case 0xFD:
349: case 0xFE:
350: case 0xFF:
351: // For offsets <= 0xFFFF, convert to a single char
352: // by adding the window's offset and subtracting
353: // the generic compression offset
354: if (fOffsets[fCurrentWindow] <= 0xFFFF) {
355: charBuffer[ucPos++] = (char) (aByte
356: + fOffsets[fCurrentWindow] - COMPRESSIONOFFSET);
357: }
358: // For offsets > 0x10000, convert to a surrogate pair by
359: // normBase = window's offset - 0x10000
360: // high surr. = 0xD800 + (normBase >> 10)
361: // low surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
362: else {
363: // make sure there is enough room to write
364: // both characters
365: // if not, save state and break out
366: if ((ucPos + 1) >= charBufferLimit) {
367: --bytePos;
368: System.arraycopy(byteBuffer, bytePos,
369: fBuffer, 0, byteBufferLimit
370: - bytePos);
371: fBufferLength = byteBufferLimit
372: - bytePos;
373: bytePos += fBufferLength;
374: break mainLoop;
375: }
376:
377: int normalizedBase = fOffsets[fCurrentWindow] - 0x10000;
378: charBuffer[ucPos++] = (char) (0xD800 + (normalizedBase >> 10));
379: charBuffer[ucPos++] = (char) (0xDC00 + (normalizedBase & 0x3FF) + (aByte & 0x7F));
380: }
381: break;
382:
383: // bytes from 0x20 through 0x7F are treated as ASCII and
384: // are remapped to chars by padding the high byte
385: // (this is the same as quoting from static window 0)
386: // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
387: // are treated as ASCII as well
388: case 0x00:
389: case 0x09:
390: case 0x0A:
391: case 0x0D:
392: case 0x20:
393: case 0x21:
394: case 0x22:
395: case 0x23:
396: case 0x24:
397: case 0x25:
398: case 0x26:
399: case 0x27:
400: case 0x28:
401: case 0x29:
402: case 0x2A:
403: case 0x2B:
404: case 0x2C:
405: case 0x2D:
406: case 0x2E:
407: case 0x2F:
408: case 0x30:
409: case 0x31:
410: case 0x32:
411: case 0x33:
412: case 0x34:
413: case 0x35:
414: case 0x36:
415: case 0x37:
416: case 0x38:
417: case 0x39:
418: case 0x3A:
419: case 0x3B:
420: case 0x3C:
421: case 0x3D:
422: case 0x3E:
423: case 0x3F:
424: case 0x40:
425: case 0x41:
426: case 0x42:
427: case 0x43:
428: case 0x44:
429: case 0x45:
430: case 0x46:
431: case 0x47:
432: case 0x48:
433: case 0x49:
434: case 0x4A:
435: case 0x4B:
436: case 0x4C:
437: case 0x4D:
438: case 0x4E:
439: case 0x4F:
440: case 0x50:
441: case 0x51:
442: case 0x52:
443: case 0x53:
444: case 0x54:
445: case 0x55:
446: case 0x56:
447: case 0x57:
448: case 0x58:
449: case 0x59:
450: case 0x5A:
451: case 0x5B:
452: case 0x5C:
453: case 0x5D:
454: case 0x5E:
455: case 0x5F:
456: case 0x60:
457: case 0x61:
458: case 0x62:
459: case 0x63:
460: case 0x64:
461: case 0x65:
462: case 0x66:
463: case 0x67:
464: case 0x68:
465: case 0x69:
466: case 0x6A:
467: case 0x6B:
468: case 0x6C:
469: case 0x6D:
470: case 0x6E:
471: case 0x6F:
472: case 0x70:
473: case 0x71:
474: case 0x72:
475: case 0x73:
476: case 0x74:
477: case 0x75:
478: case 0x76:
479: case 0x77:
480: case 0x78:
481: case 0x79:
482: case 0x7A:
483: case 0x7B:
484: case 0x7C:
485: case 0x7D:
486: case 0x7E:
487: case 0x7F:
488: charBuffer[ucPos++] = (char) aByte;
489: break;
490:
491: // quote unicode
492: case SQUOTEU:
493: // verify we have two bytes following tag
494: // if not, save state and break out
495: if ((bytePos + 1) >= byteBufferLimit) {
496: --bytePos;
497: System.arraycopy(byteBuffer, bytePos,
498: fBuffer, 0, byteBufferLimit
499: - bytePos);
500: fBufferLength = byteBufferLimit - bytePos;
501: bytePos += fBufferLength;
502: break mainLoop;
503: }
504:
505: aByte = byteBuffer[bytePos++];
506: charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
507: break;
508:
509: // switch to Unicode mode
510: case SCHANGEU:
511: fMode = UNICODEMODE;
512: break singleByteModeLoop;
513: //break;
514:
515: // handle all quote tags
516: case SQUOTE0:
517: case SQUOTE1:
518: case SQUOTE2:
519: case SQUOTE3:
520: case SQUOTE4:
521: case SQUOTE5:
522: case SQUOTE6:
523: case SQUOTE7:
524: // verify there is a byte following the tag
525: // if not, save state and break out
526: if (bytePos >= byteBufferLimit) {
527: --bytePos;
528: System.arraycopy(byteBuffer, bytePos,
529: fBuffer, 0, byteBufferLimit
530: - bytePos);
531: fBufferLength = byteBufferLimit - bytePos;
532: bytePos += fBufferLength;
533: break mainLoop;
534: }
535:
536: // if the byte is in the range 0x00 - 0x7F, use
537: // static window n otherwise, use dynamic window n
538: int dByte = byteBuffer[bytePos++] & 0xFF;
539: charBuffer[ucPos++] = (char) (dByte + (dByte >= 0x00
540: && dByte < 0x80 ? sOffsets[aByte
541: - SQUOTE0]
542: : (fOffsets[aByte - SQUOTE0] - COMPRESSIONOFFSET)));
543: break;
544:
545: // handle all change tags
546: case SCHANGE0:
547: case SCHANGE1:
548: case SCHANGE2:
549: case SCHANGE3:
550: case SCHANGE4:
551: case SCHANGE5:
552: case SCHANGE6:
553: case SCHANGE7:
554: fCurrentWindow = aByte - SCHANGE0;
555: break;
556:
557: // handle all define tags
558: case SDEFINE0:
559: case SDEFINE1:
560: case SDEFINE2:
561: case SDEFINE3:
562: case SDEFINE4:
563: case SDEFINE5:
564: case SDEFINE6:
565: case SDEFINE7:
566: // verify there is a byte following the tag
567: // if not, save state and break out
568: if (bytePos >= byteBufferLimit) {
569: --bytePos;
570: System.arraycopy(byteBuffer, bytePos,
571: fBuffer, 0, byteBufferLimit
572: - bytePos);
573: fBufferLength = byteBufferLimit - bytePos;
574: bytePos += fBufferLength;
575: break mainLoop;
576: }
577:
578: fCurrentWindow = aByte - SDEFINE0;
579: fOffsets[fCurrentWindow] = sOffsetTable[byteBuffer[bytePos++] & 0xFF];
580: break;
581:
582: // handle define extended tag
583: case SDEFINEX:
584: // verify we have two bytes following tag
585: // if not, save state and break out
586: if ((bytePos + 1) >= byteBufferLimit) {
587: --bytePos;
588: System.arraycopy(byteBuffer, bytePos,
589: fBuffer, 0, byteBufferLimit
590: - bytePos);
591: fBufferLength = byteBufferLimit - bytePos;
592: bytePos += fBufferLength;
593: break mainLoop;
594: }
595:
596: aByte = byteBuffer[bytePos++] & 0xFF;
597: fCurrentWindow = (aByte & 0xE0) >> 5;
598: fOffsets[fCurrentWindow] = 0x10000 + (0x80 * (((aByte & 0x1F) << 8) | (byteBuffer[bytePos++] & 0xFF)));
599: break;
600:
601: // reserved, shouldn't happen
602: case SRESERVED:
603: break;
604:
605: } // end switch
606: } // end while
607: break;
608:
609: case UNICODEMODE:
610: // unicode mode decompression loop
611: unicodeModeLoop: while (bytePos < byteBufferLimit
612: && ucPos < charBufferLimit) {
613: aByte = byteBuffer[bytePos++] & 0xFF;
614: switch (aByte) {
615: // handle all define tags
616: case UDEFINE0:
617: case UDEFINE1:
618: case UDEFINE2:
619: case UDEFINE3:
620: case UDEFINE4:
621: case UDEFINE5:
622: case UDEFINE6:
623: case UDEFINE7:
624: // verify there is a byte following tag
625: // if not, save state and break out
626: if (bytePos >= byteBufferLimit) {
627: --bytePos;
628: System.arraycopy(byteBuffer, bytePos,
629: fBuffer, 0, byteBufferLimit
630: - bytePos);
631: fBufferLength = byteBufferLimit - bytePos;
632: bytePos += fBufferLength;
633: break mainLoop;
634: }
635:
636: fCurrentWindow = aByte - UDEFINE0;
637: fOffsets[fCurrentWindow] = sOffsetTable[byteBuffer[bytePos++] & 0xFF];
638: fMode = SINGLEBYTEMODE;
639: break unicodeModeLoop;
640: //break;
641:
642: // handle define extended tag
643: case UDEFINEX:
644: // verify we have two bytes following tag
645: // if not, save state and break out
646: if ((bytePos + 1) >= byteBufferLimit) {
647: --bytePos;
648: System.arraycopy(byteBuffer, bytePos,
649: fBuffer, 0, byteBufferLimit
650: - bytePos);
651: fBufferLength = byteBufferLimit - bytePos;
652: bytePos += fBufferLength;
653: break mainLoop;
654: }
655:
656: aByte = byteBuffer[bytePos++] & 0xFF;
657: fCurrentWindow = (aByte & 0xE0) >> 5;
658: fOffsets[fCurrentWindow] = 0x10000 + (0x80 * (((aByte & 0x1F) << 8) | (byteBuffer[bytePos++] & 0xFF)));
659: fMode = SINGLEBYTEMODE;
660: break unicodeModeLoop;
661: //break;
662:
663: // handle all change tags
664: case UCHANGE0:
665: case UCHANGE1:
666: case UCHANGE2:
667: case UCHANGE3:
668: case UCHANGE4:
669: case UCHANGE5:
670: case UCHANGE6:
671: case UCHANGE7:
672: fCurrentWindow = aByte - UCHANGE0;
673: fMode = SINGLEBYTEMODE;
674: break unicodeModeLoop;
675: //break;
676:
677: // quote unicode
678: case UQUOTEU:
679: // verify we have two bytes following tag
680: // if not, save state and break out
681: if (bytePos >= byteBufferLimit - 1) {
682: --bytePos;
683: System.arraycopy(byteBuffer, bytePos,
684: fBuffer, 0, byteBufferLimit
685: - bytePos);
686: fBufferLength = byteBufferLimit - bytePos;
687: bytePos += fBufferLength;
688: break mainLoop;
689: }
690:
691: aByte = byteBuffer[bytePos++];
692: charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
693: break;
694:
695: default:
696: // verify there is a byte following tag
697: // if not, save state and break out
698: if (bytePos >= byteBufferLimit) {
699: --bytePos;
700: System.arraycopy(byteBuffer, bytePos,
701: fBuffer, 0, byteBufferLimit
702: - bytePos);
703: fBufferLength = byteBufferLimit - bytePos;
704: bytePos += fBufferLength;
705: break mainLoop;
706: }
707:
708: charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
709: break;
710:
711: } // end switch
712: } // end while
713: break;
714:
715: } // end switch( fMode )
716: } // end while
717:
718: // fill in output parameter
719: if (bytesRead != null)
720: bytesRead[0] = (bytePos - byteBufferStart);
721:
722: // return # of chars written
723: return (ucPos - charBufferStart);
724: }
725:
726: /**
727: * Reset the decompressor to its initial state.
728: * @stable ICU 2.4
729: */
730: public void reset() {
731: // reset dynamic windows
732: fOffsets[0] = 0x0080; // Latin-1
733: fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
734: fOffsets[2] = 0x0400; // Cyrillic
735: fOffsets[3] = 0x0600; // Arabic
736: fOffsets[4] = 0x0900; // Devanagari
737: fOffsets[5] = 0x3040; // Hiragana
738: fOffsets[6] = 0x30A0; // Katakana
739: fOffsets[7] = 0xFF00; // Fullwidth ASCII
740:
741: fCurrentWindow = 0; // Make current window Latin-1
742: fMode = SINGLEBYTEMODE; // Always start in single-byte mode
743: fBufferLength = 0; // Empty buffer
744: }
745: };
|