0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007: package com.ibm.icu.text;
0008:
0009: /**
0010: * A compression engine implementing the Standard Compression Scheme
0011: * for Unicode (SCSU) as outlined in <A
0012: * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
0013: * Report #6</A>.
0014: *
0015: * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
0016: * consisting of 128 consecutive characters in Unicode. During compression,
0017: * characters within a window are encoded in the compressed stream as the bytes
0018: * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
0019: * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the
0020: * storage size of traditional character sets, for example 1 byte per
0021: * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
0022: * ideographs.</P>
0023: *
0024: * <P><STRONG>USAGE</STRONG></P>
0025: *
0026: * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
0027: * straightforward manner to compress simple strings:</P>
0028: *
0029: * <PRE>
0030: * String s = ... ; // get string from somewhere
0031: * byte [] compressed = UnicodeCompressor.compress(s);
0032: * </PRE>
0033: *
0034: * <P>The static methods have a fairly large memory footprint.
0035: * For finer-grained control over memory usage,
0036: * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
0037: * iterative compression:</P>
0038: *
0039: * <PRE>
0040: * // Compress an array "chars" of length "len" using a buffer of 512 bytes
0041: * // to the OutputStream "out"
0042: *
0043: * UnicodeCompressor myCompressor = new UnicodeCompressor();
0044: * final static int BUFSIZE = 512;
0045: * byte [] byteBuffer = new byte [ BUFSIZE ];
0046: * int bytesWritten = 0;
0047: * int [] unicharsRead = new int [1];
0048: * int totalCharsCompressed = 0;
0049: * int totalBytesWritten = 0;
0050: *
0051: * do {
0052: * // do the compression
0053: * bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
0054: * len, unicharsRead,
0055: * byteBuffer, 0, BUFSIZE);
0056: *
0057: * // do something with the current set of bytes
0058: * out.write(byteBuffer, 0, bytesWritten);
0059: *
0060: * // update the no. of characters compressed
0061: * totalCharsCompressed += unicharsRead[0];
0062: *
0063: * // update the no. of bytes written
0064: * totalBytesWritten += bytesWritten;
0065: *
0066: * } while(totalCharsCompressed < len);
0067: *
0068: * myCompressor.reset(); // reuse compressor
0069: * </PRE>
0070: *
0071: * @see UnicodeDecompressor
0072: *
0073: * @author Stephen F. Booth
0074: * @stable ICU 2.4
0075: */
0076:
0077: /*
0078: *
0079: * COMPRESSION STRATEGY
0080: *
0081: * Single Byte Mode
0082: *
0083: * There are three relevant cases.
0084: * If the character is in the current window or is Latin-1 (U+0000,
0085: * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
0086: * directly in the stream as a single byte.
0087: *
0088: * 1. Current character is in defined, inactive window.
0089: * 2. Current character is in undefined window.
0090: * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
0091: *
0092: * 1. Current character is in defined, inactive window
0093: * A. Look ahead two characters
0094: * B. If both following characters in same window as current character,
0095: * switch to defined window
0096: * C. If only next character is in same window as current character,
0097: * quote defined window
0098: * D. If neither of following characters is in same window as current,
0099: * quote defined window
0100: *
0101: * 2. Current character is in undefined window
0102: * A. Look ahead two characters
0103: * B. If both following characters in same window as current character,
0104: * define new window
0105: * C. If only next character in same window as current character,
0106: * switch to Unicode mode
0107: * NOTE: This costs us one extra byte. However,
0108: * since we have a limited number of windows to work with, it is
0109: * assumed the cost will pay off later in savings from a window with
0110: * more characters in it.
0111: * D. If neither of following characters in same window as current,
0112: * switch to Unicode mode. Alternative to above: just quote
0113: * Unicode (same byte cost)
0114: *
0115: * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
0116: * A. Look ahead one character
0117: * B. If next character in non-compressible region, switch to
0118: * Unicode mode
0119: * C. If next character not in non-compressible region, quote Unicode
0120: *
0121: *
0122: * The following chart illustrates the bytes required for encoding characters
0123: * in each possible way
0124: *
0125: *
0126: * SINGLE BYTE MODE
0127: * Characters in a row with same index
0128: * tag encountered 1 2 3 4
0129: * ---------------------------------------------------------------
0130: * none (in current window) 1 2 3 4
0131: *
0132: * quote Unicode 3 6 9 12
0133: *
0134: * window not switch to Unicode 3 5 7 9 byte
0135: * defined define window 3 4 5 6 cost
0136: *
0137: * window switch to window 2 3 4 5
0138: * defined quote window 2 4 6 8
0139: *
0140: * Unicode Mode
0141: *
0142: * There are two relevant cases.
0143: * If the character is in the non-compressible region
0144: * (U+3400 - U+DFFF), the character is simply written to the
0145: * stream as a pair of bytes.
0146: *
0147: * 1. Current character is in defined, inactive window.
0148: * 2. Current character is in undefined window.
0149: *
0150: * 1.Current character is in defined, inactive window
0151: * A. Look ahead one character
0152: * B. If next character has same index as current character,
0153: * switch to defined window (and switch to single-byte mode)
0154: * C. If not, just put bytes in stream
0155: *
0156: *
0157: * 2. Current character is in undefined window
0158: * A. Look ahead two characters
0159: * B. If both in same window as current character, define window
0160: * (and switch to single-byte mode)
0161: * C. If only next character in same window, just put bytes in stream
0162: * NOTE: This costs us one extra byte. However,
0163: * since we have a limited number of windows to work with, it is
0164: * assumed the cost will pay off later in savings from a window with
0165: * more characters in it.
0166: * D. If neither in same window, put bytes in stream
0167: *
0168: *
0169: * The following chart illustrates the bytes required for encoding characters
0170: * in each possible way
0171: *
0172: *
0173: * UNICODE MODE
0174: * Characters in a row with same index
0175: * tag encountered 1 2 3 4
0176: * ---------------------------------------------------------------
0177: * none 2 4 6 8
0178: *
0179: * quote Unicode 3 6 9 12
0180: *
0181: * window not define window 3 4 5 6 byte
0182: * defined cost
0183: * window switch to window 2 3 4 5
0184: * defined
0185: */
0186: public final class UnicodeCompressor implements SCSU {
0187: //==========================
0188: // Class variables
0189: //==========================
0190:
0191: /** For quick identification of a byte as a single-byte mode tag */
0192: private static boolean[] sSingleTagTable = {
0193: // table generated by CompressionTableGenerator
0194: false, true, true, true, true, true, true, true, true,
0195: false, false, true, true, false, true, true, true, true,
0196: true, true, true, true, true, true, true, true, true, true,
0197: true, true, true, true, false, false, false, false, false,
0198: false, false, false, false, false, false, false, false,
0199: false, false, false, false, false, false, false, false,
0200: false, false, false, false, false, false, false, false,
0201: false, false, false, false, false, false, false, false,
0202: false, false, false, false, false, false, false, false,
0203: false, false, false, false, false, false, false, false,
0204: false, false, false, false, false, false, false, false,
0205: false, false, false, false, false, false, false, false,
0206: false, false, false, false, false, false, false, false,
0207: false, false, false, false, false, false, false, false,
0208: false, false, false, false, false, false, false, false,
0209: false, false, false, false, false, false, false, false,
0210: false, false, false, false, false, false, false, false,
0211: false, false, false, false, false, false, false, false,
0212: false, false, false, false, false, false, false, false,
0213: false, false, false, false, false, false, false, false,
0214: false, false, false, false, false, false, false, false,
0215: false, false, false, false, false, false, false, false,
0216: false, false, false, false, false, false, false, false,
0217: false, false, false, false, false, false, false, false,
0218: false, false, false, false, false, false, false, false,
0219: false, false, false, false, false, false, false, false,
0220: false, false, false, false, false, false, false, false,
0221: false, false, false, false, false, false, false, false,
0222: false, false, false, false, false, false, false, false,
0223: false, false, false, false, false, false, false, false,
0224: false, false, false, false, false, false, false, false,
0225: false, false, false };
0226:
0227: /** For quick identification of a byte as a unicode mode tag */
0228: private static boolean[] sUnicodeTagTable = {
0229: // table generated by CompressionTableGenerator
0230: false, false, false, false, false, false, false, false,
0231: false, false, false, false, false, false, false, false,
0232: false, false, false, false, false, false, false, false,
0233: false, false, false, false, false, false, false, false,
0234: false, false, false, false, false, false, false, false,
0235: false, false, false, false, false, false, false, false,
0236: false, false, false, false, false, false, false, false,
0237: false, false, false, false, false, false, false, false,
0238: false, false, false, false, false, false, false, false,
0239: false, false, false, false, false, false, false, false,
0240: false, false, false, false, false, false, false, false,
0241: false, false, false, false, false, false, false, false,
0242: false, false, false, false, false, false, false, false,
0243: false, false, false, false, false, false, false, false,
0244: false, false, false, false, false, false, false, false,
0245: false, false, false, false, false, false, false, false,
0246: false, false, false, false, false, false, false, false,
0247: false, false, false, false, false, false, false, false,
0248: false, false, false, false, false, false, false, false,
0249: false, false, false, false, false, false, false, false,
0250: false, false, false, false, false, false, false, false,
0251: false, false, false, false, false, false, false, false,
0252: false, false, false, false, false, false, false, false,
0253: false, false, false, false, false, false, false, false,
0254: false, false, false, false, false, false, false, false,
0255: false, false, false, false, false, false, false, false,
0256: false, false, false, false, false, false, false, false,
0257: false, false, false, false, false, false, false, false,
0258: true, true, true, true, true, true, true, true, true, true,
0259: true, true, true, true, true, true, true, true, true,
0260: false, false, false, false, false, false, false, false,
0261: false, false, false, false, false };
0262:
0263: //==========================
0264: // Instance variables
0265: //==========================
0266:
0267: /** Alias to current dynamic window */
0268: private int fCurrentWindow = 0;
0269:
0270: /** Dynamic compression window offsets */
0271: private int[] fOffsets = new int[NUMWINDOWS];
0272:
0273: /** Current compression mode */
0274: private int fMode = SINGLEBYTEMODE;
0275:
0276: /** Keeps count of times character indices are encountered */
0277: private int[] fIndexCount = new int[MAXINDEX + 1];
0278:
0279: /** The time stamps indicate when a window was last defined */
0280: private int[] fTimeStamps = new int[NUMWINDOWS];
0281:
0282: /** The current time stamp */
0283: private int fTimeStamp = 0;
0284:
0285: /**
0286: * Create a UnicodeCompressor.
0287: * Sets all windows to their default values.
0288: * @see #reset
0289: * @stable ICU 2.4
0290: */
0291: public UnicodeCompressor() {
0292: reset(); // initialize to defaults
0293: }
0294:
0295: /**
0296: * Compress a string into a byte array.
0297: * @param buffer The string to compress.
0298: * @return A byte array containing the compressed characters.
0299: * @see #compress(char [], int, int)
0300: * @stable ICU 2.4
0301: */
0302: public static byte[] compress(String buffer) {
0303: return compress(buffer.toCharArray(), 0, buffer.length());
0304: }
0305:
0306: /**
0307: * Compress a Unicode character array into a byte array.
0308: * @param buffer The character buffer to compress.
0309: * @param start The start of the character run to compress.
0310: * @param limit The limit of the character run to compress.
0311: * @return A byte array containing the compressed characters.
0312: * @see #compress(String)
0313: * @stable ICU 2.4
0314: */
0315: public static byte[] compress(char[] buffer, int start, int limit) {
0316: UnicodeCompressor comp = new UnicodeCompressor();
0317:
0318: // use a buffer that we know will never overflow
0319: // in the worst case, each character will take 3 bytes
0320: // to encode: UQU, hibyte, lobyte. In this case, the
0321: // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
0322: // buffer must be at least 4 bytes in size
0323: int len = Math.max(4, 3 * (limit - start) + 1);
0324: byte[] temp = new byte[len];
0325:
0326: int byteCount = comp.compress(buffer, start, limit, null, temp,
0327: 0, len);
0328:
0329: byte[] result = new byte[byteCount];
0330: System.arraycopy(temp, 0, result, 0, byteCount);
0331: return result;
0332: }
0333:
0334: /**
0335: * Compress a Unicode character array into a byte array.
0336: *
0337: * This function will only consume input that can be completely
0338: * output.
0339: *
0340: * @param charBuffer The character buffer to compress.
0341: * @param charBufferStart The start of the character run to compress.
0342: * @param charBufferLimit The limit of the character run to compress.
0343: * @param charsRead A one-element array. If not null, on return
0344: * the number of characters read from charBuffer.
0345: * @param byteBuffer A buffer to receive the compressed data. This
0346: * buffer must be at minimum four bytes in size.
0347: * @param byteBufferStart The starting offset to which to write
0348: * compressed data.
0349: * @param byteBufferLimit The limiting offset for writing compressed data.
0350: * @return The number of bytes written to byteBuffer.
0351: * @stable ICU 2.4
0352: */
0353: public int compress(char[] charBuffer, int charBufferStart,
0354: int charBufferLimit, int[] charsRead, byte[] byteBuffer,
0355: int byteBufferStart, int byteBufferLimit) {
0356: // the current position in the target byte buffer
0357: int bytePos = byteBufferStart;
0358:
0359: // the current position in the source unicode character buffer
0360: int ucPos = charBufferStart;
0361:
0362: // the current unicode character from the source buffer
0363: int curUC = INVALIDCHAR;
0364:
0365: // the index for the current character
0366: int curIndex = -1;
0367:
0368: // look ahead
0369: int nextUC = INVALIDCHAR;
0370: int forwardUC = INVALIDCHAR;
0371:
0372: // temporary for window searching
0373: int whichWindow = 0;
0374:
0375: // high and low bytes of the current unicode character
0376: int hiByte = 0;
0377: int loByte = 0;
0378:
0379: // byteBuffer must be at least 4 bytes in size
0380: if (byteBuffer.length < 4
0381: || (byteBufferLimit - byteBufferStart) < 4)
0382: throw new IllegalArgumentException("byteBuffer.length < 4");
0383:
0384: mainLoop: while (ucPos < charBufferLimit
0385: && bytePos < byteBufferLimit) {
0386: switch (fMode) {
0387: // main single byte mode compression loop
0388: case SINGLEBYTEMODE:
0389: singleByteModeLoop: while (ucPos < charBufferLimit
0390: && bytePos < byteBufferLimit) {
0391: // get current char
0392: curUC = charBuffer[ucPos++];
0393:
0394: // get next char
0395: if (ucPos < charBufferLimit)
0396: nextUC = charBuffer[ucPos];
0397: else
0398: nextUC = INVALIDCHAR;
0399:
0400: // chars less than 0x0080 (excluding tags) go straight
0401: // in stream
0402: if (curUC < 0x0080) {
0403: loByte = curUC & 0xFF;
0404:
0405: // we need to check and make sure we don't
0406: // accidentally write a single byte mode tag to
0407: // the stream unless it's quoted
0408: if (sSingleTagTable[loByte]) {
0409: // make sure there is enough room to
0410: // write both bytes if not, rewind the
0411: // source stream and break out
0412: if ((bytePos + 1) >= byteBufferLimit) {
0413: --ucPos;
0414: break mainLoop;
0415: }
0416:
0417: // since we know the byte is less than 0x80, SQUOTE0
0418: // will use static window 0, or ASCII
0419: byteBuffer[bytePos++] = (byte) SQUOTE0;
0420: }
0421:
0422: byteBuffer[bytePos++] = (byte) loByte;
0423: }
0424:
0425: // if the char belongs to current window, convert it
0426: // to a byte by adding the generic compression offset
0427: // and subtracting the window's offset
0428: else if (inDynamicWindow(curUC, fCurrentWindow)) {
0429: byteBuffer[bytePos++] = (byte) (curUC
0430: - fOffsets[fCurrentWindow] + COMPRESSIONOFFSET);
0431: }
0432:
0433: // if char is not in compressible range, either switch to or
0434: // quote from unicode
0435: else if (!isCompressible(curUC)) {
0436: // only check next character if it is valid
0437: if (nextUC != INVALIDCHAR
0438: && isCompressible(nextUC)) {
0439: // make sure there is enough room to
0440: // write all three bytes if not,
0441: // rewind the source stream and break
0442: // out
0443: if ((bytePos + 2) >= byteBufferLimit) {
0444: --ucPos;
0445: break mainLoop;
0446: }
0447:
0448: byteBuffer[bytePos++] = (byte) SQUOTEU;
0449: byteBuffer[bytePos++] = (byte) (curUC >>> 8);
0450: byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
0451: } else {
0452: // make sure there is enough room to
0453: // write all four bytes if not, rewind
0454: // the source stream and break out
0455: if ((bytePos + 3) >= byteBufferLimit) {
0456: --ucPos;
0457: break mainLoop;
0458: }
0459:
0460: byteBuffer[bytePos++] = (byte) SCHANGEU;
0461:
0462: hiByte = curUC >>> 8;
0463: loByte = curUC & 0xFF;
0464:
0465: if (sUnicodeTagTable[hiByte])
0466: // add quote Unicode tag
0467: byteBuffer[bytePos++] = (byte) UQUOTEU;
0468:
0469: byteBuffer[bytePos++] = (byte) hiByte;
0470: byteBuffer[bytePos++] = (byte) loByte;
0471:
0472: fMode = UNICODEMODE;
0473: break singleByteModeLoop;
0474: }
0475: }
0476:
0477: // if the char is in a currently defined dynamic
0478: // window, figure out which one, and either switch to
0479: // it or quote from it
0480: else if ((whichWindow = findDynamicWindow(curUC)) != INVALIDWINDOW) {
0481: // look ahead
0482: if ((ucPos + 1) < charBufferLimit)
0483: forwardUC = charBuffer[ucPos + 1];
0484: else
0485: forwardUC = INVALIDCHAR;
0486:
0487: // all three chars in same window, switch to that
0488: // window inDynamicWindow will return false for
0489: // INVALIDCHAR
0490: if (inDynamicWindow(nextUC, whichWindow)
0491: && inDynamicWindow(forwardUC,
0492: whichWindow)) {
0493: // make sure there is enough room to
0494: // write both bytes if not, rewind the
0495: // source stream and break out
0496: if ((bytePos + 1) >= byteBufferLimit) {
0497: --ucPos;
0498: break mainLoop;
0499: }
0500:
0501: byteBuffer[bytePos++] = (byte) (SCHANGE0 + whichWindow);
0502: byteBuffer[bytePos++] = (byte) (curUC
0503: - fOffsets[whichWindow] + COMPRESSIONOFFSET);
0504: fTimeStamps[whichWindow] = ++fTimeStamp;
0505: fCurrentWindow = whichWindow;
0506: }
0507:
0508: // either only next char or neither in same
0509: // window, so quote
0510: else {
0511: // make sure there is enough room to
0512: // write both bytes if not, rewind the
0513: // source stream and break out
0514: if ((bytePos + 1) >= byteBufferLimit) {
0515: --ucPos;
0516: break mainLoop;
0517: }
0518:
0519: byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
0520: byteBuffer[bytePos++] = (byte) (curUC
0521: - fOffsets[whichWindow] + COMPRESSIONOFFSET);
0522: }
0523: }
0524:
0525: // if a static window is defined, and the following
0526: // character is not in that static window, quote from
0527: // the static window Note: to quote from a static
0528: // window, don't add 0x80
0529: else if ((whichWindow = findStaticWindow(curUC)) != INVALIDWINDOW
0530: && !inStaticWindow(nextUC, whichWindow)) {
0531: // make sure there is enough room to write both
0532: // bytes if not, rewind the source stream and
0533: // break out
0534: if ((bytePos + 1) >= byteBufferLimit) {
0535: --ucPos;
0536: break mainLoop;
0537: }
0538:
0539: byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
0540: byteBuffer[bytePos++] = (byte) (curUC - sOffsets[whichWindow]);
0541: }
0542:
0543: // if a window is not defined, decide if we want to
0544: // define a new one or switch to unicode mode
0545: else {
0546: // determine index for current char (char is compressible)
0547: curIndex = makeIndex(curUC);
0548: fIndexCount[curIndex]++;
0549:
0550: // look ahead
0551: if ((ucPos + 1) < charBufferLimit)
0552: forwardUC = charBuffer[ucPos + 1];
0553: else
0554: forwardUC = INVALIDCHAR;
0555:
0556: // if we have encountered this index at least once
0557: // before, define a new window
0558: // OR
0559: // three chars in a row with same index, define a
0560: // new window (makeIndex will return RESERVEDINDEX
0561: // for INVALIDCHAR)
0562: if ((fIndexCount[curIndex] > 1)
0563: || (curIndex == makeIndex(nextUC) && curIndex == makeIndex(forwardUC))) {
0564: // make sure there is enough room to write all
0565: // three bytes if not, rewind the source
0566: // stream and break out
0567: if ((bytePos + 2) >= byteBufferLimit) {
0568: --ucPos;
0569: break mainLoop;
0570: }
0571:
0572: // get least recently defined window
0573: whichWindow = getLRDefinedWindow();
0574:
0575: byteBuffer[bytePos++] = (byte) (SDEFINE0 + whichWindow);
0576: byteBuffer[bytePos++] = (byte) curIndex;
0577: byteBuffer[bytePos++] = (byte) (curUC
0578: - sOffsetTable[curIndex] + COMPRESSIONOFFSET);
0579:
0580: fOffsets[whichWindow] = sOffsetTable[curIndex];
0581: fCurrentWindow = whichWindow;
0582: fTimeStamps[whichWindow] = ++fTimeStamp;
0583: }
0584:
0585: // only two chars in a row with same index, so
0586: // switch to unicode mode (makeIndex will return
0587: // RESERVEDINDEX for INVALIDCHAR)
0588: // OR
0589: // three chars have different indices, so switch
0590: // to unicode mode
0591: else {
0592: // make sure there is enough room to write all
0593: // four bytes if not, rewind the source stream
0594: // and break out
0595: if ((bytePos + 3) >= byteBufferLimit) {
0596: --ucPos;
0597: break mainLoop;
0598: }
0599:
0600: byteBuffer[bytePos++] = (byte) SCHANGEU;
0601:
0602: hiByte = curUC >>> 8;
0603: loByte = curUC & 0xFF;
0604:
0605: if (sUnicodeTagTable[hiByte])
0606: // add quote Unicode tag
0607: byteBuffer[bytePos++] = (byte) UQUOTEU;
0608:
0609: byteBuffer[bytePos++] = (byte) hiByte;
0610: byteBuffer[bytePos++] = (byte) loByte;
0611:
0612: fMode = UNICODEMODE;
0613: break singleByteModeLoop;
0614: }
0615: }
0616: }
0617: break;
0618:
0619: case UNICODEMODE:
0620: // main unicode mode compression loop
0621: unicodeModeLoop: while (ucPos < charBufferLimit
0622: && bytePos < byteBufferLimit) {
0623: // get current char
0624: curUC = charBuffer[ucPos++];
0625:
0626: // get next char
0627: if (ucPos < charBufferLimit)
0628: nextUC = charBuffer[ucPos];
0629: else
0630: nextUC = INVALIDCHAR;
0631:
0632: // if we have two uncompressible chars in a row,
0633: // put the current char's bytes in the stream
0634: if (!isCompressible(curUC)
0635: || (nextUC != INVALIDCHAR && !isCompressible(nextUC))) {
0636: // make sure there is enough room to write all three bytes
0637: // if not, rewind the source stream and break out
0638: if ((bytePos + 2) >= byteBufferLimit) {
0639: --ucPos;
0640: break mainLoop;
0641: }
0642:
0643: hiByte = curUC >>> 8;
0644: loByte = curUC & 0xFF;
0645:
0646: if (sUnicodeTagTable[hiByte])
0647: // add quote Unicode tag
0648: byteBuffer[bytePos++] = (byte) UQUOTEU;
0649:
0650: byteBuffer[bytePos++] = (byte) hiByte;
0651: byteBuffer[bytePos++] = (byte) loByte;
0652: }
0653:
0654: // bytes less than 0x80 can go straight in the stream,
0655: // but in single-byte mode
0656: else if (curUC < 0x0080) {
0657: loByte = curUC & 0xFF;
0658:
0659: // if two chars in a row below 0x80 and the
0660: // current char is not a single-byte mode tag,
0661: // switch to single-byte mode
0662: if (nextUC != INVALIDCHAR && nextUC < 0x0080
0663: && !sSingleTagTable[loByte]) {
0664: // make sure there is enough room to
0665: // write both bytes if not, rewind the
0666: // source stream and break out
0667: if ((bytePos + 1) >= byteBufferLimit) {
0668: --ucPos;
0669: break mainLoop;
0670: }
0671:
0672: // use the last-active window
0673: whichWindow = fCurrentWindow;
0674: byteBuffer[bytePos++] = (byte) (UCHANGE0 + whichWindow);
0675: byteBuffer[bytePos++] = (byte) loByte;
0676:
0677: //fCurrentWindow = 0;
0678: fTimeStamps[whichWindow] = ++fTimeStamp;
0679: fMode = SINGLEBYTEMODE;
0680: break unicodeModeLoop;
0681: }
0682:
0683: // otherwise, just write the bytes to the stream
0684: // (this will cover the case of only 1 char less than 0x80
0685: // and single-byte mode tags)
0686: else {
0687: // make sure there is enough room to
0688: // write both bytes if not, rewind the
0689: // source stream and break out
0690: if ((bytePos + 1) >= byteBufferLimit) {
0691: --ucPos;
0692: break mainLoop;
0693: }
0694:
0695: // since the character is less than 0x80, the
0696: // high byte is always 0x00 - no need for
0697: // (curUC >>> 8)
0698: byteBuffer[bytePos++] = (byte) 0x00;
0699: byteBuffer[bytePos++] = (byte) loByte;
0700: }
0701: }
0702:
0703: // figure out if the current char is in a defined window
0704: else if ((whichWindow = findDynamicWindow(curUC)) != INVALIDWINDOW) {
0705: // if two chars in a row in the same window,
0706: // switch to that window and go to single-byte mode
0707: // inDynamicWindow will return false for INVALIDCHAR
0708: if (inDynamicWindow(nextUC, whichWindow)) {
0709: // make sure there is enough room to
0710: // write both bytes if not, rewind the
0711: // source stream and break out
0712: if ((bytePos + 1) >= byteBufferLimit) {
0713: --ucPos;
0714: break mainLoop;
0715: }
0716:
0717: byteBuffer[bytePos++] = (byte) (UCHANGE0 + whichWindow);
0718: byteBuffer[bytePos++] = (byte) (curUC
0719: - fOffsets[whichWindow] + COMPRESSIONOFFSET);
0720:
0721: fTimeStamps[whichWindow] = ++fTimeStamp;
0722: fCurrentWindow = whichWindow;
0723: fMode = SINGLEBYTEMODE;
0724: break unicodeModeLoop;
0725: }
0726:
0727: // otherwise, just quote the unicode for the char
0728: else {
0729: // make sure there is enough room to
0730: // write all three bytes if not,
0731: // rewind the source stream and break
0732: // out
0733: if ((bytePos + 2) >= byteBufferLimit) {
0734: --ucPos;
0735: break mainLoop;
0736: }
0737:
0738: hiByte = curUC >>> 8;
0739: loByte = curUC & 0xFF;
0740:
0741: if (sUnicodeTagTable[hiByte])
0742: // add quote Unicode tag
0743: byteBuffer[bytePos++] = (byte) UQUOTEU;
0744:
0745: byteBuffer[bytePos++] = (byte) hiByte;
0746: byteBuffer[bytePos++] = (byte) loByte;
0747: }
0748: }
0749:
0750: // char is not in a defined window
0751: else {
0752: // determine index for current char (char is compressible)
0753: curIndex = makeIndex(curUC);
0754: fIndexCount[curIndex]++;
0755:
0756: // look ahead
0757: if ((ucPos + 1) < charBufferLimit)
0758: forwardUC = charBuffer[ucPos + 1];
0759: else
0760: forwardUC = INVALIDCHAR;
0761:
0762: // if we have encountered this index at least once
0763: // before, define a new window for it that hasn't
0764: // previously been redefined
0765: // OR
0766: // if three chars in a row with the same index,
0767: // define a new window (makeIndex will return
0768: // RESERVEDINDEX for INVALIDCHAR)
0769: if ((fIndexCount[curIndex] > 1)
0770: || (curIndex == makeIndex(nextUC) && curIndex == makeIndex(forwardUC))) {
0771: // make sure there is enough room to
0772: // write all three bytes if not,
0773: // rewind the source stream and break
0774: // out
0775: if ((bytePos + 2) >= byteBufferLimit) {
0776: --ucPos;
0777: break mainLoop;
0778: }
0779:
0780: // get least recently defined window
0781: whichWindow = getLRDefinedWindow();
0782:
0783: byteBuffer[bytePos++] = (byte) (UDEFINE0 + whichWindow);
0784: byteBuffer[bytePos++] = (byte) curIndex;
0785: byteBuffer[bytePos++] = (byte) (curUC
0786: - sOffsetTable[curIndex] + COMPRESSIONOFFSET);
0787:
0788: fOffsets[whichWindow] = sOffsetTable[curIndex];
0789: fCurrentWindow = whichWindow;
0790: fTimeStamps[whichWindow] = ++fTimeStamp;
0791: fMode = SINGLEBYTEMODE;
0792: break unicodeModeLoop;
0793: }
0794:
0795: // otherwise just quote the unicode, and save our
0796: // windows for longer runs
0797: else {
0798: // make sure there is enough room to
0799: // write all three bytes if not,
0800: // rewind the source stream and break
0801: // out
0802: if ((bytePos + 2) >= byteBufferLimit) {
0803: --ucPos;
0804: break mainLoop;
0805: }
0806:
0807: hiByte = curUC >>> 8;
0808: loByte = curUC & 0xFF;
0809:
0810: if (sUnicodeTagTable[hiByte])
0811: // add quote Unicode tag
0812: byteBuffer[bytePos++] = (byte) UQUOTEU;
0813:
0814: byteBuffer[bytePos++] = (byte) hiByte;
0815: byteBuffer[bytePos++] = (byte) loByte;
0816: }
0817: }
0818: }
0819: } // end switch
0820: }
0821:
0822: // fill in output parameter
0823: if (charsRead != null)
0824: charsRead[0] = (ucPos - charBufferStart);
0825:
0826: // return # of bytes written
0827: return (bytePos - byteBufferStart);
0828: }
0829:
0830: /**
0831: * Reset the compressor to its initial state.
0832: * @stable ICU 2.4
0833: */
0834: public void reset() {
0835: int i;
0836:
0837: // reset dynamic windows
0838: fOffsets[0] = 0x0080; // Latin-1
0839: fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
0840: fOffsets[2] = 0x0400; // Cyrillic
0841: fOffsets[3] = 0x0600; // Arabic
0842: fOffsets[4] = 0x0900; // Devanagari
0843: fOffsets[5] = 0x3040; // Hiragana
0844: fOffsets[6] = 0x30A0; // Katakana
0845: fOffsets[7] = 0xFF00; // Fullwidth ASCII
0846:
0847: // reset time stamps
0848: for (i = 0; i < NUMWINDOWS; i++) {
0849: fTimeStamps[i] = 0;
0850: }
0851:
0852: // reset count of seen indices
0853: for (i = 0; i <= MAXINDEX; i++) {
0854: fIndexCount[i] = 0;
0855: }
0856:
0857: fTimeStamp = 0; // Reset current time stamp
0858: fCurrentWindow = 0; // Make current window Latin-1
0859: fMode = SINGLEBYTEMODE; // Always start in single-byte mode
0860: }
0861:
0862: //==========================
0863: // Determine the index for a character
0864: //==========================
0865:
0866: /**
0867: * Create the index value for a character.
0868: * For more information on this function, refer to table X-3
0869: * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
0870: * @param c The character in question.
0871: * @return An index for c
0872: */
0873: private static int makeIndex(int c) {
0874: // check the predefined indices
0875: if (c >= 0x00C0 && c < 0x0140)
0876: return LATININDEX;
0877: else if (c >= 0x0250 && c < 0x02D0)
0878: return IPAEXTENSIONINDEX;
0879: else if (c >= 0x0370 && c < 0x03F0)
0880: return GREEKINDEX;
0881: else if (c >= 0x0530 && c < 0x0590)
0882: return ARMENIANINDEX;
0883: else if (c >= 0x3040 && c < 0x30A0)
0884: return HIRAGANAINDEX;
0885: else if (c >= 0x30A0 && c < 0x3120)
0886: return KATAKANAINDEX;
0887: else if (c >= 0xFF60 && c < 0xFF9F)
0888: return HALFWIDTHKATAKANAINDEX;
0889:
0890: // calculate index
0891: else if (c >= 0x0080 && c < 0x3400)
0892: return (c / 0x80) & 0xFF;
0893: else if (c >= 0xE000 && c <= 0xFFFF)
0894: return ((c - 0xAC00) / 0x80) & 0xFF;
0895:
0896: // should never happen
0897: else {
0898: return RESERVEDINDEX;
0899: }
0900: }
0901:
0902: //==========================
0903: // Check if a given character fits in a window
0904: //==========================
0905:
0906: /**
0907: * Determine if a character is in a dynamic window.
0908: * @param c The character to test
0909: * @param whichWindow The dynamic window the test
0910: * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
0911: * false otherwise.
0912: */
0913: private boolean inDynamicWindow(int c, int whichWindow) {
0914: return (c >= fOffsets[whichWindow] && c < (fOffsets[whichWindow] + 0x80));
0915: }
0916:
0917: /**
0918: * Determine if a character is in a static window.
0919: * @param c The character to test
0920: * @param whichWindow The static window the test
0921: * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
0922: * false otherwise.
0923: */
0924: private static boolean inStaticWindow(int c, int whichWindow) {
0925: return (c >= sOffsets[whichWindow] && c < (sOffsets[whichWindow] + 0x80));
0926: }
0927:
0928: //==========================
0929: // Check if a given character is compressible
0930: //==========================
0931:
0932: /**
0933: * Determine if a character is compressible.
0934: * @param c The character to test.
0935: * @return true if the <TT>c</TT> is compressible, false otherwise.
0936: */
0937: private static boolean isCompressible(int c) {
0938: return (c < 0x3400 || c >= 0xE000);
0939: }
0940:
0941: //==========================
0942: // Check if a window is defined for a given character
0943: //==========================
0944:
0945: /**
0946: * Determine if a dynamic window for a certain character is defined
0947: * @param c The character in question
0948: * @return The dynamic window containing <TT>c</TT>, or
0949: * INVALIDWINDOW if not defined.
0950: */
0951: private int findDynamicWindow(int c) {
0952: // supposedly faster to count down
0953: //for(int i = 0; i < NUMWINDOWS; i++) {
0954: for (int i = NUMWINDOWS - 1; i >= 0; --i) {
0955: if (inDynamicWindow(c, i)) {
0956: ++fTimeStamps[i];
0957: return i;
0958: }
0959: }
0960:
0961: return INVALIDWINDOW;
0962: }
0963:
0964: /**
0965: * Determine if a static window for a certain character is defined
0966: * @param c The character in question
0967: * @return The static window containing <TT>c</TT>, or
0968: * INVALIDWINDOW if not defined.
0969: */
0970: private static int findStaticWindow(int c) {
0971: // supposedly faster to count down
0972: //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
0973: for (int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
0974: if (inStaticWindow(c, i)) {
0975: return i;
0976: }
0977: }
0978:
0979: return INVALIDWINDOW;
0980: }
0981:
0982: //==========================
0983: // Find the least-recently used window
0984: //==========================
0985:
0986: /** Find the least-recently defined window */
0987: private int getLRDefinedWindow() {
0988: int leastRU = Integer.MAX_VALUE;
0989: int whichWindow = INVALIDWINDOW;
0990:
0991: // find least recently used window
0992: // supposedly faster to count down
0993: //for( int i = 0; i < NUMWINDOWS; i++ ) {
0994: for (int i = NUMWINDOWS - 1; i >= 0; --i) {
0995: if (fTimeStamps[i] < leastRU) {
0996: leastRU = fTimeStamps[i];
0997: whichWindow = i;
0998: }
0999: }
1000:
1001: return whichWindow;
1002: }
1003:
1004: };
|