001: package com.healthmarketscience.jackcess.scsu;
002:
003: /*
004: * This sample software accompanies Unicode Technical Report #6 and
005: * distributed as is by Unicode, Inc., subject to the following:
006: *
007: * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
008: *
009: * Permission to use, copy, modify, and distribute this software
010: * without fee is hereby granted provided that this copyright notice
011: * appears in all copies.
012: *
013: * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
014: * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
015: * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
016: * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
017: * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
018: * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
019: * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
020: * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
021: *
022: * @author Asmus Freytag
023: *
024: * @version 001 Dec 25 1996
025: * @version 002 Jun 25 1997
026: * @version 003 Jul 25 1997
027: * @version 004 Aug 25 1997
028: * @version 005 Sep 30 1998
029: *
030: * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
031: * and are registered in some jurisdictions.
032: **/
033:
034: /**
035: Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
036:
037: <H2>Notes on the Java implementation</H2>
038:
039: A limitation of Java is the exclusive use of a signed byte data type.
040: The following work arounds are required:
041:
042: Copying a byte to an integer variable and adding 256 for 'negative'
043: bytes gives an integer in the range 0-255.
044:
045: Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
046: char values is unsigned.
047:
048: Extended characters require an int to store them. The sign is not an
049: issue because only 1024*1024 + 65536 extended characters exist.
050:
051: **/
052: public class Expand extends SCSU {
053: /** (re-)define (and select) a dynamic window
054: A sliding window position cannot start at any Unicode value,
055: so rather than providing an absolute offset, this function takes
056: an index value which selects among the possible starting values.
057:
058: Most scripts in Unicode start on or near a half-block boundary
059: so the default behaviour is to multiply the index by 0x80. Han,
060: Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
061: show very poor locality--therefore no sliding window can be set
062: there. A jumpOffset is added to the index value to skip that region,
063: and only 167 index values total are required to select all eligible
064: half-blocks.
065:
066: Finally, a few scripts straddle half block boundaries. For them, a
067: table of fixed offsets is used, and the index values from 0xF9 to
068: 0xFF are used to select these special offsets.
069:
070: After (re-)defining a windows location it is selected so it is ready
071: for use.
072:
073: Recall that all Windows are of the same length (128 code positions).
074:
075: @param iWindow - index of the window to be (re-)defined
076: @param bOffset - index for the new offset value
077: **/
078: // @005 protected <-- private here and elsewhere
079: protected void defineWindow(int iWindow, byte bOffset)
080: throws IllegalInputException {
081: int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
082:
083: // 0 is a reserved value
084: if (iOffset == 0) {
085: throw new IllegalInputException();
086: } else if (iOffset < gapThreshold) {
087: dynamicOffset[iWindow] = iOffset << 7;
088: } else if (iOffset < reservedStart) {
089: dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
090: } else if (iOffset < fixedThreshold) {
091: // more reserved values
092: throw new IllegalInputException("iOffset == " + iOffset);
093: } else {
094: dynamicOffset[iWindow] = fixedOffset[iOffset
095: - fixedThreshold];
096: }
097:
098: // make the redefined window the active one
099: selectWindow(iWindow);
100: }
101:
102: /** (re-)define (and select) a window as an extended dynamic window
103: The surrogate area in Unicode allows access to 2**20 codes beyond the
104: first 64K codes by combining one of 1024 characters from the High
105: Surrogate Area with one of 1024 characters from the Low Surrogate
106: Area (see Unicode 2.0 for the details).
107:
108: The tags SDX and UDX set the window such that each subsequent byte in
109: the range 80 to FF represents a surrogate pair. The following diagram
110: shows how the bits in the two bytes following the SDX or UDX, and a
111: subsequent data byte, map onto the bits in the resulting surrogate pair.
112:
113: hbyte lbyte data
114: nnnwwwww zzzzzyyy 1xxxxxxx
115:
116: high-surrogate low-surrogate
117: 110110wwwwwzzzzz 110111yyyxxxxxxx
118:
119: @param chOffset - Since the three top bits of chOffset are not needed to
120: set the location of the extended Window, they are used instead
121: to select the window, thereby reducing the number of needed command codes.
122: The bottom 13 bits of chOffset are used to calculate the offset relative to
123: a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
124: **/
125: protected void defineExtendedWindow(char chOffset) {
126: // The top 3 bits of iOffsetHi are the window index
127: int iWindow = chOffset >>> 13;
128:
129: // Calculate the new offset
130: dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
131:
132: // make the redefined window the active one
133: selectWindow(iWindow);
134: }
135:
136: /** string buffer length used by the following functions */
137: protected int iOut = 0;
138:
139: /** input cursor used by the following functions */
140: protected int iIn = 0;
141:
142: /** expand input that is in Unicode mode
143: @param in input byte array to be expanded
144: @param iCur starting index
145: @param sb string buffer to which to append expanded input
146: @return the index for the lastc byte processed
147: **/
148: protected int expandUnicode(byte[] in, int iCur, StringBuffer sb)
149: throws IllegalInputException, EndOfInputException {
150: for (; iCur < in.length - 1; iCur += 2) // step by 2:
151: {
152: byte b = in[iCur];
153:
154: if (b >= UC0 && b <= UC7) {
155: Debug.out("SelectWindow: ", b);
156: selectWindow(b - UC0);
157: return iCur;
158: } else if (b >= UD0 && b <= UD7) {
159: defineWindow(b - UD0, in[iCur + 1]);
160: return iCur + 1;
161: } else if (b == UDX) {
162: if (iCur >= in.length - 2) {
163: break; // buffer error
164: }
165: defineExtendedWindow(charFromTwoBytes(in[iCur + 1],
166: in[iCur + 2]));
167: return iCur + 2;
168: } else if (b == UQU) {
169: if (iCur >= in.length - 2) {
170: break; // error
171: }
172: // Skip command byte and output Unicode character
173: iCur++;
174: }
175:
176: // output a Unicode character
177: char ch = charFromTwoBytes(in[iCur], in[iCur + 1]);
178: sb.append((char) ch);
179: iOut++;
180: }
181:
182: if (iCur == in.length) {
183: return iCur;
184: }
185:
186: // Error condition
187: throw new EndOfInputException();
188: }
189:
190: /** assemble a char from two bytes
191: In Java bytes are signed quantities, while chars are unsigned
192: @return the character
193: @param hi most significant byte
194: @param lo least significant byte
195: */
196: public static char charFromTwoBytes(byte hi, byte lo) {
197: char ch = (char) (lo >= 0 ? lo : 256 + lo);
198: return (char) (ch + (char) ((hi >= 0 ? hi : 256 + hi) << 8));
199: }
200:
201: /** expand portion of the input that is in single byte mode **/
202: @SuppressWarnings("fallthrough")
203: protected String expandSingleByte(byte[] in)
204: throws IllegalInputException, EndOfInputException {
205:
206: /* Allocate the output buffer. Because of control codes, generally
207: each byte of input results in fewer than one character of
208: output. Using in.length as an intial allocation length should avoid
209: the need to reallocate in mid-stream. The exception to this rule are
210: surrogates. */
211: StringBuffer sb = new StringBuffer(in.length);
212: iOut = 0;
213:
214: // Loop until all input is exhausted or an error occurred
215: int iCur;
216: Loop: for (iCur = 0; iCur < in.length; iCur++) {
217: // DEBUG Debug.out("Expanding: ", iCur);
218:
219: // Default behaviour is that ASCII characters are passed through
220: // (staticOffset[0] == 0) and characters with the high bit on are
221: // offset by the current dynamic (or sliding) window (this.iWindow)
222: int iStaticWindow = 0;
223: int iDynamicWindow = getCurrentWindow();
224:
225: switch (in[iCur]) {
226: // Quote from a static Window
227: case SQ0:
228: case SQ1:
229: case SQ2:
230: case SQ3:
231: case SQ4:
232: case SQ5:
233: case SQ6:
234: case SQ7:
235: Debug.out("SQn:", iStaticWindow);
236: // skip the command byte and check for length
237: if (iCur >= in.length - 1) {
238: Debug.out("SQn missing argument: ", in, iCur);
239: break Loop; // buffer length error
240: }
241: // Select window pair to quote from
242: iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
243: iCur++;
244:
245: // FALL THROUGH
246:
247: default:
248: // output as character
249: if (in[iCur] >= 0) {
250: // use static window
251: int ch = in[iCur] + staticOffset[iStaticWindow];
252: sb.append((char) ch);
253: iOut++;
254: } else {
255: // use dynamic window
256: int ch = (in[iCur] + 256); // adjust for signed bytes
257: ch -= 0x80; // reduce to range 00..7F
258: ch += dynamicOffset[iDynamicWindow];
259:
260: //DEBUG
261: Debug.out("Dynamic: ", (char) ch);
262:
263: if (ch < 1 << 16) {
264: // in Unicode range, output directly
265: sb.append((char) ch);
266: iOut++;
267: } else {
268: // this is an extension character
269: Debug.out("Extension character: ", ch);
270:
271: // compute and append the two surrogates:
272: // translate from 10000..10FFFF to 0..FFFFF
273: ch -= 0x10000;
274:
275: // high surrogate = top 10 bits added to D800
276: sb.append((char) (0xD800 + (ch >> 10)));
277: iOut++;
278:
279: // low surrogate = bottom 10 bits added to DC00
280: sb.append((char) (0xDC00 + (ch & ~0xFC00)));
281: iOut++;
282: }
283: }
284: break;
285:
286: // define a dynamic window as extended
287: case SDX:
288: iCur += 2;
289: if (iCur >= in.length) {
290: Debug.out("SDn missing argument: ", in, iCur - 1);
291: break Loop; // buffer length error
292: }
293: defineExtendedWindow(charFromTwoBytes(in[iCur - 1],
294: in[iCur]));
295: break;
296:
297: // Position a dynamic Window
298: case SD0:
299: case SD1:
300: case SD2:
301: case SD3:
302: case SD4:
303: case SD5:
304: case SD6:
305: case SD7:
306: iCur++;
307: if (iCur >= in.length) {
308: Debug.out("SDn missing argument: ", in, iCur - 1);
309: break Loop; // buffer length error
310: }
311: defineWindow(in[iCur - 1] - SD0, in[iCur]);
312: break;
313:
314: // Select a new dynamic Window
315: case SC0:
316: case SC1:
317: case SC2:
318: case SC3:
319: case SC4:
320: case SC5:
321: case SC6:
322: case SC7:
323: selectWindow(in[iCur] - SC0);
324: break;
325: case SCU:
326: // switch to Unicode mode and continue parsing
327: iCur = expandUnicode(in, iCur + 1, sb);
328: // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
329: break;
330:
331: case SQU:
332: // directly extract one Unicode character
333: iCur += 2;
334: if (iCur >= in.length) {
335: Debug.out("SQU missing argument: ", in, iCur - 2);
336: break Loop; // buffer length error
337: } else {
338: char ch = charFromTwoBytes(in[iCur - 1], in[iCur]);
339:
340: Debug.out("Quoted: ", ch);
341: sb.append((char) ch);
342: iOut++;
343: }
344: break;
345:
346: case Srs:
347: throw new IllegalInputException();
348: // break;
349: }
350: }
351:
352: if (iCur >= in.length) {
353: //SUCCESS: all input used up
354: sb.setLength(iOut);
355: iIn = iCur;
356: return sb.toString();
357: }
358:
359: Debug.out("Length ==" + in.length + " iCur =", iCur);
360: //ERROR: premature end of input
361: throw new EndOfInputException();
362: }
363:
364: /** expand a byte array containing compressed Unicode */
365: public String expand(byte[] in) throws IllegalInputException,
366: EndOfInputException {
367: String str = expandSingleByte(in);
368: Debug.out("expand output: ", str.toCharArray());
369: return str;
370: }
371:
372: /** reset is called to start with new input, w/o creating a new
373: instance */
374: public void reset() {
375: iOut = 0;
376: iIn = 0;
377: super .reset();
378: }
379:
380: public int charsWritten() {
381: return iOut;
382: }
383:
384: public int bytesRead() {
385: return iIn;
386: }
387: }
|