001: package com.healthmarketscience.jackcess.scsu;
002:
003: /*
004: * This sample software accompanies Unicode Technical Report #6 and
005: * distributed as is by Unicode, Inc., subject to the following:
006: *
007: * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
008: *
009: * Permission to use, copy, modify, and distribute this software
010: * without fee is hereby granted provided that this copyright notice
011: * appears in all copies.
012: *
013: * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
014: * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
015: * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
016: * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
017: * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
018: * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
019: * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
020: * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
021: *
022: * @author Asmus Freytag
023: *
024: * @version 001 Dec 25 1996
025: * @version 002 Jun 25 1997
026: * @version 003 Jul 25 1997
027: * @version 004 Aug 25 1997
028: * @version 005 Sep 30 1998
029: *
030: * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
031: * and are registered in some jurisdictions.
032: **/
033:
034: /**
035: Encoding text data in Unicode often requires more storage than using
036: an existing 8-bit character set and limited to the subset of characters
037: actually found in the text. The Unicode Compression Algorithm reduces
038: the necessary storage while retaining the universality of Unicode.
039: A full description of the algorithm can be found in document
040: http://www.unicode.org/unicode/reports/tr6.html
041:
042: Summary
043:
044: The goal of the Unicode Compression Algorithm is the abilty to
045: * Express all code points in Unicode
046: * Approximate storage size for traditional character sets
047: * Work well for short strings
048: * Provide transparency for Latin-1 data
049: * Support very simple decoders
050: * Support simple as well as sophisticated encoders
051:
052: If needed, further compression can be achieved by layering standard
053: file or disk-block based compression algorithms on top.
054:
055: <H2>Features</H2>
056:
057: Languages using small alphabets would contain runs of characters that
058: are coded close together in Unicode. These runs are interrupted only
059: by punctuation characters, which are themselves coded in proximity to
060: each other in Unicode (usually in the ASCII range).
061:
062: Two basic mechanisms in the compression algorithm account for these two
063: cases, sliding windows and static windows. A window is an area of 128
064: consecutive characters in Unicode. In the compressed data stream, each
065: character from a sliding window would be represented as a byte between
066: 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
067: TAB) would always mean an ASCII character (or control).
068:
069: <H2>Notes on the Java implementation</H2>
070:
071: A limitation of Java is the exclusive use of a signed byte data type.
072: The following work arounds are required:
073:
074: Copying a byte to an integer variable and adding 256 for 'negative'
075: bytes gives an integer in the range 0-255.
076:
077: Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
078: char values is unsigned.
079:
080: Extended characters require an int to store them. The sign is not an
081: issue because only 1024*1024 + 65536 extended characters exist.
082:
083: **/
084: public abstract class SCSU {
085: /** Single Byte mode command values */
086:
087: /** SQ<i>n</i> Quote from Window . <p>
088: If the following byte is less than 0x80, quote from
089: static window <i>n</i>, else quote from dynamic window <i>n</i>.
090: */
091:
092: static final byte SQ0 = 0x01; // Quote from window pair 0
093: static final byte SQ1 = 0x02; // Quote from window pair 1
094: static final byte SQ2 = 0x03; // Quote from window pair 2
095: static final byte SQ3 = 0x04; // Quote from window pair 3
096: static final byte SQ4 = 0x05; // Quote from window pair 4
097: static final byte SQ5 = 0x06; // Quote from window pair 5
098: static final byte SQ6 = 0x07; // Quote from window pair 6
099: static final byte SQ7 = 0x08; // Quote from window pair 7
100:
101: static final byte SDX = 0x0B; // Define a window as extended
102: static final byte Srs = 0x0C; // reserved
103:
104: static final byte SQU = 0x0E; // Quote a single Unicode character
105: static final byte SCU = 0x0F; // Change to Unicode mode
106:
107: /** SC<i>n</i> Change to Window <i>n</i>. <p>
108: If the following bytes are less than 0x80, interpret them
109: as command bytes or pass them through, else add the offset
110: for dynamic window <i>n</i>. */
111: static final byte SC0 = 0x10; // Select window 0
112: static final byte SC1 = 0x11; // Select window 1
113: static final byte SC2 = 0x12; // Select window 2
114: static final byte SC3 = 0x13; // Select window 3
115: static final byte SC4 = 0x14; // Select window 4
116: static final byte SC5 = 0x15; // Select window 5
117: static final byte SC6 = 0x16; // Select window 6
118: static final byte SC7 = 0x17; // Select window 7
119: static final byte SD0 = 0x18; // Define and select window 0
120: static final byte SD1 = 0x19; // Define and select window 1
121: static final byte SD2 = 0x1A; // Define and select window 2
122: static final byte SD3 = 0x1B; // Define and select window 3
123: static final byte SD4 = 0x1C; // Define and select window 4
124: static final byte SD5 = 0x1D; // Define and select window 5
125: static final byte SD6 = 0x1E; // Define and select window 6
126: static final byte SD7 = 0x1F; // Define and select window 7
127:
128: static final byte UC0 = (byte) 0xE0; // Select window 0
129: static final byte UC1 = (byte) 0xE1; // Select window 1
130: static final byte UC2 = (byte) 0xE2; // Select window 2
131: static final byte UC3 = (byte) 0xE3; // Select window 3
132: static final byte UC4 = (byte) 0xE4; // Select window 4
133: static final byte UC5 = (byte) 0xE5; // Select window 5
134: static final byte UC6 = (byte) 0xE6; // Select window 6
135: static final byte UC7 = (byte) 0xE7; // Select window 7
136: static final byte UD0 = (byte) 0xE8; // Define and select window 0
137: static final byte UD1 = (byte) 0xE9; // Define and select window 1
138: static final byte UD2 = (byte) 0xEA; // Define and select window 2
139: static final byte UD3 = (byte) 0xEB; // Define and select window 3
140: static final byte UD4 = (byte) 0xEC; // Define and select window 4
141: static final byte UD5 = (byte) 0xED; // Define and select window 5
142: static final byte UD6 = (byte) 0xEE; // Define and select window 6
143: static final byte UD7 = (byte) 0xEF; // Define and select window 7
144:
145: static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
146: static final byte UDX = (byte) 0xF1; // Define a Window as extended
147: static final byte Urs = (byte) 0xF2; // reserved
148:
149: /** constant offsets for the 8 static windows */
150: static final int staticOffset[] = { 0x0000, // ASCII for quoted tags
151: 0x0080, // Latin - 1 Supplement (for access to punctuation)
152: 0x0100, // Latin Extended-A
153: 0x0300, // Combining Diacritical Marks
154: 0x2000, // General Punctuation
155: 0x2080, // Currency Symbols
156: 0x2100, // Letterlike Symbols and Number Forms
157: 0x3000 // CJK Symbols and punctuation
158: };
159:
160: /** initial offsets for the 8 dynamic (sliding) windows */
161: static final int initialDynamicOffset[] = { 0x0080, // Latin-1
162: 0x00C0, // Latin Extended A //@005 fixed from 0x0100
163: 0x0400, // Cyrillic
164: 0x0600, // Arabic
165: 0x0900, // Devanagari
166: 0x3040, // Hiragana
167: 0x30A0, // Katakana
168: 0xFF00 // Fullwidth ASCII
169: };
170:
171: /** dynamic window offsets, intitialize to default values. */
172: int dynamicOffset[] = { initialDynamicOffset[0],
173: initialDynamicOffset[1], initialDynamicOffset[2],
174: initialDynamicOffset[3], initialDynamicOffset[4],
175: initialDynamicOffset[5], initialDynamicOffset[6],
176: initialDynamicOffset[7] };
177:
178: // The following method is common to encoder and decoder
179:
180: private int iWindow = 0; // current active window
181:
182: /** select the active dynamic window **/
183: protected void selectWindow(int iWindow) {
184: this .iWindow = iWindow;
185: }
186:
187: /** select the active dynamic window **/
188: protected int getCurrentWindow() {
189: return this .iWindow;
190: }
191:
192: /**
193: These values are used in defineWindow
194: **/
195:
196: /**
197: * Unicode code points from 3400 to E000 are not adressible by
198: * dynamic window, since in these areas no short run alphabets are
199: * found. Therefore add gapOffset to all values from gapThreshold */
200: static final int gapThreshold = 0x68;
201: static final int gapOffset = 0xAC00;
202:
203: /* values between reservedStart and fixedThreshold are reserved */
204: static final int reservedStart = 0xA8;
205:
206: /* use table of predefined fixed offsets for values from fixedThreshold */
207: static final int fixedThreshold = 0xF9;
208:
209: /** Table of fixed predefined Offsets, and byte values that index into **/
210: static final int fixedOffset[] = {
211: /* 0xF9 */0x00C0, // Latin-1 Letters + half of Latin Extended A
212: /* 0xFA */0x0250, // IPA extensions
213: /* 0xFB */0x0370, // Greek
214: /* 0xFC */0x0530, // Armenian
215: /* 0xFD */0x3040, // Hiragana
216: /* 0xFE */0x30A0, // Katakana
217: /* 0xFF */0xFF60 // Halfwidth Katakana
218: };
219:
220: /** whether a character is compressible */
221: public static boolean isCompressible(char ch) {
222: return (ch < 0x3400 || ch >= 0xE000);
223: }
224:
225: /** reset is only needed to bail out after an exception and
226: restart with new input */
227: public void reset() {
228:
229: // reset the dynamic windows
230: for (int i = 0; i < dynamicOffset.length; i++) {
231: dynamicOffset[i] = initialDynamicOffset[i];
232: }
233: this .iWindow = 0;
234: }
235: }
|