001: /**
002: *******************************************************************************
003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.impl;
007:
008: import com.ibm.icu.text.UCharacterIterator;
009:
010: /**
011: * <p>Binary Ordered Compression for Unicode</p>
012: *
013: * <p>Users are strongly encouraged to read the ICU paper on
014: * <a href="http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html">
015: * BOCU</a> before attempting to use this class.</p>
016: *
017: * <p>BOCU is used to compress unicode text into a stream of unsigned
018: * bytes. For many kinds of text the compression compares favorably
019: * to UTF-8, and for some kinds of text (such as CJK) it does better.
020: * The resulting bytes will compare in the same order as the original
021: * code points. The byte stream does not contain the values 0, 1, or
022: * 2.</p>
023: *
024: * <p>One example of a use of BOCU is in {@link
025: * com.ibm.icu.text.Collator#getCollationKey(String)} for a RuleBasedCollator object with
026: * collation strength IDENTICAL. The result CollationKey will consist of the
027: * collation order of the source string followed by the BOCU result of the
028: * source string.
029: * </p>
030: *
031: * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
032: * random access.</p>
033: *
034: * <p>Method: Slope Detection<br> Remember the previous code point
035: * (initial 0). For each code point in the string, encode the
036: * difference with the previous one. Similar to a UTF, the length of
037: * the byte sequence is encoded in the lead bytes. Unlike a UTF, the
038: * trail byte values may overlap with lead/single byte values. The
039: * signedness of the difference must be encoded as the most
040: * significant part.</p>
041: *
042: * <p>We encode differences with few bytes if their absolute values
043: * are small. For correct ordering, we must treat the entire value
044: * range -10ffff..+10ffff in ascending order, which forbids encoding
045: * the sign and the absolute value separately. Instead, we split the
046: * lead byte range in the middle and encode non-negative values going
047: * up and negative values going down.</p>
048: *
049: * <p>For very small absolute values, the difference is added to a
050: * middle byte value for single-byte encoded differences. For
051: * somewhat larger absolute values, the difference is divided by the
052: * number of byte values available, the modulo is used for one trail
053: * byte, and the remainder is added to a lead byte avoiding the
054: * single-byte range. For large absolute values, the difference is
055: * similarly encoded in three bytes. (Syn Wee, I need examples
056: * here.)</p>
057: *
058: * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
059: * byte values for lead and single bytes, so that the middle range of
060: * single bytes is as large as possible.</p>
061: *
062: * <p>Note that the lead byte ranges overlap some, but that the
063: * sequences as a whole are well ordered. I.e., even if the lead byte
064: * is the same for sequences of different lengths, the trail bytes
065: * establish correct order. It would be possible to encode slightly
066: * larger ranges for each length (>1) by subtracting the lower bound
067: * of the range. However, that would also slow down the calculation.
068: * (Syn Wee, need an example).</p>
069: *
070: * <p>For the actual string encoding, an optimization moves the
071: * previous code point value to the middle of its Unicode script block
072: * to minimize the differences in same-script text runs. (Syn Wee,
073: * need an example.)</p>
074: *
075: * @author Syn Wee Quek
076: * @since release 2.2, May 3rd 2002
077: * @draft 2.2
078: */
079: public class BOCU {
080: // public constructors --------------------------------------------------
081:
082: // public methods -------------------------------------------------------
083:
084: /**
085: * <p>Encode the code points of a string as a sequence of bytes,
086: * preserving lexical order.</p>
087: * <p>The minimum size of buffer required for the compression can be
088: * preflighted by getCompressionLength(String).</p>
089: * @param source text source
090: * @param buffer output buffer
091: * @param offset to start writing to
092: * @return end offset where the writing stopped
093: * @see #getCompressionLength(String)
094: * @exception ArrayIndexOutOfBoundsException thrown if size of buffer is
095: * too small for the output.
096: */
097: public static int compress(String source, byte buffer[], int offset) {
098: int prev = 0;
099: UCharacterIterator iterator = UCharacterIterator
100: .getInstance(source);
101: int codepoint = iterator.nextCodePoint();
102: while (codepoint != UCharacterIterator.DONE) {
103: if (prev < 0x4e00 || prev >= 0xa000) {
104: prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
105: } else {
106: // Unihan U+4e00..U+9fa5:
107: // double-bytes down from the upper end
108: prev = 0x9fff - SLOPE_REACH_POS_2_;
109: }
110:
111: offset = writeDiff(codepoint - prev, buffer, offset);
112: prev = codepoint;
113: codepoint = iterator.nextCodePoint();
114: }
115: return offset;
116: }
117:
118: /**
119: * Return the number of bytes that compress() would write.
120: * @param source text source string
121: * @return the length of the BOCU result
122: * @see #compress(String, byte[], int)
123: */
124: public static int getCompressionLength(String source) {
125: int prev = 0;
126: int result = 0;
127: UCharacterIterator iterator = UCharacterIterator
128: .getInstance(source);
129: int codepoint = iterator.nextCodePoint();
130: while (codepoint != UCharacterIterator.DONE) {
131: if (prev < 0x4e00 || prev >= 0xa000) {
132: prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
133: } else {
134: // Unihan U+4e00..U+9fa5:
135: // double-bytes down from the upper end
136: prev = 0x9fff - SLOPE_REACH_POS_2_;
137: }
138:
139: codepoint = iterator.nextCodePoint();
140: result += lengthOfDiff(codepoint - prev);
141: prev = codepoint;
142: }
143: return result;
144: }
145:
146: // public setter methods -------------------------------------------------
147:
148: // public getter methods ------------------------------------------------
149:
150: // public other methods -------------------------------------------------
151:
152: // protected constructor ------------------------------------------------
153:
154: // protected data members ------------------------------------------------
155:
156: // protected methods -----------------------------------------------------
157:
158: // private data members --------------------------------------------------
159:
160: /**
161: * Do not use byte values 0, 1, 2 because they are separators in sort keys.
162: */
163: private static final int SLOPE_MIN_ = 3;
164: private static final int SLOPE_MAX_ = 0xff;
165: private static final int SLOPE_MIDDLE_ = 0x81;
166: private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_
167: - SLOPE_MIN_ + 1;
168: private static final int SLOPE_MAX_BYTES_ = 4;
169:
170: /**
171: * Number of lead bytes:
172: * 1 middle byte for 0
173: * 2*80=160 single bytes for !=0
174: * 2*42=84 for double-byte values
175: * 2*3=6 for 3-byte values
176: * 2*1=2 for 4-byte values
177: *
178: * The sum must be <=SLOPE_TAIL_COUNT.
179: *
180: * Why these numbers?
181: * - There should be >=128 single-byte values to cover 128-blocks
182: * with small scripts.
183: * - There should be >=20902 single/double-byte values to cover Unihan.
184: * - It helps CJK Extension B some if there are 3-byte values that cover
185: * the distance between them and Unihan.
186: * This also helps to jump among distant places in the BMP.
187: * - Four-byte values are necessary to cover the rest of Unicode.
188: *
189: * Symmetrical lead byte counts are for convenience.
190: * With an equal distribution of even and odd differences there is also
191: * no advantage to asymmetrical lead byte counts.
192: */
193: private static final int SLOPE_SINGLE_ = 80;
194: private static final int SLOPE_LEAD_2_ = 42;
195: private static final int SLOPE_LEAD_3_ = 3;
196: private static final int SLOPE_LEAD_4_ = 1;
197:
198: /**
199: * The difference value range for single-byters.
200: */
201: private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
202: private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
203:
204: /**
205: * The difference value range for double-byters.
206: */
207: private static final int SLOPE_REACH_POS_2_ = SLOPE_LEAD_2_
208: * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
209: private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
210:
211: /**
212: * The difference value range for 3-byters.
213: */
214: private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
215: * SLOPE_TAIL_COUNT_ * SLOPE_TAIL_COUNT_
216: + (SLOPE_LEAD_3_ - 1) * SLOPE_TAIL_COUNT_
217: + (SLOPE_TAIL_COUNT_ - 1);
218: private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
219:
220: /**
221: * The lead byte start values.
222: */
223: private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
224: + SLOPE_SINGLE_ + 1;
225: private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
226: + SLOPE_LEAD_2_;
227: private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_
228: + SLOPE_REACH_NEG_1_;
229: private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
230: - SLOPE_LEAD_2_;
231:
232: // private constructor ---------------------------------------------------
233:
234: /**
235: * Constructor private to prevent initialization
236: */
237: ///CLOVER:OFF
238: private BOCU() {
239: }
240:
241: ///CLOVER:ON
242:
243: // private methods -------------------------------------------------------
244:
245: /**
246: * Integer division and modulo with negative numerators
247: * yields negative modulo results and quotients that are one more than
248: * what we need here.
249: * @param number which operations are to be performed on
250: * @param factor the factor to use for division
251: * @return (result of division) << 32 | modulo
252: */
253: private static final long getNegDivMod(int number, int factor) {
254: int modulo = number % factor;
255: long result = number / factor;
256: if (modulo < 0) {
257: --result;
258: modulo += factor;
259: }
260: return (result << 32) | modulo;
261: }
262:
263: /**
264: * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
265: * preserving lexical order
266: * @param diff
267: * @param buffer byte buffer to append to
268: * @param offset to the byte buffer to start appending
269: * @return end offset where the appending stops
270: */
271: private static final int writeDiff(int diff, byte buffer[],
272: int offset) {
273: if (diff >= SLOPE_REACH_NEG_1_) {
274: if (diff <= SLOPE_REACH_POS_1_) {
275: buffer[offset++] = (byte) (SLOPE_MIDDLE_ + diff);
276: } else if (diff <= SLOPE_REACH_POS_2_) {
277: buffer[offset++] = (byte) (SLOPE_START_POS_2_ + (diff / SLOPE_TAIL_COUNT_));
278: buffer[offset++] = (byte) (SLOPE_MIN_ + (diff % SLOPE_TAIL_COUNT_));
279: } else if (diff <= SLOPE_REACH_POS_3_) {
280: buffer[offset + 2] = (byte) (SLOPE_MIN_ + (diff % SLOPE_TAIL_COUNT_));
281: diff /= SLOPE_TAIL_COUNT_;
282: buffer[offset + 1] = (byte) (SLOPE_MIN_ + (diff % SLOPE_TAIL_COUNT_));
283: buffer[offset] = (byte) (SLOPE_START_POS_3_ + (diff / SLOPE_TAIL_COUNT_));
284: offset += 3;
285: } else {
286: buffer[offset + 3] = (byte) (SLOPE_MIN_ + diff
287: % SLOPE_TAIL_COUNT_);
288: diff /= SLOPE_TAIL_COUNT_;
289: buffer[offset] = (byte) (SLOPE_MIN_ + diff
290: % SLOPE_TAIL_COUNT_);
291: diff /= SLOPE_TAIL_COUNT_;
292: buffer[offset + 1] = (byte) (SLOPE_MIN_ + diff
293: % SLOPE_TAIL_COUNT_);
294: buffer[offset] = (byte) SLOPE_MAX_;
295: offset += 4;
296: }
297: } else {
298: long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
299: int modulo = (int) division;
300: if (diff >= SLOPE_REACH_NEG_2_) {
301: diff = (int) (division >> 32);
302: buffer[offset++] = (byte) (SLOPE_START_NEG_2_ + diff);
303: buffer[offset++] = (byte) (SLOPE_MIN_ + modulo);
304: } else if (diff >= SLOPE_REACH_NEG_3_) {
305: buffer[offset + 2] = (byte) (SLOPE_MIN_ + modulo);
306: diff = (int) (division >> 32);
307: division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
308: modulo = (int) division;
309: diff = (int) (division >> 32);
310: buffer[offset + 1] = (byte) (SLOPE_MIN_ + modulo);
311: buffer[offset] = (byte) (SLOPE_START_NEG_3_ + diff);
312: offset += 3;
313: } else {
314: buffer[offset + 3] = (byte) (SLOPE_MIN_ + modulo);
315: diff = (int) (division >> 32);
316: division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
317: modulo = (int) division;
318: diff = (int) (division >> 32);
319: buffer[offset + 2] = (byte) (SLOPE_MIN_ + modulo);
320: division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
321: modulo = (int) division;
322: buffer[offset + 1] = (byte) (SLOPE_MIN_ + modulo);
323: buffer[offset] = SLOPE_MIN_;
324: offset += 4;
325: }
326: }
327: return offset;
328: }
329:
330: /**
331: * How many bytes would writeDiff() write?
332: * @param diff
333: */
334: private static final int lengthOfDiff(int diff) {
335: if (diff >= SLOPE_REACH_NEG_1_) {
336: if (diff <= SLOPE_REACH_POS_1_) {
337: return 1;
338: } else if (diff <= SLOPE_REACH_POS_2_) {
339: return 2;
340: } else if (diff <= SLOPE_REACH_POS_3_) {
341: return 3;
342: } else {
343: return 4;
344: }
345: } else {
346: if (diff >= SLOPE_REACH_NEG_2_) {
347: return 2;
348: } else if (diff >= SLOPE_REACH_NEG_3_) {
349: return 3;
350: } else {
351: return 4;
352: }
353: }
354: }
355: }
|