001: /**
002: *******************************************************************************
003: * Copyright (C) 2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.impl;
007:
008: /**
009: * For generation of Implicit CEs
010: * @author Mark Davis
011: *
012: * Cleaned up so that changes can be made more easily.
013: * Old values:
014: # First Implicit: E26A792D
015: # Last Implicit: E3DC70C0
016: # First CJK: E0030300
017: # Last CJK: E0A9DD00
018: # First CJK_A: E0A9DF00
019: # Last CJK_A: E0DE3100
020: @internal
021: */
022: public class ImplicitCEGenerator {
023:
024: /**
025: * constants
026: */
027: static final boolean DEBUG = false;
028:
029: static final long topByte = 0xFF000000L;
030: static final long bottomByte = 0xFFL;
031: static final long fourBytes = 0xFFFFFFFFL;
032:
033: static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
034:
035: public static final int CJK_BASE = 0x4E00, CJK_LIMIT = 0x9FFF + 1,
036: CJK_COMPAT_USED_BASE = 0xFA0E,
037: CJK_COMPAT_USED_LIMIT = 0xFA2F + 1, CJK_A_BASE = 0x3400,
038: CJK_A_LIMIT = 0x4DBF + 1, CJK_B_BASE = 0x20000,
039: CJK_B_LIMIT = 0x2A6DF + 1;
040:
041: private void throwError(String title, int cp) {
042: throw new IllegalArgumentException(title + "\t"
043: + Utility.hex(cp, 6) + "\t"
044: + Utility.hex(getImplicitFromRaw(cp) & fourBytes));
045: }
046:
047: private void throwError(String title, long ce) {
048: throw new IllegalArgumentException(title + "\t"
049: + Utility.hex(ce & fourBytes));
050: }
051:
052: private void show(int i) {
053: if (i >= 0 && i <= MAX_INPUT) {
054: System.out.println(Utility.hex(i) + "\t"
055: + Utility.hex(getImplicitFromRaw(i) & fourBytes));
056: }
057: }
058:
059: /**
060: * Precomputed by constructor
061: */
062: int final3Multiplier;
063: int final4Multiplier;
064: int final3Count;
065: int final4Count;
066: int medialCount;
067: int min3Primary;
068: int min4Primary;
069: int max4Primary;
070: int minTrail;
071: int maxTrail;
072: int max3Trail;
073: int max4Trail;
074: int min4Boundary;
075:
076: public int getGap4() {
077: return final4Multiplier - 1;
078: }
079:
080: public int getGap3() {
081: return final3Multiplier - 1;
082: }
083:
084: // old comment
085: // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
086: // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
087: // we shift so that HAN all has the same first primary, for compression.
088: // for the 4 byte case, we make the gap as large as we can fit.
089:
090: /**
091: * Supply parameters for generating implicit CEs
092: */
093: public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
094: // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
095: this (minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
096: }
097:
098: /**
099: * Set up to generate implicits.
100: * @param minPrimary
101: * @param maxPrimary
102: * @param minTrail final byte
103: * @param maxTrail final byte
104: * @param gap3 the gap we leave for tailoring for 3-byte forms
105: * @param primaries3count number of 3-byte primarys we can use (normally 1)
106: */
107: public ImplicitCEGenerator(int minPrimary, int maxPrimary,
108: int minTrail, int maxTrail, int gap3, int primaries3count) {
109: // some simple parameter checks
110: if (minPrimary < 0 || minPrimary >= maxPrimary
111: || maxPrimary > 0xFF) {
112: throw new IllegalArgumentException("bad lead bytes");
113: }
114: if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
115: throw new IllegalArgumentException("bad trail bytes");
116: }
117: if (primaries3count < 1) {
118: throw new IllegalArgumentException(
119: "bad three-byte primaries");
120: }
121:
122: this .minTrail = minTrail;
123: this .maxTrail = maxTrail;
124:
125: min3Primary = minPrimary;
126: max4Primary = maxPrimary;
127: // compute constants for use later.
128: // number of values we can use in trailing bytes
129: // leave room for empty values between AND above, e.g. if gap = 2
130: // range 3..7 => +3 -4 -5 -6 -7: so 1 value
131: // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
132: // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
133: final3Multiplier = gap3 + 1;
134: final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
135: max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
136:
137: // medials can use full range
138: medialCount = (maxTrail - minTrail + 1);
139: // find out how many values fit in each form
140: int threeByteCount = medialCount * final3Count;
141: // now determine where the 3/4 boundary is.
142: // we use 3 bytes below the boundary, and 4 above
143: int primariesAvailable = maxPrimary - minPrimary + 1;
144: int primaries4count = primariesAvailable - primaries3count;
145:
146: int min3ByteCoverage = primaries3count * threeByteCount;
147: min4Primary = minPrimary + primaries3count;
148: min4Boundary = min3ByteCoverage;
149: // Now expand out the multiplier for the 4 bytes, and redo.
150:
151: int totalNeeded = MAX_INPUT - min4Boundary;
152: int neededPerPrimaryByte = divideAndRoundUp(totalNeeded,
153: primaries4count);
154: if (DEBUG)
155: System.out.println("neededPerPrimaryByte: "
156: + neededPerPrimaryByte);
157:
158: int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte,
159: medialCount * medialCount);
160: if (DEBUG)
161: System.out.println("neededPerFinalByte: "
162: + neededPerFinalByte);
163:
164: int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
165: if (DEBUG)
166: System.out.println("expandedGap: " + gap4);
167: if (gap4 < 1)
168: throw new IllegalArgumentException("must have larger gap4s");
169:
170: final4Multiplier = gap4 + 1;
171: final4Count = neededPerFinalByte;
172: max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
173:
174: if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
175: throw new IllegalArgumentException("internal error");
176: }
177: if (DEBUG) {
178: System.out.println("final4Count: " + final4Count);
179: for (int counter = 0; counter < final4Count; ++counter) {
180: int value = minTrail + (1 + counter) * final4Multiplier;
181: System.out.println(counter + "\t" + value + "\t"
182: + Utility.hex(value));
183: }
184: }
185: }
186:
187: static public int divideAndRoundUp(int a, int b) {
188: return 1 + (a - 1) / b;
189: }
190:
191: /**
192: * Converts implicit CE into raw integer
193: * @param implicit
194: * @return -1 if illegal format
195: */
196: public int getRawFromImplicit(int implicit) {
197: int result;
198: int b3 = implicit & 0xFF;
199: implicit >>= 8;
200: int b2 = implicit & 0xFF;
201: implicit >>= 8;
202: int b1 = implicit & 0xFF;
203: implicit >>= 8;
204: int b0 = implicit & 0xFF;
205:
206: // simple parameter checks
207: if (b0 < min3Primary || b0 > max4Primary || b1 < minTrail
208: || b1 > maxTrail)
209: return -1;
210: // normal offsets
211: b1 -= minTrail;
212:
213: // take care of the final values, and compose
214: if (b0 < min4Primary) {
215: if (b2 < minTrail || b2 > max3Trail || b3 != 0)
216: return -1;
217: b2 -= minTrail;
218: int remainder = b2 % final3Multiplier;
219: if (remainder != 0)
220: return -1;
221: b0 -= min3Primary;
222: b2 /= final3Multiplier;
223: result = ((b0 * medialCount) + b1) * final3Count + b2;
224: } else {
225: if (b2 < minTrail || b2 > maxTrail || b3 < minTrail
226: || b3 > max4Trail)
227: return -1;
228: b2 -= minTrail;
229: b3 -= minTrail;
230: int remainder = b3 % final4Multiplier;
231: if (remainder != 0)
232: return -1;
233: b3 /= final4Multiplier;
234: b0 -= min4Primary;
235: result = (((b0 * medialCount) + b1) * medialCount + b2)
236: * final4Count + b3 + min4Boundary;
237: }
238: // final check
239: if (result < 0 || result > MAX_INPUT)
240: return -1;
241: return result;
242: }
243:
244: /**
245: * Generate the implicit CE, from raw integer.
246: * Left shifted to put the first byte at the top of an int.
247: * @param cp code point
248: * @return Primary implicit weight
249: */
250: public int getImplicitFromRaw(int cp) {
251: if (cp < 0 || cp > MAX_INPUT) {
252: throw new IllegalArgumentException(
253: "Code point out of range " + Utility.hex(cp));
254: }
255: int last0 = cp - min4Boundary;
256: if (last0 < 0) {
257: int last1 = cp / final3Count;
258: last0 = cp % final3Count;
259:
260: int last2 = last1 / medialCount;
261: last1 %= medialCount;
262:
263: last0 = minTrail + last0 * final3Multiplier; // spread out, leaving gap at start
264: last1 = minTrail + last1; // offset
265: last2 = min3Primary + last2; // offset
266:
267: if (last2 >= min4Primary) {
268: throw new IllegalArgumentException(
269: "4-byte out of range: " + Utility.hex(cp)
270: + ", " + Utility.hex(last2));
271: }
272:
273: return (last2 << 24) + (last1 << 16) + (last0 << 8);
274: } else {
275: int last1 = last0 / final4Count;
276: last0 %= final4Count;
277:
278: int last2 = last1 / medialCount;
279: last1 %= medialCount;
280:
281: int last3 = last2 / medialCount;
282: last2 %= medialCount;
283:
284: last0 = minTrail + last0 * final4Multiplier; // spread out, leaving gap at start
285: last1 = minTrail + last1; // offset
286: last2 = minTrail + last2; // offset
287: last3 = min4Primary + last3; // offset
288:
289: if (last3 > max4Primary) {
290: throw new IllegalArgumentException(
291: "4-byte out of range: " + Utility.hex(cp)
292: + ", " + Utility.hex(last3));
293: }
294:
295: return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
296: }
297: }
298:
299: /**
300: * Gets an Implicit from a code point. Internally,
301: * swaps (which produces a raw value 0..220000,
302: * then converts raw to implicit.
303: * @param cp
304: * @return Primary implicit weight
305: */
306: public int getImplicitFromCodePoint(int cp) {
307: if (DEBUG)
308: System.out.println("Incoming: " + Utility.hex(cp));
309:
310: // Produce Raw value
311: // note, we add 1 so that the first value is always empty!!
312: cp = ImplicitCEGenerator.swapCJK(cp) + 1;
313: // we now have a range of numbers from 0 to 220000.
314:
315: if (DEBUG)
316: System.out.println("CJK swapped: " + Utility.hex(cp));
317:
318: return getImplicitFromRaw(cp);
319: }
320:
321: /**
322: * Function used to:
323: * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
324: * b) bump any non-CJK characters by 10FFFF.
325: * The relevant blocks are:
326: * A: 4E00..9FFF; CJK Unified Ideographs
327: * F900..FAFF; CJK Compatibility Ideographs
328: * B: 3400..4DBF; CJK Unified Ideographs Extension A
329: * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
330: * As long as
331: * no new B characters are allocated between 4E00 and FAFF, and
332: * no new A characters are outside of this range,
333: * (very high probability) this simple code will work.
334: * The reordered blocks are:
335: * Block1 is CJK
336: * Block2 is CJK_COMPAT_USED
337: * Block3 is CJK_A
338: * (all contiguous)
339: * Any other CJK gets its normal code point
340: * Any non-CJK gets +10FFFF
341: * When we reorder Block1, we make sure that it is at the very start,
342: * so that it will use a 3-byte form.
343: * Warning: the we only pick up the compatibility characters that are
344: * NOT decomposed, so that block is smaller!
345: */
346:
347: static int NON_CJK_OFFSET = 0x110000;
348:
349: static int swapCJK(int i) {
350:
351: if (i >= CJK_BASE) {
352: if (i < CJK_LIMIT)
353: return i - CJK_BASE;
354:
355: if (i < CJK_COMPAT_USED_BASE)
356: return i + NON_CJK_OFFSET;
357:
358: if (i < CJK_COMPAT_USED_LIMIT)
359: return i - CJK_COMPAT_USED_BASE
360: + (CJK_LIMIT - CJK_BASE);
361: if (i < CJK_B_BASE)
362: return i + NON_CJK_OFFSET;
363:
364: if (i < CJK_B_LIMIT)
365: return i; // non-BMP-CJK
366:
367: return i + NON_CJK_OFFSET; // non-CJK
368: }
369: if (i < CJK_A_BASE)
370: return i + NON_CJK_OFFSET;
371:
372: if (i < CJK_A_LIMIT)
373: return i - CJK_A_BASE + (CJK_LIMIT - CJK_BASE)
374: + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
375: return i + NON_CJK_OFFSET; // non-CJK
376: }
377:
378: /**
379: * @return Minimal trail value
380: */
381: public int getMinTrail() {
382: return minTrail;
383: }
384:
385: /**
386: * @return Maximal trail value
387: */
388: public int getMaxTrail() {
389: return maxTrail;
390: }
391:
392: public int getCodePointFromRaw(int i) {
393: i--;
394: int result = 0;
395: if (i >= NON_CJK_OFFSET) {
396: result = i - NON_CJK_OFFSET;
397: } else if (i >= CJK_B_BASE) {
398: result = i;
399: } else if (i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE)
400: + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
401: // rest of CJKs, compacted
402: if (i < CJK_LIMIT - CJK_BASE) {
403: result = i + CJK_BASE;
404: } else if (i < (CJK_LIMIT - CJK_BASE)
405: + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
406: result = i + CJK_COMPAT_USED_BASE
407: - (CJK_LIMIT - CJK_BASE);
408: } else {
409: result = i
410: + CJK_A_BASE
411: - (CJK_LIMIT - CJK_BASE)
412: - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
413: }
414: } else {
415: result = -1;
416: }
417: return result;
418: }
419:
420: public int getRawFromCodePoint(int i) {
421: return swapCJK(i) + 1;
422: }
423: }
|