001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.util;
017:
018: /**
019: * @author yonik
020: * @version $Id: BCDUtils.java 472574 2006-11-08 18:25:52Z yonik $
021: */
022: public class BCDUtils {
023: // idiv is expensive...
024: // use fixed point math to multiply by 1/10
025: // http://www.cs.uiowa.edu/~jones/bcd/divide.html
026: private static int div10(int a) {
027: return (a * 0xcccd) >>> 19;
028: }
029:
030: private static int mul10(int a) {
031: return (a * 10);
032: }
033:
034: // private static int mul10(int a) { return ((a<<3)+(a<<1)); }
035: // private static int mul10(int a) { return (a+(a<<2))<<1; } // attempt to use LEA instr
036: // (imul32 on AMD64 only has a 3 cycle latency in any case)
037:
038: // something that won't clash with other base100int
039: // chars (something >= 100)
040: private static final char NEG_CHAR = (char) 126;
041: // The zero exponent.
042: // NOTE: for smaller integer representations, this current implementation
043: // combines sign and exponent into the first char. sign is negative if
044: // exponent is less than the zero point (no negative exponents themselves)
045: private static final int ZERO_EXPONENT = 'a'; // 97
046:
047: // WARNING: assumption is that this is a legal int...
048: // no validation is done. [+-]?digit*
049: //
050: // Normalization of zeros *is* done...
051: // 0004, 004, 04, 4 will all end up being equal
052: // 0,-0 are normalized to '' (zero length)
053: //
054: // The value is written to the output buffer
055: // from the end to the start. The return value
056: // is the start of the Base100 int in the output buffer.
057: //
058: // As the output will be smaller than the input, arr and
059: // out may refer to the same array if desired.
060: //
061: public static int base10toBase100(char[] arr, int start, int end,
062: char[] out, int outend) {
063: int wpos = outend; // write position
064: boolean neg = false;
065:
066: while (--end >= start) {
067: int val = arr[end];
068: if (val == '+') {
069: break;
070: } else if (val == '-') {
071: neg = !neg;
072: break;
073: } else {
074: val = val - '0';
075: if (end > start) {
076: int val2 = arr[end - 1];
077: if (val2 == '+') {
078: out[--wpos] = (char) val;
079: break;
080: }
081: if (val2 == '-') {
082: out[--wpos] = (char) val;
083: neg = !neg;
084: break;
085: }
086: end--;
087: val = val + (val2 - '0') * 10;
088: }
089: out[--wpos] = (char) val;
090: }
091: }
092:
093: // remove leading base100 zeros
094: while (wpos < outend && out[wpos] == 0)
095: wpos++;
096:
097: // check for a zero value
098: if (wpos == outend) {
099: // if zero, don't add negative sign
100: } else if (neg) {
101: out[--wpos] = NEG_CHAR;
102: }
103:
104: return wpos; // the start of the base100 int
105: }
106:
107: // Converts a base100 number to base10 character form
108: // returns number of chars written.
109: // At least 1 char is always written.
110: public static int base100toBase10(char[] arr, int start, int end,
111: char[] out, int offset) {
112: int wpos = offset; // write position
113: boolean firstDigit = true;
114: for (int i = start; i < end; i++) {
115: int val = arr[i];
116: if (val == NEG_CHAR) {
117: out[wpos++] = '-';
118: continue;
119: }
120: char tens = (char) (val / 10 + '0');
121: if (!firstDigit || tens != '0') { // skip leading 0
122: out[wpos++] = (char) (val / 10 + '0'); // tens position
123: }
124: out[wpos++] = (char) (val % 10 + '0'); // ones position
125: firstDigit = false;
126: }
127: if (firstDigit)
128: out[wpos++] = '0';
129: return wpos - offset;
130: }
131:
132: public static String base10toBase100SortableInt(String val) {
133: char[] arr = new char[val.length() + 1];
134: val.getChars(0, val.length(), arr, 0);
135: int len = base10toBase100SortableInt(arr, 0, val.length(), arr,
136: arr.length);
137: return new String(arr, arr.length - len, len);
138: }
139:
140: public static String base100SortableIntToBase10(String val) {
141: int slen = val.length();
142: char[] arr = new char[slen << 2];
143: val.getChars(0, slen, arr, 0);
144: int len = base100SortableIntToBase10(arr, 0, slen, arr, slen);
145: return new String(arr, slen, len);
146: }
147:
148: public static String base10toBase10kSortableInt(String val) {
149: char[] arr = new char[val.length() + 1];
150: val.getChars(0, val.length(), arr, 0);
151: int len = base10toBase10kSortableInt(arr, 0, val.length(), arr,
152: arr.length);
153: return new String(arr, arr.length - len, len);
154: }
155:
156: public static String base10kSortableIntToBase10(String val) {
157: int slen = val.length();
158: char[] arr = new char[slen * 5]; // +1 time for orig, +4 for new
159: val.getChars(0, slen, arr, 0);
160: int len = base10kSortableIntToBase10(arr, 0, slen, arr, slen);
161: return new String(arr, slen, len);
162: }
163:
164: /********* FUTURE
165: // the zero exponent... exponents above this point are positive
166: // and below are negative.
167: // It is desirable to make ordinary numbers have a single byte
168: // exponent when converted to UTF-8
169: // For integers, the exponent will always be >=0, but this format
170: // is meant to be valid for floating point numbers as well...
171: private static final int ZERO_EXPONENT='a'; // 97
172:
173: // if exponent is larger than what can be represented
174: // in a single byte (char), then this is the multibyte
175: // escape char.
176: // UCS-2 surrogates start at 0xD800
177: private static final int POSITIVE_EXPONENT_ESCAPE=0x3fff;
178:
179: // if exponent is smaller than what can be represented in
180: // a single byte, then this is the multibyte escape
181: private static final int NEGATIVE_EXPONENT_ESCAPE=1;
182:
183: // if number is negative, it starts with this optional value
184: // this should not overlap with any exponent values
185: private static final int NEGATIVE_SIGN=0;
186: **********/
187:
188: // WARNING: assumption is that this is a legal int...
189: // no validation is done. [+-]?digit*
190: //
191: // Normalization of zeros *is* done...
192: // 0004, 004, 04, 4 will all end up being equal
193: // 0,-0 are normalized to '' (zero length)
194: //
195: // The value is written to the output buffer
196: // from the end to the start. The return value
197: // is the start of the Base100 int in the output buffer.
198: //
199: // As the output will be smaller than the input, arr and
200: // out may refer to the same array if desired.
201: //
202: public static int base10toBase100SortableInt(char[] arr, int start,
203: int end, char[] out, int outend) {
204: int wpos = outend; // write position
205: boolean neg = false;
206: --end; // position end pointer *on* the last char
207:
208: // read signs and leading zeros
209: while (start <= end) {
210: char val = arr[start];
211: if (val == '-')
212: neg = !neg;
213: else if (val >= '1' && val <= '9')
214: break;
215: start++;
216: }
217:
218: // eat whitespace on RHS?
219: outer: while (start <= end) {
220: switch (arr[end]) {
221: case ' ':
222: case '\t':
223: case '\n':
224: case '\r':
225: end--;
226: break;
227: default:
228: break outer;
229: }
230: }
231:
232: int hundreds = 0;
233: /******************************************************
234: * remove RHS zero normalization since it only helps 1 in 100
235: * numbers and complicates both encoding and decoding.
236:
237: // remove pairs of zeros on the RHS and keep track of
238: // the count.
239: while (start <= end) {
240: char val = arr[end];
241:
242: if (val=='0' && start <= end) {
243: val=arr[end-1];
244: if (val=='0') {
245: hundreds++;
246: end-=2;
247: continue;
248: }
249: }
250:
251: break;
252: }
253: *************************************************************/
254:
255: // now start at the end and work our way forward
256: // encoding two base 10 digits into 1 base 100 digit
257: while (start <= end) {
258: int val = arr[end--];
259: val = val - '0';
260: if (start <= end) {
261: int val2 = arr[end--];
262: val = val + (val2 - '0') * 10;
263: }
264: out[--wpos] = neg ? (char) (99 - val) : (char) val;
265: }
266:
267: /****** FUTURE: not needed for this implementation of exponent combined with sign
268: // normalize all zeros to positive values
269: if (wpos==outend) neg=false;
270: ******/
271:
272: // adjust exponent by the number of base 100 chars written
273: hundreds += outend - wpos;
274:
275: // write the exponent and sign combined
276: out[--wpos] = neg ? (char) (ZERO_EXPONENT - hundreds)
277: : (char) (ZERO_EXPONENT + hundreds);
278:
279: return outend - wpos; // the length of the base100 int
280: }
281:
282: // Converts a base100 sortable number to base10 character form
283: // returns number of chars written.
284: // At least 1 char is always written.
285: public static int base100SortableIntToBase10(char[] arr, int start,
286: int end, char[] out, int offset) {
287: // Take care of "0" case first. It's the only number that is represented
288: // in one char.
289: if (end - start == 1) {
290: out[offset] = '0';
291: return 1;
292: }
293:
294: int wpos = offset; // write position
295: boolean neg = false;
296: int exp = arr[start++];
297: if (exp < ZERO_EXPONENT) {
298: neg = true;
299: exp = ZERO_EXPONENT - exp;
300: out[wpos++] = '-';
301: }
302:
303: boolean firstDigit = true;
304: while (start < end) {
305: int val = arr[start++];
306: if (neg)
307: val = 99 - val;
308: // opt - if we ever want a faster version we can avoid one integer
309: // divide by using fixed point math to multiply by 1/10
310: // http://www.cs.uiowa.edu/~jones/bcd/divide.html
311: // TIP: write a small function in gcc or cl and see what
312: // the optimized assemply output looks like (and which is fastest).
313: // In C you can specify "unsigned" which gives the compiler more
314: // info than the Java compiler has.
315: char tens = (char) (val / 10 + '0');
316: if (!firstDigit || tens != '0') { // skip leading 0
317: out[wpos++] = tens; // write tens position
318: }
319: out[wpos++] = (char) (val % 10 + '0'); // write ones position
320: firstDigit = false;
321: }
322:
323: // OPTIONAL: if trailing zeros were truncated, then this is where
324: // we would restore them (compare number of chars read vs exponent)
325:
326: return wpos - offset;
327: }
328:
329: public static int base10toBase10kSortableInt(char[] arr, int start,
330: int end, char[] out, int outend) {
331: int wpos = outend; // write position
332: boolean neg = false;
333: --end; // position end pointer *on* the last char
334:
335: // read signs and leading zeros
336: while (start <= end) {
337: char val = arr[start];
338: if (val == '-')
339: neg = !neg;
340: else if (val >= '1' && val <= '9')
341: break;
342: start++;
343: }
344:
345: // eat whitespace on RHS?
346: outer: while (start <= end) {
347: switch (arr[end]) {
348: case ' ': // fallthrough
349: case '\t': // fallthrough
350: case '\n': // fallthrough
351: case '\r':
352: end--;
353: break;
354: default:
355: break outer;
356: }
357: }
358:
359: int exp = 0;
360:
361: /******************************************************
362: * remove RHS zero normalization since it only helps 1 in 100
363: * numbers and complicates both encoding and decoding.
364:
365: // remove pairs of zeros on the RHS and keep track of
366: // the count.
367: while (start <= end) {
368: char val = arr[end];
369:
370: if (val=='0' && start <= end) {
371: val=arr[end-1];
372: if (val=='0') {
373: hundreds++;
374: end-=2;
375: continue;
376: }
377: }
378:
379: break;
380: }
381: *************************************************************/
382:
383: // now start at the end and work our way forward
384: // encoding two base 10 digits into 1 base 100 digit
385: while (start <= end) {
386: int val = arr[end--] - '0'; // ones
387: if (start <= end) {
388: val += (arr[end--] - '0') * 10; // tens
389: if (start <= end) {
390: val += (arr[end--] - '0') * 100; // hundreds
391: if (start <= end) {
392: val += (arr[end--] - '0') * 1000; // thousands
393: }
394: }
395: }
396: out[--wpos] = neg ? (char) (9999 - val) : (char) val;
397: }
398:
399: /****** FUTURE: not needed for this implementation of exponent combined with sign
400: // normalize all zeros to positive values
401: if (wpos==outend) neg=false;
402: ******/
403:
404: // adjust exponent by the number of base 100 chars written
405: exp += outend - wpos;
406:
407: // write the exponent and sign combined
408: out[--wpos] = neg ? (char) (ZERO_EXPONENT - exp)
409: : (char) (ZERO_EXPONENT + exp);
410:
411: return outend - wpos; // the length of the base100 int
412: }
413:
414: // Converts a base100 sortable number to base10 character form
415: // returns number of chars written.
416: // At least 1 char is always written.
417: public static int base10kSortableIntToBase10(char[] arr, int start,
418: int end, char[] out, int offset) {
419: // Take care of "0" case first. It's the only number that is represented
420: // in one char since we don't chop trailing zeros.
421: if (end - start == 1) {
422: out[offset] = '0';
423: return 1;
424: }
425:
426: int wpos = offset; // write position
427: boolean neg;
428: int exp = arr[start++];
429: if (exp < ZERO_EXPONENT) {
430: neg = true;
431: // We don't currently use exp on decoding...
432: // exp = ZERO_EXPONENT - exp;
433: out[wpos++] = '-';
434: } else {
435: neg = false;
436: }
437:
438: // since so many values will fall in one char, pull it
439: // out of the loop (esp since the first value must
440: // be special-cased to not print leading zeros.
441: // integer division is still expensive, so it's best to check
442: // if you actually need to do it.
443: //
444: // TIP: write a small function in gcc or cl and see what
445: // the optimized assemply output looks like (and which is fastest).
446: // In C you can specify "unsigned" which gives the compiler more
447: // info than the Java compiler has.
448: int val = arr[start++];
449: if (neg)
450: val = 9999 - val;
451:
452: /***
453: if (val < 10) {
454: out[wpos++] = (char)(val + '0');
455: } else if (val < 100) {
456: out[wpos++] = (char)(val/10 + '0');
457: out[wpos++] = (char)(val%10 + '0');
458: } else if (val < 1000) {
459: out[wpos++] = (char)(val/100 + '0');
460: out[wpos++] = (char)((val/10)%10 + '0');
461: out[wpos++] = (char)(val%10 + '0');
462: } else {
463: out[wpos++] = (char)(val/1000 + '0');
464: out[wpos++] = (char)((val/100)%10 + '0');
465: out[wpos++] = (char)((val/10)%10 + '0');
466: out[wpos++] = (char)(val % 10 + '0');
467: }
468: ***/
469:
470: if (val < 10) {
471: out[wpos++] = (char) (val + '0');
472: } else if (val < 100) {
473: int div = div10(val);
474: int ones = val - mul10(div); // mod 10
475: out[wpos++] = (char) (div + '0');
476: out[wpos++] = (char) (ones + '0');
477: } else if (val < 1000) {
478: int div = div10(val);
479: int ones = val - mul10(div); // mod 10
480: val = div;
481: div = div10(val);
482: int tens = val - mul10(div); // mod 10
483: out[wpos++] = (char) (div + '0');
484: out[wpos++] = (char) (tens + '0');
485: out[wpos++] = (char) (ones + '0');
486: } else {
487: int div = div10(val);
488: int ones = val - mul10(div); // mod 10
489: val = div;
490: div = div10(val);
491: int tens = val - mul10(div); // mod 10
492: val = div;
493: div = div10(val);
494: int hundreds = val - mul10(div); // mod 10
495:
496: out[wpos++] = (char) (div + '0');
497: out[wpos++] = (char) (hundreds + '0');
498: out[wpos++] = (char) (tens + '0');
499: out[wpos++] = (char) (ones + '0');
500: }
501:
502: while (start < end) {
503: val = arr[start++];
504: if (neg)
505: val = 9999 - val;
506:
507: int div = div10(val);
508: int ones = val - mul10(div); // mod 10
509: val = div;
510: div = div10(val);
511: int tens = val - mul10(div); // mod 10
512: val = div;
513: div = div10(val);
514: int hundreds = val - mul10(div); // mod 10
515:
516: /***
517: int ones = val % 10;
518: val /= 10;
519: int tens = val!=0 ? val % 10 : 0;
520: val /= 10;
521: int hundreds = val!=0 ? val % 10 : 0;
522: val /= 10;
523: int thousands = val!=0 ? val % 10 : 0;
524: ***/
525:
526: /***
527: int thousands = val>=1000 ? val/1000 : 0;
528: int hundreds = val>=100 ? (val/100)%10 : 0;
529: int tens = val>=10 ? (val/10)%10 : 0;
530: int ones = val % 10;
531: ***/
532:
533: /***
534: int thousands = val/1000;
535: int hundreds = (val/100)%10;
536: int tens = (val/10)%10;
537: int ones = val % 10;
538: ***/
539:
540: out[wpos++] = (char) (div + '0');
541: out[wpos++] = (char) (hundreds + '0');
542: out[wpos++] = (char) (tens + '0');
543: out[wpos++] = (char) (ones + '0');
544: }
545:
546: // OPTIONAL: if trailing zeros were truncated, then this is where
547: // we would restore them (compare number of chars read vs exponent)
548:
549: return wpos - offset;
550: }
551:
552: }
|