001: // Copyright 1998 Finn Bock.
002:
003: package org.python.modules;
004:
005: import java.io.*;
006: import org.python.core.*;
007:
008: public class ucnhash implements ucnhashAPI {
009:
010: // Parameters for the word hash.
011: private static int n;
012: private static int m;
013: private static int minchar;
014: private static int maxchar;
015: private static int alphasz;
016: private static int maxlen;
017: private static int maxidx;
018: private static int maxklen;
019:
020: private static short[] G;
021: private static short[] T0;
022: private static short[] T1;
023: private static short[] T2;
024:
025: // Map the hashed values into the text (as bytes).
026: private static byte[] worddata;
027: private static short[] wordoffs;
028:
029: // wordindex greate then cutoff is stored as into two bytes.
030: private static short wordstart;
031: private static short wordcutoff;
032:
033: // The raw data and indexes into start of each name
034: // The rawindex is sorted based on the wordindexes.
035: private static byte[] rawdata;
036: private static char[] rawindex;
037:
038: // The mapping from raw data index to unicode code points.
039: private static char[] codepoint;
040:
041: public static String[] __depends__ = new String[] { "/org/python/modules/ucnhash.dat", };
042:
043: public static void loadTables() throws Exception {
044: InputStream instream = ucnhash.class
045: .getResourceAsStream("ucnhash.dat");
046: if (instream == null)
047: throw new IOException("Unicode name database not found: "
048: + "ucnhash.dat");
049:
050: DataInputStream in = new DataInputStream(
051: new BufferedInputStream(instream));
052:
053: n = in.readShort();
054: m = in.readShort();
055: minchar = in.readShort();
056: maxchar = in.readShort();
057: alphasz = in.readShort();
058: maxlen = in.readShort();
059: maxidx = maxlen * alphasz - minchar;
060:
061: G = readShortTable(in);
062: if (in.readShort() != 3)
063: throw new IOException("UnicodeNameMap file corrupt, "
064: + "unknown dimension");
065:
066: T0 = readShortTable(in);
067: T1 = readShortTable(in);
068: T2 = readShortTable(in);
069:
070: wordoffs = readShortTable(in);
071: worddata = readByteTable(in);
072:
073: wordstart = in.readShort();
074: wordcutoff = in.readShort();
075: maxklen = in.readShort();
076:
077: rawdata = readByteTable(in);
078: rawindex = readCharTable(in);
079: codepoint = readCharTable(in);
080: }
081:
082: private static short[] readShortTable(DataInputStream in)
083: throws IOException {
084: if (in.read() != 't')
085: throw new IOException(
086: "UnicodeNameMap file corrupt, shorttable");
087:
088: int n = in.readUnsignedShort() / 2;
089: short[] table = new short[n];
090: for (int i = 0; i < n; i++) {
091: table[i] = in.readShort();
092: }
093: return table;
094: }
095:
096: private static char[] readCharTable(DataInputStream in)
097: throws IOException {
098: if (in.read() != 't')
099: throw new IOException(
100: "UnicodeNameMap file corrupt, chartable");
101:
102: int n = in.readUnsignedShort() / 2;
103: char[] table = new char[n];
104: for (int i = 0; i < n; i++) {
105: table[i] = in.readChar();
106: }
107: return table;
108: }
109:
110: private static byte[] readByteTable(DataInputStream in)
111: throws IOException {
112: if (in.read() != 't')
113: throw new IOException(
114: "UnicodeNameMap file corrupt, byte table");
115: int n = in.readUnsignedShort();
116: byte[] table = new byte[n];
117: in.readFully(table);
118: return table;
119: }
120:
121: public static int hash(String key) {
122: return hash(key, 0, key.length());
123: }
124:
125: public static int hash(String key, int start, int end) {
126: int i, j;
127: int f0, f1, f2;
128:
129: for (j = start, i = -minchar, f0 = f1 = f2 = 0; j < end; j++) {
130: char ch = key.charAt(j);
131: if (ch >= 'a' && ch <= 'z')
132: ch = (char) (ch - 'a' + 'A');
133: f0 += T0[i + ch];
134: f1 += T1[i + ch];
135: f2 += T2[i + ch];
136: i += alphasz;
137: if (i >= maxidx)
138: i = -minchar;
139: }
140:
141: f0 %= n;
142: f1 %= n;
143: f2 %= n;
144:
145: return (G[f0] + G[f1] + G[f2]) % m;
146: }
147:
148: private static final char[] charmap = " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-"
149: .toCharArray();
150:
151: private static String getWord(int idx) {
152: int offset = wordoffs[idx];
153: int end = worddata.length;
154: if (idx < wordoffs.length - 1)
155: end = wordoffs[idx + 1];
156: StringBuffer buf = new StringBuffer();
157: for (int i = offset; i < end; i++)
158: buf.append(charmap[worddata[i]]);
159: return buf.toString();
160: }
161:
162: private static boolean match(int idx, byte[] raw, int begin, int end) {
163: int woff = wordoffs[idx];
164: int wend = worddata.length;
165: if (idx < wordoffs.length - 1)
166: wend = wordoffs[idx + 1];
167:
168: if (end - begin != wend - woff)
169: return false;
170: int l = end - begin;
171: for (int i = 0; i < l; i++) {
172: if (worddata[woff + i] != raw[begin + i])
173: return false;
174: }
175: return true;
176: }
177:
178: private static int compare(byte[] a1, int off1, int len1,
179: byte[] a2, int off2, int len2) {
180: for (int i = 0; i < len1 && i < len2; i++) {
181: int d = (a1[off1 + i] & 0xFF) - (a2[off2 + i] & 0xFF);
182: if (d != 0)
183: return d;
184: }
185: return len1 - len2;
186: }
187:
188: private static int binarysearch(byte[] rawlist, int start, int end) {
189: int floor = 0;
190: int ceiling = (rawindex.length) / 5;
191:
192: while (floor < ceiling - 1) {
193: int middle = (floor + ceiling) / 2;
194: if (debug)
195: System.out.println("floor:" + floor + " ceiling:"
196: + ceiling + " => " + middle);
197:
198: int off = rawindex[middle * 5];
199: int len = rawindex[middle * 5 + 4] & 0x1F;
200: int d = compare(rawlist, start, end - start, rawdata, off,
201: len);
202: if (d < 0)
203: ceiling = middle;
204: else if (d > 0)
205: floor = middle;
206: else
207: return middle * 12;
208: }
209:
210: int tmp = floor * 5;
211:
212: int off = rawindex[tmp++];
213: long lengths = ((long) rawindex[tmp++] << 48)
214: | ((long) rawindex[tmp++] << 32)
215: | ((long) rawindex[tmp++] << 16)
216: | ((long) rawindex[tmp++]);
217:
218: floor *= 12;
219: for (int i = 0; i < 12; i++) {
220: int len = (int) (lengths >> (i * 5)) & 0x1F;
221: if (compare(rawlist, start, end, rawdata, off, len) == 0)
222: return floor;
223: off += len;
224: floor++;
225: }
226: return -1;
227: }
228:
229: public static int lookup(String name) {
230: return lookup(name, 0, name.length());
231: }
232:
233: private static int lookup(String name, int start, int end) {
234:
235: byte[] rawlist = new byte[32];
236: int ridx = 0;
237: int rbegin = 0;
238: int rstart = 0;
239:
240: int i;
241: while (true) {
242: rbegin = ridx;
243: int begin = start;
244: for (i = start; i < end; i++) {
245: char ch = name.charAt(i);
246: if (ch == ' ') {
247: start = i + 1;
248: break;
249: }
250: int v;
251: if (ch >= 'a' && ch <= 'z')
252: ch = (char) (ch - 'a' + 'A');
253: if (ch >= 'A' && ch <= 'Z')
254: v = ch - 'A' + 1;
255: else if (ch >= '0' && ch <= '9')
256: v = ch - '0' + 27;
257: else if (ch == '-')
258: v = 37;
259: else
260: return -1;
261:
262: rawlist[ridx++] = (byte) v;
263: if (ch == '-' && start != i) {
264: start = ++i;
265: break;
266: }
267: }
268:
269: int hash = hash(name, begin, i);
270:
271: if (debug)
272: System.out.println(name.substring(begin, i) + " "
273: + hash);
274:
275: boolean isWord = hash >= 0 && ridx - rbegin > 1
276: && match(hash, rawlist, rbegin, ridx);
277:
278: if (isWord) {
279: if (debug)
280: System.out.println("match " + getWord(hash));
281: hash += wordstart;
282: ridx = rstart;
283: if (hash > wordcutoff) {
284: rawlist[ridx++] = (byte) ((hash >> 8) + wordcutoff);
285: rawlist[ridx++] = (byte) (hash & 0xFF);
286: } else
287: rawlist[ridx++] = (byte) hash;
288: }
289: rstart = ridx;
290:
291: if (i >= end)
292: break;
293:
294: if (!isWord) {
295: rawlist[ridx++] = 0;
296: }
297:
298: }
299:
300: if (debug) {
301: System.out.print("rawdata: ");
302: for (int k = 0; k < ridx; k++)
303: System.out.print((rawlist[k] & 0xFF) + " ");
304: System.out.println();
305: }
306:
307: int idx = binarysearch(rawlist, 0, ridx);
308: if (idx < 0)
309: return idx;
310: if (debug) {
311: System.out.println("idx:" + idx);
312: System.out.println("codepoint:" + codepoint[idx] + " "
313: + Integer.toHexString((int) codepoint[idx]));
314: }
315: return codepoint[idx];
316: }
317:
318: // From the ucnhashAPI interface
319: public int getCchMax() {
320: if (!initialized())
321: return -1;
322: return maxklen;
323: }
324:
325: private static String cjkPrefix = "CJK COMPATIBILITY IDEOGRAPH-";
326: private static int cjkPrefixLen = cjkPrefix.length();
327:
328: // From the ucnhashAPI interface
329: public int getValue(String s, int start, int end) {
330: if (!initialized())
331: return -1;
332:
333: if (s.regionMatches(start, cjkPrefix, 0, cjkPrefixLen)) {
334: try {
335: String hex = s.substring(start + cjkPrefixLen, end);
336: int v = Integer.parseInt(hex, 16);
337: return v;
338: } catch (NumberFormatException exc) {
339: return -1; // Maybe fallthrough to the main algorithme.
340: }
341: }
342:
343: return lookup(s, start, end);
344: }
345:
346: private static boolean initialized = false;
347: private static boolean loaded = false;
348:
349: private synchronized boolean initialized() {
350: if (initialized && loaded)
351: return true;
352: if (initialized)
353: return false;
354: try {
355: loadTables();
356: loaded = true;
357: } catch (Exception exc) {
358: return false;
359: }
360: initialized = true;
361: return true;
362: }
363:
364: private static boolean debug = false;
365:
366: public static void main(String[] args) throws Exception {
367: loadTables();
368:
369: debug = true;
370:
371: /*
372: System.out.println(getWord(hash("ARABIC")));
373: System.out.println(getWord(hash("SMALL")));
374: System.out.println(getWord(hash("YI")));
375: System.out.println(getWord(hash("SYLLABLE")));
376: System.out.println(getWord(hash("WITH")));
377: System.out.println(getWord(hash("LETTER")));
378:
379: System.out.println(lookup("NULL"));
380: System.out.println(lookup("LATIN CAPITAL LETTER AFRICAN D"));
381: System.out.println(lookup("GURMUKHI TIPPI"));
382: System.out.println(lookup("TIBETAN MARK GTER YIG MGO -UM " +
383: "RNAM BCAD MA"));
384: System.out.println(lookup("HANGUL CHOSEONG PIEUP"));
385: System.out.println(lookup("SINGLE LOW-9 QUOTATION MARK"));
386: */
387:
388: System.out.println(lookup("BACKSPACE"));
389: // System.out.println(lookup("ACTIVATE SYMMETRIC SWAPPING"));
390:
391: /*
392: System.out.println(lookup("LATIN CAPITAL LETTER A"));
393: System.out.println(lookup("GREATER-THAN SIGN"));
394: System.out.println(lookup("EURO-CURRENCY SIGN"));
395: */
396: }
397: }
|