001: /*
002: *******************************************************************************
003: * Copyright (C) 2002-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.dev.tool.localeconverter;
009:
010: import java.io.*;
011: import java.util.*;
012: import com.ibm.icu.lang.*;
013: import com.ibm.icu.text.*;
014:
015: public class PosixCharMap {
016: private Hashtable table = new Hashtable();
017: private Hashtable backTable = null;
018: private PosixCharMap parentMap;
019: private String encoding;
020:
021: public PosixCharMap() {
022: }
023:
024: public PosixCharMap(PosixCharMap parent) {
025: parentMap = parent;
026: }
027:
028: public PosixCharMap(String fileName) throws IOException {
029: this (new FileReader(fileName));
030: }
031:
032: public PosixCharMap(String pathName, String fileName)
033: throws IOException {
034: this (new FileReader(new File(pathName, fileName)));
035: }
036:
037: public PosixCharMap(Reader inputReader) throws IOException {
038: load(new BufferedReader(inputReader));
039: }
040:
041: public PosixCharMap getParent() {
042: return parentMap;
043: }
044:
045: public void setParent(PosixCharMap parent) {
046: parentMap = parent;
047: }
048:
049: public void load(String pathName, String fileName)
050: throws IOException {
051: load(new File(pathName, fileName), "");
052: }
053:
054: public void load(String pathName, String fileName, String enc)
055: throws IOException {
056: load(new File(pathName, fileName), enc);
057: }
058:
059: public void load(File file, String enc) throws IOException {
060: encoding = enc;
061: load(new BufferedReader(new FileReader(file)));
062: }
063:
064: /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
065: static private final char[] UNESCAPE_MAP = {
066: /*" 0x22, 0x22 */
067: /*' 0x27, 0x27 */
068: /*? 0x3F, 0x3F */
069: /*\ 0x5C, 0x5C */
070: /*a*/0x61, 0x07,
071: /*b*/0x62, 0x08,
072: /*f*/0x66, 0x0c,
073: /*n*/0x6E, 0x0a,
074: /*r*/0x72, 0x0d,
075: /*t*/0x74, 0x09,
076: /*v*/0x76, 0x0b };
077:
078: /**
079: * Convert an escape to a 32-bit code point value. We attempt
080: * to parallel the icu4c unesacpeAt() function.
081: * @param offset16 an array containing offset to the character
082: * <em>after</em> the backslash. Upon return offset16[0] will
083: * be updated to point after the escape sequence.
084: * @return character value from 0 to 10FFFF, or -1 on error.
085: */
086: public static int unescapeAt(String s, int[] offset16) {
087: int c;
088: int result = 0;
089: int n = 0;
090: int minDig = 0;
091: int maxDig = 0;
092: int bitsPerDigit = 4;
093: int dig;
094: int i;
095:
096: /* Check that offset is in range */
097: int offset = offset16[0];
098: int length = s.length();
099: if (offset < 0 || offset >= length) {
100: return -1;
101: }
102:
103: /* Fetch first UChar after '\\' */
104: c = UTF16.charAt(s, offset);
105: offset += UTF16.getCharCount(c);
106:
107: /* Convert hexadecimal and octal escapes */
108: switch (c) {
109: case 'u':
110: minDig = maxDig = 4;
111: break;
112: case 'U':
113: minDig = maxDig = 8;
114: break;
115: case 'x':
116: minDig = 1;
117: maxDig = 2;
118: break;
119: default:
120: dig = UCharacter.digit(c, 8);
121: if (dig >= 0) {
122: minDig = 1;
123: maxDig = 3;
124: n = 1; /* Already have first octal digit */
125: bitsPerDigit = 3;
126: result = dig;
127: }
128: break;
129: }
130: if (minDig != 0) {
131: while (offset < length && n < maxDig) {
132: // TEMPORARY
133: // TODO: Restore the char32-based code when UCharacter.digit
134: // is working (Bug 66).
135:
136: //c = UTF16.charAt(s, offset);
137: //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
138: c = s.charAt(offset);
139: dig = Character.digit((char) c, (bitsPerDigit == 3) ? 8
140: : 16);
141: if (dig < 0) {
142: break;
143: }
144: result = (result << bitsPerDigit) | dig;
145: //offset += UTF16.getCharCount(c);
146: ++offset;
147: ++n;
148: }
149: if (n < minDig) {
150: return -1;
151: }
152: offset16[0] = offset;
153: return result;
154: }
155:
156: /* Convert C-style escapes in table */
157: for (i = 0; i < UNESCAPE_MAP.length; i += 2) {
158: if (c == UNESCAPE_MAP[i]) {
159: offset16[0] = offset;
160: return UNESCAPE_MAP[i + 1];
161: } else if (c < UNESCAPE_MAP[i]) {
162: break;
163: }
164: }
165:
166: /* If no special forms are recognized, then consider
167: * the backslash to generically escape the next character. */
168: offset16[0] = offset;
169: return c;
170: }
171:
172: /**
173: * Convert all escapes in a given string using unescapeAt().
174: * @exception IllegalArgumentException if an invalid escape is
175: * seen.
176: */
177: public static String unescape(String s) {
178: StringBuffer buf = new StringBuffer();
179: int[] pos = new int[1];
180: for (int i = 0; i < s.length();) {
181: char c = s.charAt(i++);
182: if (c == '\\') {
183: pos[0] = i;
184: int e = unescapeAt(s, pos);
185: if (e < 0) {
186: throw new IllegalArgumentException(
187: "Invalid escape sequence "
188: + s.substring(i - 1, Math.min(
189: i + 8, s.length())));
190: }
191: UTF16.append(buf, e);
192: i = pos[0];
193: } else {
194: buf.append(c);
195: }
196: }
197: return buf.toString();
198: }
199:
200: public void load(Reader inputReader) throws IOException {
201: PosixCharMap oldMap = SymbolTransition.getCharMap();
202: SymbolTransition.setCharMap(null);
203: try {
204: final int TOKEN = 1;
205: final int EOF = 2;
206: final int EOL = 3;
207: final int RANGE = 4;
208: final Lex.Transition[][] states1 = { { //state 0: start
209: new SpaceTransition(0),
210: new EOLTransition(EOL),
211: new Lex.EOFTransition(EOF),
212: new Lex.DefaultTransition(
213: Lex.ACCUMULATE_CONSUME, -1) }, { //grab first word
214: new Lex.StringTransition(
215: SpaceTransition.SPACE_CHARS,
216: Lex.IGNORE_CONSUME, TOKEN),
217: new Lex.StringTransition(
218: EOLTransition.EOL_CHARS,
219: Lex.IGNORE_CONSUME, TOKEN),
220: new Lex.EOFTransition(TOKEN),
221: new Lex.DefaultTransition(
222: Lex.ACCUMULATE_CONSUME, -1) } };
223:
224: final Lex.Transition[][] states2 = { { //These states only return <symbols>. All
225: //other text is ignored.
226: new Lex.EOFTransition(EOF),
227: new EOLTransition(EOL),
228: new SymbolTransition(TOKEN),
229: new SpaceTransition(0),
230: new RangeTransition(RANGE),
231: new Lex.DefaultTransition(Lex.ACCUMULATE_CONSUME, 0) }, };
232:
233: PushbackReader input = new PushbackReader(inputReader);
234: Lex p = new Lex(states1, input);
235: int state;
236: do {
237: state = p.nextToken();
238: } while ((state != EOF) && !p.dataEquals("CHARMAP"));
239: p.accept(EOL);
240: if (state != EOF) {
241: p = new Lex(states2, input);
242: state = p.nextToken();
243: while (state != EOF) {
244:
245: String key = p.getData();
246: if (p.dataEquals("ENDCHARMAP")) {
247: break;
248: }
249: state = p.nextToken();
250: while (state == EOL) {
251: if (p.dataEquals("ENDCHARMAP")) {
252: break;
253: }
254: String data = unescape(p.getData());
255: data.trim();
256: if (data.startsWith("<U")
257: || data.startsWith("#U")) {
258: String numData = data.substring(2, data
259: .length() - 1);
260: int digit = Integer.parseInt(numData, 16);
261: defineMapping(key, "" + (char) digit);
262: } else if (data.startsWith("\\x")) {
263: byte[] encData = new byte[100];
264: int num = hexToByte(data, encData);
265: String tData = new String(encData, 0, num,
266: encoding);
267: defineMapping(key, tData);
268: } else {
269: defineMapping(key, byteToChar(data,
270: encoding));
271: }
272: state = p.nextToken();
273: key = p.getData();
274: }
275: // we come here only if there is a range transition
276: if (state == RANGE) {
277:
278: String begin = key;
279:
280: state = p.nextToken();
281: String end = p.getData();
282:
283: state = p.nextToken();
284: String data = p.getData();
285: data.trim();
286: byte[] encData = new byte[6];
287: int num = hexToByte(data, encData);
288: String tData = new String(encData, 0, num,
289: encoding);
290: String stringVal;
291: int[] val = getInt(begin);
292: int beginRange = 0;
293: int endRange = 0;
294: if (val == null) {
295: val = getInt((String) table.get(begin));
296: if (val != null) {
297: beginRange = val[1];
298: }
299: }
300: val = getInt(end);
301: if (val == null) {
302: val = getInt((String) table.get(end));
303: if (val != null) {
304: endRange = val[1];
305: }
306: }
307: stringVal = key.substring(0, val[0]);
308: int digit = (int) (char) tData.charAt(0);
309: while (beginRange <= endRange) {
310: defineMapping(
311: (stringVal + beginRange + ">"), ""
312: + (char) digit++);
313: beginRange++;
314: }
315:
316: state = p.nextToken();
317: key = p.getData();
318: }
319:
320: //state = p.nextToken();
321: }
322: }
323: } catch (EOFException e) {
324: } finally {
325: SymbolTransition.setCharMap(oldMap);
326: }
327: }
328:
329: public int[] getInt(String data) {
330: if (data == null) {
331: return null;
332: }
333: int i = 0;
334: int[] retVal = new int[2];
335: int len = data.length();
336: while (i < len) {
337: if ((data.charAt(i)) - 0x30 < (0x39 - 0x30)) {
338: break;
339: }
340: i++;
341: }
342: if (i < len) {
343: String sub = data.substring(i, len - 1);
344: retVal[0] = i;
345: retVal[1] = Integer.parseInt(sub, 10);
346: return retVal;
347: }
348: return null;
349: }
350:
351: public int hexToByte(String data, byte[] retval) {
352: String tData = data;
353: int i = 0;
354: for (i = 0; i < data.length() / 4; i++) {
355: if (tData.charAt(0) == '\\' && tData.charAt(1) == 'x') {
356: String numData = tData.substring(2, 4);
357: retval[i] = (byte) Integer.parseInt(numData, 16);
358: tData = tData.substring(4, tData.length());
359: }
360: }
361: return i;
362: }
363:
364: public String byteToChar(String data, String encoding)
365: throws UnsupportedEncodingException {
366:
367: byte[] bytes = new byte[data.length()];
368: for (int i = 0; i < data.length(); i++) {
369: char ch = data.charAt(i);
370: if (ch > 0xFF) {
371: throw new RuntimeException(
372: "Bytes in the string are greater than 0xFF");
373: }
374: bytes[i] = (byte) ch;
375: }
376: return new String(bytes, encoding);
377: }
378:
379: public void defineMapping(String from, String to) {
380: table.put(from, to);
381: backTable = null;
382: }
383:
384: public void undefineMapping(String from) {
385: table.remove(from);
386: backTable = null;
387: }
388:
389: public void swap() {
390: Hashtable newTable = new Hashtable();
391: Enumeration enumer = table.keys();
392: while (enumer.hasMoreElements()) {
393: String key = (String) enumer.nextElement();
394: String code = (String) table.get(key);
395:
396: String newKey = toSymbol(code);
397: String newCode = toLiteral(key);
398: String prevCode = (String) newTable.get(newKey);
399: if (prevCode == null || prevCode.compareTo(newCode) > 0) {
400: newTable.put(newKey, newCode);
401: }
402: }
403: table = newTable;
404: }
405:
406: private String toLiteral(String code) {
407: String data = code.substring(2, code.length() - 1);
408: int digit = Integer.parseInt(data, 16);
409: return "" + (char) digit;
410: }
411:
412: private String toSymbol(String code) {
413: StringBuffer escapeBuffer = new StringBuffer();
414: escapeBuffer.append(">");
415: for (int i = 0; i < code.length(); i++) {
416: int value = ((int) code.charAt(i)) & 0xFFFF;
417: while ((value > 0) || (escapeBuffer.length() < 5)) {
418: char digit = Character.forDigit(value % 16, 16);
419: escapeBuffer.append(digit);
420: value >>= 4;
421: }
422: }
423: escapeBuffer.append("U<");
424: escapeBuffer.reverse();
425: return escapeBuffer.toString();
426: }
427:
428: public void dump(PrintStream out) {
429: StringBuffer escapeBuffer = new StringBuffer();
430: Enumeration enumer = table.keys();
431: while (enumer.hasMoreElements()) {
432: String key = (String) enumer.nextElement();
433: String code = (String) table.get(key);
434: out.print(key);
435: out.print(" <U");
436: for (int i = 0; i < code.length(); i++) {
437: int value = ((int) code.charAt(i)) & 0xFFFF;
438: escapeBuffer.setLength(0);
439: while ((value > 0) || (escapeBuffer.length() < 4)) {
440: char digit = Character.forDigit(value % 16, 16);
441: escapeBuffer.append(digit);
442: value >>= 4;
443: }
444: escapeBuffer.reverse();
445: out.print(escapeBuffer.toString());
446: }
447: out.println(">");
448: }
449: }
450:
451: public String mapKey(final String key) {
452: String result = (String) table.get(key);
453: if (result == null) {
454: if (parentMap != null) {
455: result = parentMap.mapKey(key);
456: } else {
457: result = key;
458: }
459: }
460: return result;
461: }
462:
463: public String backmapValue(final String value) {
464: if (backTable == null) {
465: backTable = new Hashtable();
466: Enumeration enumer = table.keys();
467: while (enumer.hasMoreElements()) {
468: String key = (String) enumer.nextElement();
469: String val = (String) table.get(key);
470: backTable.put(val, key);
471: }
472: }
473: String result = (String) backTable.get(value);
474: if (result == null) {
475: if (parentMap != null) {
476: result = parentMap.backmapValue(value);
477: } else {
478: result = value;
479: }
480: }
481: return result;
482: }
483:
484: public Enumeration keys() {
485: return table.keys();
486: }
487:
488: public Enumeration elements() {
489: return table.elements();
490: }
491:
492: public static void main(String args[]) {
493: try {
494: PosixCharMap map1 = new PosixCharMap(
495: "C:\\projects\\com\\taligent\\localeconverter\\CharMaps",
496: "IBM-1129.UPMAP100.txt");
497: map1.swap();
498: map1.dump(System.out);
499:
500: SymbolTransition.setCharMap(map1);
501: System.out.println();
502: System.out.println();
503:
504: //PosixCharMap map = new PosixCharMap("C:\\projects\\data\\ISO-8859-1.html");
505: PosixCharMap map = new PosixCharMap(
506: "C:\\projects\\com\\taligent\\localeconverter\\CharMaps",
507: "ibm1129.txt");
508: map.dump(System.out);
509: System.out.println();
510: } catch (Exception e) {
511: System.out.println(e);
512: }
513: }
514: }
|