001: /*
002: *******************************************************************************
003: * Copyright (C) 2002-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.dev.tool.localeconverter;
008:
009: import java.io.*;
010: import java.util.*;
011:
012: public class POSIXLocaleReader {
013: private final String localeDataPath;
014: private final Locale locale;
015:
016: public static final int TAG_TOKEN = 1;
017: public static final int SEPARATOR_TOKEN = 2;
018: public static final int EOL_TOKEN = 3;
019: public static final int EOF_TOKEN = 4;
020:
021: //these states are used to parse the bulk of the
022: //input file. They translate escaped characters
023: //and symolic character references inline.
024: static final Lex.Transition[][] dataStates = { { //state 0: start
025: new SpaceTransition(0),
026: new Lex.CharTransition(';', Lex.IGNORE_CONSUME,
027: SEPARATOR_TOKEN),
028: new Lex.CharTransition(',', Lex.IGNORE_CONSUME,
029: SEPARATOR_TOKEN), new EOLTransition(EOL_TOKEN),
030: new TokenTransition(TAG_TOKEN),
031: new Lex.EOFTransition(EOF_TOKEN),
032: new Lex.ParseExceptionTransition("unexpected characters") } };
033:
034: static final Lex.Transition[][] LCStates = { { //state 0: start
035: new SpaceTransition(0),
036: new EOLTransition(EOL_TOKEN),
037: new Lex.EOFTransition(EOF_TOKEN),
038: new Lex.DefaultTransition(Lex.ACCUMULATE_CONSUME,
039: -1) }, { //grab first word
040: new Lex.StringTransition(
041: SpaceTransition.SPACE_CHARS,
042: Lex.IGNORE_PUTBACK, TAG_TOKEN),
043: new Lex.StringTransition(EOLTransition.EOL_CHARS,
044: Lex.IGNORE_PUTBACK, TAG_TOKEN),
045: new Lex.EOFTransition(TAG_TOKEN),
046: new Lex.DefaultTransition(Lex.ACCUMULATE_CONSUME,
047: -1) } };
048:
049: public POSIXLocaleReader(final String localeDataPath,
050: final Locale locale) {
051: this .localeDataPath = localeDataPath;
052: this .locale = locale;
053: //{{INIT_CONTROLS
054: //}}
055: }
056:
057: public Hashtable parse(String fileName, byte flags)
058: throws IOException {
059: try {
060: Hashtable table = parseNative(fileName);
061: Hashtable result = new PosixToNeutralConverter(flags,
062: locale, fileName).convert(table);
063: return result;
064: } catch (LocaleConverter.ConversionError e) {
065: System.err.println("Internal error converting locale data");
066: return null;
067: }
068: }
069:
070: public Hashtable parseNative(String fileName) throws IOException {
071: char oldEscapeChar = EscapeTransition.setDefaultEscapeChar();
072: char oldCommentChar = EOLTransition.setDefaultCommentChar();
073: Hashtable table = new Hashtable();
074: try {
075:
076: LineCharNumberReader lines = new LineCharNumberReader(
077: new BufferedReader(new FileReader(new File(
078: localeDataPath, fileName))));
079: PushbackReader reader = new PushbackReader(lines);
080:
081: //Shove a newline at the start of the file. This has the affect of allowing
082: //the file to start with a comment, since the parser only allows comments as
083: //part of an EOL
084: reader.unread('\n');
085:
086: String sectionTag = seekLC(reader);
087: while (sectionTag != null) {
088: try {
089: parseSection(table, reader, sectionTag);
090: } catch (Lex.ParseException e) {
091: System.err.println("ERROR parsing: " + e.reason);
092: System.err.println(" Line: "
093: + lines.getLineNumber());
094: System.err.println(" char: "
095: + lines.getCharNumber());
096: seekEND(reader);
097: System.err.println("Skipped to line: "
098: + (lines.getLineNumber() + 1));
099: }
100: sectionTag = seekLC(reader);
101: }
102: } finally {
103: EscapeTransition.setEscapeChar(oldEscapeChar);
104: EOLTransition.setCommentChar(oldCommentChar);
105: }
106: return table;
107: }
108:
109: private void parseSection(Hashtable table, PushbackReader reader,
110: String sectionTag) throws IOException {
111: if (sectionTag.equals("LC_CTYPE")) {
112: parseCTYPE(table, reader);
113: } else if (sectionTag.equals("LC_COLLATE")) {
114: parseCOLLATE(table, reader);
115: } else if (sectionTag.equals("LC_MONETARY")) {
116: parseLC(table, reader, sectionTag);
117: } else if (sectionTag.equals("LC_NUMERIC")) {
118: parseLC(table, reader, sectionTag);
119: } else if (sectionTag.equals("LC_TIME")) {
120: parseLC(table, reader, sectionTag);
121: } else if (sectionTag.equals("LC_MESSAGES")) {
122: parseLC(table, reader, sectionTag);
123: } else if (sectionTag.equals("LC_MEASUREMENT")) {
124: parseLC(table, reader, sectionTag);
125: } else if (sectionTag.equals("LC_ADDRESS")) {
126: parseLC(table, reader, sectionTag);
127: } else if (sectionTag.equals("LC_PAPER")) {
128: parseLC(table, reader, sectionTag);
129: } else if (sectionTag.equals("LC_NAME")) {
130: parseLC(table, reader, sectionTag);
131: } else if (sectionTag.equals("LC_IDENTIFICATION")) {
132: parseLC(table, reader, sectionTag);
133: } else if (sectionTag.equals("LC_TELEPHONE")) {
134: parseLC(table, reader, sectionTag);
135:
136: } else {
137: System.out.println("Unrecognised section:" + sectionTag);
138: System.out.println("Default parsing applied.");
139: parseLC(table, reader, sectionTag);
140: }
141: }
142:
143: private PushbackReader createParserInput(String localeName)
144: throws IOException {
145: PushbackReader reader = new PushbackReader(new BufferedReader(
146: new FileReader(new File(localeDataPath, localeName))));
147: //Shove a newline at the start of the file. This has the affect of allowing
148: //the file to start with a comment, since the parser only allows comments as
149: //part of an EOL
150: reader.unread('\n');
151: return reader;
152: }
153:
154: private String seekLC(PushbackReader reader) throws IOException {
155: Lex p = new Lex(LCStates, reader);
156: final String LC = "LC_";
157: int s = p.nextToken();
158: while ((s != EOF_TOKEN)) {
159: if (s == TAG_TOKEN) {
160: if (p.dataStartsWith(LC)) {
161: String tag = p.getData();
162: do {
163: s = p.nextToken();
164: } while (s != EOL_TOKEN && s != EOF_TOKEN);
165: return tag;
166: } else if (p.dataEquals("escape_char")) {
167: s = p.nextToken();
168: if (s == TAG_TOKEN || p.getData().length() != 1) {
169: String escape_char = p.getData();
170: EscapeTransition.setEscapeChar(escape_char
171: .charAt(0));
172: } else {
173: System.out
174: .println("Error in escape_char directive. Directive ignored.");
175: }
176: } else if (p.dataEquals("comment_char")) {
177: s = p.nextToken();
178: if (s == TAG_TOKEN || p.getData().length() != 1) {
179: String comment_char = p.getData();
180: if (comment_char.length() > 0) {
181: EOLTransition.setCommentChar(comment_char
182: .charAt(0));
183: }
184: } else {
185: System.out
186: .println("Error in escape_char directive. Directive ignored.");
187: }
188: }
189: }
190: s = p.nextToken();
191: }
192: return null;
193: }
194:
195: private boolean seekEND(PushbackReader reader) throws IOException {
196: Lex p = new Lex(LCStates, reader);
197: final String END = "END";
198: int s = p.nextToken();
199: while ((s != EOF_TOKEN)) {
200: if (s == TAG_TOKEN) {
201: if (p.dataStartsWith(END)) {
202: do {
203: s = p.nextToken();
204: } while (s != EOL_TOKEN && s != EOF_TOKEN);
205: return true;
206: }
207: }
208: s = p.nextToken();
209: }
210: return false;
211: }
212:
213: private void parseCTYPE(Hashtable table, PushbackReader reader)
214: throws IOException {
215: Lex p = new Lex(dataStates, reader);
216: StringBuffer temp = new StringBuffer();
217: int s = p.nextToken();
218: if ((s == TAG_TOKEN) && p.dataEquals("copy")) {
219: p.accept(TAG_TOKEN);
220: parseCopy("LC_CTYPE", p.getData(), table);
221: p.accept(EOL_TOKEN);
222: p.accept(TAG_TOKEN, "END");
223: p.accept(TAG_TOKEN, "LC_CTYPE");
224: } else {
225: while ((s == TAG_TOKEN) && !p.dataEquals("END")) {
226: //IGNORE the CTYPE definition ... we dont need it
227:
228: String key = p.getData();
229: temp.setLength(0);
230: p.accept(TAG_TOKEN);
231: p.appendDataTo(temp);
232: s = p.nextToken();
233: while (s == SEPARATOR_TOKEN) {
234: p.accept(TAG_TOKEN);
235: p.appendDataTo(temp);
236: s = p.nextToken();
237: }
238: if (s != EOL_TOKEN) {
239: System.err
240: .println("WARNING: Could not parse the Unexpected token: Expecting EOL got "
241: + s);
242: } else {
243: table.put(key, temp.toString());
244: }
245:
246: s = p.nextToken();
247:
248: }
249: p.accept(TAG_TOKEN, "LC_CTYPE");
250: }
251: }
252:
253: private void parseCopy(String section, String toCopy, Hashtable t)
254: throws IOException {
255: char oldEscapeChar = EscapeTransition.setDefaultEscapeChar();
256: char oldCommentChar = EOLTransition.setDefaultCommentChar();
257: try {
258: PushbackReader reader = createParserInput(toCopy);
259: String tag = seekLC(reader);
260: while (tag != null && !section.equals(tag)) {
261: tag = seekLC(reader);
262: }
263: if (tag != null) {
264: parseSection(t, reader, section);
265: } else {
266: //hey {jf} - is this an error?
267: }
268: } finally {
269: EscapeTransition.setEscapeChar(oldEscapeChar);
270: EOLTransition.setCommentChar(oldCommentChar);
271: }
272: }
273:
274: private void parseLC(Hashtable t, PushbackReader reader,
275: String sectionTag) throws IOException {
276: Lex input = new Lex(dataStates, reader);
277: input.accept(TAG_TOKEN);
278: if (input.dataEquals("copy")) {
279: input.accept(TAG_TOKEN);
280: parseCopy(sectionTag, input.getData(), t);
281: } else {
282: while ((input.getState() == TAG_TOKEN)
283: && !input.dataEquals("END")) {
284: String label = input.getData();
285: Vector values = new Vector();
286: input.accept(TAG_TOKEN);
287: String temp = input.getData();
288: values.addElement(temp);
289: while (input.nextToken() == SEPARATOR_TOKEN) {
290: input.accept(TAG_TOKEN);
291: String value = input.getData();
292: values.addElement(value);
293: }
294: if (values.size() > 1) {
295: String[] data = new String[values.size()];
296: values.copyInto(data);
297: t.put(label, data);
298: } else {
299: t.put(label, values.elementAt(0));
300: }
301: if (input.getState() != EOL_TOKEN) {
302: System.out.println("Extraneous text after label: "
303: + label);
304: throw new IOException();
305: }
306: input.nextToken();
307: }
308: }
309: input.accept(TAG_TOKEN, sectionTag);
310: }
311:
312: private void parseCOLLATE(Hashtable table, PushbackReader reader)
313: throws IOException {
314: PosixCharMap map = new PosixCharMap(SymbolTransition
315: .getCharMap());
316: SymbolTransition.setCharMap(map);
317: try {
318: Lex input = new Lex(dataStates, reader);
319: PosixCollationBuilder builder = new PosixCollationBuilder(
320: map);
321:
322: int s = input.nextToken();
323: while (s == EOL_TOKEN)
324: s = input.nextToken();
325: while (s == TAG_TOKEN) {
326: if (input.dataEquals("END")) {
327: break;
328: } else if (input.dataEquals("UNDEFINED")) {
329: System.err
330: .println("WARNING: Undefined characters will sort last.");
331: s = input.nextToken();
332: while (s != EOF_TOKEN && s != EOL_TOKEN) {
333: s = input.nextToken();
334: }
335: } else if (input.dataEquals("copy")) {
336: //copy collation rules from another locale
337: input.accept(TAG_TOKEN);
338: String toCopy = input.getData();
339: input.accept(EOL_TOKEN);
340: parseCopy("LC_COLLATE", toCopy, table);
341: System.err.println("Copying collation rules from "
342: + toCopy + "...");
343: } else if (input.dataEquals("...")) {
344: //fill the space between the last element and the next element
345: System.err
346: .println("ERROR: Ellipsis not supported in collation rules.");
347: System.err.println(" Line ignored");
348: } else if (input.dataEquals("replace-after")) {
349: System.err
350: .println("ERROR: Replace-after not supported in collation rules.");
351: System.err
352: .println(" Skipping until next replace-end.");
353: s = input.nextToken();
354: while (s != EOF_TOKEN) {
355: if (s == TAG_TOKEN
356: && input.dataEquals("replace-end")) {
357: input.accept(EOL_TOKEN);
358: break;
359: }
360: }
361: } else if (input.dataEquals("collating-element")) {
362: //Several characters should sort as a single element.
363: input.accept(TAG_TOKEN); //get the symbol
364: String key = input.getData();
365: input.accept(TAG_TOKEN, "from");
366: input.accept(TAG_TOKEN); //get the expansion
367: String value = input.getData();
368: builder.defineContraction(key, value);
369: input.accept(EOL_TOKEN);
370: } else if (input.dataEquals("collating-symbol")) {
371: //define a weight symbol. This symbol does not represent a character.
372: //It's only used for comparison purposes. We define the character
373: //value for this character to be in the private area since our
374: //collation stuff doesn't sort that area.
375: input.accept(TAG_TOKEN);
376: builder.defineWeightSymbol(input.getData());
377: input.accept(EOL_TOKEN);
378: } else if (input.dataEquals("order_start")) {
379: Vector tempVector = new Vector();
380: //start reading collation ordering rules.
381: input.accept(TAG_TOKEN);
382: tempVector.addElement(input.getData());
383: s = input.nextToken();
384: while (s == SEPARATOR_TOKEN) {
385: input.accept(TAG_TOKEN);
386: tempVector.addElement(input.getData());
387: s = input.nextToken();
388: }
389: String[] order_start = new String[tempVector.size()];
390: tempVector.copyInto(order_start);
391: table.put("sort_order", order_start);
392: } else if (input.dataEquals("order_end")) {
393: //build a list of ordered collation elements
394: input.accept(EOL_TOKEN);
395: SortedVector order = builder.getSortOrder();
396: PosixCollationBuilder.CollationRule[] ruleSource = new PosixCollationBuilder.CollationRule[order
397: .size()];
398: order.copyInto(ruleSource); //copy into an array so we can add it to the output table
399: //this is only for information purposes so we can retrieve the source of the
400: //collationItems with the weights if we want them later
401: table.put("posix_sort_rules", ruleSource);
402: } else {
403: //add a collation item to the list
404: builder.addRule(input.getData());
405: s = input.nextToken();
406: while (s == TAG_TOKEN) {
407: //we're expecting weights here
408: builder.addWeight(input.getData());
409: s = input.nextToken();
410: if (s == SEPARATOR_TOKEN) {
411: s = input.nextToken();
412: }
413: }
414: }
415: s = input.nextToken();
416: }
417: input.accept(TAG_TOKEN, "LC_COLLATE");
418: } finally {
419: SymbolTransition.setCharMap(map.getParent());
420: }
421: }
422: //{{DECLARE_CONTROLS
423: //}}
424: }
|