001:package org.skunk.spi;
002:
003:import java.io.BufferedReader;
004:import java.io.FileReader;
005:import java.io.IOException;
006:import java.io.Reader;
007:import java.io.StreamTokenizer;
008:import java.io.StringReader;
009:import java.util.HashMap;
010:import java.util.Iterator;
011:import java.util.Map;
012:import org.skunk.assert.Assertion;
013:import org.skunk.trace.Debug;
014:
015:/**
016: * parses a python dictionary with the following characteristics:
017: * a. all keys are strings.
018: * b. some values are dictionaries are dictionaries whose keys are
019: * strings and whose values are also strings.
020: */
021:public class DictParser
022:{
023: public static final boolean ENCODE=true;
024:
025: public static String toString(Map hm)
026: {
027: StringBuffer sb=new StringBuffer("{");
028: for (Iterator it=hm.keySet().iterator();it.hasNext();)
029: {
030: Object key=it.next();
031: Object value=hm.get(key);
032: String valStr =(value instanceof Map)
033: ? toString((Map)value)
034: : quote(value);
035: sb.append(quote(key))
036: .append(": ")
037: .append(valStr);
038: if (it.hasNext()) sb.append(",\n ");
039: }
040: sb.append("}");
041: return sb.toString();
042: }
043:
044: private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
045:
046: /**
047: * the following is modified from Jython source code:
048: * @see org.python.core.PyString.unicodeescape()
049: */
050: private static String quote(Object s)
051: {
052: String str=s.toString();
053: int size = str.length();
054: StringBuffer v = new StringBuffer(size);
055: boolean unicode = false;
056: char quote = str.indexOf('\'') >= 0 &&
057: str.indexOf('"') == -1 ? '"' : '\'';
058: v.append(quote);
059:
060: for (int i = 0; size-- > 0; )
061: {
062: int ch = str.charAt(i++);
063: /* Map 16-bit characters to '\\uxxxx' */
064: if (ENCODE && ch >= 256)
065: {
066: if (!unicode)
067: {
068: v.insert(0, 'u');
069: unicode = true;
070: }
071: v.append('\\');
072: v.append('u');
073: v.append(hexdigit[(ch >> 12) & 0xf]);
074: v.append(hexdigit[(ch >> 8) & 0xf]);
075: v.append(hexdigit[(ch >> 4) & 0xf]);
076: v.append(hexdigit[ch & 15]);
077: }
078: if (ENCODE && ch < ' ' || ch >= 128)
079: {
080: v.append('\\');
081: v.append(hexdigit[(ch >> 6) & 7]);
082: v.append(hexdigit[(ch >> 3) & 7]);
083: v.append(hexdigit[ch & 7]);
084: }
085: else if (ch == '\n') v.append("\\n");
086: else if (ch == '\t') v.append("\\t");
087: else if (ch == '\b') v.append("\\b");
088: else if (ch == '\f') v.append("\\f");
089: else if (ch == '\r') v.append("\\r");
090: else if (ch=='\\')
091: {
092: if (ENCODE && testEscape(str, i))
093: v.append('\\');
094: else
095: v.append("\\\\");
096: }
097: /* Copy everything else as-is */
098: else
099: v.append((char) ch);
100: }
101: v.append(quote);
102: return v.toString();
103: }
104:
105: private static boolean testEscape(String s, int index)
106: {
107: if (s.length()>(index+1))
108: {
109: char c=s.charAt(index+1);
110: if (c=='n'
111: || c=='t'
112: || c=='b'
113: || c=='f'
114: || c=='r'
115: )
116: return true;
117: }
118: if (s.length()>=(index+3))
119: {
120: boolean b=true;
121: for (int i=0;i<3;i++)
122: {
123: char c=s.charAt(i+index);
124: b=b && (Character.isDigit(c) );
125:// || (i==0 && c=='x'));
126: if (!b) break;
127: }
128: return b;
129: }
130: return false;
131: }
132:
133: /**
134: * parses a string representation of a Python dictionary and returns a HashMap
135: */
136: public static HashMap fromString(String s)
137: {
138: return fromReader(new StringReader(s));
139: }
140:
141: public static HashMap fromReader(Reader reader)
142: {
143: DictLexer lexer=new DictLexer(reader);
144: try
145: {
146: return lexer.dictionary();
147: }
148: catch (IOException oyVeh)
149: {
150: Debug.trace(DictParser.class, Debug.DP1, oyVeh);
151: return null;
152: }
153: }
154:
155: static class DictLexer
156: {
157: private StreamTokenizer st;
158:
159: static final int SINGLE_QUOTE='\'';
160: static final int DOUBLE_QUOTE='"';
161: static final int BACKSLASH='\\';
162: static final int COMMA=',';
163: static final int BRACELEFT='{';
164: static final int BRACERIGHT='}';
165: static final int WORD=-56;
166: static final int COLON=':';
167: static final int UNKNOWN=-99;
168:
169: DictLexer(Reader reader)
170: {
171: if (reader instanceof BufferedReader)
172: st=new StreamTokenizer(reader);
173: else st=new StreamTokenizer(new BufferedReader(reader));
174: resetSyntax();
175: }
176:
177: int next() throws IOException
178: {
179:// Debug.trace(DictLexer.class, Debug.DP4, "in next()");
180: int tok=st.nextToken();
181: int retTok=UNKNOWN;
182: switch (tok)
183: {
184: case ',':
185: retTok = COMMA;
186:// Debug.trace(DictLexer.class, Debug.DP4, "found comma");
187: break;
188: case '{':
189: retTok = BRACELEFT;
190:// Debug.trace(DictLexer.class, Debug.DP4, "found braceleft");
191: break;
192: case '}':
193: retTok = BRACERIGHT;
194:// Debug.trace(DictLexer.class, Debug.DP4, "found braceright");
195: break;
196: case '\'':
197: retTok = SINGLE_QUOTE;
198:// Debug.trace(DictLexer.class, Debug.DP4, "found single quote");
199: break;
200: case '"':
201: retTok = DOUBLE_QUOTE;
202:// Debug.trace(DictLexer.class, Debug.DP4, "found double quote");
203: break;
204: case '\\':
205: retTok = BACKSLASH;
206:// Debug.trace(DictLexer.class, Debug.DP4, "found backslash");
207: break;
208: case ':':
209: retTok=COLON;
210:// Debug.trace(DictLexer.class, Debug.DP4, "found colon");
211: break;
212: case StreamTokenizer.TT_WORD:
213: retTok = WORD;
214:// Debug.trace(DictLexer.class, Debug.DP4, "found word: "+st.sval);
215: break;
216: case StreamTokenizer.TT_EOL:
217: retTok = UNKNOWN;
218:// Debug.trace(DictLexer.class, Debug.DP4, "found unknown!");
219: break;
220: default:
221:// Debug.trace(DictLexer.class, Debug.DP4, "dropped through to default!");
222: retTok = next();
223: }
224: return retTok;
225: }
226:
227: int peek() throws IOException
228: {
229:// Debug.trace(DictLexer.class, Debug.DP4, "in peek()");
230: int i=st.nextToken();
231: st.pushBack();
232: return i;
233: }
234:
235: HashMap dictionary() throws IOException
236: {
237:// Debug.trace(DictLexer.class, Debug.DP4, "in dictionary()");
238: HashMap hm=new HashMap();
239:// Debug.trace(DictLexer.class, Debug.DP4, "about to consume a braceleft...");
240: expect(BRACELEFT, next());
241:// Debug.trace(DictLexer.class, Debug.DP4, "...braceleft consumed");
242: while (true)
243: {
244: if (peek()==BRACERIGHT)
245: break;
246: Object key, value;
247: key=string();
248:// Debug.trace(DictLexer.class, Debug.DP4, "key found: "+key);
249:// Debug.trace(DictLexer.class, Debug.DP4, "about to consume a colon...");
250: expect(COLON, next());
251:// Debug.trace(DictLexer.class, Debug.DP4, "...colon consumed");
252: value=(peek()==BRACELEFT) ? (Object) dictionary() : (Object) string();
253:// Debug.trace(DictLexer.class, Debug.DP4, "value found: "+value);
254: hm.put(key, value);
255: if (peek()==COMMA)
256: {
257:// Debug.trace(DictLexer.class, Debug.DP4, "found comma in the right place, skipping");
258: next();
259: }
260: }
261:// Debug.trace(DictLexer.class, Debug.DP4, "about to consume a braceright...");
262: expect(BRACERIGHT, next());
263:// Debug.trace(DictLexer.class, Debug.DP4, "...braceright consumed");
264: return hm;
265: }
266:
267: String string() throws IOException
268: {
269:// Debug.trace(DictLexer.class, Debug.DP4, "in string()");
270: int tok=next();
271: Assertion.assert((tok==SINGLE_QUOTE || tok==DOUBLE_QUOTE), "token is a quote: "+(char)tok);
272: String s=readUntil((char)tok);
273: expect(tok, next());
274:// Debug.trace(this, Debug.DP4, "returning string: "+s);
275: return s;
276: }
277:
278: String readUntil(char terminator) throws IOException
279: {
280: StringBuffer sb=new StringBuffer();
281: boolean found=false;
282: boolean escape=false;
283: st.resetSyntax();
284: st.eolIsSignificant(false);
285: while (st.nextToken()!=st.TT_EOF)
286: {
287: if (st.ttype==st.TT_WORD)
288: sb.append(st.sval);
289: else
290: {
291: escape= (st.ttype=='\\');
292: sb.append((char)st.ttype);
293: }
294: found=(!escape) && (sb.charAt(sb.length()-1)==terminator);
295: if (escape)
296: {
297: if (peek()=='\\')
298: {
299: next();
300: }
301: escape=false;
302: }
303:
304: if (found)
305: {
306: Debug.trace(DictLexer.class, Debug.DP4, "found terminator: " + terminator);
307: st.pushBack();
308: break;
309: }
310: }
311: resetSyntax();
312: Assertion.assert(found, "terminator found: "+terminator);
313: sb.setLength(sb.length()-1);
314: String s=unescape(sb.toString());
315:// Debug.trace(DictLexer.class, Debug.DP4, "returning from readUntil(): "+s);
316: return s;
317: }
318:
319: String unescape(String inputString)
320: {
321: int len=inputString.length();
322: StringBuffer sb=new StringBuffer(len);
323: int offset=0;
324: int slash;
325: while((slash=inputString.indexOf("\\", offset))>=0)
326: {
327: sb.append(inputString.substring(offset, slash));
328: offset=slash+1;
329: if (len>=slash+4)
330: {
331: String possibleOctal=inputString.substring(slash+1, slash+4);
332: try
333: {
334: char c=(char) Integer.parseInt(possibleOctal, 8);
335: sb.append(c);
336: offset=offset+3;
337: }
338: catch (NumberFormatException nafta)
339: {
340: sb.append("\\");
341: sb.append(possibleOctal);
342: offset=offset+3;
343: }
344: }
345: else sb.append("\\");
346: }
347: if (offset<=len-1)
348: {
349: sb.append(inputString.substring(offset));
350: }
351: return sb.toString();
352: }
353:
354: void expect(int expected, int received)
355: {
356: Assertion.assert((expected==received), expected + " equals " + received);
357: }
358:
359: private void resetSyntax()
360: {
361: st.resetSyntax();
362: st.eolIsSignificant(false);
363: st.wordChars('a', 'z');
364: st.wordChars('A', 'Z');
365: st.wordChars('0', '9');
366: st.whitespaceChars(0, 32);
367: }
368: }
369:
370: /**
371: * test method.
372: */
373: public static void main(String[] args) throws IOException
374: {
375: String filename=args[0];
376: FileReader fr=new FileReader(filename);
377: HashMap hm=fromReader(fr);
378: System.out.println(hm);
379: }
380:}
381:
382:/* $Log: DictParser.java,v $
383:/* Revision 1.7 2001/02/20 23:55:43 smulloni
384:/* fencepost error in DictParser fixed
385:/*
386:/* Revision 1.6 2000/12/21 18:53:13 smulloni
387:/* cosmetic improvements.
388:/*
389:/* Revision 1.5 2000/12/13 22:25:33 smulloni
390:/* now save catalogs explicitly in 8859_1 (ISO Latin-1) encoding
391:/*
392:/* Revision 1.4 2000/12/13 15:28:24 smulloni
393:/* removing improper check for hex escape
394:/*
395:/* Revision 1.3 2000/12/11 22:36:13 smulloni
396:/* improved support for input of UTF.
397:/*
398:/* Revision 1.2 2000/12/08 05:50:30 smulloni
399:/* fixed MessageCatalogEditor. The spi features are now a special build option,
400:/* and editors are loaded through reflection.
401:/* */
|