001: /*
002: * Copyright 2000 Finn Bock
003: *
004: * This program contains material copyrighted by:
005: * Copyright (c) Corporation for National Research Initiatives.
006: * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
007: */
008:
009: package org.python.core;
010:
011: /**
012: * Contains the implementation of the builtin codecs.
013: * @since Jython 2.0
014: */
015:
016: public class codecs {
017: private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
018:
019: private static PyList searchPath = new PyList();
020: private static PyStringMap searchCache = new PyStringMap();
021:
022: private static String default_encoding = "ascii";
023:
024: public static String getDefaultEncoding() {
025: return default_encoding;
026: }
027:
028: public static void setDefaultEncoding(String encoding) {
029: lookup(encoding);
030: default_encoding = encoding;
031: }
032:
033: public static void register(PyObject search_function) {
034: if (!search_function.isCallable()) {
035: throw Py.TypeError("argument must be callable");
036: }
037: searchPath.append(search_function);
038: }
039:
040: public static PyTuple lookup(String encoding) {
041: import_encodings();
042: PyString v = new PyString(normalizestring(encoding));
043: PyObject result = searchCache.__finditem__(v);
044: if (result != null) {
045: return (PyTuple) result;
046: }
047:
048: if (searchPath.__len__() == 0) {
049: throw new PyException(Py.LookupError,
050: "no codec search functions registered: "
051: + "can't find encoding");
052: }
053:
054: PyObject iter = searchPath.__iter__();
055: PyObject func = null;
056: while ((func = iter.__iternext__()) != null) {
057: result = func.__call__(v);
058: if (result == Py.None) {
059: continue;
060: }
061: if (!(result instanceof PyTuple) || result.__len__() != 4) {
062: throw Py.TypeError("codec search functions must "
063: + "return 4-tuples");
064: }
065: break;
066: }
067: if (func == null) {
068: throw new PyException(Py.LookupError, "unknown encoding "
069: + encoding);
070: }
071: searchCache.__setitem__(v, result);
072: return (PyTuple) result;
073: }
074:
075: private static String normalizestring(String string) {
076: return string.toLowerCase().replace(' ', '-');
077: }
078:
079: private static boolean import_encodings_called = false;
080:
081: private static void import_encodings() {
082: if (!import_encodings_called) {
083: import_encodings_called = true;
084: try {
085: __builtin__.__import__("encodings");
086: } catch (PyException exc) {
087: if (exc.type != Py.ImportError) {
088: throw exc;
089: }
090: }
091: }
092: }
093:
094: public static String decode(PyString v, String encoding,
095: String errors) {
096: if (encoding == null) {
097: encoding = getDefaultEncoding();
098: } else {
099: encoding = normalizestring(encoding);
100: }
101:
102: if (errors != null) {
103: errors = errors.intern();
104: }
105:
106: /* Shortcuts for common default encodings */
107: /*
108: if (encoding.equals("utf-8"))
109: return utf_8_decode(v, errors).__getitem__(0).__str__();
110: else if (encoding.equals("latin-1"))
111: ; //return PyUnicode_DecodeLatin1(s, size, errors);
112: else if (encoding.equals("ascii"))
113: ; //return PyUnicode_DecodeASCII(s, size, errors);
114: */
115: if (encoding.equals("ascii")) {
116: return PyUnicode_DecodeASCII(v.toString(), v.__len__(),
117: errors);
118: }
119:
120: /* Decode via the codec registry */
121: PyObject decoder = getDecoder(encoding);
122: PyObject result = null;
123: if (errors != null) {
124: result = decoder.__call__(v, new PyString(errors));
125: } else {
126: result = decoder.__call__(v);
127: }
128:
129: if (!(result instanceof PyTuple) || result.__len__() != 2) {
130: throw Py.TypeError("decoder must return a tuple "
131: + "(object,integer)");
132: }
133: return result.__getitem__(0).toString();
134: }
135:
136: private static PyObject getDecoder(String encoding) {
137: PyObject codecs = lookup(encoding);
138: return codecs.__getitem__(1);
139: }
140:
141: public static String encode(PyString v, String encoding,
142: String errors) {
143: if (encoding == null) {
144: encoding = getDefaultEncoding();
145: } else {
146: encoding = normalizestring(encoding);
147: }
148:
149: if (errors != null) {
150: errors = errors.intern();
151: }
152:
153: /* Shortcuts for common default encodings */
154: /*
155: if (encoding.equals("utf-8"))
156: return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
157: else if (encoding.equals("latin-1"))
158: return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
159: else
160: */
161:
162: if (encoding.equals("ascii")) {
163: return PyUnicode_EncodeASCII(v.toString(), v.__len__(),
164: errors);
165: }
166:
167: /* Decode via the codec registry */
168: PyObject encoder = getEncoder(encoding);
169: PyObject result = null;
170: if (errors != null) {
171: result = encoder.__call__(v, new PyString(errors));
172: } else {
173: result = encoder.__call__(v);
174: }
175:
176: if (!(result instanceof PyTuple) || result.__len__() != 2) {
177: throw Py.TypeError("encoder must return a tuple "
178: + "(object,integer)");
179: }
180: return result.__getitem__(0).toString();
181: }
182:
183: private static PyObject getEncoder(String encoding) {
184: PyObject codecs = lookup(encoding);
185: return codecs.__getitem__(0);
186: }
187:
188: /* --- UTF-8 Codec ---------------------------------------------------- */
189: private static byte utf8_code_length[] = {
190: /* Map UTF-8 encoded prefix byte to sequence length. zero means
191: illegal prefix. see RFC 2279 for details */
192: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
194: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
196: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
198: 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
202: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
203: 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
204: 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
205:
206: public static String PyUnicode_DecodeUTF8(String str, String errors) {
207: int size = str.length();
208: StringBuffer unicode = new StringBuffer(size);
209:
210: /* Unpack UTF-8 encoded data */
211: for (int i = 0; i < size;) {
212: int ch = str.charAt(i);
213: if (ch > 0xFF) {
214: codecs.decoding_error("utf-8", unicode, errors,
215: "ordinal not in range(255)");
216: i++;
217: continue;
218: }
219:
220: if (ch < 0x80) {
221: unicode.append((char) ch);
222: i++;
223: continue;
224: }
225:
226: int n = utf8_code_length[ch];
227:
228: if (i + n > size) {
229: codecs.decoding_error("utf-8", unicode, errors,
230: "unexpected end of data");
231: i++;
232: continue;
233: }
234:
235: switch (n) {
236: case 0:
237: codecs.decoding_error("utf-8", unicode, errors,
238: "unexpected code byte");
239: i++;
240: continue;
241: case 1:
242: codecs.decoding_error("utf-8", unicode, errors,
243: "internal error");
244: i++;
245: continue;
246: case 2:
247: char ch1 = str.charAt(i + 1);
248: if ((ch1 & 0xc0) != 0x80) {
249: codecs.decoding_error("utf-8", unicode, errors,
250: "invalid data");
251: i++;
252: continue;
253: }
254: ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
255: if (ch < 0x80) {
256: codecs.decoding_error("utf-8", unicode, errors,
257: "illegal encoding");
258: i++;
259: continue;
260: } else
261: unicode.append((char) ch);
262: break;
263:
264: case 3:
265: ch1 = str.charAt(i + 1);
266: char ch2 = str.charAt(i + 2);
267: if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
268: codecs.decoding_error("utf-8", unicode, errors,
269: "invalid data");
270: i++;
271: continue;
272: }
273: ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6)
274: + (ch2 & 0x3f);
275: if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
276: codecs.decoding_error("utf-8", unicode, errors,
277: "illegal encoding");
278: i++;
279: continue;
280: } else
281: unicode.append((char) ch);
282: break;
283:
284: case 4:
285: ch1 = str.charAt(i + 1);
286: ch2 = str.charAt(i + 2);
287: char ch3 = str.charAt(i + 3);
288: if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80
289: || (ch3 & 0xc0) != 0x80) {
290: codecs.decoding_error("utf-8", unicode, errors,
291: "invalid data");
292: i++;
293: continue;
294: }
295: ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12)
296: + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
297: /* validate and convert to UTF-16 */
298: if ((ch < 0x10000) || /* minimum value allowed for 4
299: byte encoding */
300: (ch > 0x10ffff)) { /* maximum value allowed for
301: UTF-16 */
302: codecs.decoding_error("utf-8", unicode, errors,
303: "illegal encoding");
304: i++;
305: continue;
306: }
307: /* compute and append the two surrogates: */
308:
309: /* translate from 10000..10FFFF to 0..FFFF */
310: ch -= 0x10000;
311:
312: /* high surrogate = top 10 bits added to D800 */
313: unicode.append((char) (0xD800 + (ch >> 10)));
314:
315: /* low surrogate = bottom 10 bits added to DC00 */
316: unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
317: break;
318:
319: default:
320: /* Other sizes are only needed for UCS-4 */
321: codecs.decoding_error("utf-8", unicode, errors,
322: "unsupported Unicode code range");
323: i++;
324: }
325: i += n;
326: }
327:
328: return unicode.toString();
329: }
330:
331: public static String PyUnicode_EncodeUTF8(String str, String errors) {
332: int size = str.length();
333: StringBuffer v = new StringBuffer(size * 3);
334:
335: for (int i = 0; i < size;) {
336: int ch = str.charAt(i++);
337: if (ch < 0x80) {
338: v.append((char) ch);
339: } else if (ch < 0x0800) {
340: v.append((char) (0xc0 | (ch >> 6)));
341: v.append((char) (0x80 | (ch & 0x3f)));
342: } else {
343: if (0xD800 <= ch && ch <= 0xDFFF) {
344: if (i != size) {
345: int ch2 = str.charAt(i);
346: if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
347: /* combine the two values */
348: ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
349:
350: v.append((char) ((ch >> 18) | 0xf0));
351: v
352: .append((char) (0x80 | ((ch >> 12) & 0x3f)));
353: i++;
354: }
355: }
356: } else {
357: v.append((char) (0xe0 | (ch >> 12)));
358: }
359: v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
360: v.append((char) (0x80 | (ch & 0x3f)));
361: }
362: }
363: return v.toString();
364: }
365:
366: /* --- 7-bit ASCII Codec -------------------------------------------- */
367:
368: public static String PyUnicode_DecodeASCII(String str, int size,
369: String errors) {
370: StringBuffer v = new StringBuffer(size);
371:
372: for (int i = 0; i < size; i++) {
373: char ch = str.charAt(i);
374: if (ch < 128) {
375: v.append(ch);
376: } else {
377: decoding_error("ascii", v, errors,
378: "ordinal not in range(128)");
379: continue;
380: }
381: }
382:
383: return v.toString();
384: }
385:
386: public static String PyUnicode_EncodeASCII(String str, int size,
387: String errors) {
388: StringBuffer v = new StringBuffer(size);
389:
390: for (int i = 0; i < size; i++) {
391: char ch = str.charAt(i);
392: if (ch >= 128) {
393: encoding_error("ascii", v, errors,
394: "ordinal not in range(128)");
395: } else {
396: v.append(ch);
397: }
398: }
399: return v.toString();
400: }
401:
402: /* --- RawUnicodeEscape Codec ---------------------------------------- */
403:
404: private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
405:
406: // The modified flag is used by cPickle.
407: public static String PyUnicode_EncodeRawUnicodeEscape(String str,
408: String errors, boolean modifed) {
409:
410: int size = str.length();
411: StringBuffer v = new StringBuffer(str.length());
412:
413: for (int i = 0; i < size; i++) {
414: char ch = str.charAt(i);
415: if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
416: v.append("\\u");
417: v.append(hexdigit[(ch >>> 12) & 0xF]);
418: v.append(hexdigit[(ch >>> 8) & 0xF]);
419: v.append(hexdigit[(ch >>> 4) & 0xF]);
420: v.append(hexdigit[ch & 0xF]);
421: } else {
422: v.append(ch);
423: }
424: }
425:
426: return v.toString();
427: }
428:
429: public static String PyUnicode_DecodeRawUnicodeEscape(String str,
430: String errors) {
431: int size = str.length();
432: StringBuffer v = new StringBuffer(size);
433:
434: for (int i = 0; i < size;) {
435: char ch = str.charAt(i);
436:
437: /* Non-escape characters are interpreted as Unicode ordinals */
438: if (ch != '\\') {
439: v.append(ch);
440: i++;
441: continue;
442: }
443:
444: /* \\u-escapes are only interpreted iff the number of leading
445: backslashes is odd */
446: int bs = i;
447: while (i < size) {
448: ch = str.charAt(i);
449: if (ch != '\\')
450: break;
451: v.append(ch);
452: i++;
453: }
454: if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
455: continue;
456: }
457: v.setLength(v.length() - 1);
458: i++;
459:
460: /* \\uXXXX with 4 hex digits */
461: int x = 0;
462: for (int j = 0; j < 4; j++) {
463: ch = str.charAt(i + j);
464: int d = Character.digit(ch, 16);
465: if (d == -1) {
466: codecs.decoding_error("unicode escape", v, errors,
467: "truncated \\uXXXX");
468: break;
469: }
470: x = ((x << 4) & ~0xF) + d;
471: }
472: i += 4;
473: v.append((char) x);
474: }
475: return v.toString();
476: }
477:
478: /* --- Utility methods -------------------------------------------- */
479:
480: public static void encoding_error(String type, StringBuffer dest,
481: String errors, String details) {
482: if (errors == null || errors == "strict") {
483: throw Py.UnicodeError(type + " encoding error: " + details);
484: } else if (errors == "ignore") {
485: //ignore
486: } else if (errors == "replace") {
487: dest.append('?');
488: } else {
489: throw Py.ValueError(type + " encoding error; "
490: + "unknown error handling code: " + errors);
491: }
492: }
493:
494: public static void decoding_error(String type, StringBuffer dest,
495: String errors, String details) {
496: if (errors == null || errors == "strict") {
497: throw Py.UnicodeError(type + " decoding error: " + details);
498: } else if (errors == "ignore") {
499: //ignore
500: } else if (errors == "replace") {
501: if (dest != null) {
502: dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
503: }
504: } else {
505: throw Py.ValueError(type + " decoding error; "
506: + "unknown error handling code: " + errors);
507: }
508: }
509: }
|