001: /*
002: * Copyright 2000 Finn Bock
003: *
004: * This program contains material copyrighted by:
005: * Copyright (c) Corporation for National Research Initiatives.
006: * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
007: */
008:
009: package org.python.modules;
010:
011: import org.python.core.Py;
012: import org.python.core.PyInteger;
013: import org.python.core.PyObject;
014: import org.python.core.PyString;
015: import org.python.core.PyTuple;
016: import org.python.core.PyUnicode;
017: import org.python.core.codecs;
018:
019: public class _codecs {
020:
021: public static void register(PyObject search_function) {
022: codecs.register(search_function);
023: }
024:
025: public static PyTuple lookup(String encoding) {
026: return codecs.lookup(encoding);
027: }
028:
029: private static PyTuple decode_tuple(String s, int len) {
030: return new PyTuple(new PyObject[] { new PyUnicode(s),
031: Py.newInteger(len) });
032: }
033:
034: private static PyTuple encode_tuple(String s, int len) {
035: return new PyTuple(new PyObject[] { Py.java2py(s),
036: Py.newInteger(len) });
037: }
038:
039: /* --- UTF-8 Codec --------------------------------------------------- */
040:
041: public static PyTuple utf_8_decode(String str) {
042: return utf_8_decode(str, null);
043: }
044:
045: public static PyTuple utf_8_decode(String str, String errors) {
046: int size = str.length();
047: return decode_tuple(codecs.PyUnicode_DecodeUTF8(str, errors),
048: size);
049: }
050:
051: public static PyTuple utf_8_encode(String str) {
052: return utf_8_encode(str, null);
053: }
054:
055: public static PyTuple utf_8_encode(String str, String errors) {
056: int size = str.length();
057: return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors),
058: size);
059: }
060:
061: /* --- Character Mapping Codec --------------------------------------- */
062:
063: public static PyTuple charmap_decode(String str, String errors,
064: PyObject mapping) {
065: int size = str.length();
066: StringBuffer v = new StringBuffer(size);
067:
068: for (int i = 0; i < size; i++) {
069: char ch = str.charAt(i);
070: if (ch > 0xFF) {
071: codecs.decoding_error("charmap", v, errors,
072: "ordinal not in range(255)");
073: i++;
074: continue;
075: }
076:
077: PyObject w = Py.newInteger(ch);
078: PyObject x = mapping.__finditem__(w);
079: if (x == null) {
080: /* No mapping found: default to Latin-1 mapping if possible */
081: v.append(ch);
082: continue;
083: }
084:
085: /* Apply mapping */
086: if (x instanceof PyInteger) {
087: int value = ((PyInteger) x).getValue();
088: if (value < 0 || value > 65535)
089: throw Py
090: .TypeError("character mapping must be in range(65535)");
091: v.append((char) value);
092: } else if (x == Py.None) {
093: codecs.decoding_error("charmap", v, errors,
094: "character maps to <undefined>");
095: } else if (x instanceof PyString) {
096: v.append(x.toString());
097: } else {
098: /* wrong return value */
099: throw Py
100: .TypeError("character mapping must return integer, "
101: + "None or unicode");
102: }
103: }
104: return decode_tuple(v.toString(), size);
105: }
106:
107: public static PyTuple charmap_encode(String str, String errors,
108: PyObject mapping) {
109: int size = str.length();
110: StringBuffer v = new StringBuffer(size);
111:
112: for (int i = 0; i < size; i++) {
113: char ch = str.charAt(i);
114: PyObject w = Py.newInteger(ch);
115: PyObject x = mapping.__finditem__(w);
116: if (x == null) {
117: /* No mapping found: default to Latin-1 mapping if possible */
118: if (ch < 256)
119: v.append(ch);
120: else
121: codecs.encoding_error("charmap", v, errors,
122: "missing character mapping");
123: continue;
124: }
125: if (x instanceof PyInteger) {
126: int value = ((PyInteger) x).getValue();
127: if (value < 0 || value > 255)
128: throw Py
129: .TypeError("character mapping must be in range(256)");
130: v.append((char) value);
131: } else if (x == Py.None) {
132: codecs.encoding_error("charmap", v, errors,
133: "character maps to <undefined>");
134: } else if (x instanceof PyString) {
135: v.append(x.toString());
136: } else {
137: /* wrong return value */
138: throw Py.TypeError("character mapping must return "
139: + "integer, None or unicode");
140: }
141: }
142: return encode_tuple(v.toString(), size);
143: }
144:
145: /* --- 7-bit ASCII Codec -------------------------------------------- */
146:
147: public static PyTuple ascii_decode(String str) {
148: return ascii_decode(str, null);
149: }
150:
151: public static PyTuple ascii_decode(String str, String errors) {
152: int size = str.length();
153: return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size,
154: errors), size);
155: }
156:
157: public static PyTuple ascii_encode(String str) {
158: return ascii_encode(str, null);
159: }
160:
161: public static PyTuple ascii_encode(String str, String errors) {
162: int size = str.length();
163: return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size,
164: errors), size);
165: }
166:
167: /* --- Latin-1 Codec -------------------------------------------- */
168:
169: public static PyTuple latin_1_decode(String str) {
170: return latin_1_decode(str, null);
171: }
172:
173: public static PyTuple latin_1_decode(String str, String errors) {
174: int size = str.length();
175: StringBuffer v = new StringBuffer(size);
176:
177: for (int i = 0; i < size; i++) {
178: char ch = str.charAt(i);
179: if (ch < 256) {
180: v.append(ch);
181: } else {
182: codecs.decoding_error("latin-1", v, errors,
183: "ordinal not in range(256)");
184: i++;
185: continue;
186: }
187: }
188:
189: return decode_tuple(v.toString(), size);
190: }
191:
192: public static PyTuple latin_1_encode(String str) {
193: return latin_1_encode(str, null);
194: }
195:
196: public static PyTuple latin_1_encode(String str, String errors) {
197: int size = str.length();
198: StringBuffer v = new StringBuffer(size);
199:
200: for (int i = 0; i < size; i++) {
201: char ch = str.charAt(i);
202: if (ch >= 256) {
203: codecs.encoding_error("latin-1", v, errors,
204: "ordinal not in range(256)");
205: } else
206: v.append(ch);
207: }
208: return encode_tuple(v.toString(), size);
209: }
210:
211: /* --- UTF16 Codec -------------------------------------------- */
212:
213: public static PyTuple utf_16_encode(String str) {
214: return utf_16_encode(str, null);
215: }
216:
217: public static PyTuple utf_16_encode(String str, String errors) {
218: return encode_tuple(encode_UTF16(str, errors, 0), str.length());
219: }
220:
221: public static PyTuple utf_16_encode(String str, String errors,
222: int byteorder) {
223: return encode_tuple(encode_UTF16(str, errors, byteorder), str
224: .length());
225: }
226:
227: public static PyTuple utf_16_le_encode(String str) {
228: return utf_16_le_encode(str, null);
229: }
230:
231: public static PyTuple utf_16_le_encode(String str, String errors) {
232: return encode_tuple(encode_UTF16(str, errors, -1), str.length());
233: }
234:
235: public static PyTuple utf_16_be_encode(String str) {
236: return utf_16_be_encode(str, null);
237: }
238:
239: public static PyTuple utf_16_be_encode(String str, String errors) {
240: return encode_tuple(encode_UTF16(str, errors, 1), str.length());
241: }
242:
243: private static String encode_UTF16(String str, String errors,
244: int byteorder) {
245: int size = str.length();
246: StringBuffer v = new StringBuffer((size + (byteorder == 0 ? 1
247: : 0)) * 2);
248:
249: if (byteorder == 0) {
250: v.append((char) 0xFE);
251: v.append((char) 0xFF);
252: }
253:
254: if (byteorder == 0 || byteorder == 1)
255: for (int i = 0; i < size; i++) {
256: char ch = str.charAt(i);
257: v.append((char) ((ch >>> 8) & 0xFF));
258: v.append((char) (ch & 0xFF));
259: }
260: else {
261: for (int i = 0; i < size; i++) {
262: char ch = str.charAt(i);
263: v.append((char) (ch & 0xFF));
264: v.append((char) ((ch >>> 8) & 0xFF));
265: }
266: }
267:
268: return v.toString();
269: }
270:
271: public static PyTuple utf_16_decode(String str) {
272: return utf_16_decode(str, null);
273: }
274:
275: public static PyTuple utf_16_decode(String str, String errors) {
276: int[] bo = new int[] { 0 };
277: return decode_tuple(decode_UTF16(str, errors, bo), str.length());
278: }
279:
280: public static PyTuple utf_16_decode(String str, String errors,
281: int byteorder) {
282: int[] bo = new int[] { byteorder };
283: return decode_tuple(decode_UTF16(str, errors, bo), str.length());
284: }
285:
286: public static PyTuple utf_16_le_decode(String str) {
287: return utf_16_le_decode(str, null);
288: }
289:
290: public static PyTuple utf_16_le_decode(String str, String errors) {
291: int[] bo = new int[] { -1 };
292: return decode_tuple(decode_UTF16(str, errors, bo), str.length());
293: }
294:
295: public static PyTuple utf_16_be_decode(String str) {
296: return utf_16_be_decode(str, null);
297: }
298:
299: public static PyTuple utf_16_be_decode(String str, String errors) {
300: int[] bo = new int[] { 1 };
301: return decode_tuple(decode_UTF16(str, errors, bo), str.length());
302: }
303:
304: public static PyTuple utf_16_ex_decode(String str) {
305: return utf_16_ex_decode(str, null);
306: }
307:
308: public static PyTuple utf_16_ex_decode(String str, String errors) {
309: return utf_16_ex_decode(str, errors, 0);
310: }
311:
312: public static PyTuple utf_16_ex_decode(String str, String errors,
313: int byteorder) {
314: int[] bo = new int[] { 0 };
315: String s = decode_UTF16(str, errors, bo);
316: return new PyTuple(new PyObject[] { Py.newString(s),
317: Py.newInteger(str.length()), Py.newInteger(bo[0]) });
318: }
319:
320: private static String decode_UTF16(String str, String errors,
321: int[] byteorder) {
322: int bo = 0;
323: if (byteorder != null)
324: bo = byteorder[0];
325:
326: int size = str.length();
327:
328: if (size % 2 != 0)
329: codecs.decoding_error("UTF16", null, errors,
330: "truncated data");
331:
332: StringBuffer v = new StringBuffer(size / 2);
333:
334: for (int i = 0; i < size; i += 2) {
335: char ch1 = str.charAt(i);
336: char ch2 = str.charAt(i + 1);
337: if (ch1 == 0xFE && ch2 == 0xFF) {
338: bo = 1;
339: continue;
340: } else if (ch1 == 0xFF && ch2 == 0xFE) {
341: bo = -1;
342: continue;
343: }
344:
345: char ch;
346: if (bo == -1)
347: ch = (char) (ch2 << 8 | ch1);
348: else
349: ch = (char) (ch1 << 8 | ch2);
350:
351: if (ch < 0xD800 || ch > 0xDFFF) {
352: v.append(ch);
353: continue;
354: }
355:
356: /* UTF-16 code pair: */
357: if (i == size - 1) {
358: codecs.decoding_error("UTF-16", v, errors,
359: "unexpected end of data");
360: continue;
361: }
362:
363: ch = str.charAt(++i);
364: if (0xDC00 <= ch && ch <= 0xDFFF) {
365: ch = str.charAt(++i);
366: if (0xD800 <= ch && ch <= 0xDBFF)
367: /* This is valid data (a UTF-16 surrogate pair), but
368: we are not able to store this information since our
369: Py_UNICODE type only has 16 bits... this might
370: change someday, even though it's unlikely. */
371: codecs.decoding_error("UTF-16", v, errors,
372: "code pairs are not supported");
373: continue;
374: }
375: codecs.decoding_error("UTF-16", v, errors,
376: "illegal encoding");
377: }
378:
379: if (byteorder != null)
380: byteorder[0] = bo;
381:
382: return v.toString();
383: }
384:
385: /* --- RawUnicodeEscape Codec ----------------------------------------- */
386:
387: public static PyTuple raw_unicode_escape_encode(String str) {
388: return raw_unicode_escape_encode(str, null);
389: }
390:
391: public static PyTuple raw_unicode_escape_encode(String str,
392: String errors) {
393: return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(
394: str, errors, false), str.length());
395: }
396:
397: public static PyTuple raw_unicode_escape_decode(String str) {
398: return raw_unicode_escape_decode(str, null);
399: }
400:
401: public static PyTuple raw_unicode_escape_decode(String str,
402: String errors) {
403: return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(
404: str, errors), str.length());
405: }
406:
407: /* --- UnicodeEscape Codec -------------------------------------------- */
408:
409: public static PyTuple unicode_escape_encode(String str) {
410: return unicode_escape_encode(str, null);
411: }
412:
413: public static PyTuple unicode_escape_encode(String str,
414: String errors) {
415: return encode_tuple(PyString.encode_UnicodeEscape(str, false),
416: str.length());
417: }
418:
419: public static PyTuple unicode_escape_decode(String str) {
420: return unicode_escape_decode(str, null);
421: }
422:
423: public static PyTuple unicode_escape_decode(String str,
424: String errors) {
425: int n = str.length();
426: return decode_tuple(PyString.decode_UnicodeEscape(str, 0, n,
427: errors, true), n);
428: }
429:
430: /* --- UnicodeInternal Codec ------------------------------------------ */
431:
432: public static PyTuple unicode_internal_encode(String str) {
433: return unicode_internal_encode(str, null);
434: }
435:
436: public static PyTuple unicode_internal_encode(String str,
437: String errors) {
438: return encode_tuple(str, str.length());
439: }
440:
441: public static PyTuple unicode_internal_decode(String str) {
442: return unicode_internal_decode(str, null);
443: }
444:
445: public static PyTuple unicode_internal_decode(String str,
446: String errors) {
447: return decode_tuple(str, str.length());
448: }
449:
450: }
|