001: /*
002: * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: *
023: * Free Software Foundation, Inc.
024: * 59 Temple Place, Suite 330
025: * Boston, MA 02111-1307 USA
026: *
027: * @author Scott Ferguson
028: */
029:
030: package com.caucho.vfs;
031:
032: import com.caucho.util.CharBuffer;
033: import com.caucho.vfs.i18n.EncodingReader;
034: import com.caucho.vfs.i18n.EncodingWriter;
035: import com.caucho.vfs.i18n.ISO8859_1Writer;
036: import com.caucho.vfs.i18n.JDKReader;
037: import com.caucho.vfs.i18n.JDKWriter;
038:
039: import java.io.InputStream;
040: import java.io.Reader;
041: import java.io.UnsupportedEncodingException;
042: import java.util.HashMap;
043: import java.util.Hashtable;
044: import java.util.Locale;
045:
046: /**
047: * Converts between the mime encoding names and Java encoding names.
048: */
049: public class Encoding {
050: static HashMap<String, String> _javaName;
051: static Hashtable<String, String> _mimeName;
052: static HashMap<String, String> _localeName;
053:
054: // map from an encoding name to its EncodingReader factory.
055: static final HashMap<String, EncodingReader> _readEncodingFactories = new HashMap<String, EncodingReader>();
056:
057: // map from an encoding name to its EncodingWriter factory.
058: static final HashMap<String, EncodingWriter> _writeEncodingFactories = new HashMap<String, EncodingWriter>();
059:
060: static final EncodingWriter _latin1Writer = new ISO8859_1Writer();
061:
062: /**
063: * Can't create an instance of the encoding class.
064: */
065: private Encoding() {
066: }
067:
068: /**
069: * Returns the canonical mime name for the given character encoding.
070: *
071: * @param encoding character encoding name, possibly an alias
072: *
073: * @return canonical mime name for the encoding.
074: */
075: public static String getMimeName(String encoding) {
076: if (encoding == null)
077: return null;
078:
079: String value = _mimeName.get(encoding);
080: if (value != null)
081: return value;
082:
083: String upper = normalize(encoding);
084:
085: String lookup = _mimeName.get(upper);
086:
087: value = lookup == null ? upper : lookup;
088:
089: _mimeName.put(encoding, value);
090:
091: return value;
092: }
093:
094: /**
095: * Returns the canonical mime name for the given locale.
096: *
097: * @param locale locale to use.
098: *
099: * @return canonical mime name for the encoding.
100: */
101: public static String getMimeName(Locale locale) {
102: if (locale == null)
103: return "ISO-8859-1";
104:
105: String mimeName = _localeName.get(locale.toString());
106: if (mimeName == null)
107: mimeName = _localeName.get(locale.getLanguage());
108:
109: if (mimeName == null)
110: return "ISO-8859-1";
111: else
112: return mimeName;
113: }
114:
115: /**
116: * Returns a Reader to translate bytes to characters. If a specialized
117: * reader exists in com.caucho.vfs.i18n, use it.
118: *
119: * @param is the input stream.
120: * @param encoding the encoding name.
121: *
122: * @return a reader for the translation
123: */
124: public static Reader getReadEncoding(InputStream is, String encoding)
125: throws UnsupportedEncodingException {
126: return getReadFactory(encoding).create(is);
127: }
128:
129: /**
130: * Returns a Reader to translate bytes to characters. If a specialized
131: * reader exists in com.caucho.vfs.i18n, use it.
132: *
133: * @param is the input stream.
134: * @param encoding the encoding name.
135: *
136: * @return a reader for the translation
137: */
138: public static EncodingReader getReadFactory(String encoding)
139: throws UnsupportedEncodingException {
140: EncodingReader factory = null;
141:
142: synchronized (_readEncodingFactories) {
143: factory = _readEncodingFactories.get(encoding);
144:
145: if (factory == null) {
146: try {
147: String javaEncoding = Encoding
148: .getJavaName(encoding);
149:
150: if (javaEncoding == null)
151: javaEncoding = "ISO8859_1";
152:
153: String className = "com.caucho.vfs.i18n."
154: + javaEncoding + "Reader";
155:
156: Class cl = Class.forName(className);
157:
158: factory = (EncodingReader) cl.newInstance();
159: factory.setJavaEncoding(javaEncoding);
160: } catch (Throwable e) {
161: }
162:
163: if (factory == null) {
164: String javaEncoding = Encoding
165: .getJavaName(encoding);
166:
167: if (javaEncoding == null)
168: javaEncoding = "ISO8859_1";
169:
170: factory = new JDKReader();
171: factory.setJavaEncoding(javaEncoding);
172: }
173:
174: _readEncodingFactories.put(encoding, factory);
175: }
176: }
177:
178: return factory;
179: }
180:
181: /**
182: * Returns an EncodingWriter to translate characters to bytes.
183: *
184: * @param encoding the encoding name.
185: *
186: * @return a writer for the translation
187: */
188: public static EncodingWriter getWriteEncoding(String encoding) {
189: EncodingWriter factory = _writeEncodingFactories.get(encoding);
190:
191: if (factory != null)
192: return factory.create();
193:
194: synchronized (_writeEncodingFactories) {
195: factory = _writeEncodingFactories.get(encoding);
196:
197: if (factory == null) {
198: try {
199: String javaEncoding = Encoding
200: .getJavaName(encoding);
201:
202: if (javaEncoding == null)
203: javaEncoding = "ISO8859_1";
204:
205: String className = "com.caucho.vfs.i18n."
206: + javaEncoding + "Writer";
207:
208: Class cl = Class.forName(className);
209:
210: factory = (EncodingWriter) cl.newInstance();
211: factory.setJavaEncoding(javaEncoding);
212: } catch (Throwable e) {
213: }
214:
215: if (factory == null) {
216: factory = new JDKWriter();
217: String javaEncoding = Encoding
218: .getJavaName(encoding);
219:
220: if (javaEncoding == null)
221: javaEncoding = "ISO8859_1";
222: factory.setJavaEncoding(javaEncoding);
223: }
224:
225: _writeEncodingFactories.put(encoding, factory);
226: }
227: }
228:
229: // return factory.create(factory.getJavaEncoding());
230: // charset uses the original encoding, not the java encoding
231: return factory.create(encoding);
232: }
233:
234: /**
235: * Returns the latin 1 writer.
236: */
237: public static EncodingWriter getLatin1Writer() {
238: return _latin1Writer;
239: }
240:
241: /**
242: * Returns the Java name for the given encoding.
243: *
244: * @param encoding character encoding name
245: *
246: * @return Java encoding name
247: */
248: public static String getJavaName(String encoding) {
249: if (encoding == null)
250: return null;
251:
252: String upper = normalize(encoding);
253:
254: String javaName = null;
255:
256: javaName = _javaName.get(upper);
257: if (javaName != null)
258: return javaName;
259:
260: String lookup = _mimeName.get(upper);
261:
262: if (lookup != null)
263: javaName = _javaName.get(lookup);
264:
265: return javaName == null ? upper : javaName;
266: }
267:
268: /**
269: * Returns the Java name for the given locale.
270: *
271: * @param locale the locale to use
272: *
273: * @return Java encoding name
274: */
275: public static String getJavaName(Locale locale) {
276: if (locale == null)
277: return null;
278:
279: return getJavaName(getMimeName(locale));
280: }
281:
282: /**
283: * Normalize the user's encoding name to avoid case issues.
284: */
285: private static String normalize(String name) {
286: CharBuffer cb = CharBuffer.allocate();
287:
288: int len = name.length();
289: for (int i = 0; i < len; i++) {
290: char ch = name.charAt(i);
291:
292: if (Character.isLowerCase(ch))
293: cb.append(Character.toUpperCase(ch));
294: else if (ch == '_')
295: cb.append('-');
296: else
297: cb.append(ch);
298: }
299:
300: return cb.close();
301: }
302:
303: static {
304: _javaName = new HashMap<String, String>();
305: _mimeName = new Hashtable<String, String>();
306: _localeName = new HashMap<String, String>();
307:
308: _mimeName.put("ANSI-X3.4-1968", "US-ASCII");
309: _mimeName.put("ISO-IR-6", "US-ASCII");
310: _mimeName.put("ISO-646.IRV:1991", "US-ASCII");
311: _mimeName.put("ASCII", "US-ASCII");
312: _mimeName.put("ISO646-US", "US-ASCII");
313: _mimeName.put("US-ASCII", "US-ASCII");
314: _mimeName.put("us", "US-ASCII");
315: _mimeName.put("IBM367", "US-ASCII");
316: _mimeName.put("CP367", "US-ASCII");
317: _mimeName.put("CSASCII", "US-ASCII");
318: _javaName.put("US-ASCII", "ISO8859_1");
319:
320: _mimeName.put("ISO-2022-KR", "ISO-2022-KR");
321: _mimeName.put("CSISO2022KR", "ISO-2022-KR");
322: _mimeName.put("ISO2022-KR", "ISO-2022-KR");
323: _javaName.put("ISO-2022-KR", "ISO2022_KR");
324:
325: _mimeName.put("EUC-KR", "EUC-KR");
326: _mimeName.put("CSEUCKR", "EUC-KR");
327: _javaName.put("EUC-KR", "EUC_KR");
328:
329: _mimeName.put("ISO-2022-JP", "ISO-2022-JP");
330: _mimeName.put("CSISO2022JP", "ISO-2022-JP");
331: _mimeName.put("ISO2022-JP", "ISO-2022-JP");
332: _javaName.put("ISO-2022-JP", "ISO2022JP");
333:
334: _mimeName.put("ISO-2022-JP-2", "ISO-2022-JP-2");
335: _mimeName.put("CSISO2022JP2", "ISO-2022-JP-2");
336: _mimeName.put("ISO2022-JP2", "ISO-2022-JP-2");
337: _javaName.put("ISO-2022-JP-2", "ISO2022_JP2");
338:
339: _mimeName.put("ISO_8859-1:1987", "ISO-8859-1");
340: _mimeName.put("ISO-IR-100", "ISO-8859-1");
341: _mimeName.put("ISO-8859-1", "ISO-8859-1");
342: _mimeName.put("LATIN1", "ISO-8859-1");
343: _mimeName.put("LATIN-1", "ISO-8859-1");
344: _mimeName.put("L1", "ISO-8859-1");
345: _mimeName.put("IBM819", "ISO-8859-1");
346: _mimeName.put("CP819", "ISO-8859-1");
347: _mimeName.put("CSISOLATIN1", "ISO-8859-1");
348: _mimeName.put("ISO8859-1", "ISO-8859-1");
349: _mimeName.put("8859-1", "ISO-8859-1");
350: _mimeName.put("8859_1", "ISO-8859-1");
351: _javaName.put("ISO-8859-1", "ISO8859_1");
352:
353: _mimeName.put("ISO-8859-2:1987", "ISO-8859-2");
354: _mimeName.put("ISO-IR-101", "ISO-8859-2");
355: _mimeName.put("ISO-8859-2", "ISO-8859-2");
356: _mimeName.put("LATIN2", "ISO-8859-2");
357: _mimeName.put("LATIN-2", "ISO-8859-2");
358: _mimeName.put("L2", "ISO-8859-2");
359: _mimeName.put("CSISOLATIN2", "ISO-8859-2");
360: _mimeName.put("ISO8859-2", "ISO-8859-2");
361: _javaName.put("ISO-8859-2", "ISO8859_2");
362:
363: _mimeName.put("ISO-8859-3:1988", "ISO-8859-3");
364: _mimeName.put("ISO-IR-109", "ISO-8859-3");
365: _mimeName.put("ISO-8859-3", "ISO-8859-3");
366: _mimeName.put("ISO-8859-3", "ISO-8859-3");
367: _mimeName.put("LATIN3", "ISO-8859-3");
368: _mimeName.put("LATIN-3", "ISO-8859-3");
369: _mimeName.put("L3", "ISO-8859-3");
370: _mimeName.put("CSISOLATIN3", "ISO-8859-3");
371: _mimeName.put("ISO8859-3", "ISO-8859-3");
372: _javaName.put("ISO-8859-3", "ISO8859_3");
373:
374: _mimeName.put("ISO-8859-4:1988", "ISO-8859-4");
375: _mimeName.put("ISO-IR-110", "ISO-8859-4");
376: _mimeName.put("ISO-8859-4", "ISO-8859-4");
377: _mimeName.put("ISO-8859-4", "ISO-8859-4");
378: _mimeName.put("LATIN4", "ISO-8859-4");
379: _mimeName.put("LATIN-4", "ISO-8859-4");
380: _mimeName.put("L4", "ISO-8859-4");
381: _mimeName.put("CSISOLATIN4", "ISO-8859-4");
382: _mimeName.put("ISO8859-4", "ISO-8859-4");
383: _javaName.put("ISO-8859-4", "ISO8859_4");
384:
385: _mimeName.put("ISO-8859-5:1988", "ISO-8859-5");
386: _mimeName.put("ISO-IR-144", "ISO-8859-5");
387: _mimeName.put("ISO-8859-5", "ISO-8859-5");
388: _mimeName.put("ISO-8859-5", "ISO-8859-5");
389: _mimeName.put("CYRILLIC", "ISO-8859-5");
390: _mimeName.put("CSISOLATINCYRILLIC", "ISO-8859-5");
391: _mimeName.put("ISO8859-5", "ISO-8859-5");
392: _javaName.put("ISO-8859-5", "ISO8859_5");
393:
394: _mimeName.put("ISO-8859-6:1987", "ISO-8859-6");
395: _mimeName.put("ISO-IR-127", "ISO-8859-6");
396: _mimeName.put("ISO-8859-6", "ISO-8859-6");
397: _mimeName.put("ISO-8859-6", "ISO-8859-6");
398: _mimeName.put("ECMA-114", "ISO-8859-6");
399: _mimeName.put("ASMO-708", "ISO-8859-6");
400: _mimeName.put("ARABIC", "ISO-8859-6");
401: _mimeName.put("CSISOLATINARABIC", "ISO-8859-6");
402: _mimeName.put("ISO8859-6", "ISO-8859-6");
403: _javaName.put("ISO-8859-6", "ISO8859_6");
404:
405: _mimeName.put("ISO-8859-7:1987", "ISO-8859-7");
406: _mimeName.put("ISO-IR-126", "ISO-8859-7");
407: _mimeName.put("ISO-8859-7", "ISO-8859-7");
408: _mimeName.put("ISO-8859-7", "ISO-8859-7");
409: _mimeName.put("ELOT-928", "ISO-8859-7");
410: _mimeName.put("ECMA-118", "ISO-8859-7");
411: _mimeName.put("GREEK", "ISO-8859-7");
412: _mimeName.put("GREEK8", "ISO-8859-7");
413: _mimeName.put("CSISOLATINGREEN", "ISO-8859-7");
414: _mimeName.put("ISO8859-7", "ISO-8859-7");
415: _javaName.put("ISO-8859-7", "ISO8859_7");
416:
417: _mimeName.put("ISO-8859-8:1988", "ISO-8859-8");
418: _mimeName.put("ISO-IR-138", "ISO-8859-8");
419: _mimeName.put("ISO-8859-8", "ISO-8859-8");
420: _mimeName.put("ISO-8859-8", "ISO-8859-8");
421: _mimeName.put("HEBREW", "ISO-8859-8");
422: _mimeName.put("CSISOLATINHEBREW", "ISO-8859-8");
423: _mimeName.put("ISO8859-8", "ISO-8859-8");
424: _javaName.put("ISO-8859-8", "ISO8859_8");
425:
426: _mimeName.put("ISO-8859-9:1989", "ISO-8859-9");
427: _mimeName.put("ISO-IR-148", "ISO-8859-9");
428: _mimeName.put("ISO-8859-9", "ISO-8859-9");
429: _mimeName.put("ISO-8859-9", "ISO-8859-9");
430: _mimeName.put("LATIN5", "ISO-8859-9");
431: _mimeName.put("LATIN-5", "ISO-8859-9");
432: _mimeName.put("L5", "ISO-8859-9");
433: _mimeName.put("CSISOLATIN5", "ISO-8859-9");
434: _mimeName.put("ISO8859-9", "ISO-8859-9");
435: _javaName.put("ISO-8859-9", "ISO8859_9");
436:
437: _mimeName.put("ISO_8859-10:1992", "ISO-8859-10");
438: _mimeName.put("iso-ir-157", "ISO-8859-10");
439: _mimeName.put("I6", "ISO-8859-10");
440: _mimeName.put("cslSOLatin6", "ISO-8859-10");
441: _mimeName.put("latin6", "ISO-8859-10");
442: _javaName.put("ISO-8859-10", "ISO8859_10");
443:
444: _mimeName.put("UTF-7", "UTF-7");
445: _mimeName.put("UTF7", "UTF-7");
446: _javaName.put("UTF-7", "UTF7");
447:
448: _mimeName.put("UTF-8", "UTF-8");
449: _mimeName.put("UTF8", "UTF-8");
450: _javaName.put("UTF-8", "UTF8");
451:
452: _mimeName.put("UTF-16", "UTF-16");
453: _mimeName.put("UTF16", "UTF-16");
454: _javaName.put("UTF-16", "UTF16");
455:
456: _mimeName.put("UTF-16-REV", "UTF-16-REV");
457: _mimeName.put("UTF16-REV", "UTF-16-REV");
458: _javaName.put("UTF-16-REV", "UTF16_REV");
459:
460: _mimeName.put("JIS-ENCODING", "JIS_Encoding");
461: _mimeName.put("JIS-ENCODING", "JIS_Encoding");
462: _mimeName.put("CSJISENCODING", "JIS_Encoding");
463: _javaName.put("JIS_Encoding", "JIS_ENCODING");
464:
465: _mimeName.put("SHIFT-JIS", "Shift_JIS");
466: _mimeName.put("SHIFT_JIS", "Shift_JIS");
467: _mimeName.put("CSSHIFTJIS", "Shift_JIS");
468: _mimeName.put("SJIS", "Shift_JIS");
469: _javaName.put("Shift_JIS", "SJIS");
470:
471: _mimeName.put("EUC-JP", "EUC-JP");
472: _mimeName.put("EUC-JP", "EUC-JP");
473: _mimeName.put("EUCJP", "EUC-JP");
474: _mimeName.put("EUC-JP-LINUX", "EUC-JP");
475: _javaName.put("EUC-JP", "EUC_JP");
476:
477: _mimeName.put("GB2312", "GB2312");
478: _mimeName.put("CSGB2312", "GB2312");
479: _javaName.put("GB2312", "GB2312");
480:
481: _mimeName.put("GBK", "GBK");
482: _javaName.put("GBK", "GBK");
483:
484: _mimeName.put("BIG5", "Big5");
485: _mimeName.put("BIG-5", "Big5");
486: _mimeName.put("CSBIG5", "Big5");
487: _javaName.put("Big5", "BIG5");
488:
489: _mimeName.put("KOI8-R", "KOI8-R");
490: _mimeName.put("KOI-8-R", "KOI8-R");
491: _mimeName.put("KOI8-R", "KOI8-R");
492: _javaName.put("KOI8-R", "KOI8-R");
493:
494: _mimeName.put("MS950", "ms950");
495: _javaName.put("ms950", "MS950");
496:
497: _javaName.put("JAVA", "JAVA");
498:
499: _mimeName.put("windows-hack", "ISO-8859-1");
500: _mimeName.put("WINDOWS-HACK", "ISO-8859-1");
501: _javaName.put("WINDOWS-HACK", "WindowsHack");
502:
503: _mimeName.put("MACROMAN", "MacRoman");
504: _javaName.put("MacRoman", "MacRoman");
505:
506: _mimeName.put("KS_C_5601-1987", "ks_c_5601-1987");
507: _javaName.put("ks_c_5601-1987", "Cp949");
508:
509: _javaName.put("IBM500", "Cp500");
510:
511: String[] cp = new String[] { "037", "1006", "1025", "1026",
512: "1046", "1097", "1098", "1112", "1122", "1123", "1124",
513: "1250", "1251", "1252", "1253", "1254", "1255", "1256",
514: "1257", "1258", "1381", "273", "277", "278", "280",
515: "284", "285", "297", "33722", "420", "424", "437",
516: "500", "737", "775", "838", "850", "852", "855", "857",
517: "860", "861", "862", "863", "864", "865", "866", "868",
518: "869", "870", "871", "874", "875", "918", "921", "922",
519: "930", "933", "935", "937", "939", "942", "948", "949",
520: "964", "970" };
521:
522: for (int i = 0; i < cp.length; i++) {
523: _mimeName.put("CP" + cp[i], "windows-" + cp[i]);
524: _mimeName.put("WINDOWS-" + cp[i], "windows-" + cp[i]);
525: _javaName.put("windows-" + cp[i], "Cp" + cp[i]);
526: }
527:
528: // from http://www.w3c.org/International/O-charset-lang.html
529: _localeName = new HashMap<String, String>();
530: _localeName.put("af", "ISO-8859-1");
531: _localeName.put("sq", "ISO-8859-1");
532: _localeName.put("ar", "ISO-8859-6");
533: _localeName.put("eu", "ISO-8859-1");
534: _localeName.put("bg", "ISO-8859-5");
535: _localeName.put("be", "ISO-8859-5");
536: _localeName.put("ca", "ISO-8859-1");
537: _localeName.put("hr", "ISO-8859-2");
538: _localeName.put("cs", "ISO-8859-2");
539: _localeName.put("da", "ISO-8859-1");
540: _localeName.put("nl", "ISO-8859-1");
541: _localeName.put("en", "ISO-8859-1");
542: _localeName.put("eo", "ISO-8859-3");
543: _localeName.put("et", "ISO-8859-10");
544: _localeName.put("fo", "ISO-8859-1");
545: _localeName.put("fi", "ISO-8859-1");
546: _localeName.put("fr", "ISO-8859-1");
547: _localeName.put("gl", "ISO-8859-1");
548: _localeName.put("de", "ISO-8859-1");
549: _localeName.put("el", "ISO-8859-7");
550: _localeName.put("iw", "ISO-8859-8");
551: _localeName.put("hu", "ISO-8859-2");
552: _localeName.put("is", "ISO-8859-1");
553: _localeName.put("ga", "ISO-8859-1");
554: _localeName.put("it", "ISO-8859-1");
555: _localeName.put("ja", "Shift_JIS");
556: _localeName.put("lv", "ISO-8859-10");
557: _localeName.put("lt", "ISO-8859-10");
558: _localeName.put("mk", "ISO-8859-5");
559: _localeName.put("mt", "ISO-8859-3");
560: _localeName.put("no", "ISO-8859-1");
561: _localeName.put("pl", "ISO-8859-2");
562: _localeName.put("pt", "ISO-8859-1");
563: _localeName.put("ro", "ISO-8859-2");
564: // _localeName.put("ru", "KOI8-R");
565: _localeName.put("ru", "ISO-8859-5");
566: _localeName.put("gd", "ISO-8859-1");
567: _localeName.put("sr", "ISO-8859-5");
568: _localeName.put("sk", "ISO-8859-2");
569: _localeName.put("sl", "ISO-8859-2");
570: _localeName.put("es", "ISO-8859-1");
571: _localeName.put("sv", "ISO-8859-1");
572: _localeName.put("tr", "ISO-8859-9");
573: _localeName.put("uk", "ISO-8859-5");
574:
575: _localeName.put("ko", "EUC-KR");
576: _localeName.put("zh", "GB2312");
577: _localeName.put("zh_TW", "Big5");
578: }
579: }
|