001: /*
002: * Java HTML Tidy - JTidy
003: * HTML parser and pretty printer
004: *
005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
006: * Institute of Technology, Institut National de Recherche en
007: * Informatique et en Automatique, Keio University). All Rights
008: * Reserved.
009: *
010: * Contributing Author(s):
011: *
012: * Dave Raggett <dsr@w3.org>
013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
014: * Gary L Peskin <garyp@firstech.com> (Java development)
015: * Sami Lempinen <sami@lempinen.net> (release management)
016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
017: *
018: * The contributing author(s) would like to thank all those who
019: * helped with testing, bug fixes, and patience. This wouldn't
020: * have been possible without all of you.
021: *
022: * COPYRIGHT NOTICE:
023: *
024: * This software and documentation is provided "as is," and
025: * the copyright holders and contributing author(s) make no
026: * representations or warranties, express or implied, including
027: * but not limited to, warranties of merchantability or fitness
028: * for any particular purpose or that the use of the software or
029: * documentation will not infringe any third party patents,
030: * copyrights, trademarks or other rights.
031: *
032: * The copyright holders and contributing author(s) will not be
033: * liable for any direct, indirect, special or consequential damages
034: * arising out of any use of the software or documentation, even if
035: * advised of the possibility of such damage.
036: *
037: * Permission is hereby granted to use, copy, modify, and distribute
038: * this source code, or portions hereof, documentation and executables,
039: * for any purpose, without fee, subject to the following restrictions:
040: *
041: * 1. The origin of this source code must not be misrepresented.
042: * 2. Altered versions must be plainly marked as such and must
043: * not be misrepresented as being the original source.
044: * 3. This Copyright notice may not be removed or altered from any
045: * source or altered source distribution.
046: *
047: * The copyright holders and contributing author(s) specifically
048: * permit, without fee, and encourage the use of this source code
049: * as a component for supporting the Hypertext Markup Language in
050: * commercial products. If you use this source code in a product,
051: * acknowledgment is not required but would be appreciated.
052: *
053: */
054: package org.w3c.tidy;
055:
056: import java.util.HashMap;
057: import java.util.Map;
058:
059: /**
060: * Maps between Java and IANA character encoding names. Also handles encoding alias used in tidy c.
061: * @author Fabrizio Giustina
062: * @version $Revision: 1.2 $ ($Author: fgiust $)
063: * @see http://www.iana.org/assignments/character-sets
064: */
065: public abstract class EncodingNameMapper {
066:
067: /**
068: * Map containing uppercase alias - {standard iana, standard java}.
069: */
070: private static Map encodingNameMap = new HashMap();
071:
072: static {
073: encodingNameMap.put("ISO-8859-1", new String[] { "ISO-8859-1",
074: "ISO8859_1" });
075: encodingNameMap.put("ISO8859_1", new String[] { "ISO-8859-1",
076: "ISO8859_1" });
077: encodingNameMap.put("ISO-IR-100", new String[] { "ISO-8859-1",
078: "ISO8859_1" });
079: encodingNameMap.put("LATIN1", new String[] { "ISO-8859-1",
080: "ISO8859_1" });
081: encodingNameMap.put("CSISOLATIN1", new String[] { "ISO-8859-1",
082: "ISO8859_1" });
083: encodingNameMap.put("L1", new String[] { "ISO-8859-1",
084: "ISO8859_1" });
085: encodingNameMap.put("819", new String[] { "ISO-8859-1",
086: "ISO8859_1" });
087:
088: encodingNameMap.put("US-ASCII", new String[] { "US-ASCII",
089: "ASCII" });
090: encodingNameMap.put("ASCII",
091: new String[] { "US-ASCII", "ASCII" });
092: encodingNameMap.put("ISO-IR-6", new String[] { "US-ASCII",
093: "ASCII" });
094: encodingNameMap.put("CSASCII", new String[] { "US-ASCII",
095: "ASCII" });
096: encodingNameMap.put("ISO646-US", new String[] { "US-ASCII",
097: "ASCII" });
098: encodingNameMap.put("US", new String[] { "US-ASCII", "ASCII" });
099: encodingNameMap
100: .put("367", new String[] { "US-ASCII", "ASCII" });
101:
102: encodingNameMap.put("UTF-8", new String[] { "UTF-8", "UTF8" });
103: encodingNameMap.put("UTF8", new String[] { "UTF-8", "UTF8" });
104: encodingNameMap.put("UTF-16", new String[] { "UTF-16",
105: "Unicode" });
106: encodingNameMap.put("UNICODE", new String[] { "UTF-16",
107: "Unicode" });
108: encodingNameMap.put("UTF16",
109: new String[] { "UTF-16", "Unicode" });
110: encodingNameMap.put("UTF16",
111: new String[] { "UTF-16", "Unicode" }); // tidy
112:
113: encodingNameMap.put("UTF-16BE", new String[] { "UTF-16BE",
114: "UnicodeBig" });
115: encodingNameMap.put("UNICODEBIG", new String[] { "UTF-16BE",
116: "UnicodeBig" });
117: encodingNameMap.put("UTF16-BE", new String[] { "UTF-16BE",
118: "UnicodeBig" });
119: encodingNameMap.put("UTF-16LE", new String[] { "UTF-16LE",
120: "UnicodeLittle" });
121: encodingNameMap.put("UNICODELITTLE", new String[] { "UTF-16LE",
122: "UnicodeLittle" });
123: encodingNameMap.put("UTF16-LE", new String[] { "UTF-16LE",
124: "UnicodeLittle" });
125: encodingNameMap.put("UTF16BE", new String[] { "UTF-16BE",
126: "UnicodeBig" }); // tidy
127: encodingNameMap.put("UTF16LE", new String[] { "UTF-16LE",
128: "UnicodeLittle" }); // tidy
129:
130: encodingNameMap.put("BIG5", new String[] { "BIG5", "Big5" });
131: encodingNameMap.put("CSBIG5", new String[] { "BIG5", "Big5" });
132:
133: encodingNameMap.put("SJIS",
134: new String[] { "SHIFT_JIS", "SJIS" });
135: encodingNameMap.put("SHIFT_JIS", new String[] { "SHIFT_JIS",
136: "SJIS" });
137: encodingNameMap.put("CSSHIFTJIS", new String[] { "CSSHIFTJIS",
138: "SJIS" });
139: encodingNameMap.put("MS_KANJI", new String[] { "MS_KANJI",
140: "SJIS" });
141: encodingNameMap.put("SHIFTJIS", new String[] { "SHIFT_JIS",
142: "SJIS" }); // tidy
143:
144: encodingNameMap.put("JIS",
145: new String[] { "ISO-2022-JP", "JIS" });
146: encodingNameMap.put("ISO-2022-JP", new String[] {
147: "ISO-2022-JP", "JIS" });
148: encodingNameMap.put("CSISO2022JP", new String[] {
149: "CSISO2022JP", "JIS" });
150: encodingNameMap.put("ISO2022", new String[] { "ISO-2022-JP",
151: "JIS" }); // tidy
152:
153: encodingNameMap.put("ISO2022KR", new String[] { "ISO-2022-KR",
154: "ISO2022KR" });
155: encodingNameMap.put("ISO-2022-KR", new String[] {
156: "ISO-2022-KR", "ISO2022KR" });
157: encodingNameMap.put("CSISO2022KR", new String[] {
158: "CSISO2022KR", "ISO2022KR" });
159: encodingNameMap.put("ISO-2022-CN", new String[] {
160: "ISO-2022-CN", "ISO2022CN" });
161: encodingNameMap.put("ISO2022CN", new String[] { "ISO-2022-CN",
162: "ISO2022CN" });
163:
164: encodingNameMap.put("MACROMAN", new String[] { "macintosh",
165: "MacRoman" }); // tidy
166: encodingNameMap.put("MACINTOSH", new String[] { "macintosh",
167: "MacRoman" });
168: encodingNameMap.put("MACINTOSH ROMAN", new String[] {
169: "macintosh", "MacRoman" });
170:
171: encodingNameMap.put("37", new String[] { "IBM037", "CP037" });
172: encodingNameMap.put("273", new String[] { "IBM273", "CP273" });
173: encodingNameMap.put("277", new String[] { "IBM277", "CP277" });
174: encodingNameMap.put("278", new String[] { "IBM278", "CP278" });
175: encodingNameMap.put("280", new String[] { "IBM280", "CP280" });
176: encodingNameMap.put("284", new String[] { "IBM284", "CP284" });
177: encodingNameMap.put("285", new String[] { "IBM285", "CP285" });
178: encodingNameMap.put("290", new String[] { "IBM290", "CP290" });
179: encodingNameMap.put("297", new String[] { "IBM297", "CP297" });
180: encodingNameMap.put("420", new String[] { "IBM420", "CP420" });
181: encodingNameMap.put("424", new String[] { "IBM424", "CP424" });
182: encodingNameMap.put("437", new String[] { "IBM437", "CP437" });
183: encodingNameMap.put("500", new String[] { "IBM500", "CP500" });
184: encodingNameMap.put("775", new String[] { "IBM775", "CP775" });
185: encodingNameMap.put("850", new String[] { "IBM850", "CP850" });
186: encodingNameMap.put("852", new String[] { "IBM852", "CP852" });
187: encodingNameMap.put("CSPCP852", new String[] { "IBM852",
188: "CP852" });
189: encodingNameMap.put("855", new String[] { "IBM855", "CP855" });
190: encodingNameMap.put("857", new String[] { "IBM857", "CP857" });
191: encodingNameMap
192: .put("858", new String[] { "IBM00858", "Cp858" });
193: encodingNameMap.put("0858",
194: new String[] { "IBM00858", "Cp858" });
195: encodingNameMap.put("860", new String[] { "IBM860", "CP860" });
196: encodingNameMap.put("861", new String[] { "IBM861", "CP861" });
197: encodingNameMap.put("IS", new String[] { "IBM861", "CP861" });
198: encodingNameMap.put("862", new String[] { "IBM862", "CP862" });
199: encodingNameMap.put("863", new String[] { "IBM863", "CP863" });
200: encodingNameMap.put("864", new String[] { "IBM864", "CP864" });
201: encodingNameMap.put("865", new String[] { "IBM865", "CP865" });
202: encodingNameMap.put("866", new String[] { "IBM866", "CP866" });
203: encodingNameMap.put("868", new String[] { "IBM868", "CP868" });
204: encodingNameMap.put("AR", new String[] { "IBM868", "CP868" });
205: encodingNameMap.put("869", new String[] { "IBM869", "CP869" });
206: encodingNameMap.put("GR", new String[] { "IBM869", "CP869" });
207: encodingNameMap.put("870", new String[] { "IBM870", "CP870" });
208: encodingNameMap.put("871", new String[] { "IBM871", "CP871" });
209: encodingNameMap.put("EBCDIC-CP-IS", new String[] { "IBM871",
210: "CP871" });
211: encodingNameMap.put("918", new String[] { "CP918", "CP918" });
212: encodingNameMap
213: .put("924", new String[] { "IBM00924", "CP924" });
214: encodingNameMap.put("0924",
215: new String[] { "IBM00924", "CP924" });
216: encodingNameMap.put("1026",
217: new String[] { "IBM1026", "CP1026" });
218: encodingNameMap.put("1047",
219: new String[] { "IBM1047", "Cp1047" });
220: encodingNameMap.put("1140",
221: new String[] { "IBM01140", "Cp1140" });
222: encodingNameMap.put("1141",
223: new String[] { "IBM01141", "Cp1141" });
224: encodingNameMap.put("1142",
225: new String[] { "IBM01142", "Cp1142" });
226: encodingNameMap.put("1143",
227: new String[] { "IBM01143", "Cp1143" });
228: encodingNameMap.put("1144",
229: new String[] { "IBM01144", "Cp1144" });
230: encodingNameMap.put("1145",
231: new String[] { "IBM01145", "Cp1145" });
232: encodingNameMap.put("1146",
233: new String[] { "IBM01146", "Cp1146" });
234: encodingNameMap.put("1147",
235: new String[] { "IBM01147", "Cp1147" });
236: encodingNameMap.put("1148",
237: new String[] { "IBM01148", "Cp1148" });
238: encodingNameMap.put("1149",
239: new String[] { "IBM01149", "Cp1149" });
240: encodingNameMap.put("1250", new String[] { "WINDOWS-1250",
241: "Cp1250" });
242: encodingNameMap.put("1251", new String[] { "WINDOWS-1251",
243: "Cp1251" });
244: encodingNameMap.put("1252", new String[] { "WINDOWS-1252",
245: "Cp1252" });
246: encodingNameMap.put("WIN1252", new String[] { "WINDOWS-1252",
247: "Cp1252" }); // tidy
248: encodingNameMap.put("1253", new String[] { "WINDOWS-1253",
249: "Cp1253" });
250: encodingNameMap.put("1254", new String[] { "WINDOWS-1254",
251: "Cp1254" });
252: encodingNameMap.put("1255", new String[] { "WINDOWS-1255",
253: "Cp1255" });
254: encodingNameMap.put("1256", new String[] { "WINDOWS-1256",
255: "Cp1256" });
256: encodingNameMap.put("1257", new String[] { "WINDOWS-1257",
257: "Cp1257" });
258: encodingNameMap.put("1258", new String[] { "WINDOWS-1258",
259: "Cp1258" });
260:
261: encodingNameMap.put("EUC-JP",
262: new String[] { "EUC-JP", "EUCJIS" });
263: encodingNameMap.put("EUCJIS",
264: new String[] { "EUC-JP", "EUCJIS" });
265: encodingNameMap.put("EUC-KR", new String[] { "EUC-KR",
266: "KSC5601" });
267: encodingNameMap.put("KSC5601", new String[] { "EUC-KR",
268: "KSC5601" });
269: encodingNameMap.put("GB2312",
270: new String[] { "GB2312", "GB2312" });
271: encodingNameMap.put("CSGB2312", new String[] { "GB2312",
272: "GB2312" });
273: encodingNameMap.put("X0201",
274: new String[] { "X0201", "JIS0201" });
275: encodingNameMap.put("JIS0201", new String[] { "X0201",
276: "JIS0201" });
277: encodingNameMap.put("X0208",
278: new String[] { "X0208", "JIS0208" });
279: encodingNameMap.put("JIS0208", new String[] { "X0208",
280: "JIS0208" });
281: encodingNameMap.put("ISO-IR-87", new String[] { "ISO-IR-87",
282: "JIS0208" });
283: encodingNameMap.put("JIS0208", new String[] { "ISO-IR-87",
284: "JIS0208" });
285: encodingNameMap.put("X0212",
286: new String[] { "X0212", "JIS0212" });
287: encodingNameMap.put("JIS0212", new String[] { "X0212",
288: "JIS0212" });
289: encodingNameMap.put("ISO-IR-159", new String[] { "X0212",
290: "JIS0212" });
291: encodingNameMap.put("GB18030", new String[] { "GB18030",
292: "GB18030" });
293:
294: encodingNameMap.put("936", new String[] { "GBK", "GBK" });
295: encodingNameMap.put("MS936", new String[] { "GBK", "GBK" });
296:
297: encodingNameMap.put("MS932", new String[] { "WINDOWS-31J",
298: "MS932" });
299: encodingNameMap.put("WINDOWS-31J", new String[] {
300: "WINDOWS-31J", "MS932" });
301: encodingNameMap.put("CSWINDOWS31J", new String[] {
302: "WINDOWS-31J", "MS932" });
303: encodingNameMap.put("TIS-620", new String[] { "TIS-620",
304: "TIS620" });
305: encodingNameMap.put("TIS620", new String[] { "TIS-620",
306: "TIS620" });
307:
308: encodingNameMap.put("ISO-8859-2", new String[] { "ISO-8859-2",
309: "ISO8859_2" });
310: encodingNameMap.put("ISO8859_2", new String[] { "ISO-8859-2",
311: "ISO8859_2" });
312: encodingNameMap.put("ISO-IR-101", new String[] { "ISO-8859-2",
313: "ISO8859_2" });
314: encodingNameMap.put("LATIN2", new String[] { "ISO-8859-2",
315: "ISO8859_2" });
316: encodingNameMap.put("L2", new String[] { "ISO-8859-2",
317: "ISO8859_2" });
318:
319: encodingNameMap.put("ISO-8859-3", new String[] { "ISO-8859-3",
320: "ISO8859_3" });
321: encodingNameMap.put("ISO8859_3", new String[] { "ISO-8859-3",
322: "ISO8859_3" });
323: encodingNameMap.put("ISO-IR-109", new String[] { "ISO-8859-3",
324: "ISO8859_3" });
325: encodingNameMap.put("LATIN3", new String[] { "ISO-8859-3",
326: "ISO8859_3" });
327: encodingNameMap.put("L3", new String[] { "ISO-8859-3",
328: "ISO8859_3" });
329:
330: encodingNameMap.put("ISO-8859-4", new String[] { "ISO-8859-4",
331: "ISO8859_4" });
332: encodingNameMap.put("ISO8859_4", new String[] { "ISO-8859-4",
333: "ISO8859_4" });
334: encodingNameMap.put("ISO-IR-110", new String[] { "ISO-8859-4",
335: "ISO8859_4" });
336: encodingNameMap.put("ISO-IR-110", new String[] { "ISO-8859-4",
337: "ISO8859_4" });
338: encodingNameMap.put("L4", new String[] { "ISO-8859-4",
339: "ISO8859_4" });
340:
341: encodingNameMap.put("ISO-8859-5", new String[] { "ISO-8859-5",
342: "ISO8859_5" });
343: encodingNameMap.put("ISO8859_5", new String[] { "ISO-8859-5",
344: "ISO8859_5" });
345: encodingNameMap.put("ISO-IR-144", new String[] { "ISO-8859-5",
346: "ISO8859_5" });
347: encodingNameMap.put("CYRILLIC", new String[] { "ISO-8859-5",
348: "ISO8859_5" });
349:
350: encodingNameMap.put("ISO-8859-6", new String[] { "ISO-8859-6",
351: "ISO8859_6" });
352: encodingNameMap.put("ISO8859_6", new String[] { "ISO-8859-6",
353: "ISO8859_6" });
354: encodingNameMap.put("ISO-IR-127", new String[] { "ISO-8859-6",
355: "ISO8859_6" });
356: encodingNameMap.put("ARABIC", new String[] { "ISO-8859-6",
357: "ISO8859_6" });
358:
359: encodingNameMap.put("ISO-8859-7", new String[] { "ISO-8859-7",
360: "ISO8859_7" });
361: encodingNameMap.put("ISO8859_7", new String[] { "ISO-8859-7",
362: "ISO8859_7" });
363: encodingNameMap.put("ISO-IR-126", new String[] { "ISO-8859-7",
364: "ISO8859_7" });
365: encodingNameMap.put("GREEK", new String[] { "ISO-8859-7",
366: "ISO8859_7" });
367:
368: encodingNameMap.put("ISO-8859-8", new String[] { "ISO-8859-8",
369: "ISO8859_8" });
370: encodingNameMap.put("ISO8859_8", new String[] { "ISO-8859-8",
371: "ISO8859_8" });
372: encodingNameMap.put("ISO-8859-8-I", new String[] {
373: "ISO-8859-8", "ISO8859_8" });
374: encodingNameMap.put("ISO-IR-138", new String[] { "ISO-8859-8",
375: "ISO8859_8" });
376: encodingNameMap.put("HEBREW", new String[] { "ISO-8859-8",
377: "ISO8859_8" });
378:
379: encodingNameMap.put("ISO-8859-9", new String[] { "ISO-8859-9",
380: "ISO8859_8" });
381: encodingNameMap.put("ISO8859_8", new String[] { "ISO-8859-9",
382: "ISO8859_8" });
383: encodingNameMap.put("CSISOLATINHEBREW", new String[] {
384: "ISO-8859-9", "ISO8859_9" });
385: encodingNameMap.put("ISO-IR-148", new String[] { "ISO-8859-9",
386: "ISO8859_9" });
387: encodingNameMap.put("LATIN5", new String[] { "ISO-8859-9",
388: "ISO8859_9" });
389: encodingNameMap.put("CSISOLATIN5", new String[] { "ISO-8859-9",
390: "ISO8859_9" });
391: encodingNameMap.put("L5", new String[] { "ISO-8859-9",
392: "ISO8859_9" });
393:
394: encodingNameMap.put("ISO-8859-15", new String[] {
395: "ISO-8859-15", "ISO8859_15" });
396: encodingNameMap.put("ISO8859_15", new String[] { "ISO-8859-15",
397: "ISO8859_15" });
398:
399: encodingNameMap.put("KOI8-R",
400: new String[] { "KOI8-R", "KOI8_R" });
401: encodingNameMap.put("KOI8_R", new String[] { "CSKOI8R",
402: "KOI8_R" });
403: encodingNameMap.put("CSKOI8R", new String[] { "CSKOI8R",
404: "KOI8_R" });
405: }
406:
407: /**
408: * Convert a Java character encoding name to its IANA equivalent.
409: * @param encoding java encoding name or alias
410: * @return iana equivalent or null if no match is found.
411: */
412: public static String toIana(String encoding) {
413: if (encoding == null) {
414: return null;
415: }
416:
417: String[] values = (String[]) encodingNameMap
418: .get(handlecommonAlias(encoding));
419: if (values != null) {
420: return values[0];
421: }
422:
423: return null;
424: }
425:
426: /**
427: * "Fix" the name for common alias to reduce the number of entries needed in the hashmap. It actually removes CSIBM,
428: * CCSID, IBM-, IBM0, CP-0, IBM, CP0, CP-, CP, WINDOWS- prefixes from given name.
429: * @param encoding encoding name
430: * @return "fixed" encoding.
431: */
432: private static String handlecommonAlias(String encoding) {
433: String key = encoding.toUpperCase();
434:
435: // handle common alias
436: if (key.startsWith("CSIBM") || key.startsWith("CCSID")) {
437: key = key.substring(5);
438: } else if (key.startsWith("IBM-") || key.startsWith("IBM0")
439: || key.startsWith("CP-0")) {
440: key = key.substring(4);
441: } else if (key.startsWith("IBM") || key.startsWith("CP0")
442: || key.startsWith("CP-")) {
443: key = key.substring(3);
444: } else if (key.startsWith("CP")) {
445: key = key.substring(2);
446: } else if (key.startsWith("WINDOWS-")) {
447: key = key.substring(8);
448: } else if (key.startsWith("ISO_")) {
449: key = "ISO-" + key.substring(4);
450: }
451:
452: return key;
453: }
454:
455: /**
456: * Converts an encoding name to the standard java name. Handles IANA names, legacy names used in tidy and different
457: * java encoding alias. See http://www.iana.org/assignments/character-sets.
458: * @param encoding IANA encoding name or alias
459: * @return java equivalent or null if no match is found.
460: */
461: public static String toJava(String encoding) {
462: if (encoding == null) {
463: return null;
464: }
465:
466: String[] values = (String[]) encodingNameMap
467: .get(handlecommonAlias(encoding));
468: if (values != null) {
469: return values[1];
470: }
471:
472: return null;
473: }
474: }
|