001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.io.ByteArrayInputStream;
011: import java.io.InputStreamReader;
012: import java.lang.reflect.Constructor;
013:
014: import java.lang.reflect.InvocationTargetException;
015: import java.nio.charset.Charset;
016: import java.nio.charset.IllegalCharsetNameException;
017: import java.nio.charset.UnsupportedCharsetException;
018: import java.util.HashMap;
019:
020: import com.ibm.icu.lang.UCharacter;
021:
022: /**
023: * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
024: * This API is used to convert codepage or character encoded data to and
025: * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that
026: * converter, you can get its properties, set options, convert your data.</p>
027: *
028: * <p>Since many software programs recogize different converter names for
029: * different types of converters, there are other functions in this API to
030: * iterate over the converter aliases.
031: *
032: * @draft ICU 3.6
033: * @provisional This API might change or be removed in a future release.
034: */
035: public abstract class CharsetICU extends Charset {
036:
037: String icuCanonicalName;
038: String javaCanonicalName;
039: int options;
040:
041: float maxCharsPerByte;
042:
043: boolean useFallback;
044:
045: String name; /* +4: 60 internal name of the converter- invariant chars */
046:
047: int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
048:
049: byte platform; /* +68: 1 platform of the converter (only IBM now) */
050: byte conversionType; /* +69: 1 conversion type */
051:
052: int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
053: int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
054:
055: byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
056: byte subCharLen; /* +76: 1 */
057:
058: byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
059: byte hasFromUnicodeFallback; /* +78: 1 */
060: short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
061: byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
062: byte reserved[/*19*/]; /* +81: 19 to round out the structure */
063:
064: /**
065: *
066: * @param icuCanonicalName
067: * @param canonicalName
068: * @param aliases
069: * @draft ICU 3.6
070: * @provisional This API might change or be removed in a future release.
071: */
072: protected CharsetICU(String icuCanonicalName, String canonicalName,
073: String[] aliases) {
074: super (canonicalName, aliases);
075: if (canonicalName.length() == 0) {
076: throw new IllegalCharsetNameException(canonicalName);
077: }
078: this .javaCanonicalName = canonicalName;
079: this .icuCanonicalName = icuCanonicalName;
080: }
081:
082: /**
083: * Ascertains if a charset is a sub set of this charset
084: * Implements the abstract method of super class.
085: * @param cs charset to test
086: * @return true if the given charset is a subset of this charset
087: * @stable ICU 3.6
088: */
089: public boolean contains(Charset cs) {
090: if (null == cs) {
091: return false;
092: } else if (this .equals(cs)) {
093: return true;
094: }
095: return false;
096: }
097:
098: private static final HashMap algorithmicCharsets = new HashMap();
099: static {
100: /*algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" );
101: algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" );
102: algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" );
103: algorithmicCharsets.put("imapmailboxname", "com.ibm.icu.charset.CharsetIMAP" );
104: algorithmicCharsets.put("ISCII", "com.ibm.icu.charset.CharsetISCII" );
105: algorithmicCharsets.put("iso2022", "com.ibm.icu.charset.CharsetISO2022" );*/
106: /*algorithmicCharsets.put("lmbcs1", "com.ibm.icu.charset.CharsetLMBCS1" );
107: algorithmicCharsets.put("lmbcs11", "com.ibm.icu.charset.CharsetLMBCS11" );
108: algorithmicCharsets.put("lmbcs16", "com.ibm.icu.charset.CharsetLMBCS16" );
109: algorithmicCharsets.put("lmbcs17", "com.ibm.icu.charset.CharsetLMBCS17" );
110: algorithmicCharsets.put("lmbcs18", "com.ibm.icu.charset.CharsetLMBCS18" );
111: algorithmicCharsets.put("lmbcs19", "com.ibm.icu.charset.CharsetLMBCS19" );
112: algorithmicCharsets.put("lmbcs2", "com.ibm.icu.charset.CharsetLMBCS2" );
113: algorithmicCharsets.put("lmbcs3", "com.ibm.icu.charset.CharsetLMBCS3" );
114: algorithmicCharsets.put("lmbcs4", "com.ibm.icu.charset.CharsetLMBCS4" );
115: algorithmicCharsets.put("lmbcs5", "com.ibm.icu.charset.CharsetLMBCS5" );
116: algorithmicCharsets.put("lmbcs6", "com.ibm.icu.charset.CharsetLMBCS6" );
117: algorithmicCharsets.put("lmbcs8", "com.ibm.icu.charset.CharsetLMBCS8" )
118: algorithmicCharsets.put("scsu", "com.ibm.icu.charset.CharsetSCSU" ); */
119: algorithmicCharsets.put("US-ASCII",
120: "com.ibm.icu.charset.CharsetASCII");
121: algorithmicCharsets.put("ISO-8859-1",
122: "com.ibm.icu.charset.Charset88591");
123: algorithmicCharsets.put("UTF-16",
124: "com.ibm.icu.charset.CharsetUTF16");
125: algorithmicCharsets.put("UTF-16BE",
126: "com.ibm.icu.charset.CharsetUTF16");
127: algorithmicCharsets.put("UTF-16LE",
128: "com.ibm.icu.charset.CharsetUTF16LE");
129: algorithmicCharsets.put("UTF16_OppositeEndian",
130: "com.ibm.icu.charset.CharsetUTF16LE");
131: algorithmicCharsets.put("UTF16_PlatformEndian",
132: "com.ibm.icu.charset.CharsetUTF16");
133: algorithmicCharsets.put("UTF-32",
134: "com.ibm.icu.charset.CharsetUTF32");
135: algorithmicCharsets.put("UTF-32BE",
136: "com.ibm.icu.charset.CharsetUTF32");
137: algorithmicCharsets.put("UTF-32LE",
138: "com.ibm.icu.charset.CharsetUTF32LE");
139: algorithmicCharsets.put("UTF32_PlatformEndian",
140: "com.ibm.icu.charset.CharsetUTF32LE");
141: algorithmicCharsets.put("UTF32_OppositeEndian",
142: "com.ibm.icu.charset.CharsetUTF32");
143: algorithmicCharsets.put("UTF-7",
144: "com.ibm.icu.charset.CharsetUTF7");
145: algorithmicCharsets.put("UTF-8",
146: "com.ibm.icu.charset.CharsetUTF8");
147: }
148:
149: /*public*/static final Charset getCharset(String icuCanonicalName,
150: String javaCanonicalName, String[] aliases) {
151: String className = (String) algorithmicCharsets
152: .get(icuCanonicalName);
153: if (className == null) {
154: //all the cnv files are loaded as MBCS
155: className = "com.ibm.icu.charset.CharsetMBCS";
156: }
157: try {
158: CharsetICU conv = null;
159: Class cs = Class.forName(className);
160: Class[] paramTypes = new Class[] { String.class,
161: String.class, String[].class };
162: final Constructor c = cs.getConstructor(paramTypes);
163: Object[] params = new Object[] { icuCanonicalName,
164: javaCanonicalName, aliases };
165:
166: // Run constructor
167: try {
168: Object obj = c.newInstance(params);
169: if (obj != null && obj instanceof CharsetICU) {
170: conv = (CharsetICU) obj;
171: return conv;
172: }
173: } catch (InvocationTargetException e) {
174: throw new UnsupportedCharsetException(icuCanonicalName
175: + ": " + "Could not load " + className
176: + ". Exception:" + e.getTargetException());
177: }
178: } catch (ClassNotFoundException ex) {
179: } catch (NoSuchMethodException ex) {
180: } catch (IllegalAccessException ex) {
181: } catch (InstantiationException ex) {
182: }
183: throw new UnsupportedCharsetException(icuCanonicalName + ": "
184: + "Could not load " + className);
185: }
186:
187: static final boolean isSurrogate(int c) {
188: return (((c) & 0xfffff800) == 0xd800);
189: }
190:
191: /**
192: * Always use fallbacks from codepage to Unicode?
193: * @draft ICU 3.6
194: * @provisional This API might change or be removed in a future release.
195: */
196: final boolean isToUUseFallback() {
197: return true;
198: }
199:
200: /**
201: * Use fallbacks from Unicode to codepage when useFallback or for private-use code points
202: * @param c A codepoint
203: * @draft ICU 3.6
204: * @provisional This API might change or be removed in a future release.
205: */
206: final boolean isFromUUseFallback(int c) {
207: return (useFallback) || isPrivateUse(c);
208: }
209:
210: /**
211: * Returns the default charset name
212: * @draft ICU 3.6
213: * @provisional This API might change or be removed in a future release.
214: */
215: static final String getDefaultCharsetName() {
216: String defaultEncoding = new InputStreamReader(
217: new ByteArrayInputStream(new byte[0])).getEncoding();
218: return defaultEncoding;
219: }
220:
221: static final boolean isPrivateUse(int c) {
222: return (UCharacter.getType(c) == UCharacter.PRIVATE_USE);
223: }
224:
225: /**
226: * Returns a charset object for the named charset.
227: * This method gurantee that ICU charset is returned when
228: * available. If the ICU charset provider does not support
229: * the specified charset, then try other charset providers
230: * including the standard Java charset provider.
231: *
232: * @param charsetName The name of the requested charset,
233: * may be either a canonical name or an alias
234: * @return A charset object for the named charset
235: * @throws IllegalCharsetNameException If the given charset name
236: * is illegal
237: * @throws UnsupportedCharsetException If no support for the
238: * named charset is available in this instance of th Java
239: * virtual machine
240: * @draft ICU 3.6
241: * @provisional This API might change or be removed in a future release.
242: */
243: public static Charset forNameICU(String charsetName)
244: throws IllegalCharsetNameException,
245: UnsupportedCharsetException {
246: CharsetProviderICU icuProvider = new CharsetProviderICU();
247: Charset cs = icuProvider.charsetForName(charsetName);
248: if (cs != null) {
249: return cs;
250: }
251: return Charset.forName(charsetName);
252: }
253: }
|