001: /*
002: * Copyright 1999-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: /*
017: * $Id: EncodingInfo.java,v 1.6 2005/08/04 23:57:06 minchau Exp $
018: */
019: package org.apache.xml.serializer;
020:
021: import java.io.UnsupportedEncodingException;
022:
023: /**
024: * Holds information about a given encoding, which is the Java name for the
025: * encoding, the equivalent ISO name.
026: * <p>
027: * An object of this type has two useful methods
028: * <pre>
029: * isInEncoding(char ch);
030: * </pre>
031: * which can be called if the character is not the high one in
032: * a surrogate pair and:
033: * <pre>
034: * isInEncoding(char high, char low);
035: * </pre>
036: * which can be called if the two characters from a high/low surrogate pair.
037: * <p>
038: * An EncodingInfo object is a node in a binary search tree. Such a node
039: * will answer if a character is in the encoding, and do so for a given
040: * range of unicode values (<code>m_first</code> to
041: * <code>m_last</code>). It will handle a certain range of values
042: * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
043: * If the unicode point is before that explicit range, that is it
044: * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
045: * of such a tree, m_before. Likewise for values in the range
046: * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
047: * <p>
048: * Actually figuring out if a code point is in the encoding is expensive. So the
049: * purpose of this tree is to cache such determinations, and not to build the
050: * entire tree of information at the start, but only build up as much of the
051: * tree as is used during the transformation.
052: * <p>
053: * This Class is not a public API, and should only be used internally within
054: * the serializer.
055: *
056: * @xsl.usage internal
057: */
058: public final class EncodingInfo extends Object {
059:
060: /**
061: * The ISO encoding name.
062: */
063: final String name;
064:
065: /**
066: * The name used by the Java convertor.
067: */
068: final String javaName;
069:
070: /**
071: * A helper object that we can ask if a
072: * single char, or a surrogate UTF-16 pair
073: * of chars that form a single character,
074: * is in this encoding.
075: */
076: private InEncoding m_encoding;
077:
078: /**
079: * This is not a public API. It returns true if the
080: * char in question is in the encoding.
081: * @param ch the char in question.
082: * @xsl.usage internal
083: */
084: public boolean isInEncoding(char ch) {
085: if (m_encoding == null) {
086: m_encoding = new EncodingImpl();
087:
088: // One could put alternate logic in here to
089: // instantiate another object that implements the
090: // InEncoding interface. For example if the JRE is 1.4 or up
091: // we could have an object that uses JRE 1.4 methods
092: }
093: return m_encoding.isInEncoding(ch);
094: }
095:
096: /**
097: * This is not a public API. It returns true if the
098: * character formed by the high/low pair is in the encoding.
099: * @param high a char that the a high char of a high/low surrogate pair.
100: * @param low a char that is the low char of a high/low surrogate pair.
101: * @xsl.usage internal
102: */
103: public boolean isInEncoding(char high, char low) {
104: if (m_encoding == null) {
105: m_encoding = new EncodingImpl();
106:
107: // One could put alternate logic in here to
108: // instantiate another object that implements the
109: // InEncoding interface. For example if the JRE is 1.4 or up
110: // we could have an object that uses JRE 1.4 methods
111: }
112: return m_encoding.isInEncoding(high, low);
113: }
114:
115: /**
116: * Create an EncodingInfo object based on the ISO name and Java name.
117: * If both parameters are null any character will be considered to
118: * be in the encoding. This is useful for when the serializer is in
119: * temporary output state, and has no assciated encoding.
120: *
121: * @param name reference to the ISO name.
122: * @param javaName reference to the Java encoding name.
123: */
124: public EncodingInfo(String name, String javaName) {
125:
126: this .name = name;
127: this .javaName = javaName;
128: }
129:
130: /**
131: * A simple interface to isolate the implementation.
132: * We could also use some new JRE 1.4 methods in another implementation
133: * provided we use reflection with them.
134: * <p>
135: * This interface is not a public API,
136: * and should only be used internally within the serializer.
137: * @xsl.usage internal
138: */
139: private interface InEncoding {
140: /**
141: * Returns true if the char is in the encoding
142: */
143: public boolean isInEncoding(char ch);
144:
145: /**
146: * Returns true if the high/low surrogate pair forms
147: * a character that is in the encoding.
148: */
149: public boolean isInEncoding(char high, char low);
150: }
151:
152: /**
153: * This class implements the
154: */
155: private class EncodingImpl implements InEncoding {
156:
157: public boolean isInEncoding(char ch1) {
158: final boolean ret;
159: int codePoint = Encodings.toCodePoint(ch1);
160: if (codePoint < m_explFirst) {
161: // The unicode value is before the range
162: // that we explictly manage, so we delegate the answer.
163:
164: // If we don't have an m_before object to delegate to, make one.
165: if (m_before == null)
166: m_before = new EncodingImpl(m_encoding, m_first,
167: m_explFirst - 1, codePoint);
168: ret = m_before.isInEncoding(ch1);
169: } else if (m_explLast < codePoint) {
170: // The unicode value is after the range
171: // that we explictly manage, so we delegate the answer.
172:
173: // If we don't have an m_after object to delegate to, make one.
174: if (m_after == null)
175: m_after = new EncodingImpl(m_encoding,
176: m_explLast + 1, m_last, codePoint);
177: ret = m_after.isInEncoding(ch1);
178: } else {
179: // The unicode value is in the range we explitly handle
180: final int idx = codePoint - m_explFirst;
181:
182: // If we already know the answer, just return it.
183: if (m_alreadyKnown[idx])
184: ret = m_isInEncoding[idx];
185: else {
186: // We don't know the answer, so find out,
187: // which may be expensive, then cache the answer
188: ret = inEncoding(ch1, m_encoding);
189: m_alreadyKnown[idx] = true;
190: m_isInEncoding[idx] = ret;
191: }
192: }
193: return ret;
194: }
195:
196: public boolean isInEncoding(char high, char low) {
197: final boolean ret;
198: int codePoint = Encodings.toCodePoint(high, low);
199: if (codePoint < m_explFirst) {
200: // The unicode value is before the range
201: // that we explictly manage, so we delegate the answer.
202:
203: // If we don't have an m_before object to delegate to, make one.
204: if (m_before == null)
205: m_before = new EncodingImpl(m_encoding, m_first,
206: m_explFirst - 1, codePoint);
207: ret = m_before.isInEncoding(high, low);
208: } else if (m_explLast < codePoint) {
209: // The unicode value is after the range
210: // that we explictly manage, so we delegate the answer.
211:
212: // If we don't have an m_after object to delegate to, make one.
213: if (m_after == null)
214: m_after = new EncodingImpl(m_encoding,
215: m_explLast + 1, m_last, codePoint);
216: ret = m_after.isInEncoding(high, low);
217: } else {
218: // The unicode value is in the range we explitly handle
219: final int idx = codePoint - m_explFirst;
220:
221: // If we already know the answer, just return it.
222: if (m_alreadyKnown[idx])
223: ret = m_isInEncoding[idx];
224: else {
225: // We don't know the answer, so find out,
226: // which may be expensive, then cache the answer
227: ret = inEncoding(high, low, m_encoding);
228: m_alreadyKnown[idx] = true;
229: m_isInEncoding[idx] = ret;
230: }
231: }
232: return ret;
233: }
234:
235: /**
236: * The encoding.
237: */
238: final private String m_encoding;
239: /**
240: * m_first through m_last is the range of unicode
241: * values that this object will return an answer on.
242: * It may delegate to a similar object with a different
243: * range
244: */
245: final private int m_first;
246:
247: /**
248: * m_explFirst through m_explLast is the range of unicode
249: * value that this object handles explicitly and does not
250: * delegate to a similar object.
251: */
252: final private int m_explFirst;
253: final private int m_explLast;
254: final private int m_last;
255:
256: /**
257: * The object, of the same type as this one,
258: * that handles unicode values in a range before
259: * the range explictly handled by this object, and
260: * to which this object may delegate.
261: */
262: private InEncoding m_before;
263: /**
264: * The object, of the same type as this one,
265: * that handles unicode values in a range after
266: * the range explictly handled by this object, and
267: * to which this object may delegate.
268: */
269: private InEncoding m_after;
270:
271: /**
272: * The number of unicode values explicitly handled
273: * by a single EncodingInfo object. This value is
274: * tuneable, but is set to 128 because that covers the
275: * entire low range of ASCII type chars within a single
276: * object.
277: */
278: private static final int RANGE = 128;
279:
280: /**
281: * A flag to record if we already know the answer
282: * for the given unicode value.
283: */
284: final private boolean m_alreadyKnown[] = new boolean[RANGE];
285: /**
286: * A table holding the answer on whether the given unicode
287: * value is in the encoding.
288: */
289: final private boolean m_isInEncoding[] = new boolean[RANGE];
290:
291: private EncodingImpl() {
292: // This object will answer whether any unicode value
293: // is in the encoding, it handles values 0 through Integer.MAX_VALUE
294: this (javaName, 0, Integer.MAX_VALUE, (char) 0);
295: }
296:
297: private EncodingImpl(String encoding, int first, int last,
298: int codePoint) {
299: // Set the range of unicode values that this object manages
300: // either explicitly or implicitly.
301: m_first = first;
302: m_last = last;
303:
304: // Set the range of unicode values that this object
305: // explicitly manages
306: m_explFirst = codePoint;
307: m_explLast = codePoint + (RANGE - 1);
308:
309: m_encoding = encoding;
310:
311: if (javaName != null) {
312: // Some optimization.
313: if (0 <= m_explFirst && m_explFirst <= 127) {
314: // This particular EncodingImpl explicitly handles
315: // characters in the low range.
316: if ("UTF8".equals(javaName)
317: || "UTF-16".equals(javaName)
318: || "ASCII".equals(javaName)
319: || "US-ASCII".equals(javaName)
320: || "Unicode".equals(javaName)
321: || "UNICODE".equals(javaName)
322: || javaName.startsWith("ISO8859")) {
323:
324: // Not only does this EncodingImpl object explicitly
325: // handle chracters in the low range, it is
326: // also one that we know something about, without
327: // needing to call inEncoding(char ch, String encoding)
328: // for this low range
329: //
330: // By initializing the table ahead of time
331: // for these low values, we prevent the expensive
332: // inEncoding(char ch, String encoding)
333: // from being called, at least for these common
334: // encodings.
335: for (int unicode = 1; unicode < 127; unicode++) {
336: final int idx = unicode - m_explFirst;
337: if (0 <= idx && idx < RANGE) {
338: m_alreadyKnown[idx] = true;
339: m_isInEncoding[idx] = true;
340: }
341: }
342: }
343: }
344:
345: /* A little bit more than optimization.
346: *
347: * We will say that any character is in the encoding if
348: * we don't have an encoding.
349: * This is meaningful when the serializer is being used
350: * in temporary output state, where we are not writing to
351: * the final output tree. It is when writing to the
352: * final output tree that we need to worry about the output
353: * encoding
354: */
355: if (javaName == null) {
356: for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
357: m_alreadyKnown[idx] = true;
358: m_isInEncoding[idx] = true;
359: }
360: }
361: }
362: }
363: }
364:
365: /**
366: * This is heart of the code that determines if a given character
367: * is in the given encoding. This method is probably expensive,
368: * and the answer should be cached.
369: * <p>
370: * This method is not a public API,
371: * and should only be used internally within the serializer.
372: * @param ch the char in question, that is not a high char of
373: * a high/low surrogate pair.
374: * @param encoding the Java name of the enocding.
375: *
376: * @xsl.usage internal
377: *
378: */
379: private static boolean inEncoding(char ch, String encoding) {
380: boolean isInEncoding;
381: try {
382: char cArray[] = new char[1];
383: cArray[0] = ch;
384: // Construct a String from the char
385: String s = new String(cArray);
386: // Encode the String into a sequence of bytes
387: // using the given, named charset.
388: byte[] bArray = s.getBytes(encoding);
389: isInEncoding = inEncoding(ch, bArray);
390:
391: } catch (Exception e) {
392: isInEncoding = false;
393:
394: // If for some reason the encoding is null, e.g.
395: // for a temporary result tree, we should just
396: // say that every character is in the encoding.
397: if (encoding == null)
398: isInEncoding = true;
399: }
400: return isInEncoding;
401: }
402:
403: /**
404: * This is heart of the code that determines if a given high/low
405: * surrogate pair forms a character that is in the given encoding.
406: * This method is probably expensive, and the answer should be cached.
407: * <p>
408: * This method is not a public API,
409: * and should only be used internally within the serializer.
410: * @param high the high char of
411: * a high/low surrogate pair.
412: * @param low the low char of a high/low surrogate pair.
413: * @param encoding the Java name of the encoding.
414: *
415: * @xsl.usage internal
416: *
417: */
418: private static boolean inEncoding(char high, char low,
419: String encoding) {
420: boolean isInEncoding;
421: try {
422: char cArray[] = new char[2];
423: cArray[0] = high;
424: cArray[1] = low;
425: // Construct a String from the char
426: String s = new String(cArray);
427: // Encode the String into a sequence of bytes
428: // using the given, named charset.
429: byte[] bArray = s.getBytes(encoding);
430: isInEncoding = inEncoding(high, bArray);
431: } catch (Exception e) {
432: isInEncoding = false;
433: }
434:
435: return isInEncoding;
436: }
437:
438: /**
439: * This method is the core of determining if character
440: * is in the encoding. The method is not foolproof, because
441: * s.getBytes(encoding) has specified behavior only if the
442: * characters are in the specified encoding. However this
443: * method tries it's best.
444: * @param ch the char that was converted using getBytes, or
445: * the first char of a high/low pair that was converted.
446: * @param data the bytes written out by the call to s.getBytes(encoding);
447: * @return true if the character is in the encoding.
448: */
449: private static boolean inEncoding(char ch, byte[] data) {
450: final boolean isInEncoding;
451: // If the string written out as data is not in the encoding,
452: // the output is not specified according to the documentation
453: // on the String.getBytes(encoding) method,
454: // but we do our best here.
455: if (data == null || data.length == 0) {
456: isInEncoding = false;
457: } else {
458: if (data[0] == 0)
459: isInEncoding = false;
460: else if (data[0] == '?' && ch != '?')
461: isInEncoding = false;
462: /*
463: * else if (isJapanese) {
464: * // isJapanese is really
465: * // ( "EUC-JP".equals(javaName)
466: * // || "EUC_JP".equals(javaName)
467: * // || "SJIS".equals(javaName) )
468: *
469: * // Work around some bugs in JRE for Japanese
470: * if(data[0] == 0x21)
471: * isInEncoding = false;
472: * else if (ch == 0xA5)
473: * isInEncoding = false;
474: * else
475: * isInEncoding = true;
476: * }
477: */
478:
479: else {
480: // We don't know for sure, but it looks like it is in the encoding
481: isInEncoding = true;
482: }
483: }
484: return isInEncoding;
485: }
486:
487: }
|