001: /*
002: * Copyright 1999-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: /*
017: * $Id: CharInfo.java,v 1.19 2005/04/07 03:47:14 minchau Exp $
018: */
019: package org.apache.xml.serializer;
020:
021: import java.io.BufferedReader;
022: import java.io.InputStream;
023: import java.io.InputStreamReader;
024: import java.io.UnsupportedEncodingException;
025: import java.net.URL;
026: import java.util.Enumeration;
027: import java.util.Hashtable;
028: import java.util.PropertyResourceBundle;
029: import java.util.ResourceBundle;
030: import java.security.AccessController;
031: import java.security.PrivilegedAction;
032:
033: import javax.xml.transform.TransformerException;
034:
035: import org.apache.xml.serializer.utils.MsgKey;
036: import org.apache.xml.serializer.utils.SystemIDResolver;
037: import org.apache.xml.serializer.utils.Utils;
038: import org.apache.xml.serializer.utils.WrappedRuntimeException;
039:
040: /**
041: * This class provides services that tell if a character should have
042: * special treatement, such as entity reference substitution or normalization
043: * of a newline character. It also provides character to entity reference
044: * lookup.
045: *
046: * DEVELOPERS: See Known Issue in the constructor.
047: *
048: * @xsl.usage internal
049: */
050: final class CharInfo {
051: /** Given a character, lookup a String to output (e.g. a decorated entity reference). */
052: private Hashtable m_charToString = new Hashtable();
053:
054: /**
055: * The name of the HTML entities file.
056: * If specified, the file will be resource loaded with the default class loader.
057: */
058: public static final String HTML_ENTITIES_RESOURCE = "org.apache.xml.serializer.HTMLEntities";
059:
060: /**
061: * The name of the XML entities file.
062: * If specified, the file will be resource loaded with the default class loader.
063: */
064: public static final String XML_ENTITIES_RESOURCE = "org.apache.xml.serializer.XMLEntities";
065:
066: /** The horizontal tab character, which the parser should always normalize. */
067: public static final char S_HORIZONAL_TAB = 0x09;
068:
069: /** The linefeed character, which the parser should always normalize. */
070: public static final char S_LINEFEED = 0x0A;
071:
072: /** The carriage return character, which the parser should always normalize. */
073: public static final char S_CARRIAGERETURN = 0x0D;
074:
075: /** This flag is an optimization for HTML entities. It false if entities
076: * other than quot (34), amp (38), lt (60) and gt (62) are defined
077: * in the range 0 to 127.
078: * @xsl.usage internal
079: */
080: final boolean onlyQuotAmpLtGt;
081:
082: /** Copy the first 0,1 ... ASCII_MAX values into an array */
083: private static final int ASCII_MAX = 128;
084:
085: /** Array of values is faster access than a set of bits
086: * to quickly check ASCII characters in attribute values.
087: */
088: private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
089:
090: /** Array of values is faster access than a set of bits
091: * to quickly check ASCII characters in text nodes.
092: */
093: private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
094:
095: private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
096:
097: /** An array of bits to record if the character is in the set.
098: * Although information in this array is complete, the
099: * isSpecialAttrASCII array is used first because access to its values
100: * is common and faster.
101: */
102: private int array_of_bits[] = createEmptySetOfIntegers(65535);
103:
104: // 5 for 32 bit words, 6 for 64 bit words ...
105: /*
106: * This constant is used to shift an integer to quickly
107: * calculate which element its bit is stored in.
108: * 5 for 32 bit words (int) , 6 for 64 bit words (long)
109: */
110: private static final int SHIFT_PER_WORD = 5;
111:
112: /*
113: * A mask to get the low order bits which are used to
114: * calculate the value of the bit within a given word,
115: * that will represent the presence of the integer in the
116: * set.
117: *
118: * 0x1F for 32 bit words (int),
119: * or 0x3F for 64 bit words (long)
120: */
121: private static final int LOW_ORDER_BITMASK = 0x1f;
122:
123: /*
124: * This is used for optimizing the lookup of bits representing
125: * the integers in the set. It is the index of the first element
126: * in the array array_of_bits[] that is not used.
127: */
128: private int firstWordNotUsed;
129:
130: /**
131: * Constructor that reads in a resource file that describes the mapping of
132: * characters to entity references.
133: * This constructor is private, just to force the use
134: * of the getCharInfo(entitiesResource) factory
135: *
136: * Resource files must be encoded in UTF-8 and can either be properties
137: * files with a .properties extension assumed. Alternatively, they can
138: * have the following form, with no particular extension assumed:
139: *
140: * <pre>
141: * # First char # is a comment
142: * Entity numericValue
143: * quot 34
144: * amp 38
145: * </pre>
146: *
147: * @param entitiesResource Name of properties or resource file that should
148: * be loaded, which describes that mapping of characters to entity
149: * references.
150: */
151: private CharInfo(String entitiesResource, String method) {
152: this (entitiesResource, method, false);
153: }
154:
155: private CharInfo(String entitiesResource, String method,
156: boolean internal) {
157: ResourceBundle entities = null;
158: boolean noExtraEntities = true;
159:
160: // Make various attempts to interpret the parameter as a properties
161: // file or resource file, as follows:
162: //
163: // 1) attempt to load .properties file using ResourceBundle
164: // 2) try using the class loader to find the specified file a resource
165: // file
166: // 3) try treating the resource a URI
167:
168: if (internal) {
169: try {
170: // Load entity property files by using PropertyResourceBundle,
171: // cause of security issure for applets
172: entities = PropertyResourceBundle
173: .getBundle(entitiesResource);
174: } catch (Exception e) {
175: }
176: }
177:
178: if (entities != null) {
179: Enumeration keys = entities.getKeys();
180: while (keys.hasMoreElements()) {
181: String name = (String) keys.nextElement();
182: String value = entities.getString(name);
183: int code = Integer.parseInt(value);
184: defineEntity(name, (char) code);
185: if (extraEntity(code))
186: noExtraEntities = false;
187: }
188: set(S_LINEFEED);
189: set(S_CARRIAGERETURN);
190: } else {
191: InputStream is = null;
192:
193: // Load user specified resource file by using URL loading, it
194: // requires a valid URI as parameter
195: try {
196: if (internal) {
197: is = CharInfo.class
198: .getResourceAsStream(entitiesResource);
199: } else {
200: ClassLoader cl = ObjectFactory.findClassLoader();
201: if (cl == null) {
202: is = ClassLoader
203: .getSystemResourceAsStream(entitiesResource);
204: } else {
205: is = cl.getResourceAsStream(entitiesResource);
206: }
207:
208: if (is == null) {
209: try {
210: URL url = new URL(entitiesResource);
211: is = url.openStream();
212: } catch (Exception e) {
213: }
214: }
215: }
216:
217: if (is == null) {
218: throw new RuntimeException(Utils.messages
219: .createMessage(
220: MsgKey.ER_RESOURCE_COULD_NOT_FIND,
221: new Object[] { entitiesResource,
222: entitiesResource }));
223: }
224:
225: // Fix Bugzilla#4000: force reading in UTF-8
226: // This creates the de facto standard that Xalan's resource
227: // files must be encoded in UTF-8. This should work in all
228: // JVMs.
229: //
230: // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
231: // didn't implement the UTF-8 encoding. Theoretically, we should
232: // simply let it fail in that case, since the JVM is obviously
233: // broken if it doesn't support such a basic standard. But
234: // since there are still some users attempting to use VJ++ for
235: // development, we have dropped in a fallback which makes a
236: // second attempt using the platform's default encoding. In VJ++
237: // this is apparently ASCII, which is subset of UTF-8... and
238: // since the strings we'll be reading here are also primarily
239: // limited to the 7-bit ASCII range (at least, in English
240: // versions of Xalan), this should work well enough to keep us
241: // on the air until we're ready to officially decommit from
242: // VJ++.
243:
244: BufferedReader reader;
245: try {
246: reader = new BufferedReader(new InputStreamReader(
247: is, "UTF-8"));
248: } catch (UnsupportedEncodingException e) {
249: reader = new BufferedReader(new InputStreamReader(
250: is));
251: }
252:
253: String line = reader.readLine();
254:
255: while (line != null) {
256: if (line.length() == 0 || line.charAt(0) == '#') {
257: line = reader.readLine();
258:
259: continue;
260: }
261:
262: int index = line.indexOf(' ');
263:
264: if (index > 1) {
265: String name = line.substring(0, index);
266:
267: ++index;
268:
269: if (index < line.length()) {
270: String value = line.substring(index);
271: index = value.indexOf(' ');
272:
273: if (index > 0) {
274: value = value.substring(0, index);
275: }
276:
277: int code = Integer.parseInt(value);
278:
279: defineEntity(name, (char) code);
280: if (extraEntity(code))
281: noExtraEntities = false;
282: }
283: }
284:
285: line = reader.readLine();
286: }
287:
288: is.close();
289: set(S_LINEFEED);
290: set(S_CARRIAGERETURN);
291: } catch (Exception e) {
292: throw new RuntimeException(Utils.messages
293: .createMessage(
294: MsgKey.ER_RESOURCE_COULD_NOT_LOAD,
295: new Object[] { entitiesResource,
296: e.toString(), entitiesResource,
297: e.toString() }));
298: } finally {
299: if (is != null) {
300: try {
301: is.close();
302: } catch (Exception except) {
303: }
304: }
305: }
306: }
307:
308: /* initialize the array isCleanTextASCII[] with a cache of values
309: * for use by ToStream.character(char[], int , int)
310: * and the array isSpecialTextASCII[] with the opposite values
311: * (all in the name of performance!)
312: */
313: for (int ch = 0; ch < ASCII_MAX; ch++)
314: if ((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch))) && (!get(ch)))
315: || ('"' == ch)) {
316: isCleanTextASCII[ch] = true;
317: isSpecialTextASCII[ch] = false;
318: } else {
319: isCleanTextASCII[ch] = false;
320: isSpecialTextASCII[ch] = true;
321: }
322:
323: onlyQuotAmpLtGt = noExtraEntities;
324:
325: // initialize the array with a cache of the BitSet values
326: for (int i = 0; i < ASCII_MAX; i++)
327: isSpecialAttrASCII[i] = get(i);
328:
329: /* Now that we've used get(ch) just above to initialize the
330: * two arrays we will change by adding a tab to the set of
331: * special chars for XML (but not HTML!).
332: * We do this because a tab is always a
333: * special character in an XML attribute,
334: * but only a special character in XML text
335: * if it has an entity defined for it.
336: * This is the reason for this delay.
337: */
338: if (Method.XML.equals(method)) {
339: isSpecialAttrASCII[S_HORIZONAL_TAB] = true;
340: }
341: }
342:
343: /**
344: * Defines a new character reference. The reference's name and value are
345: * supplied. Nothing happens if the character reference is already defined.
346: * <p>Unlike internal entities, character references are a string to single
347: * character mapping. They are used to map non-ASCII characters both on
348: * parsing and printing, primarily for HTML documents. '<amp;' is an
349: * example of a character reference.</p>
350: *
351: * @param name The entity's name
352: * @param value The entity's value
353: */
354: private void defineEntity(String name, char value) {
355: StringBuffer sb = new StringBuffer("&");
356: sb.append(name);
357: sb.append(';');
358: String entityString = sb.toString();
359:
360: defineChar2StringMapping(entityString, value);
361: }
362:
363: private CharKey m_charKey = new CharKey();
364:
365: /**
366: * Map a character to a String. For example given
367: * the character '>' this method would return the fully decorated
368: * entity name "<".
369: * Strings for entity references are loaded from a properties file,
370: * but additional mappings defined through calls to defineChar2String()
371: * are possible. Such entity reference mappings could be over-ridden.
372: *
373: * This is reusing a stored key object, in an effort to avoid
374: * heap activity. Unfortunately, that introduces a threading risk.
375: * Simplest fix for now is to make it a synchronized method, or to give
376: * up the reuse; I see very little performance difference between them.
377: * Long-term solution would be to replace the hashtable with a sparse array
378: * keyed directly from the character's integer value; see DTM's
379: * string pool for a related solution.
380: *
381: * @param value The character that should be resolved to
382: * a String, e.g. resolve '>' to "<".
383: *
384: * @return The String that the character is mapped to, or null if not found.
385: * @xsl.usage internal
386: */
387: synchronized String getOutputStringForChar(char value) {
388: // CharKey m_charKey = new CharKey(); //Alternative to synchronized
389: m_charKey.setChar(value);
390: return (String) m_charToString.get(m_charKey);
391: }
392:
393: /**
394: * Tell if the character argument that is from
395: * an attribute value should have special treatment.
396: *
397: * @param value the value of a character that is in an attribute value
398: * @return true if the character should have any special treatment,
399: * such as when writing out attribute values,
400: * or entity references.
401: * @xsl.usage internal
402: */
403: final boolean isSpecialAttrChar(int value) {
404: // for performance try the values in the boolean array first,
405: // this is faster access than the BitSet for common ASCII values
406:
407: if (value < ASCII_MAX)
408: return isSpecialAttrASCII[value];
409:
410: // rather than java.util.BitSet, our private
411: // implementation is faster (and less general).
412: return get(value);
413: }
414:
415: /**
416: * Tell if the character argument that is from a
417: * text node should have special treatment.
418: *
419: * @param value the value of a character that is in a text node
420: * @return true if the character should have any special treatment,
421: * such as when writing out attribute values,
422: * or entity references.
423: * @xsl.usage internal
424: */
425: final boolean isSpecialTextChar(int value) {
426: // for performance try the values in the boolean array first,
427: // this is faster access than the BitSet for common ASCII values
428:
429: if (value < ASCII_MAX)
430: return isSpecialTextASCII[value];
431:
432: // rather than java.util.BitSet, our private
433: // implementation is faster (and less general).
434: return get(value);
435: }
436:
437: /**
438: * This method is used to determine if an ASCII character in
439: * a text node (not an attribute value) is "clean".
440: * @param value the character to check (0 to 127).
441: * @return true if the character can go to the writer as-is
442: * @xsl.usage internal
443: */
444: final boolean isTextASCIIClean(int value) {
445: return isCleanTextASCII[value];
446: }
447:
448: // In the future one might want to use the array directly and avoid
449: // the method call, but I think the JIT alreay inlines this well enough
450: // so don't do it (for now) - bjm
451: // public final boolean[] getASCIIClean()
452: // {
453: // return isCleanTextASCII;
454: // }
455:
456: private static CharInfo getCharInfoBasedOnPrivilege(
457: final String entitiesFileName, final String method,
458: final boolean internal) {
459: return (CharInfo) AccessController
460: .doPrivileged(new PrivilegedAction() {
461: public Object run() {
462: return new CharInfo(entitiesFileName, method,
463: internal);
464: }
465: });
466: }
467:
468: /**
469: * Factory that reads in a resource file that describes the mapping of
470: * characters to entity references.
471: *
472: * Resource files must be encoded in UTF-8 and have a format like:
473: * <pre>
474: * # First char # is a comment
475: * Entity numericValue
476: * quot 34
477: * amp 38
478: * </pre>
479: * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
480: *
481: * @param entitiesResource Name of entities resource file that should
482: * be loaded, which describes that mapping of characters to entity references.
483: * @param method the output method type, which should be one of "xml", "html", "text"...
484: *
485: * @xsl.usage internal
486: */
487: static CharInfo getCharInfo(String entitiesFileName, String method) {
488: CharInfo charInfo = (CharInfo) m_getCharInfoCache
489: .get(entitiesFileName);
490: if (charInfo != null) {
491: return charInfo;
492: }
493:
494: // try to load it internally - cache
495: try {
496: charInfo = getCharInfoBasedOnPrivilege(entitiesFileName,
497: method, true);
498: m_getCharInfoCache.put(entitiesFileName, charInfo);
499: return charInfo;
500: } catch (Exception e) {
501: }
502:
503: // try to load it externally - do not cache
504: try {
505: return getCharInfoBasedOnPrivilege(entitiesFileName,
506: method, false);
507: } catch (Exception e) {
508: }
509:
510: String absoluteEntitiesFileName;
511:
512: if (entitiesFileName.indexOf(':') < 0) {
513: absoluteEntitiesFileName = SystemIDResolver
514: .getAbsoluteURIFromRelative(entitiesFileName);
515: } else {
516: try {
517: absoluteEntitiesFileName = SystemIDResolver
518: .getAbsoluteURI(entitiesFileName, null);
519: } catch (TransformerException te) {
520: throw new WrappedRuntimeException(te);
521: }
522: }
523:
524: return getCharInfoBasedOnPrivilege(entitiesFileName, method,
525: false);
526: }
527:
528: /** Table of user-specified char infos. */
529: private static Hashtable m_getCharInfoCache = new Hashtable();
530:
531: /**
532: * Returns the array element holding the bit value for the
533: * given integer
534: * @param i the integer that might be in the set of integers
535: *
536: */
537: private static int arrayIndex(int i) {
538: return (i >> SHIFT_PER_WORD);
539: }
540:
541: /**
542: * For a given integer in the set it returns the single bit
543: * value used within a given word that represents whether
544: * the integer is in the set or not.
545: */
546: private static int bit(int i) {
547: int ret = (1 << (i & LOW_ORDER_BITMASK));
548: return ret;
549: }
550:
551: /**
552: * Creates a new empty set of integers (characters)
553: * @param max the maximum integer to be in the set.
554: */
555: private int[] createEmptySetOfIntegers(int max) {
556: firstWordNotUsed = 0; // an optimization
557:
558: int[] arr = new int[arrayIndex(max - 1) + 1];
559: return arr;
560:
561: }
562:
563: /**
564: * Adds the integer (character) to the set of integers.
565: * @param i the integer to add to the set, valid values are
566: * 0, 1, 2 ... up to the maximum that was specified at
567: * the creation of the set.
568: */
569: private final void set(int i) {
570: setASCIIdirty(i);
571:
572: int j = (i >> SHIFT_PER_WORD); // this word is used
573: int k = j + 1;
574:
575: if (firstWordNotUsed < k) // for optimization purposes.
576: firstWordNotUsed = k;
577:
578: array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
579: }
580:
581: /**
582: * Return true if the integer (character)is in the set of integers.
583: *
584: * This implementation uses an array of integers with 32 bits per
585: * integer. If a bit is set to 1 the corresponding integer is
586: * in the set of integers.
587: *
588: * @param i an integer that is tested to see if it is the
589: * set of integers, or not.
590: */
591: private final boolean get(int i) {
592:
593: boolean in_the_set = false;
594: int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
595: // an optimization here, ... a quick test to see
596: // if this integer is beyond any of the words in use
597: if (j < firstWordNotUsed)
598: in_the_set = (array_of_bits[j] & (1 << (i & LOW_ORDER_BITMASK))) != 0; // 0L for 64 bit words
599: return in_the_set;
600: }
601:
602: // record if there are any entities other than
603: // quot, amp, lt, gt (probably user defined)
604: /**
605: * @return true if the entity
606: * @param code The value of the character that has an entity defined
607: * for it.
608: */
609: private boolean extraEntity(int entityValue) {
610: boolean extra = false;
611: if (entityValue < 128) {
612: switch (entityValue) {
613: case 34: // quot
614: case 38: // amp
615: case 60: // lt
616: case 62: // gt
617: break;
618: default: // other entity in range 0 to 127
619: extra = true;
620: }
621: }
622: return extra;
623: }
624:
625: /**
626: * If the character is a printable ASCII character then
627: * mark it as not clean and needing replacement with
628: * a String on output.
629: * @param ch
630: */
631: private void setASCIIdirty(int j) {
632: if (0 <= j && j < ASCII_MAX) {
633: isCleanTextASCII[j] = false;
634: isSpecialTextASCII[j] = true;
635: }
636: }
637:
638: /**
639: * If the character is a printable ASCII character then
640: * mark it as and not needing replacement with
641: * a String on output.
642: * @param ch
643: */
644: private void setASCIIclean(int j) {
645: if (0 <= j && j < ASCII_MAX) {
646: isCleanTextASCII[j] = true;
647: isSpecialTextASCII[j] = false;
648: }
649: }
650:
651: private void defineChar2StringMapping(String outputString,
652: char inputChar) {
653: CharKey character = new CharKey(inputChar);
654: m_charToString.put(character, outputString);
655: set(inputChar);
656: }
657:
658: /**
659: * Simple class for fast lookup of char values, when used with
660: * hashtables. You can set the char, then use it as a key.
661: *
662: * This class is a copy of the one in org.apache.xml.utils.
663: * It exists to cut the serializers dependancy on that package.
664: *
665: * @xsl.usage internal
666: */
667: private static class CharKey extends Object {
668:
669: /** String value */
670: private char m_char;
671:
672: /**
673: * Constructor CharKey
674: *
675: * @param key char value of this object.
676: */
677: public CharKey(char key) {
678: m_char = key;
679: }
680:
681: /**
682: * Default constructor for a CharKey.
683: *
684: * @param key char value of this object.
685: */
686: public CharKey() {
687: }
688:
689: /**
690: * Get the hash value of the character.
691: *
692: * @return hash value of the character.
693: */
694: public final void setChar(char c) {
695: m_char = c;
696: }
697:
698: /**
699: * Get the hash value of the character.
700: *
701: * @return hash value of the character.
702: */
703: public final int hashCode() {
704: return (int) m_char;
705: }
706:
707: /**
708: * Override of equals() for this object
709: *
710: * @param obj to compare to
711: *
712: * @return True if this object equals this string value
713: */
714: public final boolean equals(Object obj) {
715: return ((CharKey) obj).m_char == m_char;
716: }
717: }
718:
719: }
|