001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/i18n/CmsEncoder.java,v $
003: * Date : $Date: 2008-02-27 12:05:47 $
004: * Version: $Revision: 1.25 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.i18n;
033:
034: import org.opencms.main.CmsLog;
035: import org.opencms.main.OpenCms;
036: import org.opencms.util.CmsStringUtil;
037:
038: import java.io.UnsupportedEncodingException;
039: import java.net.URLDecoder;
040: import java.net.URLEncoder;
041: import java.nio.CharBuffer;
042: import java.nio.charset.Charset;
043: import java.nio.charset.CharsetEncoder;
044: import java.util.HashMap;
045: import java.util.Map;
046: import java.util.regex.Matcher;
047: import java.util.regex.Pattern;
048:
049: import org.apache.commons.logging.Log;
050:
051: /**
052: * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
053: *
054: * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
055: * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms
056: * core classes to ensure the encoding is always handled the same way.<p>
057: *
058: * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
059: * replaced with <code>%hex</code> where hex is a two digit hex number.<p>
060: *
061: * <b>Note:</b> On the client side (browser) instead of using corresponding <code>escape</code>
062: * and <code>unescape</code> JavaScript functions, better use <code>encodeURIComponent</code> and
063: * <code>decodeURIComponent</code> functions which are work properly with unicode characters.
064: * These functions are supported in IE 5.5+ and NS 6+ only.<p>
065: *
066: * @author Alexander Kandzior
067: *
068: * @version $Revision: 1.25 $
069: *
070: * @since 6.0.0
071: */
072: public final class CmsEncoder {
073:
074: /** Constant for the standard <code>ISO-8859-1</code> encoding. */
075: public static final String ENCODING_ISO_8859_1 = "ISO-8859-1";
076:
077: /** Constant for the standard <code>US-ASCII</code> encoding. */
078: public static final String ENCODING_US_ASCII = "US-ASCII";
079:
080: /**
081: * Constant for the standard <code>UTF-8</code> encoding.<p>
082: *
083: * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard.
084: */
085: public static final String ENCODING_UTF_8 = "UTF-8";
086:
087: /** The regex pattern to match HTML entities. */
088: private static final Pattern ENTITIY_PATTERN = Pattern
089: .compile("\\&#\\d+;");
090:
091: /** The prefix for HTML entities. */
092: private static final String ENTITY_PREFIX = "&#";
093:
094: /** The replacement for HTML entity prefix in parameters. */
095: private static final String ENTITY_REPLACEMENT = "$$";
096:
097: /** The log object for this class. */
098: private static final Log LOG = CmsLog.getLog(CmsEncoder.class);
099:
100: /** A cache for encoding name lookup. */
101: private static Map m_encodingCache = new HashMap(16);
102:
103: /** The plus entity. */
104: private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;";
105:
106: /**
107: * Constructor.<p>
108: */
109: private CmsEncoder() {
110:
111: // empty
112: }
113:
114: /**
115: * Adjusts the given String by making sure all characters that can be displayed
116: * in the given charset are contained as chars, whereas all other non-displayable
117: * characters are converted to HTML entities.<p>
118: *
119: * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result
120: * to {@link #encodeHtmlEntities(String, String)}. <p>
121: *
122: * @param input the input to adjust the HTML encoding for
123: * @param encoding the charset to encode the result with\
124: *
125: * @return the input with the decoded/encoded HTML entities
126: */
127: public static String adjustHtmlEncoding(String input,
128: String encoding) {
129:
130: return encodeHtmlEntities(decodeHtmlEntities(input, encoding),
131: encoding);
132: }
133:
134: /**
135: * Changes the encoding of a byte array that represents a String.<p>
136: *
137: * @param input the byte array to convert
138: * @param oldEncoding the current encoding of the byte array
139: * @param newEncoding the new encoding of the byte array
140: *
141: * @return the byte array encoded in the new encoding
142: */
143: public static byte[] changeEncoding(byte[] input,
144: String oldEncoding, String newEncoding) {
145:
146: if ((oldEncoding == null) || (newEncoding == null)) {
147: return input;
148: }
149: if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
150: return input;
151: }
152: byte[] result = input;
153: try {
154: result = (new String(input, oldEncoding))
155: .getBytes(newEncoding);
156: } catch (UnsupportedEncodingException e) {
157: // return value will be input value
158: }
159: return result;
160: }
161:
162: /**
163: * Creates a String out of a byte array with the specified encoding, falling back
164: * to the system default in case the encoding name is not valid.<p>
165: *
166: * Use this method as a replacement for <code>new String(byte[], encoding)</code>
167: * to avoid possible encoding problems.<p>
168: *
169: * @param bytes the bytes to decode
170: * @param encoding the encoding scheme to use for decoding the bytes
171: *
172: * @return the bytes decoded to a String
173: */
174: public static String createString(byte[] bytes, String encoding) {
175:
176: String enc = encoding.intern();
177: if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) {
178: enc = lookupEncoding(enc, null);
179: }
180: if (enc != null) {
181: try {
182: return new String(bytes, enc);
183: } catch (UnsupportedEncodingException e) {
184: // this can _never_ happen since the charset was looked up first
185: }
186: } else {
187: if (LOG.isWarnEnabled()) {
188: LOG.warn(Messages.get().getBundle().key(
189: Messages.ERR_UNSUPPORTED_VM_ENCODING_1,
190: encoding));
191: }
192: enc = OpenCms.getSystemInfo().getDefaultEncoding();
193: try {
194: return new String(bytes, enc);
195: } catch (UnsupportedEncodingException e) {
196: // this can also _never_ happen since the default encoding is always valid
197: }
198: }
199: // this code is unreachable in practice
200: LOG.error(Messages.get().getBundle().key(
201: Messages.ERR_ENCODING_ISSUES_1, encoding));
202: return null;
203: }
204:
205: /**
206: * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
207: * with GET ant POST requests.<p>
208: *
209: * @param source the String to decode
210: *
211: * @return String the decoded source String
212: */
213: public static String decode(String source) {
214:
215: return decode(source, ENCODING_UTF_8);
216: }
217:
218: /**
219: * This method is a substitute for <code>URLDecoder.decode()</code>.
220: * Use this in all OpenCms core classes to ensure the encoding is
221: * always handled the same way.<p>
222: *
223: * In case you don't know what encoding to use, set the value of
224: * the <code>encoding</code> parameter to <code>null</code>.
225: * This method will then default to UTF-8 encoding, which is probably the right one.<p>
226: *
227: * @param source The string to decode
228: * @param encoding The encoding to use (if null, the system default is used)
229: *
230: * @return The decoded source String
231: */
232: public static String decode(String source, String encoding) {
233:
234: if (source == null) {
235: return null;
236: }
237: if (encoding != null) {
238: try {
239: return URLDecoder.decode(source, encoding);
240: } catch (java.io.UnsupportedEncodingException e) {
241: // will fallback to default
242: }
243: }
244: // fallback to default decoding
245: try {
246: return URLDecoder.decode(source, ENCODING_UTF_8);
247: } catch (java.io.UnsupportedEncodingException e) {
248: // ignore
249: }
250: return source;
251: }
252:
253: /**
254: * Decodes HTML entity references like <code>&#8364;</code> that are contained in the
255: * String to a regular character, but only if that character is contained in the given
256: * encodings charset.<p>
257: *
258: * @param input the input to decode the HTML entities in
259: * @param encoding the charset to decode the input for
260: * @return the input with the decoded HTML entities
261: *
262: * @see #encodeHtmlEntities(String, String)
263: */
264: public static String decodeHtmlEntities(String input,
265: String encoding) {
266:
267: Matcher matcher = ENTITIY_PATTERN.matcher(input);
268: StringBuffer result = new StringBuffer(input.length());
269: Charset charset = Charset.forName(encoding);
270: CharsetEncoder encoder = charset.newEncoder();
271:
272: while (matcher.find()) {
273: String entity = matcher.group();
274: String value = entity.substring(2, entity.length() - 1);
275: int c = Integer.valueOf(value).intValue();
276: if (c < 128) {
277: // first 128 chars are contained in almost every charset
278: entity = new String(new char[] { (char) c });
279: // this is intended as performance improvement since
280: // the canEncode() operation appears quite CPU heavy
281: } else if (encoder.canEncode((char) c)) {
282: // encoder can encode this char
283: entity = new String(new char[] { (char) c });
284: }
285: matcher.appendReplacement(result, entity);
286: }
287: matcher.appendTail(result);
288: return result.toString();
289: }
290:
291: /**
292: * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
293: *
294: * @param input the encoded parameter string
295: *
296: * @return the decoded parameter string
297: *
298: * @see #encodeParameter(String)
299: */
300: public static String decodeParameter(String input) {
301:
302: String result = CmsStringUtil.substitute(input,
303: ENTITY_REPLACEMENT, ENTITY_PREFIX);
304: return CmsEncoder.decodeHtmlEntities(result, OpenCms
305: .getSystemInfo().getDefaultEncoding());
306: }
307:
308: /**
309: * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
310: * with GET ant POST requests.<p>
311: *
312: * @param source the String to encode
313: *
314: * @return String the encoded source String
315: */
316: public static String encode(String source) {
317:
318: return encode(source, ENCODING_UTF_8);
319: }
320:
321: /**
322: * This method is a substitute for <code>URLEncoder.encode()</code>.
323: * Use this in all OpenCms core classes to ensure the encoding is
324: * always handled the same way.<p>
325: *
326: * In case you don't know what encoding to use, set the value of
327: * the <code>encoding</code> parameter to <code>null</code>.
328: * This method will then default to UTF-8 encoding, which is probably the right one.<p>
329: *
330: * @param source the String to encode
331: * @param encoding the encoding to use (if null, the system default is used)
332: *
333: * @return the encoded source String
334: */
335: public static String encode(String source, String encoding) {
336:
337: if (source == null) {
338: return null;
339: }
340: if (encoding != null) {
341: try {
342: return URLEncoder.encode(source, encoding);
343: } catch (java.io.UnsupportedEncodingException e) {
344: // will fallback to default
345: }
346: }
347: // fallback to default encoding
348: try {
349: return URLEncoder.encode(source, ENCODING_UTF_8);
350: } catch (java.io.UnsupportedEncodingException e) {
351: // ignore
352: }
353: return source;
354: }
355:
356: /**
357: * Encodes all characters that are contained in the String which can not displayed
358: * in the given encodings charset with HTML entity references
359: * like <code>&#8364;</code>.<p>
360: *
361: * This is required since a Java String is
362: * internally always stored as Unicode, meaning it can contain almost every character, but
363: * the HTML charset used might not support all such characters.<p>
364: *
365: * @param input the input to encode for HTML
366: * @param encoding the charset to encode the result with
367: *
368: * @return the input with the encoded HTML entities
369: *
370: * @see #decodeHtmlEntities(String, String)
371: */
372: public static String encodeHtmlEntities(String input,
373: String encoding) {
374:
375: StringBuffer result = new StringBuffer(input.length() * 2);
376: CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
377: Charset charset = Charset.forName(encoding);
378: CharsetEncoder encoder = charset.newEncoder();
379: for (int i = 0; i < buffer.length(); i++) {
380: int c = buffer.get(i);
381: if (c < 128) {
382: // first 128 chars are contained in almost every charset
383: result.append((char) c);
384: // this is intended as performance improvement since
385: // the canEncode() operation appears quite CPU heavy
386: } else if (encoder.canEncode((char) c)) {
387: // encoder can encode this char
388: result.append((char) c);
389: } else {
390: // append HTML entity reference
391: result.append(ENTITY_PREFIX);
392: result.append(c);
393: result.append(";");
394: }
395: }
396: return result.toString();
397: }
398:
399: /**
400: * Encodes all characters that are contained in the String which can not displayed
401: * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p>
402: *
403: * This can be used to escape values used in Java property files.<p>
404: *
405: * @param input the input to encode for Java
406: * @param encoding the charset to encode the result with
407: *
408: * @return the input with the encoded Java entities
409: */
410: public static String encodeJavaEntities(String input,
411: String encoding) {
412:
413: StringBuffer result = new StringBuffer(input.length() * 2);
414: CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
415: Charset charset = Charset.forName(encoding);
416: CharsetEncoder encoder = charset.newEncoder();
417: for (int i = 0; i < buffer.length(); i++) {
418: int c = buffer.get(i);
419: if (c < 128) {
420: // first 128 chars are contained in almost every charset
421: result.append((char) c);
422: // this is intended as performance improvement since
423: // the canEncode() operation appears quite CPU heavy
424: } else if (encoder.canEncode((char) c)) {
425: // encoder can encode this char
426: result.append((char) c);
427: } else {
428: // append Java entity reference
429: result.append("\\u");
430: String hex = Integer.toHexString(c);
431: int pad = 4 - hex.length();
432: for (int p = 0; p < pad; p++) {
433: result.append('0');
434: }
435: result.append(hex);
436: }
437: }
438: return result.toString();
439: }
440:
441: /**
442: * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
443: *
444: * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
445: * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
446: * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
447: * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
448: *
449: * @param input the parameter string
450: *
451: * @return the encoded parameter string
452: */
453: public static String encodeParameter(String input) {
454:
455: String result = CmsEncoder.encodeHtmlEntities(input,
456: CmsEncoder.ENCODING_US_ASCII);
457: result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
458: return CmsStringUtil.substitute(result, ENTITY_PREFIX,
459: ENTITY_REPLACEMENT);
460: }
461:
462: /**
463: * Encodes a String in a way that is compatible with the JavaScript escape function.
464: *
465: * @param source The text to be encoded
466: * @param encoding the encoding type
467: *
468: * @return The JavaScript escaped string
469: */
470: public static String escape(String source, String encoding) {
471:
472: // the blank is encoded into "+" not "%20" when using standard encode call
473: return CmsStringUtil.substitute(encode(source, encoding), "+",
474: "%20");
475: }
476:
477: /**
478: * Escapes special characters in a HTML-String with their number-based
479: * entity representation, for example & becomes &#38;.<p>
480: *
481: * A character <code>num</code> is replaced if<br>
482: * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
483: *
484: * @param source the String to escape
485: *
486: * @return String the escaped String
487: *
488: * @see #escapeXml(String)
489: */
490: public static String escapeHtml(String source) {
491:
492: int terminatorIndex;
493: if (source == null) {
494: return null;
495: }
496: StringBuffer result = new StringBuffer(source.length() * 2);
497: for (int i = 0; i < source.length(); i++) {
498: int ch = source.charAt(i);
499: // avoid escaping already escaped characters
500: if (ch == 38) {
501: terminatorIndex = source.indexOf(";", i);
502: if (terminatorIndex > 0) {
503: if (source.substring(i + 1, terminatorIndex)
504: .matches("#[0-9]+|lt|gt|amp|quote")) {
505: result.append(source.substring(i,
506: terminatorIndex + 1));
507: // Skip remaining chars up to (and including) ";"
508: i = terminatorIndex;
509: continue;
510: }
511: }
512: }
513: if ((ch != 32)
514: && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
515: result.append(ENTITY_PREFIX);
516: result.append(ch);
517: result.append(";");
518: } else {
519: result.append((char) ch);
520: }
521: }
522: return new String(result);
523: }
524:
525: /**
526: * Escapes non ASCII characters in a HTML-String with their number-based
527: * entity representation, for example & becomes &#38;.<p>
528: *
529: * A character <code>num</code> is replaced if<br>
530: * <code>(ch > 255)</code><p>
531: *
532: * @param source the String to escape
533: *
534: * @return String the escaped String
535: *
536: * @see #escapeXml(String)
537: */
538: public static String escapeNonAscii(String source) {
539:
540: if (source == null) {
541: return null;
542: }
543: StringBuffer result = new StringBuffer(source.length() * 2);
544: for (int i = 0; i < source.length(); i++) {
545: int ch = source.charAt(i);
546: if (ch > 255) {
547: result.append(ENTITY_PREFIX);
548: result.append(ch);
549: result.append(";");
550: } else {
551: result.append((char) ch);
552: }
553: }
554: return new String(result);
555: }
556:
557: /**
558: * Encodes a String in a way that is compatible with the JavaScript escape function.
559: * Multiple blanks are encoded _multiply _with <code>%20</code>.<p>
560: *
561: * @param source The text to be encoded
562: * @param encoding the encoding type
563: *
564: * @return The JavaScript escaped string
565: */
566: public static String escapeWBlanks(String source, String encoding) {
567:
568: if (CmsStringUtil.isEmpty(source)) {
569: return source;
570: }
571: StringBuffer ret = new StringBuffer(source.length() * 2);
572:
573: // URLEncode the text string
574: // this produces a very similar encoding to JavaSscript encoding,
575: // except the blank which is not encoded into "%20" instead of "+"
576:
577: String enc = encode(source, encoding);
578: for (int z = 0; z < enc.length(); z++) {
579: char c = enc.charAt(z);
580: if (c == '+') {
581: ret.append("%20");
582: } else {
583: ret.append(c);
584: }
585: }
586: return ret.toString();
587: }
588:
589: /**
590: * Escapes a String so it may be printed as text content or attribute
591: * value in a HTML page or an XML file.<p>
592: *
593: * This method replaces the following characters in a String:
594: * <ul>
595: * <li><b><</b> with &lt;
596: * <li><b>></b> with &gt;
597: * <li><b>&</b> with &amp;
598: * <li><b>"</b> with &quot;
599: * </ul><p>
600: *
601: * @param source the string to escape
602: *
603: * @return the escaped string
604: *
605: * @see #escapeHtml(String)
606: */
607: public static String escapeXml(String source) {
608:
609: return escapeXml(source, false);
610: }
611:
612: /**
613: * Escapes a String so it may be printed as text content or attribute
614: * value in a HTML page or an XML file.<p>
615: *
616: * This method replaces the following characters in a String:
617: * <ul>
618: * <li><b><</b> with &lt;
619: * <li><b>></b> with &gt;
620: * <li><b>&</b> with &amp;
621: * <li><b>"</b> with &quot;
622: * </ul><p>
623: *
624: * @param source the string to escape
625: * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
626: *
627: * @return the escaped string
628: *
629: * @see #escapeHtml(String)
630: */
631: public static String escapeXml(String source, boolean doubleEscape) {
632:
633: if (source == null) {
634: return null;
635: }
636: StringBuffer result = new StringBuffer(source.length() * 2);
637:
638: for (int i = 0; i < source.length(); ++i) {
639: char ch = source.charAt(i);
640: switch (ch) {
641: case '<':
642: result.append("<");
643: break;
644: case '>':
645: result.append(">");
646: break;
647: case '&':
648: // don't escape already escaped international and special characters
649: if (!doubleEscape) {
650: int terminatorIndex = source.indexOf(";", i);
651: if (terminatorIndex > 0) {
652: if (source.substring(i + 1, terminatorIndex)
653: .matches("#[0-9]+")) {
654: result.append(ch);
655: break;
656: }
657: }
658: }
659: // note that to other "break" in the above "if" block
660: result.append("&");
661: break;
662: case '"':
663: result.append(""");
664: break;
665: default:
666: result.append(ch);
667: }
668: }
669: return new String(result);
670: }
671:
672: /**
673: * Checks if a given encoding name is actually supported, and if so
674: * resolves it to it's canonical name, if not it returns the given fallback
675: * value.<p>
676: *
677: * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
678: * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name
679: * to it's "canonical" form, so that simple String comparison can be used
680: * when checking charset names internally later.<p>
681: *
682: * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a>
683: * for a list of valid charset alias names.<p>
684: *
685: * @param encoding the encoding to check and resolve
686: * @param fallback the fallback encoding scheme
687: *
688: * @return the resolved encoding name, or the fallback value
689: */
690: public static String lookupEncoding(String encoding, String fallback) {
691:
692: String result = (String) m_encodingCache.get(encoding);
693: if (result != null) {
694: return result;
695: }
696:
697: try {
698: result = Charset.forName(encoding).name();
699: m_encodingCache.put(encoding, result);
700: return result;
701: } catch (Throwable t) {
702: // we will use the default value as fallback
703: }
704:
705: return fallback;
706: }
707:
708: /**
709: * Re-decodes a String that has not been correctly decoded and thus has scrambled
710: * character bytes.<p>
711: *
712: * This is an equivalent to the JavaScript "decodeURIComponent" function.
713: * It converts from the default "UTF-8" to the currently selected system encoding.<p>
714: *
715: * @param input the String to convert
716: *
717: * @return String the converted String
718: */
719: public static String redecodeUriComponent(String input) {
720:
721: if (input == null) {
722: return input;
723: }
724: return new String(changeEncoding(input.getBytes(),
725: ENCODING_UTF_8, OpenCms.getSystemInfo()
726: .getDefaultEncoding()));
727: }
728:
729: /**
730: * Decodes a String in a way that is compatible with the JavaScript
731: * unescape function.<p>
732: *
733: * @param source The String to be decoded
734: * @param encoding the encoding type
735: *
736: * @return The JavaScript unescaped String
737: */
738: public static String unescape(String source, String encoding) {
739:
740: if (source == null) {
741: return null;
742: }
743: int len = source.length();
744: // to use standard decoder we need to replace '+' with "%20" (space)
745: StringBuffer preparedSource = new StringBuffer(len);
746: for (int i = 0; i < len; i++) {
747: char c = source.charAt(i);
748: if (c == '+') {
749: preparedSource.append("%20");
750: } else {
751: preparedSource.append(c);
752: }
753: }
754: return decode(preparedSource.toString(), encoding);
755: }
756: }
|