01: package net.sf.saxon.charcode;
02:
03: /**
04: * This class defines properties of the Unicode character set
05: */
06:
07: public final class UnicodeCharacterSet implements CharacterSet {
08:
09: private static UnicodeCharacterSet theInstance = new UnicodeCharacterSet();
10:
11: /**
12: * Private constructor to force the singular instance to be used
13: */
14:
15: private UnicodeCharacterSet() {
16: }
17:
18: public static UnicodeCharacterSet getInstance() {
19: return theInstance;
20: }
21:
22: public boolean inCharset(int c) {
23: return true;
24: }
25:
26: /**
27: * Static method to generate the UTF-8 representation of a Unicode character
28: * @param in the Unicode character, or the high half of a surrogate pair
29: * @param in2 the low half of a surrogate pair (ignored unless the first argument is in the
30: * range for a surrogate pair)
31: * @param out an array of at least 4 bytes to hold the UTF-8 representation.
32: * @return the number of bytes in the UTF-8 representation
33: */
34:
35: public static int getUTF8Encoding(char in, char in2, byte[] out) {
36: // See Tony Graham, "Unicode, a Primer", page 92
37: int i = (int) in;
38: if (i <= 0x7f) {
39: out[0] = (byte) i;
40: return 1;
41: } else if (i <= 0x7ff) {
42: out[0] = (byte) (0xc0 | ((in >> 6) & 0x1f));
43: out[1] = (byte) (0x80 | (in & 0x3f));
44: return 2;
45: } else if (i >= 0xd800 && i <= 0xdbff) {
46: // surrogate pair
47: int j = (int) in2;
48: if (!(j >= 0xdc00 && j <= 0xdfff)) {
49: throw new IllegalArgumentException(
50: "Malformed Unicode Surrogate Pair (" + i + ','
51: + j + ')');
52: }
53: byte xxxxxx = (byte) (j & 0x3f);
54: byte yyyyyy = (byte) (((i & 0x03) << 4) | ((j >> 6) & 0x0f));
55: byte zzzz = (byte) ((i >> 2) & 0x0f);
56: byte uuuuu = (byte) (((i >> 6) & 0x0f) + 1);
57: out[0] = (byte) (0xf0 | ((uuuuu >> 2) & 0x07));
58: out[1] = (byte) (0x80 | ((uuuuu & 0x03) << 4) | zzzz);
59: out[2] = (byte) (0x80 | yyyyyy);
60: out[3] = (byte) (0x80 | xxxxxx);
61: return 4;
62: } else if (i >= 0xdc00 && i <= 0xdfff) {
63: // second half of surrogate pair - ignore it
64: return 0;
65: } else {
66: out[0] = (byte) (0xe0 | ((in >> 12) & 0x0f));
67: out[1] = (byte) (0x80 | ((in >> 6) & 0x3f));
68: out[2] = (byte) (0x80 | (in & 0x3f));
69: return 3;
70: }
71: }
72:
73: }
74:
75: //
76: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
77: // you may not use this file except in compliance with the License. You may obtain a copy of the
78: // License at http://www.mozilla.org/MPL/
79: //
80: // Software distributed under the License is distributed on an "AS IS" basis,
81: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
82: // See the License for the specific language governing rights and limitations under the License.
83: //
84: // The Original Code is: all this file.
85: //
86: // The Initial Developer of the Original Code is
87: // Aleksei Makarov [makarov@iitam.omsk.net.ru]
88: //
89: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
90: //
91: // Contributor(s): none.
92: //
|