001: /*
002: * The contents of this file are subject to the terms
003: * of the Common Development and Distribution License
004: * (the "License"). You may not use this file except
005: * in compliance with the License.
006: *
007: * You can obtain a copy of the license at
008: * https://jwsdp.dev.java.net/CDDLv1.0.html
009: * See the License for the specific language governing
010: * permissions and limitations under the License.
011: *
012: * When distributing Covered Code, include this CDDL
013: * HEADER in each file and include the License file at
014: * https://jwsdp.dev.java.net/CDDLv1.0.html If applicable,
015: * add the following below this CDDL HEADER, with the
016: * fields enclosed by brackets "[]" replaced with your
017: * own identifying information: Portions Copyright [yyyy]
018: * [name of copyright owner]
019: */
020:
021: package com.sun.codemodel.util;
022:
023: import java.nio.CharBuffer;
024: import java.nio.charset.CoderResult;
025:
026: /**
027: * Utility class for dealing with surrogates.
028: *
029: * @author Mark Reinhold
030: * @version 1.11, 03/01/23
031: */
032:
033: class Surrogate {
034:
035: private Surrogate() {
036: }
037:
038: // UTF-16 surrogate-character ranges
039: //
040: public static final char MIN_HIGH = '\uD800';
041: public static final char MAX_HIGH = '\uDBFF';
042: public static final char MIN_LOW = '\uDC00';
043: public static final char MAX_LOW = '\uDFFF';
044: public static final char MIN = MIN_HIGH;
045: public static final char MAX = MAX_LOW;
046:
047: // Range of UCS-4 values that need surrogates in UTF-16
048: //
049: public static final int UCS4_MIN = 0x10000;
050: public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
051:
052: /**
053: * Tells whether or not the given UTF-16 value is a high surrogate.
054: */
055: public static boolean isHigh(int c) {
056: return (MIN_HIGH <= c) && (c <= MAX_HIGH);
057: }
058:
059: /**
060: * Tells whether or not the given UTF-16 value is a low surrogate.
061: */
062: public static boolean isLow(int c) {
063: return (MIN_LOW <= c) && (c <= MAX_LOW);
064: }
065:
066: /**
067: * Tells whether or not the given UTF-16 value is a surrogate character,
068: */
069: public static boolean is(int c) {
070: return (MIN <= c) && (c <= MAX);
071: }
072:
073: /**
074: * Tells whether or not the given UCS-4 character must be represented as a
075: * surrogate pair in UTF-16.
076: */
077: public static boolean neededFor(int uc) {
078: return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
079: }
080:
081: /**
082: * Returns the high UTF-16 surrogate for the given UCS-4 character.
083: */
084: public static char high(int uc) {
085: return (char) (0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
086: }
087:
088: /**
089: * Returns the low UTF-16 surrogate for the given UCS-4 character.
090: */
091: public static char low(int uc) {
092: return (char) (0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
093: }
094:
095: /**
096: * Converts the given surrogate pair into a 32-bit UCS-4 character.
097: */
098: public static int toUCS4(char c, char d) {
099: return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
100: }
101:
102: /**
103: * Surrogate parsing support. Charset implementations may use instances of
104: * this class to handle the details of parsing UTF-16 surrogate pairs.
105: */
106: public static class Parser {
107:
108: public Parser() {
109: }
110:
111: private int character; // UCS-4
112: private CoderResult error = CoderResult.UNDERFLOW;
113: private boolean isPair;
114:
115: /**
116: * Returns the UCS-4 character previously parsed.
117: */
118: public int character() {
119: return character;
120: }
121:
122: /**
123: * Tells whether or not the previously-parsed UCS-4 character was
124: * originally represented by a surrogate pair.
125: */
126: public boolean isPair() {
127: return isPair;
128: }
129:
130: /**
131: * Returns the number of UTF-16 characters consumed by the previous
132: * parse.
133: */
134: public int increment() {
135: return isPair ? 2 : 1;
136: }
137:
138: /**
139: * If the previous parse operation detected an error, return the object
140: * describing that error.
141: */
142: public CoderResult error() {
143: return error;
144: }
145:
146: /**
147: * Returns an unmappable-input result object, with the appropriate
148: * input length, for the previously-parsed character.
149: */
150: public CoderResult unmappableResult() {
151: return CoderResult.unmappableForLength(isPair ? 2 : 1);
152: }
153:
154: /**
155: * Parses a UCS-4 character from the given source buffer, handling
156: * surrogates.
157: *
158: * @param c The first character
159: * @param in The source buffer, from which one more character
160: * will be consumed if c is a high surrogate
161: *
162: * @return Either a parsed UCS-4 character, in which case the isPair()
163: * and increment() methods will return meaningful values, or
164: * -1, in which case error() will return a descriptive result
165: * object
166: */
167: public int parse(char c, CharBuffer in) {
168: if (isHigh(c)) {
169: if (!in.hasRemaining()) {
170: error = CoderResult.UNDERFLOW;
171: return -1;
172: }
173: char d = in.get();
174: if (isLow(d)) {
175: character = toUCS4(c, d);
176: isPair = true;
177: error = null;
178: return character;
179: }
180: error = CoderResult.malformedForLength(1);
181: return -1;
182: }
183: if (isLow(c)) {
184: error = CoderResult.malformedForLength(1);
185: return -1;
186: }
187: character = c;
188: isPair = false;
189: error = null;
190: return character;
191: }
192:
193: /**
194: * Parses a UCS-4 character from the given source buffer, handling
195: * surrogates.
196: *
197: * @param c The first character
198: * @param ia The input array, from which one more character
199: * will be consumed if c is a high surrogate
200: * @param ip The input index
201: * @param il The input limit
202: *
203: * @return Either a parsed UCS-4 character, in which case the isPair()
204: * and increment() methods will return meaningful values, or
205: * -1, in which case error() will return a descriptive result
206: * object
207: */
208: public int parse(char c, char[] ia, int ip, int il) {
209: if (isHigh(c)) {
210: if (il - ip < 2) {
211: error = CoderResult.UNDERFLOW;
212: return -1;
213: }
214: char d = ia[ip + 1];
215: if (isLow(d)) {
216: character = toUCS4(c, d);
217: isPair = true;
218: error = null;
219: return character;
220: }
221: error = CoderResult.malformedForLength(1);
222: return -1;
223: }
224: if (isLow(c)) {
225: error = CoderResult.malformedForLength(1);
226: return -1;
227: }
228: character = c;
229: isPair = false;
230: error = null;
231: return character;
232: }
233:
234: }
235:
236: /**
237: * Surrogate generation support. Charset implementations may use instances
238: * of this class to handle the details of generating UTF-16 surrogate
239: * pairs.
240: */
241: public static class Generator {
242:
243: public Generator() {
244: }
245:
246: private CoderResult error = CoderResult.OVERFLOW;
247:
248: /**
249: * If the previous generation operation detected an error, return the
250: * object describing that error.
251: */
252: public CoderResult error() {
253: return error;
254: }
255:
256: /**
257: * Generates one or two UTF-16 characters to represent the given UCS-4
258: * character.
259: *
260: * @param uc The UCS-4 character
261: * @param len The number of input bytes from which the UCS-4 value
262: * was constructed (used when creating result objects)
263: * @param dst The destination buffer, to which one or two UTF-16
264: * characters will be written
265: *
266: * @return Either a positive count of the number of UTF-16 characters
267: * written to the destination buffer, or -1, in which case
268: * error() will return a descriptive result object
269: */
270: public int generate(int uc, int len, CharBuffer dst) {
271: if (uc <= 0xffff) {
272: if (is(uc)) {
273: error = CoderResult.malformedForLength(len);
274: return -1;
275: }
276: if (dst.remaining() < 1) {
277: error = CoderResult.OVERFLOW;
278: return -1;
279: }
280: dst.put((char) uc);
281: error = null;
282: return 1;
283: }
284: if (uc < UCS4_MIN) {
285: error = CoderResult.malformedForLength(len);
286: return -1;
287: }
288: if (uc <= UCS4_MAX) {
289: if (dst.remaining() < 2) {
290: error = CoderResult.OVERFLOW;
291: return -1;
292: }
293: dst.put(high(uc));
294: dst.put(low(uc));
295: error = null;
296: return 2;
297: }
298: error = CoderResult.unmappableForLength(len);
299: return -1;
300: }
301:
302: /**
303: * Generates one or two UTF-16 characters to represent the given UCS-4
304: * character.
305: *
306: * @param uc The UCS-4 character
307: * @param len The number of input bytes from which the UCS-4 value
308: * was constructed (used when creating result objects)
309: * @param da The destination array, to which one or two UTF-16
310: * characters will be written
311: * @param dp The destination position
312: * @param dl The destination limit
313: *
314: * @return Either a positive count of the number of UTF-16 characters
315: * written to the destination buffer, or -1, in which case
316: * error() will return a descriptive result object
317: */
318: public int generate(int uc, int len, char[] da, int dp, int dl) {
319: if (uc <= 0xffff) {
320: if (is(uc)) {
321: error = CoderResult.malformedForLength(len);
322: return -1;
323: }
324: if (dl - dp < 1) {
325: error = CoderResult.OVERFLOW;
326: return -1;
327: }
328: da[dp] = (char) uc;
329: error = null;
330: return 1;
331: }
332: if (uc < UCS4_MIN) {
333: error = CoderResult.malformedForLength(len);
334: return -1;
335: }
336: if (uc <= UCS4_MAX) {
337: if (dl - dp < 2) {
338: error = CoderResult.OVERFLOW;
339: return -1;
340: }
341: da[dp] = high(uc);
342: da[dp + 1] = low(uc);
343: error = null;
344: return 2;
345: }
346: error = CoderResult.unmappableForLength(len);
347: return -1;
348: }
349:
350: }
351:
352: }
|