001: /*
002: * Copyright 2000-2001 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: package sun.nio.cs;
027:
028: import java.nio.CharBuffer;
029: import java.nio.charset.CoderResult;
030: import java.nio.charset.MalformedInputException;
031: import java.nio.charset.UnmappableCharacterException;
032:
033: /**
034: * Utility class for dealing with surrogates.
035: *
036: * @author Mark Reinhold
037: * @version 1.19, 07/05/05
038: */
039:
040: public class Surrogate {
041:
042: private Surrogate() {
043: }
044:
045: // UTF-16 surrogate-character ranges
046: //
047: public static final char MIN_HIGH = '\uD800';
048: public static final char MAX_HIGH = '\uDBFF';
049: public static final char MIN_LOW = '\uDC00';
050: public static final char MAX_LOW = '\uDFFF';
051: public static final char MIN = MIN_HIGH;
052: public static final char MAX = MAX_LOW;
053:
054: // Range of UCS-4 values that need surrogates in UTF-16
055: //
056: public static final int UCS4_MIN = 0x10000;
057: public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
058:
059: /**
060: * Tells whether or not the given UTF-16 value is a high surrogate.
061: */
062: public static boolean isHigh(int c) {
063: return (MIN_HIGH <= c) && (c <= MAX_HIGH);
064: }
065:
066: /**
067: * Tells whether or not the given UTF-16 value is a low surrogate.
068: */
069: public static boolean isLow(int c) {
070: return (MIN_LOW <= c) && (c <= MAX_LOW);
071: }
072:
073: /**
074: * Tells whether or not the given UTF-16 value is a surrogate character,
075: */
076: public static boolean is(int c) {
077: return (MIN <= c) && (c <= MAX);
078: }
079:
080: /**
081: * Tells whether or not the given UCS-4 character must be represented as a
082: * surrogate pair in UTF-16.
083: */
084: public static boolean neededFor(int uc) {
085: return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
086: }
087:
088: /**
089: * Returns the high UTF-16 surrogate for the given UCS-4 character.
090: */
091: public static char high(int uc) {
092: assert neededFor(uc);
093: return (char) (0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
094: }
095:
096: /**
097: * Returns the low UTF-16 surrogate for the given UCS-4 character.
098: */
099: public static char low(int uc) {
100: assert neededFor(uc);
101: return (char) (0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
102: }
103:
104: /**
105: * Converts the given surrogate pair into a 32-bit UCS-4 character.
106: */
107: public static int toUCS4(char c, char d) {
108: assert isHigh(c) && isLow(d);
109: return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
110: }
111:
112: /**
113: * Surrogate parsing support. Charset implementations may use instances of
114: * this class to handle the details of parsing UTF-16 surrogate pairs.
115: */
116: public static class Parser {
117:
118: public Parser() {
119: }
120:
121: private int character; // UCS-4
122: private CoderResult error = CoderResult.UNDERFLOW;
123: private boolean isPair;
124:
125: /**
126: * Returns the UCS-4 character previously parsed.
127: */
128: public int character() {
129: assert (error == null);
130: return character;
131: }
132:
133: /**
134: * Tells whether or not the previously-parsed UCS-4 character was
135: * originally represented by a surrogate pair.
136: */
137: public boolean isPair() {
138: assert (error == null);
139: return isPair;
140: }
141:
142: /**
143: * Returns the number of UTF-16 characters consumed by the previous
144: * parse.
145: */
146: public int increment() {
147: assert (error == null);
148: return isPair ? 2 : 1;
149: }
150:
151: /**
152: * If the previous parse operation detected an error, return the object
153: * describing that error.
154: */
155: public CoderResult error() {
156: assert (error != null);
157: return error;
158: }
159:
160: /**
161: * Returns an unmappable-input result object, with the appropriate
162: * input length, for the previously-parsed character.
163: */
164: public CoderResult unmappableResult() {
165: assert (error == null);
166: return CoderResult.unmappableForLength(isPair ? 2 : 1);
167: }
168:
169: /**
170: * Parses a UCS-4 character from the given source buffer, handling
171: * surrogates.
172: *
173: * @param c The first character
174: * @param in The source buffer, from which one more character
175: * will be consumed if c is a high surrogate
176: *
177: * @returns Either a parsed UCS-4 character, in which case the isPair()
178: * and increment() methods will return meaningful values, or
179: * -1, in which case error() will return a descriptive result
180: * object
181: */
182: public int parse(char c, CharBuffer in) {
183: if (Surrogate.isHigh(c)) {
184: if (!in.hasRemaining()) {
185: error = CoderResult.UNDERFLOW;
186: return -1;
187: }
188: char d = in.get();
189: if (Surrogate.isLow(d)) {
190: character = toUCS4(c, d);
191: isPair = true;
192: error = null;
193: return character;
194: }
195: error = CoderResult.malformedForLength(1);
196: return -1;
197: }
198: if (Surrogate.isLow(c)) {
199: error = CoderResult.malformedForLength(1);
200: return -1;
201: }
202: character = c;
203: isPair = false;
204: error = null;
205: return character;
206: }
207:
208: /**
209: * Parses a UCS-4 character from the given source buffer, handling
210: * surrogates.
211: *
212: * @param c The first character
213: * @param ia The input array, from which one more character
214: * will be consumed if c is a high surrogate
215: * @param ip The input index
216: * @param il The input limit
217: *
218: * @returns Either a parsed UCS-4 character, in which case the isPair()
219: * and increment() methods will return meaningful values, or
220: * -1, in which case error() will return a descriptive result
221: * object
222: */
223: public int parse(char c, char[] ia, int ip, int il) {
224: assert (ia[ip] == c);
225: if (Surrogate.isHigh(c)) {
226: if (il - ip < 2) {
227: error = CoderResult.UNDERFLOW;
228: return -1;
229: }
230: char d = ia[ip + 1];
231: if (Surrogate.isLow(d)) {
232: character = toUCS4(c, d);
233: isPair = true;
234: error = null;
235: return character;
236: }
237: error = CoderResult.malformedForLength(1);
238: return -1;
239: }
240: if (Surrogate.isLow(c)) {
241: error = CoderResult.malformedForLength(1);
242: return -1;
243: }
244: character = c;
245: isPair = false;
246: error = null;
247: return character;
248: }
249:
250: }
251:
252: /**
253: * Surrogate generation support. Charset implementations may use instances
254: * of this class to handle the details of generating UTF-16 surrogate
255: * pairs.
256: */
257: public static class Generator {
258:
259: public Generator() {
260: }
261:
262: private CoderResult error = CoderResult.OVERFLOW;
263:
264: /**
265: * If the previous generation operation detected an error, return the
266: * object describing that error.
267: */
268: public CoderResult error() {
269: assert error != null;
270: return error;
271: }
272:
273: /**
274: * Generates one or two UTF-16 characters to represent the given UCS-4
275: * character.
276: *
277: * @param uc The UCS-4 character
278: * @param len The number of input bytes from which the UCS-4 value
279: * was constructed (used when creating result objects)
280: * @param dst The destination buffer, to which one or two UTF-16
281: * characters will be written
282: *
283: * @returns Either a positive count of the number of UTF-16 characters
284: * written to the destination buffer, or -1, in which case
285: * error() will return a descriptive result object
286: */
287: public int generate(int uc, int len, CharBuffer dst) {
288: if (uc <= 0xffff) {
289: if (Surrogate.is(uc)) {
290: error = CoderResult.malformedForLength(len);
291: return -1;
292: }
293: if (dst.remaining() < 1) {
294: error = CoderResult.OVERFLOW;
295: return -1;
296: }
297: dst.put((char) uc);
298: error = null;
299: return 1;
300: }
301: if (uc < Surrogate.UCS4_MIN) {
302: error = CoderResult.malformedForLength(len);
303: return -1;
304: }
305: if (uc <= Surrogate.UCS4_MAX) {
306: if (dst.remaining() < 2) {
307: error = CoderResult.OVERFLOW;
308: return -1;
309: }
310: dst.put(Surrogate.high(uc));
311: dst.put(Surrogate.low(uc));
312: error = null;
313: return 2;
314: }
315: error = CoderResult.unmappableForLength(len);
316: return -1;
317: }
318:
319: /**
320: * Generates one or two UTF-16 characters to represent the given UCS-4
321: * character.
322: *
323: * @param uc The UCS-4 character
324: * @param len The number of input bytes from which the UCS-4 value
325: * was constructed (used when creating result objects)
326: * @param da The destination array, to which one or two UTF-16
327: * characters will be written
328: * @param dp The destination position
329: * @param dl The destination limit
330: *
331: * @returns Either a positive count of the number of UTF-16 characters
332: * written to the destination buffer, or -1, in which case
333: * error() will return a descriptive result object
334: */
335: public int generate(int uc, int len, char[] da, int dp, int dl) {
336: if (uc <= 0xffff) {
337: if (Surrogate.is(uc)) {
338: error = CoderResult.malformedForLength(len);
339: return -1;
340: }
341: if (dl - dp < 1) {
342: error = CoderResult.OVERFLOW;
343: return -1;
344: }
345: da[dp] = (char) uc;
346: error = null;
347: return 1;
348: }
349: if (uc < Surrogate.UCS4_MIN) {
350: error = CoderResult.malformedForLength(len);
351: return -1;
352: }
353: if (uc <= Surrogate.UCS4_MAX) {
354: if (dl - dp < 2) {
355: error = CoderResult.OVERFLOW;
356: return -1;
357: }
358: da[dp] = Surrogate.high(uc);
359: da[dp + 1] = Surrogate.low(uc);
360: error = null;
361: return 2;
362: }
363: error = CoderResult.unmappableForLength(len);
364: return -1;
365: }
366:
367: }
368:
369: }
|