001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 2000-2002 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: //package org.apache.xerces.impl.io;
059: package org.geoserver.ows.util;
060:
061: import java.io.IOException;
062: import java.io.InputStream;
063: import java.io.Reader;
064:
065: /**
066: * Reader for UCS-2 and UCS-4 encodings.
067: * (more precisely ISO-10646-UCS-(2|4) encodings).
068: *
069: * This variant is modified to handle supplementary Unicode code points
070: * correctly. Though this required a lot of new code and definitely
071: * reduced the perfomance comparing to original version. I tried my best
072: * to preserve exsiting code and comments whenever it was possible.
073: * I performed some basic tests, but not too thorough ones, so
074: * some bugs may still nest in the code. -AK
075: *
076: * @author Neil Graham, IBM
077: *
078: * @version $Id: UCSReader.java 6177 2007-02-19 10:11:27Z aaime $
079: */
080: public class UCSReader extends Reader {
081: //
082: // Constants
083: //
084:
085: /**
086: * Default byte buffer size (8192, larger than that of ASCIIReader
087: * since it's reasonable to surmise that the average UCS-4-encoded
088: * file should be 4 times as large as the average ASCII-encoded file).
089: */
090: public static final int DEFAULT_BUFFER_SIZE = 8192;
091:
092: /**
093: * Starting size of the internal char buffer. Internal char buffer is
094: * maintained to hold excess chars that may left from previous read
095: * operation when working with UCS-4 data (never used for UCS-2).
096: */
097: public static final int CHAR_BUFFER_INITIAL_SIZE = 1024;
098: public static final short UCS2LE = 1;
099: public static final short UCS2BE = 2;
100: public static final short UCS4LE = 4;
101: public static final short UCS4BE = 8;
102:
103: /**
104: * The minimum value of a supplementary code point.
105: */
106: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
107:
108: /**
109: * The minimum value of a Unicode code point.
110: */
111: public static final int MIN_CODE_POINT = 0x000000;
112:
113: /**
114: * The maximum value of a Unicode code point.
115: */
116: public static final int MAX_CODE_POINT = 0x10ffff;
117:
118: //
119: // Data
120: //
121:
122: /** Input stream. */
123: protected InputStream fInputStream;
124:
125: /** Byte buffer. */
126: protected byte[] fBuffer;
127:
128: /** what kind of data we're dealing with */
129: protected short fEncoding;
130:
131: /**
132: * Stores aforeread or "excess" characters that may appear during
133: * <code>read</code> methods invocation due to the fact that one input
134: * UCS-4 supplementary character results in two output Java
135: * <code>char</code>`s - high surrogate and low surrogate code units.
136: * Because of that, if <code>read()</code> method encounters supplementary
137: * code point in the input stream, it returns UTF-16-encoded high surrogate
138: * code unit and stores low surrogate in buffer. When called next time,
139: * <code>read()</code> will return this low surrogate, instead of reading
140: * more bytes from the <code>InputStream</code>. Similarly if
141: * <code>read(char[], int, int)</code> is invoked to read, for example,
142: * 10 chars into specified buffer, and 4 of them turn out to
143: * be supplementary Unicode characters, each written as two chars, then we
144: * end up having 4 excess chars that we cannot immediately return or
145: * push back to the input stream. So we need to store them in the buffer
146: * awaiting further <code>read</code> invocations.
147: * Note that char buffer functions like a stack, i.e. chars and surrogate
148: * pairs are stored in reverse order.
149: */
150: protected char[] fCharBuf;
151:
152: /**
153: * Count of Java chars currently being stored in in the
154: * <code>fCharBuf</code> array.
155: */
156: protected int fCharCount;
157:
158: //
159: // Constructors
160: //
161:
162: /**
163: * Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the specified
164: * input stream using default buffer size. The Endianness and exact input
165: * encoding (<code>UCS-2</code> or <code>UCS-4</code>) also should be known
166: * in advance.
167: *
168: * @param inputStream input stream with UCS-2|4 encoded data
169: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
170: */
171: public UCSReader(InputStream inputStream, short encoding) {
172: this (inputStream, DEFAULT_BUFFER_SIZE, encoding);
173: } // <init>(InputStream, short)
174:
175: /**
176: * Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the source
177: * input stream using explicitly specified initial buffer size. Endianness
178: * and exact input encoding (<code>UCS-2</code> or <code>UCS-4</code>) also
179: * should be known in advance.
180: *
181: * @param inputStream input stream with UCS-2|4 encoded data
182: * @param size The initial buffer size. You better make sure
183: * this number is divisible by 4 if you plan to
184: * to read UCS-4 with this class.
185: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE
186: */
187: public UCSReader(InputStream inputStream, int size, short encoding) {
188: fInputStream = inputStream;
189: fBuffer = new byte[size];
190: fEncoding = encoding;
191:
192: fCharBuf = new char[CHAR_BUFFER_INITIAL_SIZE];
193: fCharCount = 0;
194: } // <init>(InputStream, int, short)
195:
196: //
197: // Reader methods
198: //
199:
200: /**
201: * Read a single character. This method will block until a character is
202: * available, an I/O error occurs, or the end of the stream is reached.
203: *
204: * If supplementary Unicode character is encountered in <code>UCS-4</code>
205: * input, it will be encoded into <code>UTF-16</code> surrogate pair
206: * according to RFC 2781. High surrogate code unit will be returned
207: * immediately, and low surrogate saved in the internal buffer to be read
208: * during next <code>read()</code> or <code>read(char[], int, int)</code>
209: * invocation. -AK
210: *
211: * @return Java 16-bit <code>char</code> value containing UTF-16 code
212: * unit which may be either code point from Basic Multilingual
213: * Plane or one of the surrogate code units (high or low)
214: * of the pair representing supplementary Unicode character
215: * (one in <code>0x10000 - 0x10FFFF</code> range) -AK
216: *
217: * @exception IOException when I/O error occurs
218: */
219: public int read() throws IOException {
220: // If we got something in the char buffer, let's use it.
221: if (0 != fCharCount) {
222: fCharCount--;
223:
224: return ((int) fCharBuf[fCharCount]) & 0xFFFF;
225: }
226:
227: int b0 = fInputStream.read() & 0xff; // 1st byte
228:
229: if (b0 == 0xff) {
230: return -1;
231: }
232:
233: int b1 = fInputStream.read() & 0xff; // 2nd byte
234:
235: if (b1 == 0xff) {
236: return -1;
237: }
238:
239: if (fEncoding >= 4) { // UCS-4
240:
241: int b2 = fInputStream.read() & 0xff; // 3rd byte
242:
243: if (b2 == 0xff) {
244: return -1;
245: }
246:
247: int b3 = fInputStream.read() & 0xff; // 4th byte
248:
249: if (b3 == 0xff) {
250: return -1;
251: }
252:
253: int codepoint;
254:
255: if (UCS4BE == fEncoding) {
256: codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
257: } else {
258: codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
259: }
260:
261: /*
262: * Encoding from UCS-4 to UTF-16 as described in RFC 2781
263: * In theory there should be additional `isValidCodePoint()` check
264: * but I simply don't know what to do if invalid one is encountered.
265: */
266: if (!isSupplementaryCodePoint(codepoint)) {
267: return codepoint;
268: } else {
269: int cp1 = (codepoint - 0x10000) & 0xFFFFF;
270: int highSurrogate = 0xD800 + (cp1 >>> 10); // ">>" should work too
271: // Saving low surrogate for future use
272:
273: fCharBuf[fCharCount] = (char) (0xDC00 + (cp1 & 0x3FF));
274:
275: // low surrogate code unit will be returned during next call
276: return highSurrogate;
277: }
278: } else { // UCS-2
279:
280: if (fEncoding == UCS2BE) {
281: return (b0 << 8) + b1;
282: } else {
283: return (b1 << 8) + b0;
284: }
285: }
286: } // read():int
287:
288: /**
289: * Read characters into a portion of an array. This method will block
290: * until some input is available, an I/O error occurs, or the end of the
291: * stream is reached.
292: *
293: * I suspect that the whole stuff works awfully slow, so if you know
294: * for sure that your <code>UCS-4</code> input does not contain any
295: * supplementary code points you probably should use original
296: * <code>UCSReader</code> class from Xerces team
297: * (<code>org.apache.xerces.impl.io.UCSReader</code>). -AK
298: *
299: * @param ch Destination buffer
300: * @param offset Offset at which to start storing characters
301: * @param length Maximum number of characters to read
302: *
303: * @return The number of characters read, or <code>-1</code> if the
304: * end of the stream has been reached. Note that this is not
305: * a number of <code>UCS-4</code> characters read, but
306: * instead number of <code>UTF-16</code> code units. These
307: * two are equal only if there were no supplementary Unicode
308: * code points among read chars.
309: *
310: * @exception IOException If an I/O error occurs
311: */
312: public int read(char[] ch, int offset, int length)
313: throws IOException {
314: /*
315: * The behavior of this method is _intended_ to be like this:
316: *
317: * 1. In case if we are working with UCS-2 data, `readUCS2` method
318: * handles the stuff.
319: *
320: * 2. For UCS-4 data method first looks if there is some data stored in
321: * the internal character buffer (fCharBuf). Usually this data is
322: * left from previous reading operation if there were any
323: * supplementary Unicode (ISO-10646) characters.
324: *
325: * 3. If buffer holds something, these chars are put directly in passed
326: * `ch` buffer (maximum `length` of them).
327: *
328: * 4. If char buffer ends and more data can be put into `ch`,
329: * then they are read from the underlying byte stream.
330: *
331: * 5. Method tries to read maximum possible number of bytes from
332: * InputStream, as if all read code points were from BMP (Basic
333: * Multilingual Plane).
334: *
335: * 6. Read UCS-4 characters are encoded to UTF-16 (which is native Java
336: * encoding) ant put into `ch` array.
337: *
338: * 7. It is possible that we end up with more chars than we can
339: * currently put into passed buffer due to the fact that
340: * supplementary Unicode characters are encoded into _two_ Java
341: * char's each. In this situation excess chars are stored in the
342: * internal char buffer (in reverse order, i.e. those read last
343: * are at the beginning of the `fCharBuf`). They are usually picked
344: * up during next call(s) to one of the `read` methods.
345: */
346: if ((0 > offset) || (offset > ch.length) || (0 > length)
347: || ((offset + length) > ch.length)
348: || (0 > (offset + length))) {
349: throw new IndexOutOfBoundsException();
350: } else if (0 == length) {
351: return 0;
352: }
353:
354: /*
355: * Well, it is clear that the code should be separated for
356: * UCS-2 and UCS-4 now with all that char buffer stuff around.
357: * Things are already getting nasty.
358: */
359: if (fEncoding < 4) {
360: return readUCS2(ch, offset, length);
361: }
362:
363: // First using chars from internal char buffer (if any)
364: int charsRead = 0;
365:
366: while (charsRead <= length) {
367: if (0 != fCharCount) {
368: ch[offset + charsRead] = fCharBuf[--fCharCount];
369: charsRead++;
370: } else {
371: break;
372: }
373: }
374:
375: // Reading remaining chars from InputStream.
376: if (0 != (length - charsRead)) {
377: /*
378: * Each output char (two for supplementary characters) will require
379: * us to read 4 input bytes. But as we cannot predict how many
380: * supplementary chars we will encounter, so we should try to read
381: * maximum possible number.
382: */
383: int byteLength = (length - charsRead) << 2;
384:
385: if (byteLength > fBuffer.length) {
386: byteLength = fBuffer.length;
387: }
388:
389: int count = fInputStream.read(fBuffer, 0, byteLength);
390:
391: if (-1 == count) {
392: return (0 == charsRead) ? (-1) : charsRead;
393: } else {
394: // try and make count be a multiple of the number of bytes we're
395: // looking for (simply reading 1 to 3 bytes from input stream to
396: // ensure the last code point is complete)
397: // this looks ugly, but it avoids an if at any rate...
398: int numToRead = ((4 - (count & 3)) & 3);
399:
400: for (int i = 0; i < numToRead; i++) {
401: int charRead = fInputStream.read();
402:
403: if (charRead == -1) {
404: // end of input; something likely went wrong! Pad buffer
405: // with zeros.
406: for (int j = i; j < numToRead; j++)
407: fBuffer[count + j] = 0;
408:
409: break;
410: } else {
411: fBuffer[count + i] = (byte) charRead;
412: }
413: }
414:
415: count += numToRead;
416:
417: // now count is a multiple of the right number of bytes
418: int numChars = count >> 2;
419: int curPos = 0;
420:
421: /*
422: * `i` is index of currently processed char from InputStream.
423: * `charsCount` also counts number of chars that were (possibly)
424: * read from internal char buffer.
425: */
426: int charsCount = charsRead;
427: int i;
428:
429: for (i = 0; (i < numChars) && (length >= charsCount); i++) {
430: int b0 = fBuffer[curPos++] & 0xff;
431: int b1 = fBuffer[curPos++] & 0xff;
432: int b2 = fBuffer[curPos++] & 0xff;
433: int b3 = fBuffer[curPos++] & 0xff;
434:
435: int codepoint;
436:
437: if (UCS4BE == fEncoding) {
438: codepoint = ((b0 << 24) + (b1 << 16)
439: + (b2 << 8) + b3);
440: } else {
441: codepoint = ((b3 << 24) + (b2 << 16)
442: + (b1 << 8) + b0);
443: }
444:
445: // Again, validity of this codepoint is never checked, this
446: // can yield problems sometimes.
447: if (!isSupplementaryCodePoint(codepoint)) {
448: ch[offset + charsCount] = (char) codepoint;
449: charsCount++;
450: } else {
451: // Checking if we can put another 2 chars in buffer.
452: if (2 <= (length - charsCount)) {
453: int cp1 = (codepoint - 0x10000) & 0xFFFFF;
454: ch[offset + charsCount] = (char) (0xD800 + (cp1 >>> 10));
455: ch[offset + charsCount + 1] = (char) (0xDC00 + (cp1 & 0x3FF));
456: charsCount += 2;
457: } else {
458: break; // END for
459: }
460: }
461: } // END for
462:
463: // Storing data, that possibly remain in `fBuffer` into internal
464: // char buffer for future use :)
465: curPos = (numChars << 2) - 1;
466:
467: for (int k = numChars; k > i; k--) {
468: // Reading bytes in reverse order
469: int b3 = fBuffer[curPos--] & 0xff;
470: int b2 = fBuffer[curPos--] & 0xff;
471: int b1 = fBuffer[curPos--] & 0xff;
472: int b0 = fBuffer[curPos--] & 0xff;
473:
474: int codepoint;
475:
476: if (UCS4BE == fEncoding) {
477: codepoint = ((b0 << 24) + (b1 << 16)
478: + (b2 << 8) + b3);
479: } else {
480: codepoint = ((b3 << 24) + (b2 << 16)
481: + (b1 << 8) + b0);
482: }
483:
484: // Look if we need to increase buffer size
485: if (2 > (fCharBuf.length - k)) {
486: char[] newBuf = new char[fCharBuf.length << 1];
487: System.arraycopy(fCharBuf, 0, newBuf, 0,
488: fCharBuf.length);
489: fCharBuf = newBuf;
490: }
491:
492: if (!isSupplementaryCodePoint(codepoint)) {
493: fCharBuf[fCharCount++] = (char) codepoint;
494: } else {
495: int cp1 = (codepoint - 0x10000) & 0xFFFFF;
496: // In this case store low surrogate code unit first, so that
497: // it can be read back after high one.
498: fCharBuf[fCharCount++] = (char) (0xDC00 + ((char) cp1 & 0x3FF));
499: fCharBuf[fCharCount++] = (char) (0xD800 + (cp1 >>> 10));
500: }
501: } // END for
502:
503: return charsCount;
504: } // END if (-1 == count) ELSE
505: } // END if (0 != (length - charsRead))
506:
507: return charsRead;
508: } // read(char[],int,int)
509:
510: /**
511: * Read <code>UCS-2</code> characters into a portion of an array.
512: * This method will block until some input is available, an I/O
513: * error occurs, or the end of the stream is reached.
514: * <p>
515: * In original <code>UCSReader</code> this code was part of
516: * <code>read(char[], int, int)</code> method, but I removed it
517: * from there to reduce complexity of the latter.
518: * </p>
519: *
520: * @param ch destination buffer
521: * @param offset offset at which to start storing characters
522: * @param length maximum number of characters to read
523: *
524: * @return The number of characters read, or <code>-1</code>
525: * if the end of the stream has been reached
526: *
527: * @exception IOException If an I/O error occurs
528: */
529: protected int readUCS2(char[] ch, int offset, int length)
530: throws IOException {
531: int byteLength = length << 1;
532:
533: if (byteLength > fBuffer.length) {
534: byteLength = fBuffer.length;
535: }
536:
537: int count = fInputStream.read(fBuffer, 0, byteLength);
538:
539: if (count == -1) {
540: return -1;
541: }
542:
543: // try and make count be a multiple of the number of bytes we're
544: // looking for (simply reading 1 to 3 bytes from input stream to
545: // ensure the last code point is complete)
546: int numToRead = count & 1;
547:
548: if (numToRead != 0) {
549: count++;
550:
551: int charRead = fInputStream.read();
552:
553: if (charRead == -1) { // end of input; something likely went
554: // wrong! Pad buffer with nulls.
555: fBuffer[count] = 0;
556: } else {
557: fBuffer[count] = (byte) charRead;
558: }
559: }
560:
561: // now count is a multiple of the right number of bytes
562: int numChars = count >> 1;
563: int curPos = 0;
564:
565: for (int i = 0; i < numChars; i++) {
566: int b0 = fBuffer[curPos++] & 0xff;
567: int b1 = fBuffer[curPos++] & 0xff;
568:
569: if (fEncoding == UCS2BE) {
570: ch[offset + i] = (char) ((b0 << 8) + b1);
571: } else {
572: ch[offset + i] = (char) ((b1 << 8) + b0);
573: }
574: }
575:
576: return numChars;
577: } // END readUCS2(char[], int, int)
578:
579: /**
580: * Skip characters. This method will block until some characters are
581: * available, an I/O error occurs, or the end of the stream is reached.
582: *
583: * @param n The number of characters to skip
584: *
585: * @return The number of characters actually skipped
586: *
587: * @exception IOException If an I/O error occurs
588: */
589: public long skip(long n) throws IOException {
590: /*
591: * charWidth will represent the number of bits to move
592: * n leftward to get num of bytes to skip, and then move the result
593: * rightward
594: * to get num of chars effectively skipped.
595: * The trick with &'ing, as with elsewhere in this dcode, is
596: * intended to avoid an expensive use of / that might not be optimized
597: * away.
598: */
599: int charWidth = (fEncoding >= 4) ? 2 : 1;
600: long bytesSkipped = fInputStream.skip(n << charWidth);
601:
602: if ((bytesSkipped & (charWidth | 1)) == 0) {
603: return bytesSkipped >>> charWidth;
604: }
605:
606: return (bytesSkipped >>> charWidth) + 1;
607: } // skip(long):long
608:
609: /**
610: * Tell whether this stream is ready to be read.
611: *
612: * @return True if the next read() is guaranteed not to block for input,
613: * false otherwise. Note that returning false does not guarantee that the
614: * next read will block.
615: *
616: * @exception IOException If an I/O error occurs
617: */
618: public boolean ready() throws IOException {
619: return false;
620: } // ready()
621:
622: /**
623: * Tell whether this stream supports the mark() operation.
624: */
625: public boolean markSupported() {
626: return fInputStream.markSupported();
627: } // markSupported()
628:
629: /**
630: * Mark the present position in the stream. Subsequent calls to
631: * <code>reset</code> will attempt to reposition the stream to this point.
632: * Not all character-input streams support the <code>mark</code> operation.
633: * This is one of them :) It relies on marking facilities of underlying
634: * byte stream.
635: *
636: * @param readAheadLimit Limit on the number of characters that may be
637: * read while still preserving the mark. After
638: * reading this many characters, attempting to
639: * reset the stream may fail.
640: *
641: * @exception IOException If the stream does not support
642: * <code>mark</code>, or if some other I/O error
643: * occurs
644: */
645: public void mark(int readAheadLimit) throws IOException {
646: fInputStream.mark(readAheadLimit);
647: } // mark(int)
648:
649: /**
650: * Reset the stream. If the stream has been marked, then attempt to
651: * reposition it at the mark. If the stream has not been marked, then
652: * attempt to reset it in some way appropriate to the particular stream,
653: * for example by repositioning it to its starting point. This stream
654: * implementation does not support <code>mark</code>/<code>reset</code>
655: * by itself, it relies on underlying byte stream in this matter.
656: *
657: * @exception IOException If the stream has not been marked,
658: * or if the mark has been invalidated,
659: * or if the stream does not support reset(),
660: * or if some other I/O error occurs
661: */
662: public void reset() throws IOException {
663: fInputStream.reset();
664: } // reset()
665:
666: /**
667: * Close the stream. Once a stream has been closed, further
668: * <code>read</code>, <code>ready</code>, <code>mark</code>,
669: * or <code>reset</code> invocations will throw an IOException.
670: * Closing a previously-closed stream, however, has no effect.
671: *
672: * @exception IOException If an I/O error occurs
673: */
674: public void close() throws IOException {
675: fInputStream.close();
676: fInputStream = null;
677: fCharBuf = null;
678: fBuffer = null;
679: } // close()
680:
681: /**
682: * Returns the encoding currently in use by this character stream.
683: *
684: * @return Encoding of this stream. Either ISO-10646-UCS-2 or
685: * ISO-10646-UCS-4. Problem is that this string doesn't indicate
686: * the byte order of that encoding. What to do, then? Unlike
687: * UTF-16 byte order cannot be made part of the encoding name
688: * in this case and still can be critical. Currently you can
689: * find out the byte order by invoking <code>getByteOrder</code>
690: * method.
691: */
692: public String getEncoding() {
693: if (4 > fEncoding) {
694: return "ISO-10646-UCS-2";
695: } else {
696: return "ISO-10646-UCS-4";
697: }
698: }
699:
700: /**
701: * Returns byte order ("endianness") of the encoding currently in use by
702: * this character stream. This is a string with two possible values:
703: * <code>LITTLE_ENDIAN</code> and <code>BIG_ENDIAN</code>. Maybe using
704: * a named constant is a better alternative, but I just don't like them.
705: * But feel free to change this behavior if you think that would be
706: * better.
707: *
708: * @return <code>LITTLE_ENDIAN</code> or <code>BIG_ENDIAN</code> depending
709: * on byte order of current encoding of this stream.
710: */
711: public String getByteOrder() {
712: if ((1 == fEncoding) || (4 == fEncoding)) {
713: return "LITTLE_ENDIAN";
714: } else {
715: return "BIG_ENDIAN";
716: }
717: }
718:
719: /**
720: * Determines whether the specified character (Unicode code point)
721: * is in the supplementary character range. The method call is
722: * equivalent to the expression:
723: * <blockquote><pre>
724: * codePoint >= 0x10000 && codePoint <= 0x10ffff
725: * </pre></blockquote>
726: *
727: * Stolen from JDK 1.5 <code>java.lang.Character</code> class in
728: * order to provide JDK 1.4 compatibility.
729: *
730: * @param codePoint the character (Unicode code point) to be tested
731: * @return <code>true</code> if the specified character is in the Unicode
732: * supplementary character range; <code>false</code> otherwise.
733: */
734: protected boolean isSupplementaryCodePoint(int codePoint) {
735: return (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
736: && (codePoint <= MAX_CODE_POINT);
737: }
738: } // class UCSReader
|