001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.impl.io;
019:
020: import java.io.InputStream;
021: import java.io.IOException;
022: import java.io.Reader;
023:
024: /**
025: * Reader for UCS-2 and UCS-4 encodings.
026: * (i.e., encodings from ISO-10646-UCS-(2|4)).
027: *
028: * @xerces.internal
029: *
030: * @author Neil Graham, IBM
031: *
032: * @version $Id: UCSReader.java 449317 2006-09-23 22:12:30Z mrglavas $
033: */
034: public class UCSReader extends Reader {
035:
036: //
037: // Constants
038: //
039:
040: /**
041: * Default byte buffer size (8192, larger than that of ASCIIReader
042: * since it's reasonable to surmise that the average UCS-4-encoded
043: * file should be 4 times as large as the average ASCII-encoded file).
044: */
045: public static final int DEFAULT_BUFFER_SIZE = 8192;
046:
047: public static final short UCS2LE = 1;
048: public static final short UCS2BE = 2;
049: public static final short UCS4LE = 4;
050: public static final short UCS4BE = 8;
051:
052: //
053: // Data
054: //
055:
056: /** Input stream. */
057: protected final InputStream fInputStream;
058:
059: /** Byte buffer. */
060: protected final byte[] fBuffer;
061:
062: // what kind of data we're dealing with
063: protected final short fEncoding;
064:
065: //
066: // Constructors
067: //
068:
069: /**
070: * Constructs a UCS reader from the specified input stream
071: * using the default buffer size. The Endian-ness and whether this is
072: * UCS-2 or UCS-4 needs also to be known in advance.
073: *
074: * @param inputStream The input stream.
075: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
076: */
077: public UCSReader(InputStream inputStream, short encoding) {
078: this (inputStream, DEFAULT_BUFFER_SIZE, encoding);
079: } // <init>(InputStream, short)
080:
081: /**
082: * Constructs a UCS reader from the specified input stream
083: * and buffer size. The Endian-ness and whether this is
084: * UCS-2 or UCS-4 needs also to be known in advance.
085: *
086: * @param inputStream The input stream.
087: * @param size The initial buffer size.
088: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
089: */
090: public UCSReader(InputStream inputStream, int size, short encoding) {
091: this (inputStream, new byte[size], encoding);
092: } // <init>(InputStream,int,short)
093:
094: /**
095: * Constructs a UCS reader from the specified input stream
096: * and buffer. The Endian-ness and whether this is
097: * UCS-2 or UCS-4 needs also to be known in advance.
098: *
099: * @param inputStream The input stream.
100: * @param buffer The byte buffer.
101: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
102: */
103: public UCSReader(InputStream inputStream, byte[] buffer,
104: short encoding) {
105: fInputStream = inputStream;
106: fBuffer = buffer;
107: fEncoding = encoding;
108: } // <init>(InputStream,int,short)
109:
110: //
111: // Reader methods
112: //
113:
114: /**
115: * Read a single character. This method will block until a character is
116: * available, an I/O error occurs, or the end of the stream is reached.
117: *
118: * <p> Subclasses that intend to support efficient single-character input
119: * should override this method.
120: *
121: * @return The character read, as an integer in the range 0 to 127
122: * (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
123: * been reached
124: *
125: * @exception IOException If an I/O error occurs
126: */
127: public int read() throws IOException {
128: int b0 = fInputStream.read() & 0xff;
129: if (b0 == 0xff) {
130: return -1;
131: }
132: int b1 = fInputStream.read() & 0xff;
133: if (b1 == 0xff) {
134: return -1;
135: }
136: // UCS-4
137: if (fEncoding >= 4) {
138: int b2 = fInputStream.read() & 0xff;
139: if (b2 == 0xff) {
140: return -1;
141: }
142: int b3 = fInputStream.read() & 0xff;
143: if (b3 == 0xff) {
144: return -1;
145: }
146: if (fEncoding == UCS4BE) {
147: return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;
148: }
149: return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;
150: }
151: // UCS-2
152: if (fEncoding == UCS2BE) {
153: return (b0 << 8) + b1;
154: }
155: return (b1 << 8) + b0;
156: } // read():int
157:
158: /**
159: * Read characters into a portion of an array. This method will block
160: * until some input is available, an I/O error occurs, or the end of the
161: * stream is reached.
162: *
163: * @param ch Destination buffer
164: * @param offset Offset at which to start storing characters
165: * @param length Maximum number of characters to read
166: *
167: * @return The number of characters read, or -1 if the end of the
168: * stream has been reached
169: *
170: * @exception IOException If an I/O error occurs
171: */
172: public int read(char ch[], int offset, int length)
173: throws IOException {
174: int byteLength = length << ((fEncoding >= 4) ? 2 : 1);
175: if (byteLength > fBuffer.length) {
176: byteLength = fBuffer.length;
177: }
178: int count = fInputStream.read(fBuffer, 0, byteLength);
179: if (count == -1)
180: return -1;
181: // try and make count be a multiple of the number of bytes we're looking for
182: if (fEncoding >= 4) { // BigEndian
183: // this looks ugly, but it avoids an if at any rate...
184: int numToRead = (4 - (count & 3) & 3);
185: for (int i = 0; i < numToRead; i++) {
186: int charRead = fInputStream.read();
187: if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
188: for (int j = i; j < numToRead; j++) {
189: fBuffer[count + j] = 0;
190: }
191: break;
192: }
193: fBuffer[count + i] = (byte) charRead;
194: }
195: count += numToRead;
196: } else {
197: int numToRead = count & 1;
198: if (numToRead != 0) {
199: count++;
200: int charRead = fInputStream.read();
201: if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
202: fBuffer[count] = 0;
203: } else {
204: fBuffer[count] = (byte) charRead;
205: }
206: }
207: }
208:
209: // now count is a multiple of the right number of bytes
210: int numChars = count >> ((fEncoding >= 4) ? 2 : 1);
211: int curPos = 0;
212: for (int i = 0; i < numChars; i++) {
213: int b0 = fBuffer[curPos++] & 0xff;
214: int b1 = fBuffer[curPos++] & 0xff;
215: // UCS-4
216: if (fEncoding >= 4) {
217: int b2 = fBuffer[curPos++] & 0xff;
218: int b3 = fBuffer[curPos++] & 0xff;
219: if (fEncoding == UCS4BE) {
220: ch[offset + i] = (char) ((b0 << 24) + (b1 << 16)
221: + (b2 << 8) + b3);
222: } else {
223: ch[offset + i] = (char) ((b3 << 24) + (b2 << 16)
224: + (b1 << 8) + b0);
225: }
226: } else { // UCS-2
227: if (fEncoding == UCS2BE) {
228: ch[offset + i] = (char) ((b0 << 8) + b1);
229: } else {
230: ch[offset + i] = (char) ((b1 << 8) + b0);
231: }
232: }
233: }
234: return numChars;
235: } // read(char[],int,int)
236:
237: /**
238: * Skip characters. This method will block until some characters are
239: * available, an I/O error occurs, or the end of the stream is reached.
240: *
241: * @param n The number of characters to skip
242: *
243: * @return The number of characters actually skipped
244: *
245: * @exception IOException If an I/O error occurs
246: */
247: public long skip(long n) throws IOException {
248: // charWidth will represent the number of bits to move
249: // n leftward to get num of bytes to skip, and then move the result rightward
250: // to get num of chars effectively skipped.
251: // The trick with &'ing, as with elsewhere in this dcode, is
252: // intended to avoid an expensive use of / that might not be optimized
253: // away.
254: int charWidth = (fEncoding >= 4) ? 2 : 1;
255: long bytesSkipped = fInputStream.skip(n << charWidth);
256: if ((bytesSkipped & (charWidth | 1)) == 0)
257: return bytesSkipped >> charWidth;
258: return (bytesSkipped >> charWidth) + 1;
259: } // skip(long):long
260:
261: /**
262: * Tell whether this stream is ready to be read.
263: *
264: * @return True if the next read() is guaranteed not to block for input,
265: * false otherwise. Note that returning false does not guarantee that the
266: * next read will block.
267: *
268: * @exception IOException If an I/O error occurs
269: */
270: public boolean ready() throws IOException {
271: return false;
272: } // ready()
273:
274: /**
275: * Tell whether this stream supports the mark() operation.
276: */
277: public boolean markSupported() {
278: return fInputStream.markSupported();
279: } // markSupported()
280:
281: /**
282: * Mark the present position in the stream. Subsequent calls to reset()
283: * will attempt to reposition the stream to this point. Not all
284: * character-input streams support the mark() operation.
285: *
286: * @param readAheadLimit Limit on the number of characters that may be
287: * read while still preserving the mark. After
288: * reading this many characters, attempting to
289: * reset the stream may fail.
290: *
291: * @exception IOException If the stream does not support mark(),
292: * or if some other I/O error occurs
293: */
294: public void mark(int readAheadLimit) throws IOException {
295: fInputStream.mark(readAheadLimit);
296: } // mark(int)
297:
298: /**
299: * Reset the stream. If the stream has been marked, then attempt to
300: * reposition it at the mark. If the stream has not been marked, then
301: * attempt to reset it in some way appropriate to the particular stream,
302: * for example by repositioning it to its starting point. Not all
303: * character-input streams support the reset() operation, and some support
304: * reset() without supporting mark().
305: *
306: * @exception IOException If the stream has not been marked,
307: * or if the mark has been invalidated,
308: * or if the stream does not support reset(),
309: * or if some other I/O error occurs
310: */
311: public void reset() throws IOException {
312: fInputStream.reset();
313: } // reset()
314:
315: /**
316: * Close the stream. Once a stream has been closed, further read(),
317: * ready(), mark(), or reset() invocations will throw an IOException.
318: * Closing a previously-closed stream, however, has no effect.
319: *
320: * @exception IOException If an I/O error occurs
321: */
322: public void close() throws IOException {
323: fInputStream.close();
324: } // close()
325:
326: } // class UCSReader
|