001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 2000-2002 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package com.sun.xml.stream.xerces.impl.io;
059:
060: import java.io.*;
061: import com.sun.xml.stream.util.BufferAllocator;
062: import com.sun.xml.stream.util.ThreadLocalBufferAllocator;
063:
064: /**
065: * Reader for UCS-2 and UCS-4 encodings.
066: * (i.e., encodings from ISO-10646-UCS-(2|4)).
067: *
068: * @author Neil Graham, IBM
069: *
070: * @version $Id: UCSReader.java,v 1.4 2006/11/30 16:17:47 spericas Exp $
071: */
072: public class UCSReader extends Reader {
073:
074: //
075: // Constants
076: //
077:
078: /** Default byte buffer size (8192, larger than that of ASCIIReader
079: * since it's reasonable to surmise that the average UCS-4-encoded
080: * file should be 4 times as large as the average ASCII-encoded file).
081: */
082: public static final int DEFAULT_BUFFER_SIZE = 8192;
083:
084: public static final short UCS2LE = 1;
085: public static final short UCS2BE = 2;
086: public static final short UCS4LE = 4;
087: public static final short UCS4BE = 8;
088:
089: //
090: // Data
091: //
092:
093: /** Input stream. */
094: protected InputStream fInputStream;
095:
096: /** Byte buffer. */
097: protected byte[] fBuffer;
098:
099: // what kind of data we're dealing with
100: protected short fEncoding;
101:
102: //
103: // Constructors
104: //
105:
106: /**
107: * Constructs an ASCII reader from the specified input stream
108: * using the default buffer size. The Endian-ness and whether this is
109: * UCS-2 or UCS-4 needs also to be known in advance.
110: *
111: * @param inputStream The input stream.
112: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
113: */
114: public UCSReader(InputStream inputStream, short encoding) {
115: this (inputStream, DEFAULT_BUFFER_SIZE, encoding);
116: } // <init>(InputStream, short)
117:
118: /**
119: * Constructs an ASCII reader from the specified input stream
120: * and buffer size. The Endian-ness and whether this is
121: * UCS-2 or UCS-4 needs also to be known in advance.
122: *
123: * @param inputStream The input stream.
124: * @param size The initial buffer size.
125: * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
126: */
127: public UCSReader(InputStream inputStream, int size, short encoding) {
128: fInputStream = inputStream;
129: BufferAllocator ba = ThreadLocalBufferAllocator
130: .getBufferAllocator();
131: fBuffer = ba.getByteBuffer(size);
132: if (fBuffer == null) {
133: fBuffer = new byte[size];
134: }
135: fEncoding = encoding;
136: } // <init>(InputStream,int,short)
137:
138: //
139: // Reader methods
140: //
141:
142: /**
143: * Read a single character. This method will block until a character is
144: * available, an I/O error occurs, or the end of the stream is reached.
145: *
146: * <p> Subclasses that intend to support efficient single-character input
147: * should override this method.
148: *
149: * @return The character read, as an integer in the range 0 to 127
150: * (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
151: * been reached
152: *
153: * @exception IOException If an I/O error occurs
154: */
155: public int read() throws IOException {
156: int b0 = fInputStream.read() & 0xff;
157: if (b0 == 0xff)
158: return -1;
159: int b1 = fInputStream.read() & 0xff;
160: if (b1 == 0xff)
161: return -1;
162: if (fEncoding >= 4) {
163: int b2 = fInputStream.read() & 0xff;
164: if (b2 == 0xff)
165: return -1;
166: int b3 = fInputStream.read() & 0xff;
167: if (b3 == 0xff)
168: return -1;
169: System.err.println("b0 is " + (b0 & 0xff) + " b1 "
170: + (b1 & 0xff) + " b2 " + (b2 & 0xff) + " b3 "
171: + (b3 & 0xff));
172: if (fEncoding == UCS4BE)
173: return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;
174: else
175: return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;
176: } else { // UCS-2
177: if (fEncoding == UCS2BE)
178: return (b0 << 8) + b1;
179: else
180: return (b1 << 8) + b0;
181: }
182: } // read():int
183:
184: /**
185: * Read characters into a portion of an array. This method will block
186: * until some input is available, an I/O error occurs, or the end of the
187: * stream is reached.
188: *
189: * @param ch Destination buffer
190: * @param offset Offset at which to start storing characters
191: * @param length Maximum number of characters to read
192: *
193: * @return The number of characters read, or -1 if the end of the
194: * stream has been reached
195: *
196: * @exception IOException If an I/O error occurs
197: */
198: public int read(char ch[], int offset, int length)
199: throws IOException {
200: int byteLength = length << ((fEncoding >= 4) ? 2 : 1);
201: if (byteLength > fBuffer.length) {
202: byteLength = fBuffer.length;
203: }
204: int count = fInputStream.read(fBuffer, 0, byteLength);
205: if (count == -1)
206: return -1;
207: // try and make count be a multiple of the number of bytes we're looking for
208: if (fEncoding >= 4) { // BigEndian
209: // this looks ugly, but it avoids an if at any rate...
210: int numToRead = (4 - (count & 3) & 3);
211: for (int i = 0; i < numToRead; i++) {
212: int charRead = fInputStream.read();
213: if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
214: for (int j = i; j < numToRead; j++)
215: fBuffer[count + j] = 0;
216: break;
217: } else {
218: fBuffer[count + i] = (byte) charRead;
219: }
220: }
221: count += numToRead;
222: } else {
223: int numToRead = count & 1;
224: if (numToRead != 0) {
225: count++;
226: int charRead = fInputStream.read();
227: if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
228: fBuffer[count] = 0;
229: } else {
230: fBuffer[count] = (byte) charRead;
231: }
232: }
233: }
234:
235: // now count is a multiple of the right number of bytes
236: int numChars = count >> ((fEncoding >= 4) ? 2 : 1);
237: int curPos = 0;
238: for (int i = 0; i < numChars; i++) {
239: int b0 = fBuffer[curPos++] & 0xff;
240: int b1 = fBuffer[curPos++] & 0xff;
241: if (fEncoding >= 4) {
242: int b2 = fBuffer[curPos++] & 0xff;
243: int b3 = fBuffer[curPos++] & 0xff;
244: if (fEncoding == UCS4BE)
245: ch[offset + i] = (char) ((b0 << 24) + (b1 << 16)
246: + (b2 << 8) + b3);
247: else
248: ch[offset + i] = (char) ((b3 << 24) + (b2 << 16)
249: + (b1 << 8) + b0);
250: } else { // UCS-2
251: if (fEncoding == UCS2BE)
252: ch[offset + i] = (char) ((b0 << 8) + b1);
253: else
254: ch[offset + i] = (char) ((b1 << 8) + b0);
255: }
256: }
257: return numChars;
258: } // read(char[],int,int)
259:
260: /**
261: * Skip characters. This method will block until some characters are
262: * available, an I/O error occurs, or the end of the stream is reached.
263: *
264: * @param n The number of characters to skip
265: *
266: * @return The number of characters actually skipped
267: *
268: * @exception IOException If an I/O error occurs
269: */
270: public long skip(long n) throws IOException {
271: // charWidth will represent the number of bits to move
272: // n leftward to get num of bytes to skip, and then move the result rightward
273: // to get num of chars effectively skipped.
274: // The trick with &'ing, as with elsewhere in this dcode, is
275: // intended to avoid an expensive use of / that might not be optimized
276: // away.
277: int charWidth = (fEncoding >= 4) ? 2 : 1;
278: long bytesSkipped = fInputStream.skip(n << charWidth);
279: if ((bytesSkipped & (charWidth | 1)) == 0)
280: return bytesSkipped >> charWidth;
281: return (bytesSkipped >> charWidth) + 1;
282: } // skip(long):long
283:
284: /**
285: * Tell whether this stream is ready to be read.
286: *
287: * @return True if the next read() is guaranteed not to block for input,
288: * false otherwise. Note that returning false does not guarantee that the
289: * next read will block.
290: *
291: * @exception IOException If an I/O error occurs
292: */
293: public boolean ready() throws IOException {
294: return false;
295: } // ready()
296:
297: /**
298: * Tell whether this stream supports the mark() operation.
299: */
300: public boolean markSupported() {
301: return fInputStream.markSupported();
302: } // markSupported()
303:
304: /**
305: * Mark the present position in the stream. Subsequent calls to reset()
306: * will attempt to reposition the stream to this point. Not all
307: * character-input streams support the mark() operation.
308: *
309: * @param readAheadLimit Limit on the number of characters that may be
310: * read while still preserving the mark. After
311: * reading this many characters, attempting to
312: * reset the stream may fail.
313: *
314: * @exception IOException If the stream does not support mark(),
315: * or if some other I/O error occurs
316: */
317: public void mark(int readAheadLimit) throws IOException {
318: fInputStream.mark(readAheadLimit);
319: } // mark(int)
320:
321: /**
322: * Reset the stream. If the stream has been marked, then attempt to
323: * reposition it at the mark. If the stream has not been marked, then
324: * attempt to reset it in some way appropriate to the particular stream,
325: * for example by repositioning it to its starting point. Not all
326: * character-input streams support the reset() operation, and some support
327: * reset() without supporting mark().
328: *
329: * @exception IOException If the stream has not been marked,
330: * or if the mark has been invalidated,
331: * or if the stream does not support reset(),
332: * or if some other I/O error occurs
333: */
334: public void reset() throws IOException {
335: fInputStream.reset();
336: } // reset()
337:
338: /**
339: * Close the stream. Once a stream has been closed, further read(),
340: * ready(), mark(), or reset() invocations will throw an IOException.
341: * Closing a previously-closed stream, however, has no effect.
342: *
343: * @exception IOException If an I/O error occurs
344: */
345: public void close() throws IOException {
346: BufferAllocator ba = ThreadLocalBufferAllocator
347: .getBufferAllocator();
348: ba.returnByteBuffer(fBuffer);
349: fBuffer = null;
350: fInputStream.close();
351: } // close()
352:
353: } // class UCSReader
|