001: /*
002: *
003: *
004: * Copyright 1990-2007 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: */
026:
027: package com.sun.ukit.jaxp;
028:
029: import java.io.Reader;
030: import java.io.InputStream;
031: import java.io.IOException;
032: import java.io.UnsupportedEncodingException;
033:
034: /**
035: * UTF-8 transformed UCS-2 character stream reader.
036: *
037: * This reader converts UTF-8 transformed UCS-2 characters to Java characters.
038: * The UCS-2 subset of UTF-8 transformation is described in RFC-2279 #2
039: * "UTF-8 definition":
040: * 0000 0000-0000 007F 0xxxxxxx
041: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
042: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
043: *
044: * This reader will return incorrect last character on broken UTF-8 stream.
045: */
046: public class ReaderUTF8 extends Reader {
047: private InputStream is;
048:
049: /**
050: * Constructor.
051: *
052: * @param is A byte input stream.
053: */
054: public ReaderUTF8(InputStream is) {
055: this .is = is;
056: }
057:
058: /**
059: * Reads characters into a portion of an array.
060: *
061: * @param cbuf Destination buffer.
062: * @param off Offset at which to start storing characters.
063: * @param len Maximum number of characters to read.
064: * @exception IOException If any IO errors occur.
065: * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
066: */
067: public int read(char[] cbuf, int off, int len) throws IOException {
068: int num = 0;
069: int val;
070: while (num < len) {
071: if ((val = is.read()) < 0)
072: return (num != 0) ? num : -1;
073: switch (val & 0xf0) {
074: case 0xc0:
075: case 0xd0:
076: cbuf[off++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f));
077: break;
078:
079: case 0xe0:
080: cbuf[off++] = (char) (((val & 0x0f) << 12)
081: | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f));
082: break;
083:
084: case 0xf0: // UCS-4 character
085: throw new UnsupportedEncodingException();
086:
087: default:
088: cbuf[off++] = (char) val;
089: break;
090: }
091: num++;
092: }
093: return num;
094: }
095:
096: /**
097: * Reads a single character.
098: *
099: * @return The character read, as an integer in the range 0 to 65535
100: * (0x00-0xffff), or -1 if the end of the stream has been reached.
101: * @exception IOException If any IO errors occur.
102: * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
103: */
104: public int read() throws IOException {
105: int val;
106: if ((val = is.read()) < 0)
107: return -1;
108: switch (val & 0xf0) {
109: case 0xc0:
110: case 0xd0:
111: val = ((val & 0x1f) << 6) | (is.read() & 0x3f);
112: break;
113:
114: case 0xe0:
115: val = ((val & 0x0f) << 12) | ((is.read() & 0x3f) << 6)
116: | (is.read() & 0x3f);
117: break;
118:
119: case 0xf0: // UCS-4 character
120: throw new UnsupportedEncodingException();
121:
122: default:
123: break;
124: }
125: return val;
126: }
127:
128: /**
129: * Closes the stream.
130: *
131: * @exception IOException If any IO errors occur.
132: */
133: public void close() throws IOException {
134: is.close();
135: }
136: }
|