001: /*
002: * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: * Free SoftwareFoundation, Inc.
023: * 59 Temple Place, Suite 330
024: * Boston, MA 02111-1307 USA
025: *
026: * @author Scott Ferguson
027: */
028:
029: package com.caucho.vfs.i18n;
030:
031: import java.io.CharConversionException;
032: import java.io.EOFException;
033: import java.io.IOException;
034: import java.io.InputStream;
035: import java.io.Reader;
036:
037: /**
038: * Implements an encoding reader for UTF8.
039: */
040: public class UTF8Reader extends EncodingReader {
041: private InputStream _is;
042: private int _peek = -1;
043:
044: /**
045: * Null-arg constructor for instantiation by com.caucho.vfs.Encoding only.
046: */
047: public UTF8Reader() {
048: }
049:
050: /**
051: * Create a UTF-8 reader based on the readStream.
052: */
053: private UTF8Reader(InputStream is) {
054: _is = is;
055: }
056:
057: /**
058: * Create a UTF-8 reader based on the readStream.
059: *
060: * @param is the input stream providing the bytes.
061: * @param javaEncoding the JDK name for the encoding.
062: *
063: * @return the UTF-8 reader.
064: */
065: public Reader create(InputStream is, String javaEncoding) {
066: return new UTF8Reader(is);
067: }
068:
069: /**
070: * Reads into a character buffer using the correct encoding.
071: */
072: public int read() throws IOException {
073: if (_peek >= 0) {
074: int peek = _peek;
075: _peek = -1;
076: return peek;
077: }
078:
079: InputStream is = _is;
080:
081: int ch1 = is.read();
082:
083: if (ch1 < 0x80) {
084: return ch1;
085: }
086: if ((ch1 & 0xe0) == 0xc0) {
087: int ch2 = is.read();
088: if (ch2 < 0)
089: throw new EOFException(
090: "unexpected end of file in utf8 character");
091: else if ((ch2 & 0xc0) != 0x80)
092: throw new CharConversionException(
093: "illegal utf8 encoding at 0x"
094: + Integer.toHexString(ch1) + ", "
095: + Integer.toHexString(ch2));
096:
097: return ((ch1 & 0x1f) << 6) + (ch2 & 0x3f);
098: } else if ((ch1 & 0xf0) == 0xe0) {
099: int ch2 = is.read();
100: int ch3 = is.read();
101:
102: if (ch2 < 0)
103: throw new EOFException(
104: "unexpected end of file in utf8 character");
105: else if ((ch2 & 0xc0) != 0x80)
106: throw new CharConversionException(
107: "illegal utf8 encoding at 0x"
108: + Integer.toHexString(ch2));
109:
110: if (ch3 < 0)
111: throw new EOFException(
112: "unexpected end of file in utf8 character");
113: else if ((ch3 & 0xc0) != 0x80)
114: throw new CharConversionException(
115: "illegal utf8 encoding at 0x"
116: + Integer.toHexString(ch3));
117:
118: int ch = ((ch1 & 0x1f) << 12) + ((ch2 & 0x3f) << 6)
119: + (ch3 & 0x3f);
120:
121: if (ch == 0xfeff) // handle some writers, e.g. microsoft
122: return is.read();
123: else
124: return ch;
125: } else if ((ch1 & 0xf0) == 0xf0) {
126: int ch2 = is.read();
127: int ch3 = is.read();
128: int ch4 = is.read();
129:
130: if (ch2 < 0)
131: throw new EOFException(
132: "unexpected end of file in utf8 character");
133: else if ((ch2 & 0xc0) != 0x80)
134: throw new CharConversionException(
135: "illegal utf8 encoding at 0x"
136: + Integer.toHexString(ch2));
137:
138: if (ch3 < 0)
139: throw new EOFException(
140: "unexpected end of file in utf8 character");
141: else if ((ch3 & 0xc0) != 0x80)
142: throw new CharConversionException(
143: "illegal utf8 encoding at 0x"
144: + Integer.toHexString(ch3));
145:
146: if (ch4 < 0)
147: throw new EOFException(
148: "unexpected end of file in utf8 character");
149: else if ((ch4 & 0xc0) != 0x80)
150: throw new CharConversionException(
151: "illegal utf8 encoding at 0x"
152: + Integer.toHexString(ch4));
153:
154: int ch = (((ch1 & 0xf) << 18) + ((ch2 & 0x3f) << 12)
155: + ((ch3 & 0x3f) << 6) + ((ch4 & 0x3f)));
156:
157: _peek = 0xdc00 + (ch & 0x3ff);
158:
159: return 0xd800 + ((ch - 0x10000) / 0x400);
160: } else
161: throw new CharConversionException(
162: "illegal utf8 encoding at (" + (int) ch1 + ")");
163: }
164:
165: /**
166: * Reads into a character buffer using the correct encoding.
167: *
168: * @param cbuf character buffer receiving the data.
169: * @param off starting offset into the buffer.
170: * @param len number of characters to read.
171: *
172: * @return the number of characters read or -1 on end of file.
173: */
174: public int read(char[] cbuf, int off, int len) throws IOException {
175: int i = 0;
176:
177: for (i = 0; i < len; i++) {
178: int ch = read();
179:
180: if (ch < 0)
181: return i == 0 ? -1 : i;
182:
183: cbuf[off + i] = (char) ch;
184: }
185:
186: return i;
187: }
188: }
|