001: package org.apache.velocity.io;
002:
003: /*
004: * Licensed to the Apache Software Foundation (ASF) under one
005: * or more contributor license agreements. See the NOTICE file
006: * distributed with this work for additional information
007: * regarding copyright ownership. The ASF licenses this file
008: * to you under the Apache License, Version 2.0 (the
009: * "License"); you may not use this file except in compliance
010: * with the License. You may obtain a copy of the License at
011: *
012: * http://www.apache.org/licenses/LICENSE-2.0
013: *
014: * Unless required by applicable law or agreed to in writing,
015: * software distributed under the License is distributed on an
016: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017: * KIND, either express or implied. See the License for the
018: * specific language governing permissions and limitations
019: * under the License.
020: */
021:
022: import java.io.IOException;
023: import java.io.InputStream;
024: import java.io.PushbackInputStream;
025:
026: import org.apache.velocity.util.ExceptionUtils;
027:
028: /**
029: * This is an input stream that is unicode BOM aware. This allows you to e.g. read
030: * Windows Notepad Unicode files as Velocity templates.
031: *
032: * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
033: * the input stream reader.
034: *
035: * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
036: * the caller must provide synchronization.
037: *
038: * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
039: * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
040: * @version $Id: UnicodeInputStream.java 500638 2007-01-27 22:16:10Z henning $
041: */
042:
043: public class UnicodeInputStream extends InputStream {
044:
045: /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
046: public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8",
047: new byte[] { (byte) 0xef, (byte) 0xbb, (byte) 0xbf });
048:
049: /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
050: public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM(
051: "UTF-16LE", new byte[] { (byte) 0xff, (byte) 0xfe });
052:
053: /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
054: public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM(
055: "UTF-16BE", new byte[] { (byte) 0xfe, (byte) 0xff });
056:
057: /**
058: * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
059: *
060: * TODO: Does Java actually support this?
061: */
062: public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM(
063: "UTF-32LE", new byte[] { (byte) 0xff, (byte) 0xfe,
064: (byte) 0x00, (byte) 0x00 });
065:
066: /**
067: * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
068: *
069: * TODO: Does Java actually support this?
070: */
071: public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM(
072: "UTF-32BE", new byte[] { (byte) 0x00, (byte) 0x00,
073: (byte) 0xfe, (byte) 0xff });
074:
075: /** The maximum amount of bytes to read for a BOM */
076: private static final int MAX_BOM_SIZE = 4;
077:
078: /** Buffer for BOM reading */
079: private byte[] buf = new byte[MAX_BOM_SIZE];
080:
081: /** Buffer pointer. */
082: private int pos = 0;
083:
084: /** The stream encoding as read from the BOM or null. */
085: private final String encoding;
086:
087: /** True if the BOM itself should be skipped and not read. */
088: private final boolean skipBOM;
089:
090: private final PushbackInputStream inputStream;
091:
092: /**
093: * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
094: *
095: * @param inputStream The input stream to use for reading.
096: */
097: public UnicodeInputStream(final InputStream inputStream)
098: throws IllegalStateException, IOException {
099: this (inputStream, true);
100: }
101:
102: /**
103: * Creates a new UnicodeInputStream object.
104: *
105: * @param inputStream The input stream to use for reading.
106: * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
107: */
108: public UnicodeInputStream(final InputStream inputStream,
109: boolean skipBOM) throws IllegalStateException, IOException {
110: super ();
111:
112: this .skipBOM = skipBOM;
113: this .inputStream = new PushbackInputStream(inputStream,
114: MAX_BOM_SIZE);
115:
116: try {
117: this .encoding = readEncoding();
118: } catch (IOException ioe) {
119: IllegalStateException ex = new IllegalStateException(
120: "Could not read BOM from Stream");
121: ExceptionUtils.setCause(ex, ioe);
122: throw ex;
123: }
124: }
125:
126: /**
127: * Returns true if the input stream discards the BOM.
128: *
129: * @return True if the input stream discards the BOM.
130: */
131: public boolean isSkipBOM() {
132: return skipBOM;
133: }
134:
135: /**
136: * Read encoding based on BOM.
137: *
138: * @return The encoding based on the BOM.
139: *
140: * @throws IllegalStateException When a problem reading the BOM occured.
141: */
142: public String getEncodingFromStream() {
143: return encoding;
144: }
145:
146: /**
147: * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
148: * is undefined.
149: *
150: * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
151: */
152: protected String readEncoding() throws IOException {
153: pos = 0;
154:
155: UnicodeBOM encoding = null;
156:
157: // read first byte.
158: if (readByte()) {
159: // Build a list of matches
160: //
161: // 00 00 FE FF --> UTF 32 BE
162: // EF BB BF --> UTF 8
163: // FE FF --> UTF 16 BE
164: // FF FE --> UTF 16 LE
165: // FF FE 00 00 --> UTF 32 LE
166:
167: switch (buf[0]) {
168: case (byte) 0x00: // UTF32 BE
169: encoding = match(UTF32BE_BOM, null);
170: break;
171: case (byte) 0xef: // UTF8
172: encoding = match(UTF8_BOM, null);
173: break;
174: case (byte) 0xfe: // UTF16 BE
175: encoding = match(UTF16BE_BOM, null);
176: break;
177: case (byte) 0xff: // UTF16/32 LE
178: encoding = match(UTF16LE_BOM, null);
179:
180: if (encoding != null) {
181: encoding = match(UTF32LE_BOM, encoding);
182: }
183: break;
184:
185: default:
186: encoding = null;
187: break;
188: }
189: }
190:
191: pushback(encoding);
192:
193: return (encoding != null) ? encoding.getEncoding() : null;
194: }
195:
196: private final UnicodeBOM match(final UnicodeBOM matchEncoding,
197: final UnicodeBOM noMatchEncoding) throws IOException {
198: byte[] bom = matchEncoding.getBytes();
199:
200: for (int i = 0; i < bom.length; i++) {
201: if (pos <= i) // Byte has not yet been read
202: {
203: if (!readByte()) {
204: return noMatchEncoding;
205: }
206: }
207:
208: if (bom[i] != buf[i]) {
209: return noMatchEncoding;
210: }
211: }
212:
213: return matchEncoding;
214: }
215:
216: private final boolean readByte() throws IOException {
217: int res = inputStream.read();
218: if (res == -1) {
219: return false;
220: }
221:
222: if (pos >= buf.length) {
223: throw new IOException("BOM read error");
224: }
225:
226: buf[pos++] = (byte) res;
227: return true;
228: }
229:
230: private final void pushback(final UnicodeBOM matchBOM)
231: throws IOException {
232: int count = pos; // By default, all bytes are pushed back.
233: int start = 0;
234:
235: if (matchBOM != null && skipBOM) {
236: // We have a match (some bytes are part of the BOM)
237: // and we want to skip the BOM. Push back only the bytes
238: // after the BOM.
239: start = matchBOM.getBytes().length;
240: count = (pos - start);
241:
242: if (count < 0) {
243: throw new IllegalStateException(
244: "Match has more bytes than available!");
245: }
246: }
247:
248: inputStream.unread(buf, start, count);
249: }
250:
251: /**
252: * @see java.io.InputStream#close()
253: */
254: public void close() throws IOException {
255: inputStream.close();
256: }
257:
258: /**
259: * @see java.io.InputStream#available()
260: */
261: public int available() throws IOException {
262: return inputStream.available();
263: }
264:
265: /**
266: * @see java.io.InputStream#mark(int)
267: */
268: public void mark(final int readlimit) {
269: inputStream.mark(readlimit);
270: }
271:
272: /**
273: * @see java.io.InputStream#markSupported()
274: */
275: public boolean markSupported() {
276: return inputStream.markSupported();
277: }
278:
279: /**
280: * @see java.io.InputStream#read()
281: */
282: public int read() throws IOException {
283: return inputStream.read();
284: }
285:
286: /**
287: * @see java.io.InputStream#read(byte[])
288: */
289: public int read(final byte[] b) throws IOException {
290: return inputStream.read(b);
291: }
292:
293: /**
294: * @see java.io.InputStream#read(byte[], int, int)
295: */
296: public int read(final byte[] b, final int off, final int len)
297: throws IOException {
298: return inputStream.read(b, off, len);
299: }
300:
301: /**
302: * @see java.io.InputStream#reset()
303: */
304: public void reset() throws IOException {
305: inputStream.reset();
306: }
307:
308: /**
309: * @see java.io.InputStream#skip(long)
310: */
311: public long skip(final long n) throws IOException {
312: return inputStream.skip(n);
313: }
314:
315: /**
316: * Helper class to bundle encoding and BOM marker.
317: *
318: * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
319: * @version $Id: UnicodeInputStream.java 500638 2007-01-27 22:16:10Z henning $
320: */
321: static final class UnicodeBOM {
322: private final String encoding;
323:
324: private final byte[] bytes;
325:
326: private UnicodeBOM(final String encoding, final byte[] bytes) {
327: this .encoding = encoding;
328: this .bytes = bytes;
329: }
330:
331: String getEncoding() {
332: return encoding;
333: }
334:
335: byte[] getBytes() {
336: return bytes;
337: }
338: }
339: }
|