001: /*
002: * Copyright 2005-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.cyberneko.html;
018:
019: import java.io.*;
020:
021: /**
022: * This class is an input stream filter that skips the first
023: * three bytes read if they match the UTF-8 byte order mark,
024: * 0xEFBBBF. The UTF-8 BOM is most often generated by Windows®
025: * tools.
026: *
027: * @author Andy Clark
028: */
029: public class UTF8BOMSkipper extends FilterInputStream {
030:
031: //
032: // Data
033: //
034:
035: /** Start of reading. */
036: private boolean fStart = true;
037:
038: /** Byte offset. */
039: private int fOffset;
040:
041: /** First three bytes. */
042: private int[] fFirst3Bytes;
043:
044: //
045: // Constructors
046: //
047:
048: /** Constructs a UTF-8 BOM skipper. */
049: public UTF8BOMSkipper(InputStream stream) {
050: super (stream);
051: } // <init>(InputStream)
052:
053: //
054: // InputStream methods
055: //
056:
057: /** Returns the next byte. */
058: public int read() throws IOException {
059:
060: // read first three bytes in order to skip UTF-8 BOM, if present
061: if (fStart) {
062: fStart = false;
063: int b1 = super .read();
064: int b2 = super .read();
065: int b3 = super .read();
066: if (b1 != 0xEF || b2 != 0xBB || b3 != 0xBF) {
067: fFirst3Bytes = new int[3];
068: fFirst3Bytes[0] = b1;
069: fFirst3Bytes[1] = b2;
070: fFirst3Bytes[2] = b3;
071: }
072: }
073:
074: // return read bytes
075: if (fFirst3Bytes != null) {
076: int b = fFirst3Bytes[fOffset++];
077: if (fOffset == fFirst3Bytes.length) {
078: fFirst3Bytes = null;
079: }
080: return b;
081: }
082:
083: // return next char
084: return super .read();
085:
086: } // read():int
087:
088: /** Reads bytes into specified buffer and returns total bytes read. */
089: public int read(byte[] buffer, int offset, int length)
090: throws IOException {
091:
092: if (fStart || fFirst3Bytes != null) {
093: for (int i = 0; i < length; i++) {
094: int b = this .read();
095: if (b == -1) {
096: return i > 0 ? i : -1;
097: }
098: buffer[offset + i] = (byte) b;
099: }
100: return length;
101: }
102:
103: return super .read(buffer, offset, length);
104:
105: } // read(byte[],int,int):int
106:
107: /** Mark is not supported for this input stream. */
108: public boolean markSupported() {
109: return false;
110: } // markSupported():boolean
111:
112: /** Returns the number of bytes available. */
113: public int available() throws IOException {
114: if (fFirst3Bytes != null) {
115: return fFirst3Bytes.length - fOffset;
116: }
117: return super .available();
118: } // available():int
119:
120: } // class UTF8BOMSkipper
|