001: /*
002: * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: * Free SoftwareFoundation, Inc.
023: * 59 Temple Place, Suite 330
024: * Boston, MA 02111-1307 USA
025: *
026: * @author Scott Ferguson
027: */
028:
029: package com.caucho.xml2.readers;
030:
031: import com.caucho.util.CharBuffer;
032: import com.caucho.vfs.ReadStream;
033: import com.caucho.xml2.XmlParser;
034:
035: import java.io.CharConversionException;
036: import java.io.EOFException;
037: import java.io.IOException;
038:
039: /**
040: * A fast reader to convert bytes to characters for parsing XML.
041: */
042: public class Utf8Reader extends XmlReader {
043: /**
044: * Create a new reader.
045: */
046: public Utf8Reader() {
047: }
048:
049: /**
050: * Create a new reader with the given read stream.
051: */
052: public Utf8Reader(XmlParser parser, ReadStream is) {
053: super (parser, is);
054: }
055:
056: /**
057: * Read the next character, returning -1 on end of file..
058: */
059: public int read() throws IOException {
060: int ch1 = _is.read();
061:
062: if (ch1 == '\n') {
063: _parser.setLine(++_line);
064: return ch1;
065: } else if (ch1 == '\r') {
066: _parser.setLine(++_line);
067:
068: int ch2 = _is.read();
069: if (ch2 == '\n')
070: return '\n';
071:
072: if (ch2 < 0) {
073: } else if (ch2 < 0x80)
074: _parser.unread(ch2);
075: else
076: _parser.unread(readSecond(ch2));
077:
078: return '\n';
079: } else if (ch1 < 0x80)
080: return ch1;
081: else
082: return readSecond(ch1);
083: }
084:
085: private int readSecond(int ch1) throws IOException {
086: if ((ch1 & 0xe0) == 0xc0) {
087: int ch2 = _is.read();
088: if (ch2 < 0)
089: throw new EOFException(
090: "unexpected end of file in utf8 character");
091: else if ((ch2 & 0xc0) != 0x80)
092: throw error(L.l("illegal utf8 encoding {0}", hex(ch1)));
093:
094: return ((ch1 & 0x1f) << 6) + (ch2 & 0x3f);
095: } else if ((ch1 & 0xf0) == 0xe0) {
096: int ch2 = _is.read();
097: int ch3 = _is.read();
098:
099: if (ch2 < 0)
100: throw new EOFException(
101: "unexpected end of file in utf8 character");
102: else if ((ch2 & 0xc0) != 0x80)
103: throw error(L.l("illegal utf8 encoding at {0} {1} {2}",
104: hex(ch1), hex(ch2), hex(ch3)));
105:
106: if (ch3 < 0)
107: throw new EOFException(
108: "unexpected end of file in utf8 character");
109: else if ((ch3 & 0xc0) != 0x80)
110: throw error(L.l("illegal utf8 encoding {0} {1} {2}",
111: hex(ch1), hex(ch2), hex(ch3)));
112:
113: int ch = ((ch1 & 0x1f) << 12) + ((ch2 & 0x3f) << 6)
114: + (ch3 & 0x3f);
115:
116: if (ch == 0xfeff) // handle some writers, e.g. microsoft
117: return read();
118: else
119: return ch;
120: } else
121: throw error(L.l("illegal utf8 encoding at {0}", hex(ch1)));
122: }
123:
124: private String hex(int n) {
125: n = n & 0xff;
126:
127: CharBuffer cb = CharBuffer.allocate();
128:
129: cb.append("0x");
130:
131: int d = n / 16;
132: if (d >= 0 && d <= 9)
133: cb.append((char) ('0' + d));
134: else
135: cb.append((char) ('a' + d - 10));
136:
137: d = n % 16;
138: if (d >= 0 && d <= 9)
139: cb.append((char) ('0' + d));
140: else
141: cb.append((char) ('a' + d - 10));
142:
143: return cb.close();
144: }
145:
146: private CharConversionException error(String msg) {
147: String filename = _parser.getFilename();
148: int line = _parser.getLine();
149:
150: if (filename != null)
151: return new CharConversionException(filename + ":" + line
152: + ": " + msg);
153: else
154: return new CharConversionException(msg);
155: }
156: }
|