001: /*
002: * $Id: UnicodeLittleXMLDecoder.java,v 1.5 2004/07/11 09:37:37 yuvalo Exp $
003: *
004: * (C) Copyright 2002-2004 by Yuval Oren. All rights reserved.
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: package com.bluecast.xml;
020:
021: import com.bluecast.io.CharsetDecoder;
022: import com.bluecast.io.IllegalCharException;
023:
024: import java.io.CharConversionException;
025:
026: /**
027: * A decoder for little-endian Unicode text. Also converts
028: * carriage returns into linefeeds and CRLF into LF.
029: *
030: * @author Yuval Oren, yuval@bluecast.com
031: * @version $Revision: 1.5 $
032: */
033: final public class UnicodeLittleXMLDecoder implements XMLDecoder {
034: private boolean sawCR = false;
035:
036: public CharsetDecoder newCharsetDecoder() {
037: return newXMLDecoder();
038: }
039:
040: public XMLDecoder newXMLDecoder() {
041: return new UnicodeLittleXMLDecoder();
042: }
043:
044: public int minBytesPerChar() {
045: return 2;
046: }
047:
048: public int maxBytesPerChar() {
049: return 2;
050: }
051:
052: public void reset() {
053: sawCR = false;
054: }
055:
056: public void decode(byte[] in_buf, int in_off, int in_len,
057: char[] out_buf, int out_off, int out_len, int[] result)
058: throws CharConversionException {
059: int i, o;
060: for (i = o = 0; i + 1 < in_len && o < out_len; i += 2) {
061: char c = (char) (((0xFF & in_buf[in_off + i + 1]) << 8) | (0xFF & in_buf[in_off
062: + i]));
063:
064: if (c >= 0x20) {
065: if ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
066: || (c >= 0x10000 && c <= 0x10FFFF)) {
067: sawCR = false;
068: out_buf[out_off + (o++)] = (char) c;
069: } else
070: throw new IllegalCharException(
071: "Illegal XML Character: 0x"
072: + Integer.toHexString(c));
073: } else {
074: switch (c) {
075: case '\n':
076: if (sawCR) {
077: sawCR = false;
078: } else
079: out_buf[out_off + o++] = '\n';
080: break;
081:
082: case '\r':
083: sawCR = true;
084: out_buf[out_off + o++] = '\n';
085: break;
086:
087: case '\t':
088: out_buf[out_off + o++] = '\t';
089: break;
090:
091: default:
092: throw new IllegalCharException(
093: "Illegal XML character: 0x"
094: + Integer.toHexString(c));
095: }
096: }
097: }
098: result[0] = i;
099: result[1] = o;
100: }
101:
102: public void decodeXMLDecl(byte[] in_buf, int in_off, int in_len,
103: char[] out_buf, int out_off, int out_len, int[] result)
104: throws CharConversionException {
105: int i, o;
106: inputLoop: for (i = o = 0; i + 1 < in_len && o < out_len; i += 2) {
107: char c = (char) (((0xFF & in_buf[in_off + i + 1]) << 8) | (0xFF & in_buf[in_off
108: + i]));
109:
110: if (c >= 0x20) {
111: if ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
112: || (c >= 0x10000 && c <= 0x10FFFF)) {
113: sawCR = false;
114: out_buf[out_off + (o++)] = (char) c;
115:
116: // XML declaration is definitely over. Stop decoding.
117: if (c == '>') {
118: i += 2;
119: break inputLoop;
120: }
121: } else {
122: // Decoding error. Stop reading.
123: break inputLoop;
124: }
125: } else {
126: switch (c) {
127: case '\n':
128: if (sawCR) {
129: sawCR = false;
130: } else
131: out_buf[out_off + o++] = '\n';
132: break;
133:
134: case '\r':
135: sawCR = true;
136: out_buf[out_off + o++] = '\n';
137: break;
138:
139: case '\t':
140: out_buf[out_off + o++] = '\t';
141: break;
142:
143: default:
144: // Illegal character. Stop reading.
145: break inputLoop;
146: }
147: }
148: }
149: result[0] = i;
150: result[1] = o;
151: }
152:
153: }
|