001: /*
002: * $Id: UnicodeBigXMLDecoder.java,v 1.5 2004/07/11 09:37:37 yuvalo Exp $
003: *
004: * (C) Copyright 2002-2004 by Yuval Oren. All rights reserved.
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: package com.bluecast.xml;
020:
021: import com.bluecast.io.CharsetDecoder;
022: import com.bluecast.io.IllegalCharException;
023:
024: import java.io.CharConversionException;
025:
026: /**
027: * A decoder for big-endian Unicode text. Also converts
028: * carriage returns into linefeeds and CRLF into LF.
029: *
030: * @author Yuval Oren, yuval@bluecast.com
031: * @version $Revision: 1.5 $
032: */
033:
034: final public class UnicodeBigXMLDecoder implements XMLDecoder {
035: private boolean sawCR = false;
036:
037: public CharsetDecoder newCharsetDecoder() {
038: return newXMLDecoder();
039: }
040:
041: public XMLDecoder newXMLDecoder() {
042: return new UnicodeBigXMLDecoder();
043: }
044:
045: public int minBytesPerChar() {
046: return 2;
047: }
048:
049: public int maxBytesPerChar() {
050: return 2;
051: }
052:
053: public void reset() {
054: sawCR = false;
055: }
056:
057: public void decode(byte[] in_buf, int in_off, int in_len,
058: char[] out_buf, int out_off, int out_len, int[] result)
059: throws CharConversionException {
060: int i, o;
061: for (i = o = 0; i + 1 < in_len && o < out_len; i += 2) {
062: char c = (char) (((0xFF & in_buf[in_off + i]) << 8) | (0xFF & in_buf[in_off
063: + i + 1]));
064: if (c >= 0x20) {
065: if ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
066: || (c >= 0x10000 && c <= 0x10FFFF)) {
067: sawCR = false;
068: out_buf[out_off + o++] = (char) c;
069: } else
070: throw new IllegalCharException(
071: "Illegal XML Character: 0x"
072: + Integer.toHexString(c));
073: } else {
074: switch (c) {
075: case '\n':
076: if (sawCR) {
077: sawCR = false;
078: } else
079: out_buf[out_off + o++] = '\n';
080: break;
081:
082: case '\r':
083: sawCR = true;
084: out_buf[out_off + o++] = '\n';
085: break;
086:
087: case '\t':
088: out_buf[out_off + o++] = '\t';
089: break;
090:
091: default:
092: throw new IllegalCharException(
093: "Illegal XML character: 0x"
094: + Integer.toHexString(c));
095: }
096: }
097: }
098: result[0] = i;
099: result[1] = o;
100: }
101:
102: public void decodeXMLDecl(byte[] in_buf, int in_off, int in_len,
103: char[] out_buf, int out_off, int out_len, int[] result)
104: throws CharConversionException {
105: int i, o;
106: inputLoop: for (i = o = 0; i + 1 < in_len && o < out_len; i += 2) {
107: char c = (char) (((0xFF & in_buf[in_off + i]) << 8) | (0xFF & in_buf[in_off
108: + i + 1]));
109: if (c >= 0x20) {
110: if ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
111: || (c >= 0x10000 && c <= 0x10FFFF)) {
112: sawCR = false;
113: out_buf[out_off + o++] = (char) c;
114:
115: // XML declaration is definitely over. Stop decoding.
116: if (c == '>') {
117: i += 2;
118: break inputLoop;
119: }
120: } else {
121: // Decoding error. Stop reading.
122: break inputLoop;
123: }
124: } else {
125: switch (c) {
126: case '\n':
127: if (sawCR) {
128: sawCR = false;
129: } else
130: out_buf[out_off + o++] = '\n';
131: break;
132:
133: case '\r':
134: sawCR = true;
135: out_buf[out_off + o++] = '\n';
136: break;
137:
138: case '\t':
139: out_buf[out_off + o++] = '\t';
140: break;
141:
142: default:
143: // Illegal character. Stop reading.
144: break inputLoop;
145: }
146: }
147: }
148: result[0] = i;
149: result[1] = o;
150: }
151:
152: }
|