001: package com.jclark.xml.tok;
002:
003: /**
004: * An <CODE>Encoding</CODE> for UTF-8.
005: * @version $Revision: 1.5 $ $Date: 1998/05/27 05:43:15 $
006: */
007: final class UTF8Encoding extends Encoding {
008: private static final byte[] utf8HiTypeTable = {
009: /* 0x80 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
010: /* 0x84 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
011: /* 0x88 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
012: /* 0x8C */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
013: /* 0x90 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
014: /* 0x94 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
015: /* 0x98 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
016: /* 0x9C */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
017: /* 0xA0 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
018: /* 0xA4 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
019: /* 0xA8 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
020: /* 0xAC */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
021: /* 0xB0 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
022: /* 0xB4 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
023: /* 0xB8 */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
024: /* 0xBC */BT_MALFORM, BT_MALFORM, BT_MALFORM, BT_MALFORM,
025: /* 0xC0 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
026: /* 0xC4 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
027: /* 0xC8 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
028: /* 0xCC */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
029: /* 0xD0 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
030: /* 0xD4 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
031: /* 0xD8 */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
032: /* 0xDC */BT_LEAD2, BT_LEAD2, BT_LEAD2, BT_LEAD2,
033: /* 0xE0 */BT_LEAD3, BT_LEAD3, BT_LEAD3, BT_LEAD3,
034: /* 0xE4 */BT_LEAD3, BT_LEAD3, BT_LEAD3, BT_LEAD3,
035: /* 0xE8 */BT_LEAD3, BT_LEAD3, BT_LEAD3, BT_LEAD3,
036: /* 0xEC */BT_LEAD3, BT_LEAD3, BT_LEAD3, BT_LEAD3,
037: /* 0xF0 */BT_LEAD4, BT_LEAD4, BT_LEAD4, BT_LEAD4,
038: /* 0xF4 */BT_LEAD4, BT_LEAD4, BT_LEAD4, BT_LEAD4,
039: /* 0xF8 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
040: /* 0xFC */BT_NONXML, BT_NONXML, BT_MALFORM, BT_MALFORM };
041:
042: private static final byte[] utf8TypeTable = new byte[256];
043:
044: static {
045: System.arraycopy(asciiTypeTable, 0, utf8TypeTable, 0, 128);
046: System.arraycopy(utf8HiTypeTable, 0, utf8TypeTable, 128, 128);
047: }
048:
049: UTF8Encoding() {
050: super (1);
051: }
052:
053: int byteType(byte[] buf, int off) {
054: return utf8TypeTable[buf[off] & 0xFF];
055: }
056:
057: int byteToAscii(byte[] buf, int off) {
058: return (char) buf[off];
059: }
060:
061: // c is a significant ASCII character
062: boolean charMatches(byte[] buf, int off, char c) {
063: return (char) buf[off] == c;
064: }
065:
066: /* A 2 byte UTF-8 representation splits the characters 11 bits
067: between the bottom 5 and 6 bits of the bytes. */
068:
069: int byteType2(byte[] buf, int off) {
070: final byte[] page = charTypeTable[(buf[off] >>> 2) & 0x7];
071: return page[((buf[off] & 3) << 6) | (buf[off + 1] & 0x3F)];
072: }
073:
074: /* A 3 byte UTF-8 representation splits the characters 16 bits
075: between the bottom 4, 6 and 6 bits of the bytes. */
076:
077: /* This will (incorrectly) return BT_LEAD4 for surrogates, but that
078: doesn't matter. */
079: int byteType3(byte[] buf, int off) {
080: final byte[] page = charTypeTable[((buf[off] & 0xF) << 4)
081: | ((buf[off + 1] >>> 2) & 0xF)];
082: return page[((buf[off + 1] & 3) << 6) | (buf[off + 2] & 0x3F)];
083: }
084:
085: void check3(byte[] buf, int off) throws InvalidTokenException {
086: switch (buf[off]) {
087: case 0xEF - 0x100:
088: /* 0xFFFF 0xFFFE */
089: if (buf[off + 1] == (0xBF - 0x100)
090: && (buf[off + 2] == (0xBF - 0x100) || buf[off + 2] == (0xBE - 0x100)))
091: break;
092: return;
093: case 0xED - 0x100:
094: /* 0xD800..0xDFFF <=> top 5 bits are 11011 */
095: if ((buf[off + 1] & 0x20) != 0)
096: break;
097: return;
098: default:
099: return;
100: }
101: throw new InvalidTokenException(off);
102: }
103:
104: void check4(byte[] buf, int off) throws InvalidTokenException {
105: switch (buf[off] & 0x7) {
106: default:
107: return;
108: case 5:
109: case 6:
110: case 7:
111: break;
112: case 4:
113: if ((buf[off + 1] & 0x30) == 0)
114: return;
115: break;
116: }
117: throw new InvalidTokenException(off);
118: }
119:
120: public int convert(byte[] sourceBuf, int sourceStart,
121: int sourceEnd, char[] targetBuf, int targetStart) {
122: int initTargetStart = targetStart;
123: int c;
124: while (sourceStart != sourceEnd) {
125: byte b = sourceBuf[sourceStart++];
126: if (b >= 0)
127: targetBuf[targetStart++] = (char) b;
128: else {
129: switch (utf8TypeTable[b & 0xFF]) {
130: case BT_LEAD2:
131: /* 5, 6 */
132: targetBuf[targetStart++] = (char) (((b & 0x1F) << 6) | (sourceBuf[sourceStart++] & 0x3F));
133: break;
134: case BT_LEAD3:
135: /* 4, 6, 6 */
136: c = (b & 0xF) << 12;
137: c |= (sourceBuf[sourceStart++] & 0x3F) << 6;
138: c |= (sourceBuf[sourceStart++] & 0x3F);
139: targetBuf[targetStart++] = (char) c;
140: break;
141: case BT_LEAD4:
142: /* 3, 6, 6, 6 */
143: c = (b & 0x7) << 18;
144: c |= (sourceBuf[sourceStart++] & 0x3F) << 12;
145: c |= (sourceBuf[sourceStart++] & 0x3F) << 6;
146: c |= (sourceBuf[sourceStart++] & 0x3F);
147: c -= 0x10000;
148: targetBuf[targetStart++] = (char) ((c >> 10) | 0xD800);
149: targetBuf[targetStart++] = (char) ((c & ((1 << 10) - 1)) | 0xDC00);
150: break;
151: }
152: }
153: }
154: return targetStart - initTargetStart;
155: }
156:
157: public int getFixedBytesPerChar() {
158: return 0;
159: }
160:
161: public void movePosition(final byte[] buf, int off, int end,
162: Position pos) {
163: /* Maintain the invariant: off - colDiff == colNumber. */
164: int colDiff = off - pos.columnNumber;
165: int lineNumber = pos.lineNumber;
166: while (off != end) {
167: byte b = buf[off];
168: if (b >= 0) {
169: ++off;
170: switch (b) {
171: case (byte) '\n':
172: lineNumber += 1;
173: colDiff = off;
174: break;
175: case (byte) '\r':
176: lineNumber += 1;
177: if (off != end && buf[off] == '\n')
178: off++;
179: colDiff = off;
180: break;
181: }
182: } else {
183: switch (utf8TypeTable[b & 0xFF]) {
184: default:
185: off += 1;
186: break;
187: case BT_LEAD2:
188: off += 2;
189: colDiff++;
190: break;
191: case BT_LEAD3:
192: off += 3;
193: colDiff += 2;
194: break;
195: case BT_LEAD4:
196: off += 4;
197: colDiff += 3;
198: break;
199: }
200: }
201: }
202: pos.columnNumber = off - colDiff;
203: pos.lineNumber = lineNumber;
204: }
205:
206: int extendData(final byte[] buf, int off, final int end)
207: throws InvalidTokenException {
208: while (off != end) {
209: int type = utf8TypeTable[buf[off] & 0xFF];
210: if (type >= 0)
211: off++;
212: else if (type < BT_LEAD4)
213: break;
214: else {
215: if (end - off + type < 0)
216: break;
217: switch (type) {
218: case BT_LEAD3:
219: check3(buf, off);
220: break;
221: case BT_LEAD4:
222: check4(buf, off);
223: break;
224: }
225: off -= type;
226: }
227: }
228: return off;
229: }
230:
231: }
|