001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.harmony.niochar.charset;
019:
020: import java.nio.ByteBuffer;
021: import java.nio.CharBuffer;
022: import java.nio.charset.Charset;
023: import java.nio.charset.CharsetDecoder;
024: import java.nio.charset.CharsetEncoder;
025: import java.nio.charset.CoderResult;
026:
027: public class UTF_8 extends Charset {
028:
029: // The next table contains information about UTF-8 charset and
030: // correspondence of 1st byte to the length of sequence
031: // For information please visit http://www.ietf.org/rfc/rfc3629.txt
032: //
033: // Please note, o means 0, actually.
034: // -------------------------------------------------------------------
035: // 0 1 2 3 Value
036: // -------------------------------------------------------------------
037: // oxxxxxxx 00000000 00000000 0xxxxxxx
038: // 11oyyyyy 1oxxxxxx 00000000 00000yyy yyxxxxxx
039: // 111ozzzz 1oyyyyyy 1oxxxxxx 00000000 zzzzyyyy yyxxxxxx
040: // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
041:
042: private static final int remainingBytes[] = {
043: // oxxxxxxx
044: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
045: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
046: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
047: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
048: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
049: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
050: 0, 0, 0, 0,
051: 0,
052: 0,
053: 0,
054: 0,
055: // 1owwwwww
056: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
057: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
058: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
059: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
060: -1, -1, -1, -1,
061: // 11oyyyyy
062: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
063: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
064: // 111ozzzz
065: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
066: // 1111ouuu
067: 3, 3, 3, 3, 3, 3, 3, 3,
068: // > 11110111
069: -1, -1, -1, -1, -1, -1, -1, -1 };
070:
071: private static final int remainingNumbers[] = { 0, // 0 1 2 3
072: 12416, // (11o00000b << 6)+(1o000000b)
073: 925824, // (111o0000b << 12)+(1o000000b << 6)+(1o000000b)
074: 63447168 // (1111o000b << 18)+(1o000000b << 12)+(1o000000b << 6)+(1o000000b)
075: };
076:
077: public UTF_8(String canonicalName, String[] aliases) {
078: super (canonicalName, aliases);
079: }
080:
081: public boolean contains(Charset cs) {
082: return cs.name().equalsIgnoreCase("UTF-8")
083: || cs.name().equalsIgnoreCase("US-ASCII")
084: || cs.name().equalsIgnoreCase("KOI8-R")
085: || cs.name().equalsIgnoreCase("windows-1250")
086: || cs.name().equalsIgnoreCase("windows-1251")
087: || cs.name().equalsIgnoreCase("windows-1252")
088: || cs.name().equalsIgnoreCase("windows-1253")
089: || cs.name().equalsIgnoreCase("windows-1254")
090: || cs.name().equalsIgnoreCase("windows-1257")
091: || cs.name().equalsIgnoreCase("ISO-8859-1")
092: || cs.name().equalsIgnoreCase("ISO-8859-2")
093: || cs.name().equalsIgnoreCase("ISO-8859-4")
094: || cs.name().equalsIgnoreCase("ISO-8859-5")
095: || cs.name().equalsIgnoreCase("ISO-8859-7")
096: || cs.name().equalsIgnoreCase("ISO-8859-9")
097: || cs.name().equalsIgnoreCase("ISO-8859-10")
098: || cs.name().equalsIgnoreCase("ISO-8859-13")
099: || cs.name().equalsIgnoreCase("ISO-8859-14")
100: || cs.name().equalsIgnoreCase("ISO-8859-15")
101: || cs.name().equalsIgnoreCase("ISO-8859-16")
102: || cs.name().equalsIgnoreCase("UTF-16")
103: || cs.name().equalsIgnoreCase("UTF-16LE")
104: || cs.name().equalsIgnoreCase("UTF-16BE");
105: }
106:
107: public CharsetDecoder newDecoder() {
108: return new Decoder(this );
109: }
110:
111: public CharsetEncoder newEncoder() {
112: return new Encoder(this );
113: }
114:
115: private final class Decoder extends CharsetDecoder {
116:
117: private Decoder(Charset cs) {
118: super (cs, 1.0f, 1.0f);
119: }
120:
121: protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
122: int outRemaining = out.remaining();
123: int pos = in.position();
124: int limit = in.limit();
125: if (in.hasArray() && out.hasArray()) {
126: byte[] bArr;
127: char[] cArr;
128: bArr = in.array();
129: cArr = out.array();
130: int outPos = out.position();
131: int x = pos;
132: try {
133: for (x = pos; x < pos + in.remaining(); x++) {
134: if (outRemaining == 0) {
135: in.position(x - 1);
136: out.position(outPos);
137: return CoderResult.OVERFLOW;
138: }
139:
140: int jchar = (bArr[x] & 0xFF);
141: int tail = remainingBytes[jchar];
142: if (tail == -1) {
143: in.position(x);
144: out.position(outPos);
145: return CoderResult.unmappableForLength(1);
146: }
147: if (limit - x < 1 + tail) {
148: in.position(x);
149: out.position(outPos);
150: return CoderResult.UNDERFLOW;
151: }
152:
153: if (tail > 0) {
154: int nextByte;
155: for (int i = 0; i < tail; i++) {
156: nextByte = bArr[x + i + 1] & 0xFF;
157: if ((nextByte & 0xC0) != 0x80) {
158: in.position(x + i);
159: out.position(outPos);
160: return CoderResult
161: .malformedForLength(1 + i);
162: }
163: jchar = (jchar << 6) + nextByte;
164: }
165: jchar -= remainingNumbers[tail];
166: x += tail;
167: }
168:
169: cArr[outPos++] = (char) jchar;
170: outRemaining--;
171: }
172: in.position(x);
173: out.position(outPos);
174: if (outRemaining == 0 && in.hasRemaining()) {
175: return CoderResult.OVERFLOW;
176: }
177: return CoderResult.UNDERFLOW;
178: } finally {
179: in.position(x);
180: }
181:
182: } else {
183: try {
184: while (pos < limit) {
185: if (outRemaining == 0) {
186: return CoderResult.OVERFLOW;
187: }
188:
189: int jchar = (in.get() & 0xFF);
190: int tail = remainingBytes[jchar];
191: if (tail == -1) {
192: return CoderResult.unmappableForLength(1);
193: }
194: if (limit - pos < 1 + tail) {
195: return CoderResult.UNDERFLOW;
196: }
197:
198: if (tail > 0) {
199: int nextByte;
200: for (int i = 0; i < tail; i++) {
201: nextByte = in.get() & 0xFF;
202: if ((nextByte & 0xC0) != 0x80) {
203: return CoderResult
204: .malformedForLength(1 + i);
205: }
206: jchar = (jchar << 6) + nextByte;
207: }
208: jchar -= remainingNumbers[tail];
209: pos += tail;
210: }
211: pos++;
212:
213: out.put((char) jchar);
214: outRemaining--;
215: }
216: return CoderResult.UNDERFLOW;
217: } finally {
218: in.position(pos);
219: }
220: }
221: }
222: }
223:
224: private final class Encoder extends CharsetEncoder {
225:
226: private Encoder(Charset cs) {
227: super (cs, 1.1f, 4.0f);
228: }
229:
230: protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
231: int outRemaining = out.remaining();
232: int pos = in.position();
233: int limit = in.limit();
234: if (in.hasArray() && out.hasArray()) {
235: byte[] bArr;
236: char[] cArr;
237: int x = pos;
238: bArr = out.array();
239: cArr = in.array();
240: int outPos = out.position();
241: int rem = in.remaining();
242: for (x = pos; x < pos + rem; x++) {
243: int jchar = (cArr[x] & 0xFFFF);
244:
245: if (jchar <= 0x7F) {
246: if (outRemaining < 1) {
247: in.position(x);
248: out.position(outPos);
249: return CoderResult.OVERFLOW;
250: }
251: bArr[outPos++] = (byte) (jchar & 0xFF);
252: outRemaining--;
253: } else if (jchar <= 0x7FF) {
254:
255: if (outRemaining < 2) {
256: in.position(x);
257: out.position(outPos);
258: return CoderResult.OVERFLOW;
259: }
260: bArr[outPos++] = (byte) (0xC0 + ((jchar >> 6) & 0x1F));
261: bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
262: outRemaining -= 2;
263:
264: } else if (jchar >= 0xD800 && jchar <= 0xDFFF) {
265:
266: // in has to have one byte more.
267: if (limit <= x + 1) {
268: in.position(x);
269: out.position(outPos);
270: return CoderResult.UNDERFLOW;
271: }
272:
273: if (outRemaining < 4) {
274: in.position(x);
275: out.position(outPos);
276: return CoderResult.OVERFLOW;
277: }
278:
279: // The surrogate pair starts with a low-surrogate.
280: if (jchar >= 0xDC00) {
281: in.position(x);
282: out.position(outPos);
283: return CoderResult.malformedForLength(1);
284: }
285:
286: int jchar2 = cArr[x + 1] & 0xFFFF;
287:
288: // The surrogate pair ends with a high-surrogate.
289: if (jchar2 < 0xDC00) {
290: in.position(x);
291: out.position(outPos);
292: return CoderResult.malformedForLength(1);
293: }
294:
295: // Note, the Unicode scalar value n is defined
296: // as follows:
297: // n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
298: // Where jchar is a high-surrogate,
299: // jchar2 is a low-surrogate.
300: int n = (jchar << 10) + jchar2 + 0xFCA02400;
301:
302: bArr[outPos++] = (byte) (0xF0 + ((n >> 18) & 0x07));
303: bArr[outPos++] = (byte) (0x80 + ((n >> 12) & 0x3F));
304: bArr[outPos++] = (byte) (0x80 + ((n >> 6) & 0x3F));
305: bArr[outPos++] = (byte) (0x80 + (n & 0x3F));
306: outRemaining -= 4;
307: x++;
308:
309: } else {
310:
311: if (outRemaining < 3) {
312: in.position(x);
313: out.position(outPos);
314: return CoderResult.OVERFLOW;
315: }
316: bArr[outPos++] = (byte) (0xE0 + ((jchar >> 12) & 0x0F));
317: bArr[outPos++] = (byte) (0x80 + ((jchar >> 6) & 0x3F));
318: bArr[outPos++] = (byte) (0x80 + (jchar & 0x3F));
319: outRemaining -= 3;
320: }
321: if (outRemaining == 0) {
322: in.position(x + 1);
323: out.position(outPos);
324: return CoderResult.OVERFLOW;
325: }
326:
327: }
328: if (rem != 0) {
329: in.position(x);
330: out.position(outPos);
331: }
332: } else {
333: try {
334: while (pos < limit) {
335: if (outRemaining == 0) {
336: return CoderResult.OVERFLOW;
337: }
338:
339: int jchar = (in.get() & 0xFFFF);
340:
341: if (jchar <= 0x7F) {
342:
343: if (outRemaining < 1) {
344: return CoderResult.OVERFLOW;
345: }
346: out.put((byte) (jchar & 0xFF));
347: outRemaining--;
348:
349: } else if (jchar <= 0x7FF) {
350:
351: if (outRemaining < 2) {
352: return CoderResult.OVERFLOW;
353: }
354: out
355: .put((byte) (0xC0 + ((jchar >> 6) & 0x1F)));
356: out.put((byte) (0x80 + (jchar & 0x3F)));
357: outRemaining -= 2;
358:
359: } else if (jchar >= 0xD800 && jchar <= 0xDFFF) {
360:
361: // in has to have one byte more.
362: if (limit <= pos + 1) {
363: return CoderResult.UNDERFLOW;
364: }
365:
366: if (outRemaining < 4) {
367: return CoderResult.OVERFLOW;
368: }
369:
370: // The surrogate pair starts with a low-surrogate.
371: if (jchar >= 0xDC00) {
372: return CoderResult
373: .malformedForLength(1);
374: }
375:
376: int jchar2 = (in.get() & 0xFFFF);
377:
378: // The surrogate pair ends with a high-surrogate.
379: if (jchar2 < 0xDC00) {
380: return CoderResult
381: .malformedForLength(1);
382: }
383:
384: // Note, the Unicode scalar value n is defined
385: // as follows:
386: // n = (jchar-0xD800)*0x400+(jchar2-0xDC00)+0x10000
387: // Where jchar is a high-surrogate,
388: // jchar2 is a low-surrogate.
389: int n = (jchar << 10) + jchar2 + 0xFCA02400;
390:
391: out.put((byte) (0xF0 + ((n >> 18) & 0x07)));
392: out.put((byte) (0x80 + ((n >> 12) & 0x3F)));
393: out.put((byte) (0x80 + ((n >> 6) & 0x3F)));
394: out.put((byte) (0x80 + (n & 0x3F)));
395: outRemaining -= 4;
396: pos++;
397:
398: } else {
399:
400: if (outRemaining < 3) {
401: return CoderResult.OVERFLOW;
402: }
403: out
404: .put((byte) (0xE0 + ((jchar >> 12) & 0x0F)));
405: out
406: .put((byte) (0x80 + ((jchar >> 6) & 0x3F)));
407: out.put((byte) (0x80 + (jchar & 0x3F)));
408: outRemaining -= 3;
409: }
410: pos++;
411: }
412: } finally {
413: in.position(pos);
414: }
415: }
416:
417: return CoderResult.UNDERFLOW;
418: }
419:
420: }
421: }
|