001: /*
002: * Copyright 1997-2006 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025: package sun.io;
026:
027: import sun.nio.cs.ext.IBM933;
028:
029: /**
030: * @author Malcolm Ayres
031: */
032:
033: public class CharToByteCp933 extends CharToByteConverter {
034: private static final char SBase = '\uAC00';
035: private static final char LBase = '\u1100';
036: private static final char VBase = '\u1161';
037: private static final char TBase = '\u11A7';
038: private static final int VCount = 21;
039: private static final int TCount = 28;
040: private static final byte G0 = 0;
041: private static final byte G1 = 1;
042: private static final byte G2 = 2;
043: private static final byte G3 = 3;
044: private byte charState = G0;
045: private char l, v, t;
046:
047: private int byteState;
048: private byte[] outputByte;
049: private static final int SBCS = 0;
050: private static final int DBCS = 1;
051: private static final byte SO = 0x0e;
052: private static final byte SI = 0x0f;
053:
054: private char highHalfZoneCode;
055:
056: private short[] index1;
057: private String index2;
058: private String index2a;
059: private int mask1;
060: private int mask2;
061: private int shift;
062:
063: private final static IBM933 nioCoder = new IBM933();
064:
065: public CharToByteCp933() {
066: super ();
067: byteState = doSBCS() ? SBCS : DBCS;
068: highHalfZoneCode = 0;
069: outputByte = new byte[2];
070: mask1 = 0xFFF8;
071: mask2 = 0x0007;
072: shift = 3;
073: index1 = nioCoder.getEncoderIndex1();
074: index2 = nioCoder.getEncoderIndex2();
075: index2a = nioCoder.getEncoderIndex2a();
076: subBytes = new byte[1];
077: subBytes[0] = 0x6f;
078: }
079:
080: /**
081: * flush out any residual data and reset the buffer state
082: */
083: public int flush(byte[] output, int outStart, int outEnd)
084: throws MalformedInputException,
085: ConversionBufferFullException {
086: int bytesOut;
087:
088: byteOff = outStart;
089:
090: if (highHalfZoneCode != 0) {
091: reset();
092: badInputLength = 0;
093: throw new MalformedInputException();
094: }
095:
096: if (charState != G0) {
097: try {
098: unicodeToBuffer(composeHangul(), output, outEnd);
099: } catch (UnknownCharacterException e) {
100: reset();
101: badInputLength = 0;
102: throw new MalformedInputException();
103: }
104: charState = G0;
105: }
106:
107: if (byteState == DBCS && doSBCS()) {
108: if (byteOff >= outEnd)
109: throw new ConversionBufferFullException();
110: output[byteOff++] = SI;
111: byteState = SBCS;
112: }
113:
114: bytesOut = byteOff - outStart;
115:
116: reset();
117: return bytesOut;
118: }
119:
120: /**
121: * Resets converter to its initial state.
122: */
123: public void reset() {
124: byteState = doSBCS() ? SBCS : DBCS;
125: highHalfZoneCode = 0;
126: charState = G0;
127: charOff = byteOff = 0;
128: }
129:
130: /**
131: * Returns true if the given character can be converted to the
132: * target character encoding.
133: */
134: public boolean canConvert(char ch) {
135: return encodeHangul(ch) != -1;
136: }
137:
138: /**
139: * Sets the substitution bytes to use when the converter is in
140: * substitution mode. The given bytes should represent a valid
141: * character in the target character encoding.
142: */
143:
144: public void setSubstitutionBytes(byte[] newSubBytes)
145: throws IllegalArgumentException {
146: if (newSubBytes.length > 2 || newSubBytes.length == 0) {
147: throw new IllegalArgumentException();
148: }
149:
150: subBytes = new byte[newSubBytes.length];
151: System.arraycopy(newSubBytes, 0, subBytes, 0,
152: newSubBytes.length);
153:
154: }
155:
156: /**
157: * Character conversion
158: */
159:
160: public int convert(char[] input, int inOff, int inEnd,
161: byte[] output, int outOff, int outEnd)
162: throws UnknownCharacterException, MalformedInputException,
163: ConversionBufferFullException {
164: char inputChar;
165: int inputSize;
166:
167: charOff = inOff;
168: byteOff = outOff;
169:
170: while (charOff < inEnd) {
171:
172: if (highHalfZoneCode == 0) {
173: inputChar = input[charOff];
174: inputSize = 1;
175: } else {
176: inputChar = highHalfZoneCode;
177: inputSize = 0;
178: highHalfZoneCode = 0;
179: }
180:
181: switch (charState) {
182: case G0:
183:
184: l = LBase;
185: v = VBase;
186: t = TBase;
187:
188: if (isLeadingC(inputChar)) { // Leading Consonant
189: l = inputChar;
190: charState = G1;
191: break;
192: }
193:
194: if (isVowel(inputChar)) { // Vowel
195: v = inputChar;
196: charState = G2;
197: break;
198: }
199:
200: if (isTrailingC(inputChar)) { // Trailing Consonant
201: t = inputChar;
202: charState = G3;
203: break;
204: }
205:
206: break;
207:
208: case G1:
209: if (isLeadingC(inputChar)) { // Leading Consonant
210: l = composeLL(l, inputChar);
211: break;
212: }
213:
214: if (isVowel(inputChar)) { // Vowel
215: v = inputChar;
216: charState = G2;
217: break;
218: }
219:
220: if (isTrailingC(inputChar)) { // Trailing Consonant
221: t = inputChar;
222: charState = G3;
223: break;
224: }
225:
226: unicodeToBuffer(composeHangul(), output, outEnd);
227:
228: charState = G0;
229: break;
230:
231: case G2:
232: if (isLeadingC(inputChar)) { // Leading Consonant
233:
234: unicodeToBuffer(composeHangul(), output, outEnd);
235:
236: l = inputChar;
237: v = VBase;
238: t = TBase;
239: charState = G1;
240: break;
241: }
242:
243: if (isVowel(inputChar)) { // Vowel
244: v = composeVV(l, inputChar);
245: charState = G2;
246: break;
247: }
248:
249: if (isTrailingC(inputChar)) { // Trailing Consonant
250: t = inputChar;
251: charState = G3;
252: break;
253: }
254:
255: unicodeToBuffer(composeHangul(), output, outEnd);
256:
257: charState = G0;
258:
259: break;
260:
261: case G3:
262: if (isTrailingC(inputChar)) { // Trailing Consonant
263: t = composeTT(t, inputChar);
264: charState = G3;
265: break;
266: }
267:
268: unicodeToBuffer(composeHangul(), output, outEnd);
269:
270: charState = G0;
271:
272: break;
273: }
274:
275: if (charState != G0)
276: charOff++;
277: else {
278:
279: // Is this a high surrogate?
280: if (inputChar >= '\ud800' && inputChar <= '\udbff') {
281: // Is this the last character of the input?
282: if (charOff + inputSize >= inEnd) {
283: highHalfZoneCode = inputChar;
284: charOff += inputSize;
285: break;
286: }
287:
288: // Is there a low surrogate following?
289: inputChar = input[charOff + inputSize];
290: if (inputChar >= '\udc00' && inputChar <= '\udfff') {
291: // We have a valid surrogate pair. Too bad we don't do
292: // surrogates. Is substitution enabled?
293: if (subMode) {
294: if (subBytes.length == 1) {
295: outputByte[0] = 0x00;
296: outputByte[1] = subBytes[0];
297: } else {
298: outputByte[0] = subBytes[0];
299: outputByte[1] = subBytes[1];
300: }
301:
302: bytesToBuffer(outputByte, output, outEnd);
303: inputSize++;
304: } else {
305: badInputLength = 2;
306: throw new UnknownCharacterException();
307: }
308: } else {
309: // We have a malformed surrogate pair
310: badInputLength = 1;
311: throw new MalformedInputException();
312: }
313: }
314:
315: // Is this an unaccompanied low surrogate?
316: else if (inputChar >= '\uDC00' && inputChar <= '\uDFFF') {
317: badInputLength = 1;
318: throw new MalformedInputException();
319: } else {
320: unicodeToBuffer(inputChar, output, outEnd);
321: }
322:
323: charOff += inputSize;
324:
325: }
326:
327: }
328:
329: return byteOff - outOff;
330:
331: }
332:
333: private char composeHangul() {
334: int lIndex, vIndex, tIndex;
335:
336: lIndex = l - LBase;
337: vIndex = v - VBase;
338: tIndex = t - TBase;
339:
340: return (char) ((lIndex * VCount + vIndex) * TCount + tIndex + SBase);
341: }
342:
343: private char composeLL(char l1, char l2) {
344: return l2;
345: }
346:
347: private char composeVV(char v1, char v2) {
348: return v2;
349: }
350:
351: private char composeTT(char t1, char t2) {
352: return t2;
353: }
354:
355: private boolean isLeadingC(char c) {
356: return (c >= LBase && c <= '\u1159');
357: }
358:
359: private boolean isVowel(char c) {
360: return (c >= VBase && c <= '\u11a2');
361: }
362:
363: private boolean isTrailingC(char c) {
364: return (c >= TBase && c <= '\u11f9');
365: }
366:
367: /**
368: * returns the maximum number of bytes needed to convert a char
369: */
370: public int getMaxBytesPerChar() {
371: return 4;
372: }
373:
374: /**
375: * Return the character set ID
376: */
377: public String getCharacterEncoding() {
378: return "Cp933";
379: }
380:
381: /**
382: * private function to add the bytes to the output buffer
383: */
384: private void bytesToBuffer(byte[] theBytes, byte[] output,
385: int outEnd) throws ConversionBufferFullException,
386: UnknownCharacterException {
387:
388: int spaceNeeded;
389:
390: // Set the output buffer into the correct state
391:
392: if (byteState == DBCS && theBytes[0] == 0x00) {
393: if (byteOff >= outEnd)
394: throw new ConversionBufferFullException();
395: byteState = SBCS;
396: output[byteOff++] = SI;
397: } else if (byteState == SBCS && theBytes[0] != 0x00) {
398: if (byteOff >= outEnd)
399: throw new ConversionBufferFullException();
400: byteState = DBCS;
401: output[byteOff++] = SO;
402: }
403:
404: // ensure sufficient space for the bytes(s)
405:
406: if (byteState == DBCS)
407: spaceNeeded = 2;
408: else
409: spaceNeeded = 1;
410:
411: if (byteOff + spaceNeeded > outEnd)
412: throw new ConversionBufferFullException();
413:
414: // move the data into the buffer
415:
416: if (byteState == SBCS)
417: output[byteOff++] = theBytes[1];
418: else {
419: output[byteOff++] = theBytes[0];
420: output[byteOff++] = theBytes[1];
421: }
422: }
423:
424: // return -1 for unmappable character
425: protected int encodeHangul(char unicode) {
426: int theBytes;
427: int index;
428: index = index1[((unicode & mask1) >> shift)]
429: + (unicode & mask2);
430: if (index < 15000)
431: theBytes = (int) (index2.charAt(index));
432: else
433: theBytes = (int) (index2a.charAt(index - 15000));
434:
435: // The input char is undefined if theBytes is 0 and the char is NOT unicode 0
436: if (theBytes == 0 && unicode != '\u0000')
437: return -1;
438: return theBytes;
439: }
440:
441: /**
442: * private function to add a unicode character to the output buffer
443: */
444: private void unicodeToBuffer(char unicode, byte[] output, int outEnd)
445: throws ConversionBufferFullException,
446: UnknownCharacterException {
447:
448: // first we convert the unicode to its byte representation
449: int theBytes = encodeHangul(unicode);
450:
451: // if the unicode was not mappable - look for the substitution bytes
452: if (theBytes == -1) {
453: if (subMode) {
454: if (subBytes.length == 1) {
455: outputByte[0] = 0x00;
456: outputByte[1] = subBytes[0];
457: } else {
458: outputByte[0] = subBytes[0];
459: outputByte[1] = subBytes[1];
460: }
461: } else {
462: badInputLength = 1;
463: throw new UnknownCharacterException();
464: }
465: } else {
466: outputByte[0] = (byte) ((theBytes & 0x0000ff00) >> 8);
467: outputByte[1] = (byte) (theBytes & 0x000000ff);
468: }
469:
470: // now put the bytes in the buffer
471: bytesToBuffer(outputByte, output, outEnd);
472: }
473:
474: //Methods below are for subclass Cp834
475: protected boolean doSBCS() {
476: return true;
477: }
478: }
|