0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: *
0007: *******************************************************************************
0008: */package com.ibm.icu.charset;
0009:
0010: import java.io.BufferedInputStream;
0011: import java.io.IOException;
0012: import java.io.InputStream;
0013: import java.nio.Buffer;
0014: import java.nio.BufferOverflowException;
0015: import java.nio.ByteBuffer;
0016: import java.nio.CharBuffer;
0017: import java.nio.IntBuffer;
0018: import java.nio.charset.CharsetDecoder;
0019: import java.nio.charset.CharsetEncoder;
0020: import java.nio.charset.CoderResult;
0021:
0022: import com.ibm.icu.charset.UConverterSharedData.UConverterType;
0023: import com.ibm.icu.impl.ICUData;
0024: import com.ibm.icu.impl.ICUResourceBundle;
0025: import com.ibm.icu.impl.InvalidFormatException;
0026: import com.ibm.icu.lang.UCharacter;
0027: import com.ibm.icu.text.UTF16;
0028:
0029: class CharsetMBCS extends CharsetICU {
0030:
0031: protected byte[] fromUSubstitution = null;
0032: protected UConverterSharedData sharedData = null;
0033: static final int MAX_VERSION_LENGTH = 4;
0034:
0035: /**
0036: * Fallbacks to Unicode are stored outside the normal state table and code point structures
0037: * in a vector of items of this type. They are sorted by offset.
0038: */
0039: final class MBCSToUFallback {
0040: int offset;
0041: int codePoint;
0042: }
0043:
0044: /**
0045: * This is the MBCS part of the UConverterTable union (a runtime data structure).
0046: * It keeps all the per-converter data and points into the loaded mapping tables.
0047: */
0048: static final class UConverterMBCSTable {
0049: /* toUnicode */
0050: short countStates;
0051: byte dbcsOnlyState;
0052: boolean stateTableOwned;
0053: int countToUFallbacks;
0054:
0055: int stateTable[/*countStates*/][/*256*/];
0056: int swapLFNLStateTable[/*countStates*/][/*256*/]; /* for swaplfnl */
0057: char unicodeCodeUnits[/*countUnicodeResults*/];
0058: MBCSToUFallback toUFallbacks[/*countToUFallbacks*/];
0059:
0060: /* fromUnicode */
0061: char fromUnicodeTable[];
0062: byte fromUnicodeBytes[];
0063: byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */
0064: int fromUBytesLength;
0065: short outputType, unicodeMask;
0066:
0067: /* converter name for swaplfnl */
0068: String swapLFNLName;
0069:
0070: /* extension data */
0071: UConverterSharedData baseSharedData;
0072: //int extIndexes[];
0073: ByteBuffer extIndexes; // create int[] view etc. as needed
0074:
0075: UConverterMBCSTable() {
0076: }
0077:
0078: /* UConverterMBCSTable(UConverterMBCSTable t)
0079: {
0080: countStates = t.countStates;
0081: dbcsOnlyState = t.dbcsOnlyState;
0082: stateTableOwned = t.stateTableOwned;
0083: countToUFallbacks = t.countToUFallbacks;
0084: stateTable = t.stateTable;
0085: swapLFNLStateTable = t.swapLFNLStateTable;
0086: unicodeCodeUnits = t.unicodeCodeUnits;
0087: toUFallbacks = t.toUFallbacks;
0088: fromUnicodeTable = t.fromUnicodeTable;
0089: fromUnicodeBytes = t.fromUnicodeBytes;
0090: swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes;
0091: fromUBytesLength = t.fromUBytesLength;
0092: outputType = t.outputType;
0093: unicodeMask = t.unicodeMask;
0094: swapLFNLName = t.swapLFNLName;
0095: baseSharedData = t.baseSharedData;
0096: extIndexes = t.extIndexes;
0097: }*/
0098: }
0099:
0100: /**
0101: * MBCS data header. See data format description above.
0102: */
0103: final class MBCSHeader {
0104: byte version[/*U_MAX_VERSION_LENGTH*/];
0105: int countStates, countToUFallbacks, offsetToUCodeUnits,
0106: offsetFromUTable, offsetFromUBytes;
0107: int flags;
0108: int fromUBytesLength;
0109:
0110: MBCSHeader() {
0111: version = new byte[MAX_VERSION_LENGTH];
0112: }
0113: }
0114:
0115: /**
0116: * Tags for pacifying the check tags tool
0117: * @draft ICU 3.6
0118: * @provisional This API might change or be removed in a future release.
0119: */
0120: public CharsetMBCS(String icuCanonicalName,
0121: String javaCanonicalName, String[] aliases)
0122: throws InvalidFormatException {
0123: super (icuCanonicalName, javaCanonicalName, aliases);
0124:
0125: // now try to load the data
0126: LoadArguments args = new LoadArguments(1, icuCanonicalName);
0127: sharedData = loadConverter(args);
0128:
0129: maxBytesPerChar = sharedData.staticData.maxBytesPerChar;
0130: minBytesPerChar = sharedData.staticData.minBytesPerChar;
0131: maxCharsPerByte = 1;
0132: fromUSubstitution = sharedData.staticData.subChar;
0133: subChar = sharedData.staticData.subChar;
0134: subCharLen = sharedData.staticData.subCharLen;
0135: subChar1 = sharedData.staticData.subChar1;
0136: fromUSubstitution = new byte[sharedData.staticData.subCharLen];
0137: System.arraycopy(sharedData.staticData.subChar, 0,
0138: fromUSubstitution, 0, sharedData.staticData.subCharLen);
0139:
0140: // Todo: pass options
0141: initializeConverter(0);
0142: }
0143:
0144: class LoadArguments {
0145: int nestedLoads; /* count nested loadConverter() calls */
0146: // int reserved; /* reserved - for good alignment of the pointers */
0147: // long options;
0148: // String pkg;
0149: String name;
0150:
0151: LoadArguments(int nestedLoads, String name) {
0152: this .nestedLoads = nestedLoads;
0153: this .name = name;
0154: }
0155: }
0156:
0157: protected UConverterSharedData loadConverter(LoadArguments args)
0158: throws InvalidFormatException {
0159: // Read converter data from file
0160: UConverterStaticData staticData = new UConverterStaticData();
0161: UConverterDataReader reader = null;
0162: try {
0163: InputStream i = ICUData
0164: .getRequiredStream(ICUResourceBundle.ICU_BUNDLE
0165: + "/" + args.name + "."
0166: + UConverterSharedData.DATA_TYPE);
0167: BufferedInputStream b = new BufferedInputStream(i,
0168: UConverterConstants.CNV_DATA_BUFFER_SIZE);
0169: reader = new UConverterDataReader(b);
0170: reader.readStaticData(staticData);
0171: } catch (IOException e) {
0172: throw new InvalidFormatException();
0173: } catch (Exception e) {
0174: throw new InvalidFormatException();
0175: }
0176:
0177: UConverterSharedData data = null;
0178: int type = staticData.conversionType;
0179:
0180: if (type != UConverterSharedData.UConverterType.MBCS
0181: || staticData.structSize != UConverterSharedData.SIZE_OF_UCONVERTER_SHARED_DATA) {
0182: throw new InvalidFormatException();
0183: }
0184:
0185: data = new UConverterSharedData(
0186: UConverterSharedData.SIZE_OF_UCONVERTER_SHARED_DATA, 1,
0187: null, false, 0);
0188: data.dataReader = reader;
0189: data.staticData = staticData;
0190: data.sharedDataCached = false;
0191:
0192: // Load data
0193: UConverterMBCSTable mbcsTable = data.mbcs;
0194: MBCSHeader header = new MBCSHeader();
0195: try {
0196: reader.readMBCSHeader(header);
0197: } catch (IOException e) {
0198: throw new InvalidFormatException();
0199: }
0200:
0201: int offset;
0202: //int[] extIndexesArray = null;
0203: String baseNameString = null;
0204: int[][] stateTableArray = null;
0205: MBCSToUFallback[] toUFallbacksArray = null;
0206: char[] unicodeCodeUnitsArray = null;
0207: char[] fromUnicodeTableArray = null;
0208: byte[] fromUnicodeBytesArray = null;
0209:
0210: if (header.version[0] != 4) {
0211: throw new InvalidFormatException();
0212: }
0213:
0214: mbcsTable.outputType = (byte) header.flags;
0215:
0216: /* extension data, header version 4.2 and higher */
0217: offset = header.flags >>> 8;
0218: //if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
0219: if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
0220: try {
0221: baseNameString = reader.readBaseTableName();
0222: if (offset != 0) {
0223: //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read;
0224: mbcsTable.extIndexes = reader.readExtIndexes(offset
0225: - 32 - baseNameString.length() - 1);
0226: }
0227: } catch (IOException e) {
0228: throw new InvalidFormatException();
0229: }
0230: }
0231: /*
0232: if(offset != 0) {
0233: try {
0234: //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read;
0235: int namelen = baseNameString != null? baseNameString.length() + 1: 0;
0236: mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen);
0237:
0238: }
0239: catch(IOException e) {
0240: if(debug) System.err.println("Caught IOException: " + e.getMessage());
0241: pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR;
0242: return;
0243: }
0244: }
0245: */
0246: //agljport:add this would be unnecessary if extIndexes were memory mapped
0247: if (mbcsTable.extIndexes != null) {
0248: /*
0249: try {
0250: //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4;
0251: //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE]
0252: //byte[] extTables = dataReader.readExtTables(nbytes);
0253: //mbcsTable.extTables = ByteBuffer.wrap(extTables);
0254: }
0255: catch(IOException e) {
0256: System.err.println("Caught IOException: " + e.getMessage());
0257: pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR;
0258: return;
0259: }
0260: */
0261: }
0262:
0263: if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
0264: UConverterSharedData baseSharedData = null;
0265: ByteBuffer extIndexes;
0266: String baseName;
0267:
0268: /* extension-only file, load the base table and set values appropriately */
0269: if ((extIndexes = mbcsTable.extIndexes) == null) {
0270: /* extension-only file without extension */
0271: throw new InvalidFormatException();
0272: }
0273:
0274: if (args.nestedLoads != 1) {
0275: /* an extension table must not be loaded as a base table */
0276: throw new InvalidFormatException();
0277: }
0278:
0279: /* load the base table */
0280: baseName = baseNameString;
0281: if (baseName.equals(staticData.name)) {
0282: /* forbid loading this same extension-only file */
0283: throw new InvalidFormatException();
0284: }
0285:
0286: /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
0287: //agljport:fix args.size=sizeof(UConverterLoadArgs);
0288: LoadArguments args2 = new LoadArguments(2, baseName);
0289: baseSharedData = loadConverter(args2);
0290:
0291: if (baseSharedData.staticData.conversionType != UConverterType.MBCS
0292: || baseSharedData.mbcs.baseSharedData != null) {
0293: //agljport:fix ucnv_unload(baseSharedData);
0294: throw new InvalidFormatException();
0295: }
0296:
0297: /* copy the base table data */
0298: //agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't need the deep copy so can just make sure mbcs and its local reference both refer to the same new object
0299: mbcsTable = data.mbcs = baseSharedData.mbcs;
0300:
0301: /* overwrite values with relevant ones for the extension converter */
0302: mbcsTable.baseSharedData = baseSharedData;
0303: mbcsTable.extIndexes = extIndexes;
0304:
0305: /*
0306: * It would be possible to share the swapLFNL data with a base converter,
0307: * but the generated name would have to be different, and the memory
0308: * would have to be free'd only once.
0309: * It is easier to just create the data for the extension converter
0310: * separately when it is requested.
0311: */
0312: mbcsTable.swapLFNLStateTable = null;
0313: mbcsTable.swapLFNLFromUnicodeBytes = null;
0314: mbcsTable.swapLFNLName = null;
0315:
0316: /*
0317: * Set a special, runtime-only outputType if the extension converter
0318: * is a DBCS version of a base converter that also maps single bytes.
0319: */
0320: if (staticData.conversionType == UConverterType.DBCS
0321: || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) {
0322:
0323: if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) {
0324: /* the base converter is SI/SO-stateful */
0325: int entry;
0326:
0327: /* get the dbcs state from the state table entry for SO=0x0e */
0328: entry = mbcsTable.stateTable[0][0xe];
0329: if (MBCS_ENTRY_IS_FINAL(entry)
0330: && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY
0331: && MBCS_ENTRY_FINAL_STATE(entry) != 0) {
0332: mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry);
0333:
0334: mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
0335: }
0336: } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS
0337: && baseSharedData.staticData.minBytesPerChar == 1
0338: && baseSharedData.staticData.maxBytesPerChar == 2
0339: && mbcsTable.countStates <= 127) {
0340:
0341: /* non-stateful base converter, need to modify the state table */
0342: int newStateTable[][/*256*/];
0343: int state[]; // this works because java 2-D array is array of references and we can have state = newStateTable[i];
0344: int i, count;
0345:
0346: /* allocate a new state table and copy the base state table contents */
0347: count = mbcsTable.countStates;
0348: newStateTable = new int[(count + 1) * 1024][256];
0349:
0350: for (i = 0; i < mbcsTable.stateTable.length; ++i)
0351: System.arraycopy(mbcsTable.stateTable[i], 0,
0352: newStateTable[i], 0,
0353: mbcsTable.stateTable[i].length);
0354:
0355: /* change all final single-byte entries to go to a new all-illegal state */
0356: state = newStateTable[0];
0357: for (i = 0; i < 256; ++i) {
0358: if (MBCS_ENTRY_IS_FINAL(state[i])) {
0359: state[i] = MBCS_ENTRY_TRANSITION(count, 0);
0360: }
0361: }
0362:
0363: /* build the new all-illegal state */
0364: state = newStateTable[count];
0365: for (i = 0; i < 256; ++i) {
0366: state[i] = MBCS_ENTRY_FINAL(0,
0367: MBCS_STATE_ILLEGAL, 0);
0368: }
0369: mbcsTable.stateTable = newStateTable;
0370: mbcsTable.countStates = (byte) (count + 1);
0371: mbcsTable.stateTableOwned = true;
0372:
0373: mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
0374: }
0375: }
0376:
0377: /*
0378: * unlike below for files with base tables, do not get the unicodeMask
0379: * from the sharedData; instead, use the base table's unicodeMask,
0380: * which we copied in the memcpy above;
0381: * this is necessary because the static data unicodeMask, especially
0382: * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
0383: */
0384: } else {
0385: /* conversion file with a base table; an additional extension table is optional */
0386: /* make sure that the output type is known */
0387: switch (mbcsTable.outputType) {
0388: case MBCS_OUTPUT_1:
0389: case MBCS_OUTPUT_2:
0390: case MBCS_OUTPUT_3:
0391: case MBCS_OUTPUT_4:
0392: case MBCS_OUTPUT_3_EUC:
0393: case MBCS_OUTPUT_4_EUC:
0394: case MBCS_OUTPUT_2_SISO:
0395: /* OK */
0396: break;
0397: default:
0398: throw new InvalidFormatException();
0399: }
0400:
0401: stateTableArray = new int[header.countStates][256];
0402: toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks];
0403: for (int i = 0; i < toUFallbacksArray.length; ++i)
0404: toUFallbacksArray[i] = new MBCSToUFallback();
0405: unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits) / 2];
0406: fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable) / 2];
0407: fromUnicodeBytesArray = new byte[header.fromUBytesLength];
0408: try {
0409: reader.readMBCSTable(stateTableArray,
0410: toUFallbacksArray, unicodeCodeUnitsArray,
0411: fromUnicodeTableArray, fromUnicodeBytesArray);
0412: } catch (IOException e) {
0413: throw new InvalidFormatException();
0414: }
0415:
0416: mbcsTable.countStates = (byte) header.countStates;
0417: mbcsTable.countToUFallbacks = header.countToUFallbacks;
0418: mbcsTable.stateTable = stateTableArray;
0419: mbcsTable.toUFallbacks = toUFallbacksArray;
0420: mbcsTable.unicodeCodeUnits = unicodeCodeUnitsArray;
0421:
0422: mbcsTable.fromUnicodeTable = fromUnicodeTableArray;
0423: mbcsTable.fromUnicodeBytes = fromUnicodeBytesArray;
0424: mbcsTable.fromUBytesLength = header.fromUBytesLength;
0425:
0426: /*
0427: * converter versions 6.1 and up contain a unicodeMask that is
0428: * used here to select the most efficient function implementations
0429: */
0430: //agljport:fix info.size=sizeof(UDataInfo);
0431: //agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
0432: //agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
0433: /* mask off possible future extensions to be safe */
0434: mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3);
0435: //agljport:fix } else {
0436: /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
0437: //agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
0438: //agljport:fix }
0439: if (offset != 0) {
0440: try {
0441: //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read;
0442: //int namelen = baseNameString != null? baseNameString.length() + 1: 0;
0443: //mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen);
0444: mbcsTable.extIndexes = reader.readExtIndexes(0);
0445: } catch (IOException e) {
0446: throw new InvalidFormatException();
0447: }
0448: }
0449: }
0450: return data;
0451: }
0452:
0453: protected void initializeConverter(int options) {
0454: UConverterMBCSTable mbcsTable;
0455: ByteBuffer extIndexes;
0456: short outputType;
0457: byte maxBytesPerUChar;
0458:
0459: mbcsTable = sharedData.mbcs;
0460: outputType = mbcsTable.outputType;
0461:
0462: if (outputType == MBCS_OUTPUT_DBCS_ONLY) {
0463: /* the swaplfnl option does not apply, remove it */
0464: this .options = options &= ~UConverterConstants.OPTION_SWAP_LFNL;
0465: }
0466:
0467: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
0468: /* do this because double-checked locking is broken */
0469: boolean isCached;
0470:
0471: //agljport:todo umtx_lock(NULL);
0472: isCached = mbcsTable.swapLFNLStateTable != null;
0473: //agljport:todo umtx_unlock(NULL);
0474:
0475: if (!isCached) {
0476: //agljport:fix if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
0477: //agljport:fix if(U_FAILURE(*pErrorCode)) {
0478: //agljport:fix return; /* something went wrong */
0479: //agljport:fix }
0480:
0481: /* the option does not apply, remove it */
0482: //agljport:fix cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
0483: //agljport:fix }
0484: }
0485: }
0486:
0487: if (icuCanonicalName.toLowerCase().indexOf("gb18030") >= 0) {
0488: /* set a flag for GB 18030 mode, which changes the callback behavior */
0489: this .options |= MBCS_OPTION_GB18030;
0490: }
0491:
0492: /* fix maxBytesPerUChar depending on outputType and options etc. */
0493: if (outputType == MBCS_OUTPUT_2_SISO) {
0494: maxBytesPerChar = 3; /* SO+DBCS */
0495: }
0496:
0497: extIndexes = mbcsTable.extIndexes;
0498: if (extIndexes != null) {
0499: maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes);
0500: if (outputType == MBCS_OUTPUT_2_SISO) {
0501: ++maxBytesPerUChar; /* SO + multiple DBCS */
0502: }
0503:
0504: if (maxBytesPerUChar > maxBytesPerChar) {
0505: maxBytesPerChar = maxBytesPerUChar;
0506: }
0507: }
0508: }
0509:
0510: /**
0511: * MBCS output types for conversions from Unicode.
0512: * These per-converter types determine the storage method in stage 3 of the lookup table,
0513: * mostly how many bytes are stored per entry.
0514: */
0515: protected static final int MBCS_OUTPUT_1 = 0; /* 0 */
0516: protected static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */
0517: protected static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */
0518: protected static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */
0519: protected static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */
0520: protected static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */
0521: protected static final int MBCS_OUTPUT_2_SISO = 12; /* c */
0522: protected static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */
0523: protected static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */
0524: protected static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1;
0525: protected static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */
0526:
0527: /* GB 18030 data ------------------------------------------------------------ */
0528:
0529: /* helper macros for linear values for GB 18030 four-byte sequences */
0530: protected static long LINEAR_18030(long a, long b, long c, long d) {
0531: return ((((a) * 10 + (b)) * 126L + (c)) * 10L + (d));
0532: }
0533:
0534: protected static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30,
0535: 0x81, 0x30);
0536:
0537: protected static long LINEAR(long x) {
0538: return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff,
0539: (x >>> 8) & 0xff, x & 0xff);
0540: }
0541:
0542: /*
0543: * Some ranges of GB 18030 where both the Unicode code points and the
0544: * GB four-byte sequences are contiguous and are handled algorithmically by
0545: * the special callback functions below.
0546: * The values are start & end of Unicode & GB codes.
0547: *
0548: * Note that single surrogates are not mapped by GB 18030
0549: * as of the re-released mapping tables from 2000-nov-30.
0550: */
0551: protected static final long gb18030Ranges[][] = new long[/*13*/][/*4*/] {
0552: { 0x10000L, 0x10FFFFL, LINEAR(0x90308130L),
0553: LINEAR(0xE3329A35L) },
0554: { 0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L),
0555: LINEAR(0x8336C738L) },
0556: { 0x0452L, 0x200FL, LINEAR(0x8130D330L),
0557: LINEAR(0x8136A531L) },
0558: { 0xE865L, 0xF92BL, LINEAR(0x8336D030L),
0559: LINEAR(0x84308534L) },
0560: { 0x2643L, 0x2E80L, LINEAR(0x8137A839L),
0561: LINEAR(0x8138FD38L) },
0562: { 0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L),
0563: LINEAR(0x84318537L) },
0564: { 0x3CE1L, 0x4055L, LINEAR(0x8231D438L),
0565: LINEAR(0x8232AF32L) },
0566: { 0x361BL, 0x3917L, LINEAR(0x8230A633L),
0567: LINEAR(0x8230F237L) },
0568: { 0x49B8L, 0x4C76L, LINEAR(0x8234A131L),
0569: LINEAR(0x8234E733L) },
0570: { 0x4160L, 0x4336L, LINEAR(0x8232C937L),
0571: LINEAR(0x8232F837L) },
0572: { 0x478EL, 0x4946L, LINEAR(0x8233E838L),
0573: LINEAR(0x82349638L) },
0574: { 0x44D7L, 0x464BL, LINEAR(0x8233A339L),
0575: LINEAR(0x8233C931L) },
0576: { 0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L),
0577: LINEAR(0x8431A439L) } };
0578:
0579: /* bit flag for UConverter.options indicating GB 18030 special handling */
0580: protected static final int MBCS_OPTION_GB18030 = 0x8000;
0581:
0582: /**
0583: * MBCS action codes for conversions to Unicode.
0584: * These values are in bits 23..20 of the state table entries.
0585: */
0586: protected static final int MBCS_STATE_VALID_DIRECT_16 = 0;
0587: protected static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1;
0588: protected static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1;
0589: protected static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1;
0590: protected static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1;
0591: protected static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1;
0592: protected static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1;
0593: protected static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1;
0594: protected static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1;
0595:
0596: /* Methods for state table entries */
0597: protected static int MBCS_ENTRY_TRANSITION(int state, int offset) {
0598: return (state << 24L) | offset;
0599: }
0600:
0601: protected static int MBCS_ENTRY_FINAL(int state, int action,
0602: int value) {
0603: return (int) (0x80000000 | ((int) (state) << 24L)
0604: | ((action) << 20L) | (value));
0605: }
0606:
0607: protected static boolean MBCS_ENTRY_IS_TRANSITION(int entry) {
0608: return (entry) >= 0;
0609: }
0610:
0611: protected static boolean MBCS_ENTRY_IS_FINAL(int entry) {
0612: return (entry) < 0;
0613: }
0614:
0615: protected static int MBCS_ENTRY_TRANSITION_STATE(int entry) {
0616: return ((entry) >>> 24);
0617: }
0618:
0619: protected static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) {
0620: return ((entry) & 0xffffff);
0621: }
0622:
0623: protected static int MBCS_ENTRY_FINAL_STATE(int entry) {
0624: return ((entry) >>> 24) & 0x7f;
0625: }
0626:
0627: protected static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(
0628: int entry) {
0629: return ((entry) < 0x80100000);
0630: }
0631:
0632: protected static int MBCS_ENTRY_FINAL_ACTION(int entry) {
0633: return ((entry) >>> 20) & 0xf;
0634: }
0635:
0636: protected static int MBCS_ENTRY_FINAL_VALUE(int entry) {
0637: return ((entry) & 0xfffff);
0638: }
0639:
0640: protected static char MBCS_ENTRY_FINAL_VALUE_16(int entry) {
0641: return (char) (entry);
0642: }
0643:
0644: /**
0645: * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte.
0646: * It works for single-byte, single-state codepages that only map
0647: * to and from BMP code points, and it always
0648: * returns fallback values.
0649: */
0650: protected static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
0651: UConverterMBCSTable mbcs, final int b) {
0652: return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b
0653: & UConverterConstants.UNSIGNED_BYTE_MASK]);
0654: }
0655:
0656: /* single-byte fromUnicode: get the 16-bit result word */
0657: protected static char MBCS_SINGLE_RESULT_FROM_U(char[] table,
0658: byte[] results, int c) {
0659: int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
0660: int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
0661: return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
0662: }
0663:
0664: /* multi-byte fromUnicode: get the 32-bit stage 2 entry */
0665: protected static int MBCS_STAGE_2_FROM_U(char[] table, int c) {
0666: int i = 2 * (table[(c) >>> 10] + ((c >>> 4) & 0x3f)); // 2x because used as index into char[] array treated as int[] array
0667: return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) << 16)
0668: | (table[i + 1] & UConverterConstants.UNSIGNED_SHORT_MASK);
0669: }
0670:
0671: protected static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry,
0672: int c) {
0673: return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0);
0674: }
0675:
0676: protected static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes,
0677: int stage2Entry, int c) {
0678: int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
0679: return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
0680: }
0681:
0682: protected static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes,
0683: int stage2Entry, int c) {
0684: int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
0685: return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 24)
0686: | ((bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
0687: | ((bytes[i + 2] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
0688: | (bytes[i + 3] & UConverterConstants.UNSIGNED_BYTE_MASK);
0689: }
0690:
0691: protected static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes,
0692: int stage2Entry, int c) {
0693: return ((16 * ((char) (stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3);
0694: }
0695:
0696: //------------UConverterExt-------------------------------------------------------
0697:
0698: protected static final int EXT_INDEXES_LENGTH = 0; /* 0 */
0699:
0700: protected static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */
0701: protected static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1;
0702: protected static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1;
0703: protected static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1;
0704:
0705: protected static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */
0706: protected static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1;
0707: protected static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1;
0708: protected static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1;
0709: protected static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1;
0710:
0711: protected static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */
0712: protected static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1;
0713: protected static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1;
0714: protected static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1;
0715: protected static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1;
0716: protected static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1;
0717: protected static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1;
0718:
0719: protected static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */
0720: protected static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1;
0721: protected static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1;
0722:
0723: protected static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */
0724:
0725: protected static final int EXT_SIZE = 31;
0726: protected static final int EXT_INDEXES_MIN_LENGTH = 32;
0727:
0728: /* toUnicode helpers -------------------------------------------------------- */
0729:
0730: protected static final int TO_U_BYTE_SHIFT = 24;
0731: protected static final int TO_U_VALUE_MASK = 0xffffff;
0732: protected static final int TO_U_MIN_CODE_POINT = 0x1f0000;
0733: protected static final int TO_U_MAX_CODE_POINT = 0x2fffff;
0734: protected static final int TO_U_ROUNDTRIP_FLAG = (1 << 23);
0735: protected static final int TO_U_INDEX_MASK = 0x3ffff;
0736: protected static final int TO_U_LENGTH_SHIFT = 18;
0737: protected static final int TO_U_LENGTH_OFFSET = 12;
0738:
0739: /* maximum number of indexed UChars */
0740: protected static final int MAX_UCHARS = 19;
0741:
0742: protected static int TO_U_GET_BYTE(int word) {
0743: return word >>> TO_U_BYTE_SHIFT;
0744: }
0745:
0746: protected static int TO_U_GET_VALUE(int word) {
0747: return word & TO_U_VALUE_MASK;
0748: }
0749:
0750: protected static boolean TO_U_IS_ROUNDTRIP(int value) {
0751: return (value & TO_U_ROUNDTRIP_FLAG) != 0;
0752: }
0753:
0754: protected static boolean TO_U_IS_PARTIAL(int value) {
0755: return (value & UConverterConstants.UNSIGNED_INT_MASK) < TO_U_MIN_CODE_POINT;
0756: }
0757:
0758: protected static int TO_U_GET_PARTIAL_INDEX(int value) {
0759: return value;
0760: }
0761:
0762: protected static int TO_U_MASK_ROUNDTRIP(int value) {
0763: return value & ~TO_U_ROUNDTRIP_FLAG;
0764: }
0765:
0766: protected static int TO_U_MAKE_WORD(byte b, int value) {
0767: return ((b & UConverterConstants.UNSIGNED_BYTE_MASK) << TO_U_BYTE_SHIFT)
0768: | value;
0769: }
0770:
0771: /* use after masking off the roundtrip flag */
0772: protected static boolean TO_U_IS_CODE_POINT(int value) {
0773: return (value & UConverterConstants.UNSIGNED_INT_MASK) <= TO_U_MAX_CODE_POINT;
0774: }
0775:
0776: protected static int TO_U_GET_CODE_POINT(int value) {
0777: return (int) ((value & UConverterConstants.UNSIGNED_INT_MASK) - TO_U_MIN_CODE_POINT);
0778: }
0779:
0780: protected static int TO_U_GET_INDEX(int value) {
0781: return value & TO_U_INDEX_MASK;
0782: }
0783:
0784: protected static int TO_U_GET_LENGTH(int value) {
0785: return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET;
0786: }
0787:
0788: /* fromUnicode helpers ------------------------------------------------------ */
0789:
0790: /* most trie constants are shared with ucnvmbcs.h */
0791: protected static final int STAGE_2_LEFT_SHIFT = 2;
0792: protected static final int STAGE_3_GRANULARITY = 4;
0793:
0794: /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
0795: protected static int FROM_U(CharBuffer stage12, CharBuffer stage3,
0796: int s1Index, int c) {
0797: return stage3
0798: .get(stage3.position()
0799: + ((int) stage12
0800: .get(stage12.position()
0801: + (stage12.get(stage12
0802: .position()
0803: + s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT)
0804: + (c & 0xf));
0805: }
0806:
0807: protected static final int FROM_U_LENGTH_SHIFT = 24;
0808: protected static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31;
0809: protected static final int FROM_U_RESERVED_MASK = 0x60000000;
0810: protected static final int FROM_U_DATA_MASK = 0xffffff;
0811:
0812: /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
0813: protected static final int FROM_U_SUBCHAR1 = 0x80000001;
0814:
0815: /* at most 3 bytes in the lower part of the value */
0816: protected static final int FROM_U_MAX_DIRECT_LENGTH = 3;
0817:
0818: /* maximum number of indexed bytes */
0819: protected static final int MAX_BYTES = 0x1f;
0820:
0821: protected static boolean FROM_U_IS_PARTIAL(int value) {
0822: return (value >>> FROM_U_LENGTH_SHIFT) == 0;
0823: }
0824:
0825: protected static int FROM_U_GET_PARTIAL_INDEX(int value) {
0826: return value;
0827: }
0828:
0829: protected static boolean FROM_U_IS_ROUNDTRIP(int value) {
0830: return (value & FROM_U_ROUNDTRIP_FLAG) != 0;
0831: }
0832:
0833: protected static int FROM_U_MASK_ROUNDTRIP(int value) {
0834: return value & ~FROM_U_ROUNDTRIP_FLAG;
0835: }
0836:
0837: /* use after masking off the roundtrip flag */
0838: protected static int FROM_U_GET_LENGTH(int value) {
0839: return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES;
0840: }
0841:
0842: /* get bytes or bytes index */
0843: protected static int FROM_U_GET_DATA(int value) {
0844: return value & FROM_U_DATA_MASK;
0845: }
0846:
0847: /* get the pointer to an extension array from indexes[index] */
0848: protected static Buffer ARRAY(ByteBuffer indexes, int index,
0849: Class itemType) {
0850: int oldpos = indexes.position();
0851: Buffer b;
0852:
0853: indexes.position(indexes.getInt(index * 4));
0854: if (itemType == int.class)
0855: b = indexes.asIntBuffer();
0856: else if (itemType == short.class)
0857: b = indexes.asShortBuffer();
0858: else if (itemType == byte.class)
0859: b = indexes.slice();
0860: else if (itemType == char.class)
0861: b = indexes.asCharBuffer();
0862: else
0863: b = indexes.slice();
0864: indexes.position(oldpos);
0865: return b;
0866: }
0867:
0868: protected static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) {
0869: indexes.position(0);
0870: return indexes.getInt(EXT_COUNT_BYTES) & 0xff;
0871: }
0872:
0873: /*
0874: * @return index of the UChar, if found; else <0
0875: */
0876: protected static int findFromU(CharBuffer fromUSection, int length,
0877: char u) {
0878: int i, start, limit;
0879:
0880: /* binary search */
0881: start = 0;
0882: limit = length;
0883: for (;;) {
0884: i = limit - start;
0885: if (i <= 1) {
0886: break; /* done */
0887: }
0888: /* start<limit-1 */
0889:
0890: if (i <= 4) {
0891: /* linear search for the last part */
0892: if (u <= fromUSection.get(fromUSection.position()
0893: + start)) {
0894: break;
0895: }
0896: if (++start < limit
0897: && u <= fromUSection.get(fromUSection
0898: .position()
0899: + start)) {
0900: break;
0901: }
0902: if (++start < limit
0903: && u <= fromUSection.get(fromUSection
0904: .position()
0905: + start)) {
0906: break;
0907: }
0908: /* always break at start==limit-1 */
0909: ++start;
0910: break;
0911: }
0912:
0913: i = (start + limit) / 2;
0914: if (u < fromUSection.get(fromUSection.position() + i)) {
0915: limit = i;
0916: } else {
0917: start = i;
0918: }
0919: }
0920:
0921: /* did we really find it? */
0922: if (start < limit
0923: && u == fromUSection.get(fromUSection.position()
0924: + start)) {
0925: return start;
0926: } else {
0927: return -1; /* not found */
0928: }
0929: }
0930:
0931: /*
0932: * @return lookup value for the byte, if found; else 0
0933: */
0934: protected static int findToU(IntBuffer toUSection, int length,
0935: short byt) {
0936: long word0, word;
0937: int i, start, limit;
0938:
0939: /* check the input byte against the lowest and highest section bytes */
0940: //agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position property
0941: start = TO_U_GET_BYTE(toUSection.get(toUSection.position()));
0942: limit = TO_U_GET_BYTE(toUSection.get(toUSection.position()
0943: + length - 1));
0944: if (byt < start || limit < byt) {
0945: return 0; /* the byte is out of range */
0946: }
0947:
0948: if (length == ((limit - start) + 1)) {
0949: /* direct access on a linear array */
0950: return TO_U_GET_VALUE(toUSection.get(toUSection.position()
0951: + byt - start)); /* could be 0 */
0952: }
0953:
0954: /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
0955: word0 = TO_U_MAKE_WORD((byte) byt, 0)
0956: & UConverterConstants.UNSIGNED_INT_MASK;
0957:
0958: /*
0959: * Shift byte once instead of each section word and add 0xffffff.
0960: * We will compare the shifted/added byte (bbffffff) against
0961: * section words which have byte values in the same bit position.
0962: * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
0963: * for all v=0..f
0964: * so we need not mask off the lower 24 bits of each section word.
0965: */
0966: word = word0 | TO_U_VALUE_MASK;
0967:
0968: /* binary search */
0969: start = 0;
0970: limit = length;
0971: for (;;) {
0972: i = limit - start;
0973: if (i <= 1) {
0974: break; /* done */
0975: }
0976: /* start<limit-1 */
0977:
0978: if (i <= 4) {
0979: /* linear search for the last part */
0980: if (word0 <= (toUSection.get(toUSection.position()
0981: + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
0982: break;
0983: }
0984: if (++start < limit
0985: && word0 <= (toUSection.get(toUSection
0986: .position()
0987: + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
0988: break;
0989: }
0990: if (++start < limit
0991: && word0 <= (toUSection.get(toUSection
0992: .position()
0993: + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
0994: break;
0995: }
0996: /* always break at start==limit-1 */
0997: ++start;
0998: break;
0999: }
1000:
1001: i = (start + limit) / 2;
1002: if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) {
1003: limit = i;
1004: } else {
1005: start = i;
1006: }
1007: }
1008:
1009: /* did we really find it? */
1010: if (start < limit
1011: && byt == TO_U_GET_BYTE((int) (word = (toUSection
1012: .get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)))) {
1013: return TO_U_GET_VALUE((int) word); /* never 0 */
1014: } else {
1015: return 0; /* not found */
1016: }
1017: }
1018:
1019: /*
1020: * TRUE if not an SI/SO stateful converter,
1021: * or if the match length fits with the current converter state
1022: */
1023: protected static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState,
1024: int match) {
1025: return sisoState < 0 || (sisoState == 0) == (match == 1);
1026: }
1027:
1028: /*
1029: * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
1030: * or 1 for DBCS-only,
1031: * or -1 if the converter is not SI/SO stateful
1032: *
1033: * Note: For SI/SO stateful converters getting here,
1034: * cnv->mode==0 is equivalent to firstLength==1.
1035: */
1036: protected static int SISO_STATE(UConverterSharedData sharedData,
1037: int mode) {
1038: return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode
1039: : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1
1040: : -1;
1041: }
1042:
1043: class CharsetDecoderMBCS extends CharsetDecoderICU {
1044:
1045: CharsetDecoderMBCS(CharsetICU cs) {
1046: super (cs);
1047: }
1048:
1049: protected CoderResult decodeLoop(ByteBuffer source,
1050: CharBuffer target, IntBuffer offsets, boolean flush) {
1051: CoderResult[] cr = { CoderResult.UNDERFLOW };
1052:
1053: int sourceArrayIndex;
1054: int stateTable[][/*256*/];
1055: char[] unicodeCodeUnits;
1056:
1057: int offset;
1058: byte state;
1059: int byteIndex;
1060: byte[] bytes;
1061:
1062: int sourceIndex, nextSourceIndex;
1063:
1064: int entry = 0;
1065: char c;
1066: byte action;
1067:
1068: if (preToULength > 0) {
1069: /*
1070: * pass sourceIndex=-1 because we continue from an earlier buffer
1071: * in the future, this may change with continuous offsets
1072: */
1073: cr[0] = continueMatchToU(source, target, offsets, -1,
1074: flush);
1075:
1076: if (cr[0].isError() || preToULength < 0) {
1077: return cr[0];
1078: }
1079: }
1080:
1081: if (sharedData.mbcs.countStates == 1) {
1082: if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
1083: cr[0] = cnvMBCSSingleToBMPWithOffsets(source,
1084: target, offsets, flush);
1085: } else {
1086: cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source,
1087: target, offsets, flush);
1088: }
1089: return cr[0];
1090: }
1091:
1092: /* set up the local pointers */
1093: sourceArrayIndex = source.position();
1094:
1095: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
1096: stateTable = sharedData.mbcs.swapLFNLStateTable;
1097: } else {
1098: stateTable = sharedData.mbcs.stateTable;
1099: }
1100: unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
1101:
1102: /* get the converter state from UConverter */
1103: offset = (int) toUnicodeStatus;
1104: byteIndex = toULength;
1105: bytes = toUBytesArray;
1106:
1107: /*
1108: * if we are in the SBCS state for a DBCS-only converter,
1109: * then load the DBCS state from the MBCS data
1110: * (dbcsOnlyState==0 if it is not a DBCS-only converter)
1111: */
1112: if ((state = (byte) (mode)) == 0) {
1113: state = sharedData.mbcs.dbcsOnlyState;
1114: }
1115:
1116: /* sourceIndex=-1 if the current character began in the previous buffer */
1117: sourceIndex = byteIndex == 0 ? 0 : -1;
1118: nextSourceIndex = 0;
1119:
1120: /* conversion loop */
1121: while (sourceArrayIndex < source.limit()) {
1122: /*
1123: * This following test is to see if available input would overflow the output.
1124: * It does not catch output of more than one code unit that
1125: * overflows as a result of a surrogate pair or callback output
1126: * from the last source byte.
1127: * Therefore, those situations also test for overflows and will
1128: * then break the loop, too.
1129: */
1130: if (!target.hasRemaining()) {
1131: /* target is full */
1132: cr[0] = CoderResult.OVERFLOW;
1133: break;
1134: }
1135:
1136: if (byteIndex == 0) {
1137: /* optimized loop for 1/2-byte input and BMP output */
1138: if (offsets == null) {
1139: do {
1140: entry = stateTable[state][source
1141: .get(sourceArrayIndex)
1142: & UConverterConstants.UNSIGNED_BYTE_MASK];
1143: if (MBCS_ENTRY_IS_TRANSITION(entry)) {
1144: state = (byte) MBCS_ENTRY_TRANSITION_STATE(entry);
1145: offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
1146:
1147: ++sourceArrayIndex;
1148: if (sourceArrayIndex < source.limit()
1149: && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source
1150: .get(sourceArrayIndex)
1151: & UConverterConstants.UNSIGNED_BYTE_MASK])
1152: && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
1153: && (c = unicodeCodeUnits[offset
1154: + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
1155: ++sourceArrayIndex;
1156: target.put(c);
1157: state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1158: offset = 0;
1159: } else {
1160: /* set the state and leave the optimized loop */
1161: bytes[0] = source
1162: .get(sourceArrayIndex - 1);
1163: byteIndex = 1;
1164: break;
1165: }
1166: } else {
1167: if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1168: /* output BMP code point */
1169: ++sourceArrayIndex;
1170: target
1171: .put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1172: state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1173: } else {
1174: /* leave the optimized loop */
1175: break;
1176: }
1177: }
1178: } while (sourceArrayIndex < source.limit()
1179: && target.hasRemaining());
1180: } else /* offsets!=NULL */{
1181: //agljport:todo see ucnvmbcs.c for deleted block
1182: do {
1183: entry = stateTable[state][source
1184: .get(sourceArrayIndex)];
1185: if (MBCS_ENTRY_IS_TRANSITION(entry)) {
1186: state = (byte) MBCS_ENTRY_TRANSITION_STATE(entry);
1187: offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
1188:
1189: ++sourceArrayIndex;
1190: if (sourceArrayIndex < source.limit()
1191: && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source
1192: .get(sourceArrayIndex)])
1193: && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
1194: && (c = unicodeCodeUnits[offset
1195: + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
1196:
1197: ++sourceArrayIndex;
1198: target.put(c);
1199: if (offsets != null) {
1200: offsets.put(sourceIndex);
1201: sourceIndex = (nextSourceIndex += 2);
1202: }
1203: state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1204: offset = 0;
1205: } else {
1206: /* set the state and leave the optimized loop */
1207: ++nextSourceIndex;
1208: bytes[0] = source
1209: .get(sourceArrayIndex - 1);
1210: byteIndex = 1;
1211: break;
1212: }
1213: } else {
1214: if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1215: /* output BMP code point */
1216: ++sourceArrayIndex;
1217: target
1218: .put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1219: if (offsets != null) {
1220: offsets.put(sourceIndex);
1221: sourceIndex = ++nextSourceIndex;
1222: }
1223: state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1224: } else {
1225: /* leave the optimized loop */
1226: break;
1227: }
1228: }
1229: } while (sourceArrayIndex < source.limit()
1230: && target.hasRemaining());
1231: }
1232:
1233: /*
1234: * these tests and break statements could be put inside the loop
1235: * if C had "break outerLoop" like Java
1236: */
1237: if (sourceArrayIndex >= source.limit()) {
1238: break;
1239: }
1240: if (!target.hasRemaining()) {
1241: /* target is full */
1242: cr[0] = CoderResult.OVERFLOW;
1243: break;
1244: }
1245:
1246: ++nextSourceIndex;
1247: bytes[byteIndex++] = source.get(sourceArrayIndex++);
1248: } else /* byteIndex>0 */{
1249: ++nextSourceIndex;
1250: entry = stateTable[state][(bytes[byteIndex++] = source
1251: .get(sourceArrayIndex++))
1252: & UConverterConstants.UNSIGNED_BYTE_MASK];
1253: }
1254:
1255: if (MBCS_ENTRY_IS_TRANSITION(entry)) {
1256: state = (byte) MBCS_ENTRY_TRANSITION_STATE(entry);
1257: offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
1258: continue;
1259: }
1260:
1261: /* save the previous state for proper extension mapping with SI/SO-stateful converters */
1262: mode = state;
1263:
1264: /* set the next state early so that we can reuse the entry variable */
1265: state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1266:
1267: /*
1268: * An if-else-if chain provides more reliable performance for
1269: * the most common cases compared to a switch.
1270: */
1271: action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
1272: if (action == MBCS_STATE_VALID_16) {
1273: offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
1274: c = unicodeCodeUnits[offset];
1275: if (c < 0xfffe) {
1276: /* output BMP code point */
1277: target.put(c);
1278: if (offsets != null) {
1279: offsets.put(sourceIndex);
1280: }
1281: byteIndex = 0;
1282: } else if (c == 0xfffe) {
1283: if (isToUUseFallback()
1284: && (entry = (int) getFallback(
1285: sharedData.mbcs, offset)) != 0xfffe) {
1286: /* output fallback BMP code point */
1287: target.put((char) entry);
1288: if (offsets != null) {
1289: offsets.put(sourceIndex);
1290: }
1291: byteIndex = 0;
1292: }
1293: } else {
1294: /* callback(illegal) */
1295: cr[0] = CoderResult
1296: .malformedForLength(byteIndex);
1297: }
1298: } else if (action == MBCS_STATE_VALID_DIRECT_16) {
1299: /* output BMP code point */
1300: target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1301: if (offsets != null) {
1302: offsets.put(sourceIndex);
1303: }
1304: byteIndex = 0;
1305: } else if (action == MBCS_STATE_VALID_16_PAIR) {
1306: offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
1307: c = unicodeCodeUnits[offset++];
1308: if (c < 0xd800) {
1309: /* output BMP code point below 0xd800 */
1310: target.put(c);
1311: if (offsets != null) {
1312: offsets.put(sourceIndex);
1313: }
1314: byteIndex = 0;
1315: } else if (isToUUseFallback() ? c <= 0xdfff
1316: : c <= 0xdbff) {
1317: /* output roundtrip or fallback surrogate pair */
1318: target.put((char) (c & 0xdbff));
1319: if (offsets != null) {
1320: offsets.put(sourceIndex);
1321: }
1322: byteIndex = 0;
1323: if (target.hasRemaining()) {
1324: target.put(unicodeCodeUnits[offset]);
1325: if (offsets != null) {
1326: offsets.put(sourceIndex);
1327: }
1328: } else {
1329: /* target overflow */
1330: charErrorBufferArray[0] = unicodeCodeUnits[offset];
1331: charErrorBufferLength = 1;
1332: cr[0] = CoderResult.OVERFLOW;
1333:
1334: offset = 0;
1335: break;
1336: }
1337: } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000
1338: : c == 0xe000) {
1339: /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1340: target.put(unicodeCodeUnits[offset]);
1341: if (offsets != null) {
1342: offsets.put(sourceIndex);
1343: }
1344: byteIndex = 0;
1345: } else if (c == 0xffff) {
1346: /* callback(illegal) */
1347: cr[0] = CoderResult
1348: .malformedForLength(byteIndex);
1349: }
1350: } else if (action == MBCS_STATE_VALID_DIRECT_20
1351: || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isToUUseFallback())) {
1352: entry = MBCS_ENTRY_FINAL_VALUE(entry);
1353: /* output surrogate pair */
1354: target.put((char) (0xd800 | (char) (entry >> 10)));
1355: if (offsets != null) {
1356: offsets.put(sourceIndex);
1357: }
1358: byteIndex = 0;
1359: c = (char) (0xdc00 | (char) (entry & 0x3ff));
1360: if (target.hasRemaining()) {
1361: target.put(c);
1362: if (offsets != null) {
1363: offsets.put(sourceIndex);
1364: }
1365: } else {
1366: /* target overflow */
1367: charErrorBufferArray[0] = c;
1368: charErrorBufferLength = 1;
1369: cr[0] = CoderResult.OVERFLOW;
1370:
1371: offset = 0;
1372: break;
1373: }
1374: } else if (action == MBCS_STATE_CHANGE_ONLY) {
1375: /*
1376: * This serves as a state change without any output.
1377: * It is useful for reading simple stateful encodings,
1378: * for example using just Shift-In/Shift-Out codes.
1379: * The 21 unused bits may later be used for more sophisticated
1380: * state transitions.
1381: */
1382: if (sharedData.mbcs.dbcsOnlyState == 0) {
1383: byteIndex = 0;
1384: } else {
1385: /* SI/SO are illegal for DBCS-only conversion */
1386: state = (byte) (mode); /* restore the previous state */
1387:
1388: /* callback(illegal) */
1389: cr[0] = CoderResult
1390: .malformedForLength(byteIndex);
1391: }
1392: } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
1393: if (isToUUseFallback()) {
1394: /* output BMP code point */
1395: target
1396: .put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1397: if (offsets != null) {
1398: offsets.put(sourceIndex);
1399: }
1400: byteIndex = 0;
1401: }
1402: } else if (action == MBCS_STATE_UNASSIGNED) {
1403: /* just fall through */
1404: } else if (action == MBCS_STATE_ILLEGAL) {
1405: /* callback(illegal) */
1406: cr[0] = CoderResult.malformedForLength(byteIndex);
1407: } else {
1408: /* reserved, must never occur */
1409: byteIndex = 0;
1410: }
1411:
1412: /* end of action codes: prepare for a new character */
1413: offset = 0;
1414:
1415: if (byteIndex == 0) {
1416: sourceIndex = nextSourceIndex;
1417: } else if (cr[0].isError()) {
1418: /* callback(illegal) */
1419: break;
1420: } else /* unassigned sequences indicated with byteIndex>0 */{
1421: /* try an extension mapping */
1422: int sourceBeginIndex = sourceArrayIndex;
1423: source.position(sourceArrayIndex);
1424: byteIndex = toU(byteIndex, source, target, offsets,
1425: sourceIndex, flush, cr);
1426: sourceArrayIndex = source.position();
1427: sourceIndex = nextSourceIndex
1428: + (int) (sourceArrayIndex - sourceBeginIndex);
1429:
1430: if (cr[0].isError() || cr[0].isOverflow()) {
1431: /* not mappable or buffer overflow */
1432: break;
1433: }
1434: }
1435: }
1436:
1437: /* set the converter state back into UConverter */
1438: toUnicodeStatus = offset;
1439: mode = state;
1440: toULength = byteIndex;
1441:
1442: /* write back the updated pointers */
1443: source.position(sourceArrayIndex);
1444:
1445: return cr[0];
1446: }
1447:
1448: /*
1449: * continue partial match with new input
1450: * never called for simple, single-character conversion
1451: */
1452: protected CoderResult continueMatchToU(ByteBuffer source,
1453: CharBuffer target, IntBuffer offsets, int srcIndex,
1454: boolean flush) {
1455: CoderResult cr = CoderResult.UNDERFLOW;
1456:
1457: int[] value = new int[1];
1458: int match, length;
1459:
1460: match = matchToU((byte) SISO_STATE(sharedData, mode),
1461: preToUArray, preToUBegin, preToULength, source,
1462: value, flush);
1463:
1464: if (match > 0) {
1465: if (match >= preToULength) {
1466: /* advance src pointer for the consumed input */
1467: source.position(source.position() + match
1468: - preToULength);
1469: preToULength = 0;
1470: } else {
1471: /* the match did not use all of preToU[] - keep the rest for replay */
1472: length = preToULength - match;
1473: System.arraycopy(preToUArray, preToUBegin + match,
1474: preToUArray, preToUBegin, length);
1475: preToULength = (byte) -length;
1476: }
1477:
1478: /* write result */
1479: cr = writeToU(value[0], target, offsets, srcIndex);
1480: } else if (match < 0) {
1481: /* save state for partial match */
1482: int j, sArrayIndex;
1483:
1484: /* just _append_ the newly consumed input to preToU[] */
1485: sArrayIndex = source.position();
1486: match = -match;
1487: for (j = preToULength; j < match; ++j) {
1488: preToUArray[j] = source.get(sArrayIndex++);
1489: }
1490: source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
1491: preToULength = (byte) match;
1492: } else /* match==0 */{
1493: /*
1494: * no match
1495: *
1496: * We need to split the previous input into two parts:
1497: *
1498: * 1. The first codepage character is unmappable - that's how we got into
1499: * trying the extension data in the first place.
1500: * We need to move it from the preToU buffer
1501: * to the error buffer, set an error code,
1502: * and prepare the rest of the previous input for 2.
1503: *
1504: * 2. The rest of the previous input must be converted once we
1505: * come back from the callback for the first character.
1506: * At that time, we have to try again from scratch to convert
1507: * these input characters.
1508: * The replay will be handled by the ucnv.c conversion code.
1509: */
1510:
1511: /* move the first codepage character to the error field */
1512: System
1513: .arraycopy(preToUArray, preToUBegin,
1514: toUBytesArray, toUBytesBegin,
1515: preToUFirstLength);
1516: toULength = preToUFirstLength;
1517:
1518: /* move the rest up inside the buffer */
1519: length = preToULength - preToUFirstLength;
1520: if (length > 0) {
1521: System.arraycopy(preToUArray, preToUBegin
1522: + preToUFirstLength, preToUArray,
1523: preToUBegin, length);
1524: }
1525:
1526: /* mark preToU for replay */
1527: preToULength = (byte) -length;
1528:
1529: /* set the error code for unassigned */
1530: cr = CoderResult.unmappableForLength(preToUFirstLength);
1531: }
1532: return cr;
1533: }
1534:
1535: /*
1536: * this works like natchFromU() except
1537: * - the first character is in pre
1538: * - no trie is used
1539: * - the returned matchLength is not offset by 2
1540: */
1541: protected int matchToU(byte sisoState, byte[] preArray,
1542: int preArrayBegin, int preLength, ByteBuffer source,
1543: int[] pMatchValue, boolean flush) {
1544: ByteBuffer cx = sharedData.mbcs.extIndexes;
1545: IntBuffer toUTable, toUSection;
1546:
1547: int value, matchValue, srcLength;
1548: int i, j, index, length, matchLength;
1549: short b;
1550:
1551: if (cx == null
1552: || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) {
1553: return 0; /* no extension data, no match */
1554: }
1555:
1556: /* initialize */
1557: toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class);
1558: index = 0;
1559:
1560: matchValue = 0;
1561: i = j = matchLength = 0;
1562: srcLength = source.remaining();
1563:
1564: if (sisoState == 0) {
1565: /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
1566: if (preLength > 1) {
1567: return 0; /* no match of a DBCS sequence in SBCS mode */
1568: } else if (preLength == 1) {
1569: srcLength = 0;
1570: } else /* preLength==0 */{
1571: if (srcLength > 1) {
1572: srcLength = 1;
1573: }
1574: }
1575: flush = true;
1576: }
1577:
1578: /* we must not remember fallback matches when not using fallbacks */
1579:
1580: /* match input units until there is a full match or the input is consumed */
1581: for (;;) {
1582: /* go to the next section */
1583: int oldpos = toUTable.position();
1584: toUSection = ((IntBuffer) toUTable.position(index))
1585: .slice();
1586: toUTable.position(oldpos);
1587:
1588: /* read first pair of the section */
1589: value = toUSection.get();
1590: length = TO_U_GET_BYTE(value);
1591: value = TO_U_GET_VALUE(value);
1592: if (value != 0
1593: && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback())
1594: && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
1595: /* remember longest match so far */
1596: matchValue = value;
1597: matchLength = i + j;
1598: }
1599:
1600: /* match pre[] then src[] */
1601: if (i < preLength) {
1602: b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK);
1603: } else if (j < srcLength) {
1604: b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK);
1605: } else {
1606: /* all input consumed, partial match */
1607: if (flush || (length = (i + j)) > MAX_BYTES) {
1608: /*
1609: * end of the entire input stream, stop with the longest match so far
1610: * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
1611: * because it must fit into state buffers
1612: */
1613: break;
1614: } else {
1615: /* continue with more input next time */
1616: return -length;
1617: }
1618: }
1619:
1620: /* search for the current UChar */
1621: value = findToU(toUSection, length, b);
1622: if (value == 0) {
1623: /* no match here, stop with the longest match so far */
1624: break;
1625: } else {
1626: if (TO_U_IS_PARTIAL(value)) {
1627: /* partial match, continue */
1628: index = TO_U_GET_PARTIAL_INDEX(value);
1629: } else {
1630: if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback())
1631: && TO_U_VERIFY_SISO_MATCH(sisoState, i
1632: + j)) {
1633: /* full match, stop with result */
1634: matchValue = value;
1635: matchLength = i + j;
1636: } else {
1637: /* full match on fallback not taken, stop with the longest match so far */
1638: }
1639: break;
1640: }
1641: }
1642: }
1643:
1644: if (matchLength == 0) {
1645: /* no match at all */
1646: return 0;
1647: }
1648:
1649: /* return result */
1650: pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue);
1651: return matchLength;
1652: }
1653:
1654: protected CoderResult writeToU(int value, CharBuffer target,
1655: IntBuffer offsets, int srcIndex) {
1656: ByteBuffer cx = sharedData.mbcs.extIndexes;
1657: /* output the result */
1658: if (TO_U_IS_CODE_POINT(value)) {
1659: /* output a single code point */
1660: return toUWriteCodePoint(TO_U_GET_CODE_POINT(value),
1661: target, offsets, srcIndex);
1662: } else {
1663: /* output a string - with correct data we have resultLength>0 */
1664:
1665: char[] a = new char[TO_U_GET_LENGTH(value)];
1666: CharBuffer cb = ((CharBuffer) ARRAY(cx,
1667: EXT_TO_U_UCHARS_INDEX, char.class));
1668: cb.position(TO_U_GET_INDEX(value));
1669: cb.get(a, 0, a.length);
1670: return toUWriteUChars(this , a, 0, a.length, target,
1671: offsets, srcIndex);
1672: }
1673: }
1674:
1675: protected CoderResult toUWriteCodePoint(int c,
1676: CharBuffer target, IntBuffer offsets, int sourceIndex) {
1677: CoderResult cr = CoderResult.UNDERFLOW;
1678: int tBeginIndex = target.position();
1679:
1680: if (target.hasRemaining()) {
1681: if (c <= 0xffff) {
1682: target.put((char) c);
1683: c = UConverterConstants.U_SENTINEL;
1684: } else /* c is a supplementary code point */{
1685: target.put(UTF16.getLeadSurrogate(c));
1686: c = UTF16.getTrailSurrogate(c);
1687: if (target.hasRemaining()) {
1688: target.put((char) c);
1689: c = UConverterConstants.U_SENTINEL;
1690: }
1691: }
1692:
1693: /* write offsets */
1694: if (offsets != null) {
1695: offsets.put(sourceIndex);
1696: if ((tBeginIndex + 1) < target.position()) {
1697: offsets.put(sourceIndex);
1698: }
1699: }
1700: }
1701:
1702: /* write overflow from c */
1703: if (c >= 0) {
1704: charErrorBufferLength = UTF16.append(
1705: charErrorBufferArray, 0, c);
1706: cr = CoderResult.OVERFLOW;
1707: }
1708:
1709: return cr;
1710: }
1711:
1712: /*
1713: * Input sequence: cnv->toUBytes[0..length[
1714: * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1715: * else return 0 after output has been written to the target
1716: */
1717: protected int toU(int length, ByteBuffer source,
1718: CharBuffer target, IntBuffer offsets, int sourceIndex,
1719: boolean flush, CoderResult[] cr) {
1720: //ByteBuffer cx;
1721:
1722: if (sharedData.mbcs.extIndexes != null
1723: && initialMatchToU(length, source, target, offsets,
1724: sourceIndex, flush, cr)) {
1725: return 0; /* an extension mapping handled the input */
1726: }
1727:
1728: /* GB 18030 */
1729: if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) {
1730: long[] range;
1731: long linear;
1732: int i;
1733:
1734: linear = LINEAR_18030(toUBytesArray[0],
1735: toUBytesArray[1], toUBytesArray[2],
1736: toUBytesArray[3]);
1737: range = gb18030Ranges[0];
1738: for (i = 0; i < gb18030Ranges.length
1739: / gb18030Ranges[0].length; range = gb18030Ranges[++i]) {
1740: if (range[2] <= linear && linear <= range[3]) {
1741: /* found the sequence, output the Unicode code point for it */
1742: cr[0] = CoderResult.UNDERFLOW;
1743:
1744: /* add the linear difference between the input and start sequences to the start code point */
1745: linear = range[0] + (linear - range[2]);
1746:
1747: /* output this code point */
1748: cr[0] = toUWriteCodePoint((int) linear, target,
1749: offsets, sourceIndex);
1750:
1751: return 0;
1752: }
1753: }
1754: }
1755:
1756: /* no mapping */
1757: cr[0] = CoderResult.unmappableForLength(length);
1758: return length;
1759: }
1760:
1761: /*
1762: * target<targetLimit; set error code for overflow
1763: */
1764: protected boolean initialMatchToU(int firstLength,
1765: ByteBuffer source, CharBuffer target,
1766: IntBuffer offsets, int srcIndex, boolean flush,
1767: CoderResult[] cr) {
1768: int[] value = new int[1];
1769: int match = 0;
1770:
1771: /* try to match */
1772: match = matchToU((byte) SISO_STATE(sharedData, mode),
1773: toUBytesArray, toUBytesBegin, firstLength, source,
1774: value, flush);
1775: if (match > 0) {
1776: /* advance src pointer for the consumed input */
1777: source
1778: .position(source.position() + match
1779: - firstLength);
1780:
1781: /* write result to target */
1782: cr[0] = writeToU(value[0], target, offsets, srcIndex);
1783: return true;
1784: } else if (match < 0) {
1785: /* save state for partial match */
1786: byte[] sArray;
1787: int sArrayIndex;
1788: int j;
1789:
1790: /* copy the first code point */
1791: sArray = toUBytesArray;
1792: sArrayIndex = toUBytesBegin;
1793: preToUFirstLength = (byte) firstLength;
1794: for (j = 0; j < firstLength; ++j) {
1795: preToUArray[j] = sArray[sArrayIndex++];
1796: }
1797:
1798: /* now copy the newly consumed input */
1799: sArrayIndex = source.position();
1800: match = -match;
1801: for (; j < match; ++j) {
1802: preToUArray[j] = source.get(sArrayIndex++);
1803: }
1804: source.position(sArrayIndex);
1805: preToULength = (byte) match;
1806: return true;
1807: } else /* match==0 no match */{
1808: return false;
1809: }
1810: }
1811:
1812: /*
1813: * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1814: * that only map to and from the BMP.
1815: * In addition to single-byte optimizations, the offset calculations
1816: * become much easier.
1817: */
1818: protected CoderResult cnvMBCSSingleToBMPWithOffsets(
1819: ByteBuffer source, CharBuffer target,
1820: IntBuffer offsets, boolean flush) {
1821: CoderResult[] cr = { CoderResult.UNDERFLOW };
1822:
1823: int sourceArrayIndex, lastSource;
1824: int targetCapacity, length;
1825: int[][] stateTable;
1826:
1827: int sourceIndex;
1828:
1829: int entry;
1830: byte action;
1831:
1832: /* set up the local pointers */
1833: sourceArrayIndex = source.position();
1834: targetCapacity = target.remaining();
1835:
1836: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
1837: stateTable = sharedData.mbcs.swapLFNLStateTable;
1838: } else {
1839: stateTable = sharedData.mbcs.stateTable;
1840: }
1841:
1842: /* sourceIndex=-1 if the current character began in the previous buffer */
1843: sourceIndex = 0;
1844: lastSource = sourceArrayIndex;
1845:
1846: /*
1847: * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1848: * for the minimum of the sourceLength and targetCapacity
1849: */
1850: length = source.remaining();
1851: if (length < targetCapacity) {
1852: targetCapacity = length;
1853: }
1854:
1855: /* conversion loop */
1856: while (targetCapacity > 0) {
1857: entry = stateTable[0][source.get(sourceArrayIndex++)
1858: & UConverterConstants.UNSIGNED_BYTE_MASK];
1859: /* MBCS_ENTRY_IS_FINAL(entry) */
1860:
1861: /* test the most common case first */
1862: if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1863: /* output BMP code point */
1864: target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1865: --targetCapacity;
1866: continue;
1867: }
1868:
1869: /*
1870: * An if-else-if chain provides more reliable performance for
1871: * the most common cases compared to a switch.
1872: */
1873: action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
1874: if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
1875: if (isToUUseFallback()) {
1876: /* output BMP code point */
1877: target
1878: .put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
1879: --targetCapacity;
1880: continue;
1881: }
1882: } else if (action == MBCS_STATE_UNASSIGNED) {
1883: /* just fall through */
1884: } else if (action == MBCS_STATE_ILLEGAL) {
1885: /* callback(illegal) */
1886: cr[0] = CoderResult
1887: .malformedForLength(sourceArrayIndex
1888: - lastSource);
1889: } else {
1890: /* reserved, must never occur */
1891: continue;
1892: }
1893:
1894: /* set offsets since the start or the last extension */
1895: if (offsets != null) {
1896: int count = sourceArrayIndex - lastSource;
1897:
1898: /* predecrement: do not set the offset for the callback-causing character */
1899: while (--count > 0) {
1900: offsets.put(sourceIndex++);
1901: }
1902: /* offset and sourceIndex are now set for the current character */
1903: }
1904:
1905: if (cr[0].isError()) {
1906: /* callback(illegal) */
1907: break;
1908: } else /* unassigned sequences indicated with byteIndex>0 */{
1909: /* try an extension mapping */
1910: lastSource = sourceArrayIndex;
1911: toUBytesArray[0] = source.get(sourceArrayIndex - 1);
1912: source.position(sourceArrayIndex);
1913: toULength = toU((byte) 1, source, target, offsets,
1914: sourceIndex, flush, cr);
1915: sourceArrayIndex = source.position();
1916: sourceIndex += 1 + (int) (sourceArrayIndex - lastSource);
1917:
1918: if (cr[0].isError()) {
1919: /* not mappable or buffer overflow */
1920: break;
1921: }
1922:
1923: /* recalculate the targetCapacity after an extension mapping */
1924: targetCapacity = target.remaining();
1925: length = source.remaining();
1926: if (length < targetCapacity) {
1927: targetCapacity = length;
1928: }
1929: }
1930: }
1931:
1932: if (!cr[0].isError()
1933: && sourceArrayIndex < source.capacity()
1934: && !target.hasRemaining()) {
1935: /* target is full */
1936: cr[0] = CoderResult.OVERFLOW;
1937: }
1938:
1939: /* set offsets since the start or the last callback */
1940: if (offsets != null) {
1941: int count = sourceArrayIndex - lastSource;
1942: while (count > 0) {
1943: offsets.put(sourceIndex++);
1944: --count;
1945: }
1946: }
1947:
1948: /* write back the updated pointers */
1949: source.position(sourceArrayIndex);
1950:
1951: return cr[0];
1952: }
1953:
1954: /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1955: protected CoderResult cnvMBCSSingleToUnicodeWithOffsets(
1956: ByteBuffer source, CharBuffer target,
1957: IntBuffer offsets, boolean flush) {
1958: CoderResult[] cr = { CoderResult.UNDERFLOW };
1959:
1960: int sourceArrayIndex;
1961: int[][] stateTable;
1962:
1963: int sourceIndex;
1964:
1965: int entry;
1966: char c;
1967: byte action;
1968:
1969: /* set up the local pointers */
1970: sourceArrayIndex = source.position();
1971:
1972: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
1973: stateTable = sharedData.mbcs.swapLFNLStateTable;
1974: } else {
1975: stateTable = sharedData.mbcs.stateTable;
1976: }
1977:
1978: /* sourceIndex=-1 if the current character began in the previous buffer */
1979: sourceIndex = 0;
1980:
1981: /* conversion loop */
1982: while (sourceArrayIndex < source.limit()) {
1983: /*
1984: * This following test is to see if available input would overflow the output.
1985: * It does not catch output of more than one code unit that
1986: * overflows as a result of a surrogate pair or callback output
1987: * from the last source byte.
1988: * Therefore, those situations also test for overflows and will
1989: * then break the loop, too.
1990: */
1991: if (!target.hasRemaining()) {
1992: /* target is full */
1993: cr[0] = CoderResult.OVERFLOW;
1994: break;
1995: }
1996:
1997: entry = stateTable[0][source.get(sourceArrayIndex++)
1998: & UConverterConstants.UNSIGNED_BYTE_MASK];
1999: /* MBCS_ENTRY_IS_FINAL(entry) */
2000:
2001: /* test the most common case first */
2002: if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2003: /* output BMP code point */
2004: target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
2005: if (offsets != null) {
2006: offsets.put(sourceIndex);
2007: }
2008:
2009: /* normal end of action codes: prepare for a new character */
2010: ++sourceIndex;
2011: continue;
2012: }
2013:
2014: /*
2015: * An if-else-if chain provides more reliable performance for
2016: * the most common cases compared to a switch.
2017: */
2018: action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
2019: if (action == MBCS_STATE_VALID_DIRECT_20
2020: || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isToUUseFallback())) {
2021:
2022: entry = MBCS_ENTRY_FINAL_VALUE(entry);
2023: /* output surrogate pair */
2024: target.put((char) (0xd800 | (char) (entry >>> 10)));
2025: if (offsets != null) {
2026: offsets.put(sourceIndex);
2027: }
2028: c = (char) (0xdc00 | (char) (entry & 0x3ff));
2029: if (target.hasRemaining()) {
2030: target.put(c);
2031: if (offsets != null) {
2032: offsets.put(sourceIndex);
2033: }
2034: } else {
2035: /* target overflow */
2036: charErrorBufferArray[0] = c;
2037: charErrorBufferLength = 1;
2038: cr[0] = CoderResult.OVERFLOW;
2039: break;
2040: }
2041:
2042: ++sourceIndex;
2043: continue;
2044: } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
2045: if (isToUUseFallback()) {
2046: /* output BMP code point */
2047: target
2048: .put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
2049: if (offsets != null) {
2050: offsets.put(sourceIndex);
2051: }
2052:
2053: ++sourceIndex;
2054: continue;
2055: }
2056: } else if (action == MBCS_STATE_UNASSIGNED) {
2057: /* just fall through */
2058: } else if (action == MBCS_STATE_ILLEGAL) {
2059: /* callback(illegal) */
2060: cr[0] = CoderResult.malformedForLength(1);
2061: } else {
2062: /* reserved, must never occur */
2063: ++sourceIndex;
2064: continue;
2065: }
2066:
2067: if (cr[0].isError()) {
2068: /* callback(illegal) */
2069: break;
2070: } else /* unassigned sequences indicated with byteIndex>0 */{
2071: /* try an extension mapping */
2072: int sourceBeginIndex = sourceArrayIndex;
2073: toUBytesArray[0] = source.get(sourceArrayIndex - 1);
2074: source.position(sourceArrayIndex);
2075: toULength = toU((byte) 1, source, target, offsets,
2076: sourceIndex, flush, cr);
2077: sourceArrayIndex = source.position();
2078: sourceIndex += 1 + (int) (sourceArrayIndex - sourceBeginIndex);
2079:
2080: if (cr[0].isError()) {
2081: /* not mappable or buffer overflow */
2082: break;
2083: }
2084: }
2085: }
2086:
2087: /* write back the updated pointers */
2088: source.position(sourceArrayIndex);
2089:
2090: return cr[0];
2091: }
2092:
2093: protected int getFallback(UConverterMBCSTable mbcsTable,
2094: int offset) {
2095: MBCSToUFallback[] toUFallbacks;
2096: int i, start, limit;
2097:
2098: limit = mbcsTable.countToUFallbacks;
2099: if (limit > 0) {
2100: /* do a binary search for the fallback mapping */
2101: toUFallbacks = mbcsTable.toUFallbacks;
2102: start = 0;
2103: while (start < limit - 1) {
2104: i = (start + limit) / 2;
2105: if (offset < toUFallbacks[i].offset) {
2106: limit = i;
2107: } else {
2108: start = i;
2109: }
2110: }
2111:
2112: /* did we really find it? */
2113: if (offset == toUFallbacks[start].offset) {
2114: return toUFallbacks[start].codePoint;
2115: }
2116: }
2117:
2118: return 0xfffe;
2119: }
2120:
2121: }
2122:
2123: class CharsetEncoderMBCS extends CharsetEncoderICU {
2124:
2125: CharsetEncoderMBCS(CharsetICU cs) {
2126: super (cs, fromUSubstitution);
2127: implReset();
2128: }
2129:
2130: protected void implReset() {
2131: super .implReset();
2132: preFromUFirstCP = UConverterConstants.U_SENTINEL;
2133: }
2134:
2135: protected CoderResult encodeLoop(CharBuffer source,
2136: ByteBuffer target, IntBuffer offsets, boolean flush) {
2137:
2138: CoderResult[] cr = { CoderResult.UNDERFLOW };
2139:
2140: int sourceArrayIndex;
2141: char[] table;
2142: byte[] pArray, bytes;
2143: int pArrayIndex, outputType, c;
2144: int prevSourceIndex, sourceIndex, nextSourceIndex;
2145: int stage2Entry, value, length, prevLength;
2146: short unicodeMask;
2147:
2148: try {
2149:
2150: if (preFromUFirstCP >= 0) {
2151: /*
2152: * pass sourceIndex=-1 because we continue from an earlier buffer
2153: * in the future, this may change with continuous offsets
2154: */
2155: cr[0] = continueMatchFromU(source, target, offsets,
2156: flush, -1);
2157:
2158: if (cr[0].isError() || preFromULength < 0) {
2159: return cr[0];
2160: }
2161: }
2162:
2163: /* use optimized function if possible */
2164: outputType = sharedData.mbcs.outputType;
2165: unicodeMask = sharedData.mbcs.unicodeMask;
2166: if (outputType == MBCS_OUTPUT_1
2167: && (unicodeMask & UConverterConstants.HAS_SURROGATES) == 0) {
2168: if ((unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
2169: cr[0] = cnvMBCSSingleFromBMPWithOffsets(source,
2170: target, offsets, flush);
2171: } else {
2172: cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(
2173: source, target, offsets, flush);
2174: }
2175: return cr[0];
2176: } else if (outputType == MBCS_OUTPUT_2) {
2177: cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source,
2178: target, offsets, flush);
2179: return cr[0];
2180: }
2181:
2182: table = sharedData.mbcs.fromUnicodeTable;
2183: sourceArrayIndex = source.position();
2184:
2185: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
2186: bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
2187: } else {
2188: bytes = sharedData.mbcs.fromUnicodeBytes;
2189: }
2190:
2191: /* get the converter state from UConverter */
2192: c = fromUChar32;
2193:
2194: if (outputType == MBCS_OUTPUT_2_SISO) {
2195: prevLength = (int) fromUnicodeStatus;
2196: if (prevLength == 0) {
2197: /* set the real value */
2198: prevLength = 1;
2199: }
2200: } else {
2201: /* prevent fromUnicodeStatus from being set to something non-0 */
2202: prevLength = 0;
2203: }
2204:
2205: /* sourceIndex=-1 if the current character began in the previous buffer */
2206: prevSourceIndex = -1;
2207: sourceIndex = c == 0 ? 0 : -1;
2208: nextSourceIndex = 0;
2209:
2210: /* conversion loop */
2211: /*
2212: * This is another piece of ugly code:
2213: * A goto into the loop if the converter state contains a first surrogate
2214: * from the previous function call.
2215: * It saves me to check in each loop iteration a check of if(c==0)
2216: * and duplicating the trail-surrogate-handling code in the else
2217: * branch of that check.
2218: * I could not find any other way to get around this other than
2219: * using a function call for the conversion and callback, which would
2220: * be even more inefficient.
2221: *
2222: * Markus Scherer 2000-jul-19
2223: */
2224: boolean doloop = true;
2225: if (c != 0 && target.hasRemaining()) {
2226: SideEffects x = new SideEffects(c,
2227: sourceArrayIndex, sourceIndex,
2228: nextSourceIndex, prevSourceIndex,
2229: prevLength);
2230: doloop = getTrail(source, target, unicodeMask, x,
2231: flush, cr);
2232: c = x.c;
2233: sourceArrayIndex = x.sourceArrayIndex;
2234: sourceIndex = x.sourceIndex;
2235: nextSourceIndex = x.nextSourceIndex;
2236: prevSourceIndex = x.prevSourceIndex;
2237: prevLength = x.prevLength;
2238: }
2239:
2240: if (doloop) {
2241: while (sourceArrayIndex < source.limit()) {
2242: /*
2243: * This following test is to see if available input would overflow the output.
2244: * It does not catch output of more than one byte that
2245: * overflows as a result of a multi-byte character or callback output
2246: * from the last source character.
2247: * Therefore, those situations also test for overflows and will
2248: * then break the loop, too.
2249: */
2250: if (target.hasRemaining()) {
2251: /*
2252: * Get a correct Unicode code point:
2253: * a single UChar for a BMP code point or
2254: * a matched surrogate pair for a "supplementary code point".
2255: */
2256: c = source.get(sourceArrayIndex++);
2257: ++nextSourceIndex;
2258: /*
2259: * This also tests if the codepage maps single surrogates.
2260: * If it does, then surrogates are not paired but mapped separately.
2261: * Note that in this case unmatched surrogates are not detected.
2262: */
2263: if (UTF16.isSurrogate((char) c)
2264: && (unicodeMask & UConverterConstants.HAS_SURROGATES) == 0) {
2265: if (UTF16.isLeadSurrogate((char) c)) {
2266: //getTrail:
2267: SideEffects x = new SideEffects(c,
2268: sourceArrayIndex,
2269: sourceIndex,
2270: nextSourceIndex,
2271: prevSourceIndex, prevLength);
2272: doloop = getTrail(source, target,
2273: unicodeMask, x, flush, cr);
2274: c = x.c;
2275: sourceArrayIndex = x.sourceArrayIndex;
2276: sourceIndex = x.sourceIndex;
2277: nextSourceIndex = x.nextSourceIndex;
2278: prevSourceIndex = x.prevSourceIndex;
2279:
2280: if (doloop)
2281: continue;
2282: else
2283: break;
2284: } else {
2285: /* this is an unmatched trail code unit (2nd surrogate) */
2286: /* callback(illegal) */
2287: cr[0] = CoderResult
2288: .malformedForLength(1);
2289: break;
2290: }
2291: }
2292:
2293: /* convert the Unicode code point in c into codepage bytes */
2294:
2295: /*
2296: * The basic lookup is a triple-stage compact array (trie) lookup.
2297: * For details see the beginning of this file.
2298: *
2299: * Single-byte codepages are handled with a different data structure
2300: * by _MBCSSingle... functions.
2301: *
2302: * The result consists of a 32-bit value from stage 2 and
2303: * a pointer to as many bytes as are stored per character.
2304: * The pointer points to the character's bytes in stage 3.
2305: * Bits 15..0 of the stage 2 entry contain the stage 3 index
2306: * for that pointer, while bits 31..16 are flags for which of
2307: * the 16 characters in the block are roundtrip-assigned.
2308: *
2309: * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
2310: * respectively as uint32_t, in the platform encoding.
2311: * For 3-byte codepages, the bytes are always stored in big-endian order.
2312: *
2313: * For EUC encodings that use only either 0x8e or 0x8f as the first
2314: * byte of their longest byte sequences, the first two bytes in
2315: * this third stage indicate with their 7th bits whether these bytes
2316: * are to be written directly or actually need to be preceeded by
2317: * one of the two Single-Shift codes. With this, the third stage
2318: * stores one byte fewer per character than the actual maximum length of
2319: * EUC byte sequences.
2320: *
2321: * Other than that, leading zero bytes are removed and the other
2322: * bytes output. A single zero byte may be output if the "assigned"
2323: * bit in stage 2 was on.
2324: * The data structure does not support zero byte output as a fallback,
2325: * and also does not allow output of leading zeros.
2326: */
2327: stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
2328:
2329: /* get the bytes and the length for the output */
2330: switch (outputType) {
2331: case MBCS_OUTPUT_2:
2332: value = MBCS_VALUE_2_FROM_STAGE_2(
2333: bytes, stage2Entry, c);
2334: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2335: length = 1;
2336: } else {
2337: length = 2;
2338: }
2339: break;
2340: case MBCS_OUTPUT_2_SISO:
2341: /* 1/2-byte stateful with Shift-In/Shift-Out */
2342: /*
2343: * Save the old state in the converter object
2344: * right here, then change the local prevLength state variable if necessary.
2345: * Then, if this character turns out to be unassigned or a fallback that
2346: * is not taken, the callback code must not save the new state in the converter
2347: * because the new state is for a character that is not output.
2348: * However, the callback must still restore the state from the converter
2349: * in case the callback function changed it for its output.
2350: */
2351: fromUnicodeStatus = prevLength; /* save the old state */
2352: value = MBCS_VALUE_2_FROM_STAGE_2(
2353: bytes, stage2Entry, c);
2354: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2355: if (value == 0
2356: && MBCS_FROM_U_IS_ROUNDTRIP(
2357: stage2Entry, c) == false) {
2358: /* no mapping, leave value==0 */
2359: length = 0;
2360: } else if (prevLength <= 1) {
2361: length = 1;
2362: } else {
2363: /* change from double-byte mode to single-byte */
2364: value |= UConverterConstants.SI << 8;
2365: length = 2;
2366: prevLength = 1;
2367: }
2368: } else {
2369: if (prevLength == 2) {
2370: length = 2;
2371: } else {
2372: /* change from single-byte mode to double-byte */
2373: value |= UConverterConstants.SO << 16;
2374: length = 3;
2375: prevLength = 2;
2376: }
2377: }
2378: break;
2379: case MBCS_OUTPUT_DBCS_ONLY:
2380: /* table with single-byte results, but only DBCS mappings used */
2381: value = MBCS_VALUE_2_FROM_STAGE_2(
2382: bytes, stage2Entry, c);
2383: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2384: /* no mapping or SBCS result, not taken for DBCS-only */
2385: value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
2386: length = 0;
2387: } else {
2388: length = 2;
2389: }
2390: break;
2391: case MBCS_OUTPUT_3:
2392: pArray = bytes;
2393: pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(
2394: bytes, stage2Entry, c);
2395: value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
2396: | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
2397: | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
2398: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2399: length = 1;
2400: } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
2401: length = 2;
2402: } else {
2403: length = 3;
2404: }
2405: break;
2406: case MBCS_OUTPUT_4:
2407: value = MBCS_VALUE_4_FROM_STAGE_2(
2408: bytes, stage2Entry, c);
2409: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2410: length = 1;
2411: } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
2412: length = 2;
2413: } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffffff) {
2414: length = 3;
2415: } else {
2416: length = 4;
2417: }
2418: break;
2419: case MBCS_OUTPUT_3_EUC:
2420: value = MBCS_VALUE_2_FROM_STAGE_2(
2421: bytes, stage2Entry, c);
2422: /* EUC 16-bit fixed-length representation */
2423: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2424: length = 1;
2425: } else if ((value & 0x8000) == 0) {
2426: value |= 0x8e8000;
2427: length = 3;
2428: } else if ((value & 0x80) == 0) {
2429: value |= 0x8f0080;
2430: length = 3;
2431: } else {
2432: length = 2;
2433: }
2434: break;
2435: case MBCS_OUTPUT_4_EUC:
2436: pArray = bytes;
2437: pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(
2438: bytes, stage2Entry, c);
2439: value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
2440: | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
2441: | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
2442: /* EUC 16-bit fixed-length representation applied to the first two bytes */
2443: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
2444: length = 1;
2445: } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
2446: length = 2;
2447: } else if ((value & 0x800000) == 0) {
2448: value |= 0x8e800000;
2449: length = 4;
2450: } else if ((value & 0x8000) == 0) {
2451: value |= 0x8f008000;
2452: length = 4;
2453: } else {
2454: length = 3;
2455: }
2456: break;
2457: default:
2458: /* must not occur */
2459: /*
2460: * To avoid compiler warnings that value & length may be
2461: * used without having been initialized, we set them here.
2462: * In reality, this is unreachable code.
2463: * Not having a default branch also causes warnings with
2464: * some compilers.
2465: */
2466: value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
2467: length = 0;
2468: break;
2469: }
2470:
2471: /* is this code point assigned, or do we use fallbacks? */
2472: if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry,
2473: c) || (isFromUUseFallback(c) && value != 0))) {
2474: /*
2475: * We allow a 0 byte output if the "assigned" bit is set for this entry.
2476: * There is no way with this data structure for fallback output
2477: * to be a zero byte.
2478: */
2479:
2480: //unassigned:
2481: SideEffects x = new SideEffects(c,
2482: sourceArrayIndex, sourceIndex,
2483: nextSourceIndex,
2484: prevSourceIndex, prevLength);
2485: doloop = unassigned(source, target,
2486: offsets, x, flush, cr);
2487: c = x.c;
2488: sourceArrayIndex = x.sourceArrayIndex;
2489: sourceIndex = x.sourceIndex;
2490: nextSourceIndex = x.nextSourceIndex;
2491: prevSourceIndex = x.prevSourceIndex;
2492: prevLength = x.prevLength;
2493: if (doloop)
2494: continue;
2495: else
2496: break;
2497: }
2498:
2499: /* write the output character bytes from value and length */
2500: /* from the first if in the loop we know that targetCapacity>0 */
2501: if (length <= target.remaining()) {
2502: if (offsets == null) {
2503: switch (length) {
2504: /* each branch falls through to the next one */
2505: case 4:
2506: target
2507: .put((byte) (value >>> 24));
2508: case 3:
2509: target
2510: .put((byte) (value >>> 16));
2511: case 2:
2512: target
2513: .put((byte) (value >>> 8));
2514: case 1:
2515: target.put((byte) value);
2516: default:
2517: /* will never occur */
2518: break;
2519: }
2520: } else {
2521: switch (length) {
2522: /* each branch falls through to the next one */
2523: case 4:
2524: target
2525: .put((byte) (value >>> 24));
2526: offsets.put(sourceIndex);
2527: case 3:
2528: target
2529: .put((byte) (value >>> 16));
2530: offsets.put(sourceIndex);
2531: case 2:
2532: target
2533: .put((byte) (value >>> 8));
2534: offsets.put(sourceIndex);
2535: case 1:
2536: target.put((byte) value);
2537: offsets.put(sourceIndex);
2538: default:
2539: /* will never occur */
2540: break;
2541: }
2542: }
2543: } else {
2544: int errorBufferArrayIndex;
2545:
2546: /*
2547: * We actually do this backwards here:
2548: * In order to save an intermediate variable, we output
2549: * first to the overflow buffer what does not fit into the
2550: * regular target.
2551: */
2552: /* we know that 1<=targetCapacity<length<=4 */
2553: length -= target.remaining();
2554:
2555: errorBufferArrayIndex = 0;
2556: switch (length) {
2557: /* each branch falls through to the next one */
2558: case 3:
2559: errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16);
2560: case 2:
2561: errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8);
2562: case 1:
2563: errorBuffer[errorBufferArrayIndex] = (byte) value;
2564: default:
2565: /* will never occur */
2566: break;
2567: }
2568: errorBufferLength = (byte) length;
2569:
2570: /* now output what fits into the regular target */
2571: value >>>= 8 * length; /* length was reduced by targetCapacity */
2572: switch (target.remaining()) {
2573: /* each branch falls through to the next one */
2574: case 3:
2575: target.put((byte) (value >>> 16));
2576: if (offsets != null) {
2577: offsets.put(sourceIndex);
2578: }
2579: case 2:
2580: target.put((byte) (value >>> 8));
2581: if (offsets != null) {
2582: offsets.put(sourceIndex);
2583: }
2584: case 1:
2585: target.put((byte) value);
2586: if (offsets != null) {
2587: offsets.put(sourceIndex);
2588: }
2589: default:
2590: /* will never occur */
2591: break;
2592: }
2593:
2594: /* target overflow */
2595: cr[0] = CoderResult.OVERFLOW;
2596: c = 0;
2597: break;
2598: }
2599:
2600: /* normal end of conversion: prepare for a new character */
2601: c = 0;
2602: if (offsets != null) {
2603: prevSourceIndex = sourceIndex;
2604: sourceIndex = nextSourceIndex;
2605: }
2606: continue;
2607: } else {
2608: /* target is full */
2609: cr[0] = CoderResult.OVERFLOW;
2610: break;
2611: }
2612: }
2613: }
2614:
2615: /*
2616: * the end of the input stream and detection of truncated input
2617: * are handled by the framework, but for EBCDIC_STATEFUL conversion
2618: * we need to emit an SI at the very end
2619: *
2620: * conditions:
2621: * successful
2622: * EBCDIC_STATEFUL in DBCS mode
2623: * end of input and no truncated input
2624: */
2625: if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2
2626: && flush && sourceArrayIndex >= source.limit()
2627: && c == 0) {
2628:
2629: /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
2630: if (target.hasRemaining()) {
2631: target.put((byte) UConverterConstants.SI);
2632: if (offsets != null) {
2633: /* set the last source character's index (sourceIndex points at sourceLimit now) */
2634: offsets.put(prevSourceIndex);
2635: }
2636: } else {
2637: /* target is full */
2638: errorBuffer[0] = (byte) UConverterConstants.SI;
2639: errorBufferLength = 1;
2640: cr[0] = CoderResult.OVERFLOW;
2641: }
2642: prevLength = 1; /* we switched into SBCS */
2643: }
2644:
2645: /* set the converter state back into UConverter */
2646: fromUChar32 = c;
2647: fromUnicodeStatus = prevLength;
2648:
2649: source.position(sourceArrayIndex);
2650: } catch (BufferOverflowException ex) {
2651: cr[0] = CoderResult.OVERFLOW;
2652: }
2653:
2654: return cr[0];
2655: }
2656:
2657: /*
2658: * continue partial match with new input, requires cnv->preFromUFirstCP>=0
2659: * never called for simple, single-character conversion
2660: */
2661: protected CoderResult continueMatchFromU(CharBuffer source,
2662: ByteBuffer target, IntBuffer offsets, boolean flush,
2663: int srcIndex) {
2664: CoderResult cr = CoderResult.UNDERFLOW;
2665: int[] value = new int[1];
2666: int match;
2667:
2668: match = matchFromU(preFromUFirstCP, preFromUArray,
2669: preFromUBegin, preFromULength, source, target,
2670: value, flush);
2671: if (match >= 2) {
2672: match -= 2; /* remove 2 for the initial code point */
2673:
2674: if (match >= preFromULength) {
2675: /* advance src pointer for the consumed input */
2676: source.position(source.position() + match
2677: - preFromULength);
2678: preFromULength = 0;
2679: } else {
2680: /* the match did not use all of preFromU[] - keep the rest for replay */
2681: int length = preFromULength - match;
2682: System.arraycopy(preFromUArray, preFromUBegin
2683: + match, preFromUArray, preFromUBegin,
2684: length);
2685: preFromULength = (byte) -length;
2686: }
2687:
2688: /* finish the partial match */
2689: preFromUFirstCP = UConverterConstants.U_SENTINEL;
2690:
2691: /* write result */
2692: writeFromU(value[0], target, offsets, srcIndex);
2693: } else if (match < 0) {
2694: /* save state for partial match */
2695: int sArrayIndex;
2696: int j;
2697:
2698: /* just _append_ the newly consumed input to preFromU[] */
2699: sArrayIndex = source.position();
2700: match = -match - 2; /* remove 2 for the initial code point */
2701: for (j = preFromULength; j < match; ++j) {
2702: preFromUArray[j] = source.get(sArrayIndex++);
2703: }
2704: source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
2705: preFromULength = (byte) match;
2706: } else /* match==0 or 1 */{
2707: /*
2708: * no match
2709: *
2710: * We need to split the previous input into two parts:
2711: *
2712: * 1. The first code point is unmappable - that's how we got into
2713: * trying the extension data in the first place.
2714: * We need to move it from the preFromU buffer
2715: * to the error buffer, set an error code,
2716: * and prepare the rest of the previous input for 2.
2717: *
2718: * 2. The rest of the previous input must be converted once we
2719: * come back from the callback for the first code point.
2720: * At that time, we have to try again from scratch to convert
2721: * these input characters.
2722: * The replay will be handled by the ucnv.c conversion code.
2723: */
2724:
2725: if (match == 1) {
2726: /* matched, no mapping but request for <subchar1> */
2727: useSubChar1 = true;
2728: }
2729:
2730: /* move the first code point to the error field */
2731: fromUChar32 = preFromUFirstCP;
2732: preFromUFirstCP = UConverterConstants.U_SENTINEL;
2733:
2734: /* mark preFromU for replay */
2735: preFromULength = (byte) -preFromULength;
2736:
2737: /* set the error code for unassigned */
2738: cr = CoderResult.unmappableForLength(source.position());
2739: }
2740: return cr;
2741: }
2742:
2743: /*
2744: * @param cx pointer to extension data; if NULL, returns 0
2745: * @param firstCP the first code point before all the other UChars
2746: * @param pre UChars that must match; !initialMatch: partial match with them
2747: * @param preLength length of pre, >=0
2748: * @param src UChars that can be used to complete a match
2749: * @param srcLength length of src, >=0
2750: * @param pMatchValue [out] output result value for the match from the data structure
2751: * @param useFallback "use fallback" flag, usually from cnv->useFallback
2752: * @param flush TRUE if the end of the input stream is reached
2753: * @return >1: matched, return value=total match length (number of input units matched)
2754: * 1: matched, no mapping but request for <subchar1>
2755: * (only for the first code point)
2756: * 0: no match
2757: * <0: partial match, return value=negative total match length
2758: * (partial matches are never returned for flush==TRUE)
2759: * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
2760: * the matchLength is 2 if only firstCP matched, and >2 if firstCP and
2761: * further code units matched
2762: */
2763: //static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush)
2764: protected int matchFromU(int firstCP, char[] preArray,
2765: int preArrayBegin, int preLength, CharBuffer source,
2766: ByteBuffer target, int[] pMatchValue, boolean flush) {
2767: ByteBuffer cx = sharedData.mbcs.extIndexes;
2768:
2769: CharBuffer stage12, stage3;
2770: IntBuffer stage3b;
2771:
2772: CharBuffer fromUTableUChars, fromUSectionUChars;
2773: IntBuffer fromUTableValues, fromUSectionValues;
2774:
2775: int value, matchValue;
2776: int i, j, index, length, matchLength;
2777: char c;
2778:
2779: if (cx == null) {
2780: return 0; /* no extension data, no match */
2781: }
2782:
2783: /* trie lookup of firstCP */
2784: index = firstCP >>> 10; /* stage 1 index */
2785: if (index >= cx.asIntBuffer()
2786: .get(EXT_FROM_U_STAGE_1_LENGTH)) {
2787: return 0; /* the first code point is outside the trie */
2788: }
2789:
2790: stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,
2791: char.class);
2792: stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,
2793: char.class);
2794: index = FROM_U(stage12, stage3, index, firstCP);
2795:
2796: stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,
2797: int.class);
2798: value = stage3b.get(stage3b.position() + index);
2799: if (value == 0) {
2800: return 0;
2801: }
2802:
2803: if (TO_U_IS_PARTIAL(value)) {
2804: /* partial match, enter the loop below */
2805: index = FROM_U_GET_PARTIAL_INDEX(value);
2806:
2807: /* initialize */
2808: fromUTableUChars = (CharBuffer) ARRAY(cx,
2809: EXT_FROM_U_UCHARS_INDEX, char.class);
2810: fromUTableValues = (IntBuffer) ARRAY(cx,
2811: EXT_FROM_U_VALUES_INDEX, int.class);
2812:
2813: matchValue = 0;
2814: i = j = matchLength = 0;
2815:
2816: /* we must not remember fallback matches when not using fallbacks */
2817:
2818: /* match input units until there is a full match or the input is consumed */
2819: for (;;) {
2820: /* go to the next section */
2821: int oldpos = fromUTableUChars.position();
2822: fromUSectionUChars = ((CharBuffer) fromUTableUChars
2823: .position(index)).slice();
2824: fromUTableUChars.position(oldpos);
2825: oldpos = fromUTableValues.position();
2826: fromUSectionValues = ((IntBuffer) fromUTableValues
2827: .position(index)).slice();
2828: fromUTableValues.position(oldpos);
2829:
2830: /* read first pair of the section */
2831: length = fromUSectionUChars.get();
2832: value = fromUSectionValues.get();
2833: if (value != 0
2834: && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(firstCP))) {
2835: /* remember longest match so far */
2836: matchValue = value;
2837: matchLength = 2 + i + j;
2838: }
2839:
2840: /* match pre[] then src[] */
2841: if (i < preLength) {
2842: c = preArray[preArrayBegin + i++];
2843: } else if (j < source.remaining()) {
2844: c = source.get(source.position() + j++);
2845: } else {
2846: /* all input consumed, partial match */
2847: if (flush || (length = (i + j)) > MAX_UCHARS) {
2848: /*
2849: * end of the entire input stream, stop with the longest match so far
2850: * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
2851: * because it must fit into state buffers
2852: */
2853: break;
2854: } else {
2855: /* continue with more input next time */
2856: return -(2 + length);
2857: }
2858: }
2859:
2860: /* search for the current UChar */
2861: index = findFromU(fromUSectionUChars, length, c);
2862: if (index < 0) {
2863: /* no match here, stop with the longest match so far */
2864: break;
2865: } else {
2866: value = fromUSectionValues
2867: .get(fromUSectionValues.position()
2868: + index);
2869: if (FROM_U_IS_PARTIAL(value)) {
2870: /* partial match, continue */
2871: index = FROM_U_GET_PARTIAL_INDEX(value);
2872: } else {
2873: if (FROM_U_IS_ROUNDTRIP(value)
2874: || isFromUUseFallback(firstCP)) {
2875: /* full match, stop with result */
2876: matchValue = value;
2877: matchLength = 2 + i + j;
2878: } else {
2879: /* full match on fallback not taken, stop with the longest match so far */
2880: }
2881: break;
2882: }
2883: }
2884: }
2885:
2886: if (matchLength == 0) {
2887: /* no match at all */
2888: return 0;
2889: }
2890: } else /* result from firstCP trie lookup */{
2891: if (FROM_U_IS_ROUNDTRIP(value)
2892: || isFromUUseFallback(firstCP)) {
2893: /* full match, stop with result */
2894: matchValue = value;
2895: matchLength = 2;
2896: } else {
2897: /* fallback not taken */
2898: return 0;
2899: }
2900: }
2901:
2902: if ((matchValue & FROM_U_RESERVED_MASK) != 0) {
2903: /* do not interpret values with reserved bits used, for forward compatibility */
2904: return 0;
2905: }
2906:
2907: /* return result */
2908: if (matchValue == FROM_U_SUBCHAR1) {
2909: return 1; /* assert matchLength==2 */
2910: }
2911:
2912: pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue);
2913: return matchLength;
2914: }
2915:
2916: protected CoderResult writeFromU(int value, ByteBuffer target,
2917: IntBuffer offsets, int srcIndex) {
2918: ByteBuffer cx = sharedData.mbcs.extIndexes;
2919:
2920: byte bufferArray[] = new byte[1 + MAX_BYTES];
2921: int bufferArrayIndex = 0;
2922: byte[] resultArray;
2923: int resultArrayIndex;
2924: int length, prevLength;
2925:
2926: length = FROM_U_GET_LENGTH(value);
2927: value = FROM_U_GET_DATA(value);
2928:
2929: /* output the result */
2930: if (length <= FROM_U_MAX_DIRECT_LENGTH) {
2931: /*
2932: * Generate a byte array and then write it below.
2933: * This is not the fastest possible way, but it should be ok for
2934: * extension mappings, and it is much simpler.
2935: * Offset and overflow handling are only done once this way.
2936: */
2937: int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */
2938: switch (length) {
2939: case 3:
2940: bufferArray[p++] = (byte) (value >>> 16);
2941: case 2:
2942: bufferArray[p++] = (byte) (value >>> 8);
2943: case 1:
2944: bufferArray[p++] = (byte) value;
2945: default:
2946: break; /* will never occur */
2947: }
2948: resultArray = bufferArray;
2949: resultArrayIndex = bufferArrayIndex + 1;
2950: } else {
2951: byte[] slice = new byte[length];
2952:
2953: ByteBuffer bb = ((ByteBuffer) ARRAY(cx,
2954: EXT_FROM_U_BYTES_INDEX, byte.class));
2955: bb.position(value);
2956: bb.get(slice, 0, slice.length);
2957:
2958: resultArray = slice;
2959: resultArrayIndex = 0;
2960: }
2961:
2962: /* with correct data we have length>0 */
2963:
2964: if ((prevLength = (int) fromUnicodeStatus) != 0) {
2965: /* handle SI/SO stateful output */
2966: byte shiftByte;
2967:
2968: if (prevLength > 1 && length == 1) {
2969: /* change from double-byte mode to single-byte */
2970: shiftByte = (byte) UConverterConstants.SI;
2971: fromUnicodeStatus = 1;
2972: } else if (prevLength == 1 && length > 1) {
2973: /* change from single-byte mode to double-byte */
2974: shiftByte = (byte) UConverterConstants.SO;
2975: fromUnicodeStatus = 2;
2976: } else {
2977: shiftByte = 0;
2978: }
2979:
2980: if (shiftByte != 0) {
2981: /* prepend the shift byte to the result bytes */
2982: bufferArray[0] = shiftByte;
2983: if (resultArray != bufferArray
2984: || resultArrayIndex != bufferArrayIndex + 1) {
2985: System.arraycopy(resultArray, resultArrayIndex,
2986: bufferArray, bufferArrayIndex + 1,
2987: length);
2988: }
2989: resultArray = bufferArray;
2990: resultArrayIndex = bufferArrayIndex;
2991: ++length;
2992: }
2993: }
2994:
2995: return fromUWriteBytes(this , resultArray, resultArrayIndex,
2996: length, target, offsets, srcIndex);
2997: }
2998:
2999: /*
3000: * @return if(U_FAILURE) return the code point for cnv->fromUChar32
3001: * else return 0 after output has been written to the target
3002: */
3003: protected int fromU(int cp_, CharBuffer source,
3004: ByteBuffer target, IntBuffer offsets, int sourceIndex,
3005: boolean flush, CoderResult[] cr) {
3006: //ByteBuffer cx;
3007: long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK;
3008:
3009: useSubChar1 = false;
3010:
3011: if (sharedData.mbcs.extIndexes != null
3012: && initialMatchFromU((int) cp, source, target,
3013: offsets, sourceIndex, flush, cr)) {
3014: return 0; /* an extension mapping handled the input */
3015: }
3016:
3017: /* GB 18030 */
3018: if ((options & MBCS_OPTION_GB18030) != 0) {
3019: long[] range;
3020: int i;
3021:
3022: for (i = 0; i < gb18030Ranges.length; ++i) {
3023: range = gb18030Ranges[i];
3024: if (range[0] <= cp && cp <= range[1]) {
3025: /* found the Unicode code point, output the four-byte sequence for it */
3026: long linear;
3027: byte bytes[] = new byte[4];
3028:
3029: /* get the linear value of the first GB 18030 code in this range */
3030: linear = range[2] - LINEAR_18030_BASE;
3031:
3032: /* add the offset from the beginning of the range */
3033: linear += (cp - range[0]);
3034:
3035: bytes[3] = (byte) (0x30 + linear % 10);
3036: linear /= 10;
3037: bytes[2] = (byte) (0x81 + linear % 126);
3038: linear /= 126;
3039: bytes[1] = (byte) (0x30 + linear % 10);
3040: linear /= 10;
3041: bytes[0] = (byte) (0x81 + linear);
3042:
3043: /* output this sequence */
3044: cr[0] = fromUWriteBytes(this , bytes, 0, 4,
3045: target, offsets, sourceIndex);
3046: return 0;
3047: }
3048: }
3049: }
3050:
3051: /* no mapping */
3052: cr[0] = CoderResult.unmappableForLength(1);
3053: return (int) cp;
3054: }
3055:
3056: /*
3057: * target<targetLimit; set error code for overflow
3058: */
3059: protected boolean initialMatchFromU(int cp, CharBuffer source,
3060: ByteBuffer target, IntBuffer offsets, int srcIndex,
3061: boolean flush, CoderResult[] cr) {
3062: int[] value = new int[1];
3063: int match;
3064:
3065: /* try to match */
3066: match = matchFromU(cp, null, 0, 0, source, target, value,
3067: flush);
3068:
3069: /* reject a match if the result is a single byte for DBCS-only */
3070: if (match >= 2
3071: && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) {
3072: /* advance src pointer for the consumed input */
3073: source.position(source.position() + match - 2); /* remove 2 for the initial code point */
3074:
3075: /* write result to target */
3076: cr[0] = writeFromU(value[0], target, offsets, srcIndex);
3077: return true;
3078: } else if (match < 0) {
3079: /* save state for partial match */
3080: int sArrayIndex;
3081: int j;
3082:
3083: /* copy the first code point */
3084: preFromUFirstCP = cp;
3085:
3086: /* now copy the newly consumed input */
3087: sArrayIndex = source.position();
3088: match = -match - 2; /* remove 2 for the initial code point */
3089: for (j = 0; j < match; ++j) {
3090: preFromUArray[j] = source.get(sArrayIndex++);
3091: }
3092: source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
3093: preFromULength = (byte) match;
3094: return true;
3095: } else if (match == 1) {
3096: /* matched, no mapping but request for <subchar1> */
3097: useSubChar1 = true;
3098: return false;
3099: } else /* match==0 no match */{
3100: return false;
3101: }
3102: }
3103:
3104: /*
3105: * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3106: * that map only to and from the BMP.
3107: * In addition to single-byte/state optimizations, the offset calculations
3108: * become much easier.
3109: */
3110: protected CoderResult cnvMBCSSingleFromBMPWithOffsets(
3111: CharBuffer source, ByteBuffer target,
3112: IntBuffer offsets, boolean flush) {
3113:
3114: CoderResult[] cr = { CoderResult.UNDERFLOW };
3115:
3116: int sourceArrayIndex, lastSource;
3117: int targetCapacity, length;
3118: char[] table;
3119: byte[] results;
3120:
3121: int c, sourceIndex;
3122: char value, minValue;
3123:
3124: /* set up the local pointers */
3125: sourceArrayIndex = source.position();
3126: targetCapacity = target.remaining();
3127: table = sharedData.mbcs.fromUnicodeTable;
3128:
3129: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
3130: results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it?
3131: } else {
3132: results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it?
3133: }
3134:
3135: if (useFallback) {
3136: /* use all roundtrip and fallback results */
3137: minValue = 0x800;
3138: } else {
3139: /* use only roundtrips and fallbacks from private-use characters */
3140: minValue = 0xc00;
3141: }
3142:
3143: /* get the converter state from UConverter */
3144: c = fromUChar32;
3145:
3146: /* sourceIndex=-1 if the current character began in the previous buffer */
3147: sourceIndex = c == 0 ? 0 : -1;
3148: lastSource = sourceArrayIndex;
3149:
3150: /*
3151: * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3152: * for the minimum of the sourceLength and targetCapacity
3153: */
3154: length = source.limit() - sourceArrayIndex;
3155: if (length < targetCapacity) {
3156: targetCapacity = length;
3157: }
3158:
3159: boolean doloop = true;
3160: if (c != 0 && targetCapacity > 0) {
3161: SideEffectsSingleBMP x = new SideEffectsSingleBMP(c,
3162: sourceArrayIndex);
3163: doloop = getTrailSingleBMP(source, x, cr);
3164: c = x.c;
3165: sourceArrayIndex = x.sourceArrayIndex;
3166: }
3167:
3168: if (doloop) {
3169: while (targetCapacity > 0) {
3170: /*
3171: * Get a correct Unicode code point:
3172: * a single UChar for a BMP code point or
3173: * a matched surrogate pair for a "supplementary code point".
3174: */
3175: c = source.get(sourceArrayIndex++);
3176: /*
3177: * Do not immediately check for single surrogates:
3178: * Assume that they are unassigned and check for them in that case.
3179: * This speeds up the conversion of assigned characters.
3180: */
3181: /* convert the Unicode code point in c into codepage bytes */
3182: value = MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3183:
3184: /* is this code point assigned, or do we use fallbacks? */
3185: if (value >= minValue) {
3186: /* assigned, write the output character bytes from value and length */
3187: /* length==1 */
3188: /* this is easy because we know that there is enough space */
3189: target.put((byte) value);
3190: --targetCapacity;
3191:
3192: /* normal end of conversion: prepare for a new character */
3193: c = 0;
3194: continue;
3195: } else if (!UTF16.isSurrogate((char) c)) {
3196: /* normal, unassigned BMP character */
3197: } else if (UTF16.isLeadSurrogate((char) c)) {
3198: //getTrail:
3199: SideEffectsSingleBMP x = new SideEffectsSingleBMP(
3200: c, sourceArrayIndex);
3201: doloop = getTrailSingleBMP(source, x, cr);
3202: c = x.c;
3203: sourceArrayIndex = x.sourceArrayIndex;
3204: if (!doloop)
3205: break;
3206: } else {
3207: /* this is an unmatched trail code unit (2nd surrogate) */
3208: /* callback(illegal) */
3209: cr[0] = CoderResult.malformedForLength(1);
3210: break;
3211: }
3212:
3213: /* c does not have a mapping */
3214:
3215: /* get the number of code units for c to correctly advance sourceIndex */
3216: length = UTF16.getCharCount(c);
3217:
3218: /* set offsets since the start or the last extension */
3219: if (offsets != null) {
3220: int count = sourceArrayIndex - lastSource;
3221:
3222: /* do not set the offset for this character */
3223: count -= length;
3224:
3225: while (count > 0) {
3226: offsets.put(sourceIndex++);
3227: --count;
3228: }
3229: /* offsets and sourceIndex are now set for the current character */
3230: }
3231:
3232: /* try an extension mapping */
3233: lastSource = sourceArrayIndex;
3234: source.position(sourceArrayIndex);
3235: c = fromU(c, source, target, offsets, sourceIndex,
3236: flush, cr);
3237: sourceArrayIndex = source.position();
3238: sourceIndex += length
3239: + (sourceArrayIndex - lastSource);
3240: lastSource = sourceArrayIndex;
3241:
3242: if (cr[0].isError()) {
3243: /* not mappable or buffer overflow */
3244: break;
3245: } else {
3246: /* a mapping was written to the target, continue */
3247:
3248: /* recalculate the targetCapacity after an extension mapping */
3249: targetCapacity = target.remaining();
3250: length = source.limit() - sourceArrayIndex;
3251: if (length < targetCapacity) {
3252: targetCapacity = length;
3253: }
3254: }
3255: }
3256: }
3257:
3258: if (sourceArrayIndex < source.limit()
3259: && !target.hasRemaining()) {
3260: /* target is full */
3261: cr[0] = CoderResult.OVERFLOW;
3262: }
3263:
3264: /* set offsets since the start or the last callback */
3265: if (offsets != null) {
3266: int count = sourceArrayIndex - lastSource;
3267: while (count > 0) {
3268: offsets.put(sourceIndex++);
3269: --count;
3270: }
3271: }
3272:
3273: /* set the converter state back into UConverter */
3274: fromUChar32 = c;
3275:
3276: /* write back the updated pointers */
3277: source.position(sourceArrayIndex);
3278:
3279: return cr[0];
3280: }
3281:
3282: /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3283: protected CoderResult cnvMBCSSingleFromUnicodeWithOffsets(
3284: CharBuffer source, ByteBuffer target,
3285: IntBuffer offsets, boolean flush) {
3286:
3287: CoderResult[] cr = { CoderResult.UNDERFLOW };
3288:
3289: int sourceArrayIndex;
3290:
3291: char[] table;
3292: byte[] results; //agljport:comment results is used to to get 16-bit values out of byte[] array
3293:
3294: int c;
3295: int sourceIndex, nextSourceIndex;
3296:
3297: char value, minValue;
3298:
3299: /* set up the local pointers */
3300: short unicodeMask;
3301: sourceArrayIndex = source.position();
3302:
3303: table = sharedData.mbcs.fromUnicodeTable;
3304:
3305: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
3306: results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it?
3307: } else {
3308: results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it?
3309: }
3310:
3311: if (useFallback) {
3312: /* use all roundtrip and fallback results */
3313: minValue = 0x800;
3314: } else {
3315: /* use only roundtrips and fallbacks from private-use characters */
3316: minValue = 0xc00;
3317: }
3318: //agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation
3319: unicodeMask = sharedData.mbcs.unicodeMask;
3320:
3321: /* get the converter state from UConverter */
3322: c = fromUChar32;
3323:
3324: /* sourceIndex=-1 if the current character began in the previous buffer */
3325: sourceIndex = c == 0 ? 0 : -1;
3326: nextSourceIndex = 0;
3327:
3328: boolean doloop = true;
3329: if (c != 0 && target.hasRemaining()) {
3330: SideEffectsDouble x = new SideEffectsDouble(c,
3331: sourceArrayIndex, sourceIndex, nextSourceIndex);
3332: doloop = getTrailDouble(source, target, unicodeMask, x,
3333: flush, cr);
3334: c = x.c;
3335: sourceArrayIndex = x.sourceArrayIndex;
3336: sourceIndex = x.sourceIndex;
3337: nextSourceIndex = x.nextSourceIndex;
3338: }
3339:
3340: if (doloop) {
3341: while (sourceArrayIndex < source.limit()) {
3342: /*
3343: * This following test is to see if available input would overflow the output.
3344: * It does not catch output of more than one byte that
3345: * overflows as a result of a multi-byte character or callback output
3346: * from the last source character.
3347: * Therefore, those situations also test for overflows and will
3348: * then break the loop, too.
3349: */
3350: if (target.hasRemaining()) {
3351: /*
3352: * Get a correct Unicode code point:
3353: * a single UChar for a BMP code point or
3354: * a matched surrogate pair for a "supplementary code point".
3355: */
3356: c = source.get(sourceArrayIndex++);
3357: ++nextSourceIndex;
3358: if (UTF16.isSurrogate((char) c)) {
3359: if (UTF16.isLeadSurrogate((char) c)) {
3360: //getTrail:
3361: SideEffectsDouble x = new SideEffectsDouble(
3362: c, sourceArrayIndex,
3363: sourceIndex, nextSourceIndex);
3364: doloop = getTrailDouble(source, target,
3365: unicodeMask, x, flush, cr);
3366: c = x.c;
3367: sourceArrayIndex = x.sourceArrayIndex;
3368: sourceIndex = x.sourceIndex;
3369: nextSourceIndex = x.nextSourceIndex;
3370: if (doloop)
3371: continue;
3372: else
3373: break;
3374: } else {
3375: /* this is an unmatched trail code unit (2nd surrogate) */
3376: /* callback(illegal) */
3377: cr[0] = CoderResult
3378: .malformedForLength(1);
3379: break;
3380: }
3381: }
3382:
3383: /* convert the Unicode code point in c into codepage bytes */
3384: value = MBCS_SINGLE_RESULT_FROM_U(table,
3385: results, c);
3386:
3387: /* is this code point assigned, or do we use fallbacks? */
3388: if (value >= minValue) {
3389: /* assigned, write the output character bytes from value and length */
3390: /* length==1 */
3391: /* this is easy because we know that there is enough space */
3392: target.put((byte) value);
3393: if (offsets != null) {
3394: offsets.put(sourceIndex);
3395: }
3396:
3397: /* normal end of conversion: prepare for a new character */
3398: c = 0;
3399: sourceIndex = nextSourceIndex;
3400: } else { /* unassigned */
3401: /* try an extension mapping */
3402: SideEffectsDouble x = new SideEffectsDouble(
3403: c, sourceArrayIndex, sourceIndex,
3404: nextSourceIndex);
3405: doloop = unassignedDouble(source, target,
3406: x, flush, cr);
3407: c = x.c;
3408: sourceArrayIndex = x.sourceArrayIndex;
3409: sourceIndex = x.sourceIndex;
3410: nextSourceIndex = x.nextSourceIndex;
3411: if (!doloop)
3412: break;
3413: }
3414: } else {
3415: /* target is full */
3416: cr[0] = CoderResult.OVERFLOW;
3417: break;
3418: }
3419: }
3420: }
3421:
3422: /* set the converter state back into UConverter */
3423: fromUChar32 = c;
3424:
3425: /* write back the updated pointers */
3426: source.position(sourceArrayIndex);
3427:
3428: return cr[0];
3429: }
3430:
3431: /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3432: protected CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(
3433: CharBuffer source, ByteBuffer target,
3434: IntBuffer offsets, boolean flush) {
3435: CoderResult[] cr = { CoderResult.UNDERFLOW };
3436:
3437: int sourceArrayIndex;
3438:
3439: char[] table;
3440: byte[] bytes;
3441:
3442: int c, sourceIndex, nextSourceIndex;
3443:
3444: int stage2Entry;
3445: int value;
3446: int length;
3447: short unicodeMask;
3448:
3449: /* use optimized function if possible */
3450: unicodeMask = sharedData.mbcs.unicodeMask;
3451:
3452: /* set up the local pointers */
3453: sourceArrayIndex = source.position();
3454:
3455: table = sharedData.mbcs.fromUnicodeTable;
3456:
3457: if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
3458: bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
3459: } else {
3460: bytes = sharedData.mbcs.fromUnicodeBytes;
3461: }
3462:
3463: /* get the converter state from UConverter */
3464: c = fromUChar32;
3465:
3466: /* sourceIndex=-1 if the current character began in the previous buffer */
3467: sourceIndex = c == 0 ? 0 : -1;
3468: nextSourceIndex = 0;
3469:
3470: /* conversion loop */
3471: boolean doloop = true;
3472: if (c != 0 && target.hasRemaining()) {
3473: SideEffectsDouble x = new SideEffectsDouble(c,
3474: sourceArrayIndex, sourceIndex, nextSourceIndex);
3475: doloop = getTrailDouble(source, target, unicodeMask, x,
3476: flush, cr);
3477: c = x.c;
3478: sourceArrayIndex = x.sourceArrayIndex;
3479: sourceIndex = x.sourceIndex;
3480: nextSourceIndex = x.nextSourceIndex;
3481: }
3482:
3483: if (doloop) {
3484: while (sourceArrayIndex < source.limit()) {
3485: /*
3486: * This following test is to see if available input would overflow the output.
3487: * It does not catch output of more than one byte that
3488: * overflows as a result of a multi-byte character or callback output
3489: * from the last source character.
3490: * Therefore, those situations also test for overflows and will
3491: * then break the loop, too.
3492: */
3493: if (target.hasRemaining()) {
3494: /*
3495: * Get a correct Unicode code point:
3496: * a single UChar for a BMP code point or
3497: * a matched surrogate pair for a "supplementary code point".
3498: */
3499: c = source.get(sourceArrayIndex++);
3500: ++nextSourceIndex;
3501: /*
3502: * This also tests if the codepage maps single surrogates.
3503: * If it does, then surrogates are not paired but mapped separately.
3504: * Note that in this case unmatched surrogates are not detected.
3505: */
3506: if (UTF16.isSurrogate((char) c)
3507: && (unicodeMask & UConverterConstants.HAS_SURROGATES) == 0) {
3508: if (UTF16.isLeadSurrogate((char) c)) {
3509: //getTrail:
3510: SideEffectsDouble x = new SideEffectsDouble(
3511: c, sourceArrayIndex,
3512: sourceIndex, nextSourceIndex);
3513: doloop = getTrailDouble(source, target,
3514: unicodeMask, x, flush, cr);
3515: c = x.c;
3516: sourceArrayIndex = x.sourceArrayIndex;
3517: sourceIndex = x.sourceIndex;
3518: nextSourceIndex = x.nextSourceIndex;
3519:
3520: if (doloop) {
3521: continue;
3522: } else {
3523: break;
3524: }
3525: } else {
3526: /* this is an unmatched trail code unit (2nd surrogate) */
3527: /* callback(illegal) */
3528: cr[0] = CoderResult
3529: .malformedForLength(1);
3530: break;
3531: }
3532: }
3533:
3534: /* convert the Unicode code point in c into codepage bytes */
3535: stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
3536:
3537: /* get the bytes and the length for the output */
3538: /* MBCS_OUTPUT_2 */
3539: value = MBCS_VALUE_2_FROM_STAGE_2(bytes,
3540: stage2Entry, c);
3541: if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3542: length = 1;
3543: } else {
3544: length = 2;
3545: }
3546:
3547: /* is this code point assigned, or do we use fallbacks? */
3548: if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) {
3549: /*
3550: * We allow a 0 byte output if the "assigned" bit is set for this entry.
3551: * There is no way with this data structure for fallback output
3552: * to be a zero byte.
3553: */
3554:
3555: //unassigned:
3556: SideEffectsDouble x = new SideEffectsDouble(
3557: c, sourceArrayIndex, sourceIndex,
3558: nextSourceIndex);
3559:
3560: doloop = unassignedDouble(source, target,
3561: x, flush, cr);
3562: c = x.c;
3563: sourceArrayIndex = x.sourceArrayIndex;
3564: sourceIndex = x.sourceIndex;
3565: nextSourceIndex = x.nextSourceIndex;
3566: if (doloop)
3567: continue;
3568: else
3569: break;
3570: }
3571:
3572: /* write the output character bytes from value and length */
3573: /* from the first if in the loop we know that targetCapacity>0 */
3574: if (length == 1) {
3575: /* this is easy because we know that there is enough space */
3576: target.put((byte) value);
3577: if (offsets != null) {
3578: offsets.put(sourceIndex);
3579: }
3580: } else /* length==2 */{
3581: target.put((byte) (value >>> 8));
3582: if (2 <= target.remaining()) {
3583: target.put((byte) value);
3584: if (offsets != null) {
3585: offsets.put(sourceIndex);
3586: offsets.put(sourceIndex);
3587: }
3588: } else {
3589: if (offsets != null) {
3590: offsets.put(sourceIndex);
3591: }
3592: errorBuffer[0] = (byte) value;
3593: errorBufferLength = 1;
3594:
3595: /* target overflow */
3596: cr[0] = CoderResult.OVERFLOW;
3597: c = 0;
3598: break;
3599: }
3600: }
3601:
3602: /* normal end of conversion: prepare for a new character */
3603: c = 0;
3604: sourceIndex = nextSourceIndex;
3605: continue;
3606: } else {
3607: /* target is full */
3608: cr[0] = CoderResult.OVERFLOW;
3609: break;
3610: }
3611: }
3612: }
3613:
3614: /* set the converter state back into UConverter */
3615: fromUChar32 = c;
3616:
3617: /* write back the updated pointers */
3618: source.position(sourceArrayIndex);
3619:
3620: return cr[0];
3621: }
3622:
3623: protected final class SideEffectsSingleBMP {
3624: int c, sourceArrayIndex;
3625:
3626: SideEffectsSingleBMP(int c_, int sourceArrayIndex_) {
3627: c = c_;
3628: sourceArrayIndex = sourceArrayIndex_;
3629: }
3630: }
3631:
3632: // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets
3633: // assumes input c is lead surrogate
3634: protected final boolean getTrailSingleBMP(CharBuffer source,
3635: SideEffectsSingleBMP x, CoderResult[] cr) {
3636: if (x.sourceArrayIndex < source.limit()) {
3637: /* test the following code unit */
3638: char trail = source.get(x.sourceArrayIndex);
3639: if (UTF16.isTrailSurrogate(trail)) {
3640: ++x.sourceArrayIndex;
3641: x.c = UCharacter.getCodePoint((char) x.c, trail);
3642: /* this codepage does not map supplementary code points */
3643: /* callback(unassigned) */
3644: cr[0] = CoderResult.unmappableForLength(2);
3645: return false;
3646: } else {
3647: /* this is an unmatched lead code unit (1st surrogate) */
3648: /* callback(illegal) */
3649: cr[0] = CoderResult.malformedForLength(2);
3650: return false;
3651: }
3652: } else {
3653: /* no more input */
3654: return false;
3655: }
3656: //return true;
3657: }
3658:
3659: protected final class SideEffects {
3660: int c, sourceArrayIndex, sourceIndex, nextSourceIndex,
3661: prevSourceIndex, prevLength;
3662:
3663: SideEffects(int c_, int sourceArrayIndex_,
3664: int sourceIndex_, int nextSourceIndex_,
3665: int prevSourceIndex_, int prevLength_) {
3666: c = c_;
3667: sourceArrayIndex = sourceArrayIndex_;
3668: sourceIndex = sourceIndex_;
3669: nextSourceIndex = nextSourceIndex_;
3670: prevSourceIndex = prevSourceIndex_;
3671: prevLength = prevLength_;
3672: }
3673: }
3674:
3675: // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets
3676: // assumes input c is lead surrogate
3677: protected final boolean getTrail(CharBuffer source,
3678: ByteBuffer target, int unicodeMask, SideEffects x,
3679: boolean flush, CoderResult[] cr) {
3680: if (x.sourceArrayIndex < source.limit()) {
3681: /* test the following code unit */
3682: char trail = source.get(x.sourceArrayIndex);
3683: if (UTF16.isTrailSurrogate(trail)) {
3684: ++x.sourceArrayIndex;
3685: ++x.nextSourceIndex;
3686: x.c = UCharacter.getCodePoint((char) x.c, trail);
3687: if ((unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
3688: /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3689: fromUnicodeStatus = x.prevLength; /* save the old state */
3690: /* callback(unassigned) */
3691: return unassigned(source, target, null, x,
3692: flush, cr);
3693: }
3694: /* convert this supplementary code point */
3695: /* exit this condition tree */
3696: } else {
3697: /* this is an unmatched lead code unit (1st surrogate) */
3698: /* callback(illegal) */
3699: cr[0] = CoderResult.malformedForLength(2);
3700: return false;
3701: }
3702: } else {
3703: /* no more input */
3704: return false;
3705: }
3706: return true;
3707: }
3708:
3709: // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets
3710: protected final boolean unassigned(CharBuffer source,
3711: ByteBuffer target, IntBuffer offsets, SideEffects x,
3712: boolean flush, CoderResult[] cr) {
3713: /* try an extension mapping */
3714: int sourceBegin = x.sourceArrayIndex;
3715: source.position(x.sourceArrayIndex);
3716: x.c = fromU(x.c, source, target, null, x.sourceIndex,
3717: flush, cr);
3718: x.sourceArrayIndex = source.position();
3719: x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
3720: x.prevLength = (int) fromUnicodeStatus;
3721:
3722: if (cr[0].isError()) {
3723: /* not mappable or buffer overflow */
3724: return false;
3725: } else {
3726: /* a mapping was written to the target, continue */
3727:
3728: /* recalculate the targetCapacity after an extension mapping */
3729: //x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
3730: /* normal end of conversion: prepare for a new character */
3731: if (offsets != null) {
3732: x.prevSourceIndex = x.sourceIndex;
3733: x.sourceIndex = x.nextSourceIndex;
3734: }
3735: return true;
3736: }
3737: }
3738:
3739: protected final class SideEffectsDouble {
3740: int c, sourceArrayIndex, sourceIndex, nextSourceIndex;
3741:
3742: SideEffectsDouble(int c_, int sourceArrayIndex_,
3743: int sourceIndex_, int nextSourceIndex_) {
3744: c = c_;
3745: sourceArrayIndex = sourceArrayIndex_;
3746: sourceIndex = sourceIndex_;
3747: nextSourceIndex = nextSourceIndex_;
3748: }
3749: }
3750:
3751: // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets
3752: // assumes input c is lead surrogate
3753: protected final boolean getTrailDouble(CharBuffer source,
3754: ByteBuffer target, int unicodeMask,
3755: SideEffectsDouble x, boolean flush, CoderResult[] cr) {
3756: if (x.sourceArrayIndex < source.limit()) {
3757: /* test the following code unit */
3758: char trail = source.get(x.sourceArrayIndex);
3759: if (UTF16.isTrailSurrogate(trail)) {
3760: ++x.sourceArrayIndex;
3761: ++x.nextSourceIndex;
3762: x.c = UCharacter.getCodePoint((char) x.c, trail);
3763: if ((unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
3764: /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3765: /* callback(unassigned) */
3766: return unassignedDouble(source, target, x,
3767: flush, cr);
3768: }
3769: /* convert this supplementary code point */
3770: /* exit this condition tree */
3771: } else {
3772: /* this is an unmatched lead code unit (1st surrogate) */
3773: /* callback(illegal) */
3774: cr[0] = CoderResult.malformedForLength(2);
3775: return false;
3776: }
3777: } else {
3778: /* no more input */
3779: return false;
3780: }
3781: return true;
3782: }
3783:
3784: // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets
3785: protected final boolean unassignedDouble(CharBuffer source,
3786: ByteBuffer target, SideEffectsDouble x, boolean flush,
3787: CoderResult[] cr) {
3788: /* try an extension mapping */
3789: int sourceBegin = x.sourceArrayIndex;
3790: source.position(x.sourceArrayIndex);
3791: x.c = fromU(x.c, source, target, null, x.sourceIndex,
3792: flush, cr);
3793: x.sourceArrayIndex = source.position();
3794: x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
3795:
3796: if (cr[0].isError()) {
3797: /* not mappable or buffer overflow */
3798: return false;
3799: } else {
3800: /* a mapping was written to the target, continue */
3801:
3802: /* recalculate the targetCapacity after an extension mapping */
3803: //x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
3804: /* normal end of conversion: prepare for a new character */
3805: x.sourceIndex = x.nextSourceIndex;
3806: return true;
3807: }
3808: }
3809:
3810: /**
3811: * Overrides super class method
3812: * @param encoder
3813: * @param source
3814: * @param target
3815: * @param offsets
3816: * @return
3817: */
3818: protected CoderResult cbFromUWriteSub(
3819: CharsetEncoderICU encoder, CharBuffer source,
3820: ByteBuffer target, IntBuffer offsets) {
3821: CharsetMBCS cs = (CharsetMBCS) encoder.charset();
3822: byte[] subchar, p;
3823: byte[] buffer = new byte[4];
3824: int length, i = 0;
3825: /* first, select between subChar and subChar1 */
3826: if (cs.subChar1 != 0
3827: && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1
3828: : (encoder.invalidUCharBuffer[0] <= 0xff))) {
3829: /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
3830: subchar = new byte[1];
3831: subchar[0] = cs.subChar1;
3832: length = 1;
3833: } else {
3834: /* select subChar in all other cases */
3835: subchar = cs.subChar;
3836: length = cs.subCharLen;
3837: }
3838:
3839: /* reset the selector for the next code point */
3840: encoder.useSubChar1 = false;
3841:
3842: switch (cs.sharedData.mbcs.outputType) {
3843: case MBCS_OUTPUT_2_SISO:
3844: p = buffer;
3845:
3846: /* fromUnicodeStatus contains prevLength */
3847: switch (length) {
3848: case 1:
3849: if (encoder.fromUnicodeStatus == 2) {
3850: /* DBCS mode and SBCS sub char: change to SBCS */
3851: encoder.fromUnicodeStatus = 1;
3852: p[i++] = UConverterConstants.SI;
3853: }
3854: p[i++] = subchar[0];
3855: break;
3856: case 2:
3857: if (encoder.fromUnicodeStatus <= 1) {
3858: /* SBCS mode and DBCS sub char: change to DBCS */
3859: encoder.fromUnicodeStatus = 2;
3860: p[i++] = UConverterConstants.SO;
3861: }
3862: p[i++] = subchar[0];
3863: p[i++] = subchar[1];
3864: break;
3865: default:
3866: throw new IllegalArgumentException();
3867: }
3868: return super .cbFromUWriteSub(encoder, source, target,
3869: offsets);
3870: default:
3871: return super .cbFromUWriteSub(encoder, source, target,
3872: offsets);
3873: }
3874: }
3875: }
3876:
3877: public CharsetDecoder newDecoder() {
3878: return new CharsetDecoderMBCS(this );
3879: }
3880:
3881: public CharsetEncoder newEncoder() {
3882: return new CharsetEncoderMBCS(this);
3883: }
3884:
3885: }
|