001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: */
008:
009: package com.ibm.icu.text;
010:
011: /**
012: * This class matches UTF-16 and UTF-32, both big- and little-endian. The
013: * BOM will be used if it is present.
014: *
015: * @internal
016: */
017: abstract class CharsetRecog_Unicode extends CharsetRecognizer {
018:
019: /* (non-Javadoc)
020: * @see com.ibm.icu.text.CharsetRecognizer#getName()
021: */
022: abstract String getName();
023:
024: /* (non-Javadoc)
025: * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
026: */
027: abstract int match(CharsetDetector det);
028:
029: static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
030: String getName() {
031: return "UTF-16BE";
032: }
033:
034: int match(CharsetDetector det) {
035: byte[] input = det.fRawInput;
036:
037: if ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF) {
038: return 100;
039: }
040:
041: // TODO: Do some statastics to check for unsigned UTF-16BE
042: return 0;
043: }
044: }
045:
046: static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
047: String getName() {
048: return "UTF-16LE";
049: }
050:
051: int match(CharsetDetector det) {
052: byte[] input = det.fRawInput;
053:
054: if ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE
055: && (input[2] != 0x00 || input[3] != 0x00)) {
056: return 100;
057: }
058:
059: // TODO: Do some statastics to check for unsigned UTF-16LE
060: return 0;
061: }
062: }
063:
064: static abstract class CharsetRecog_UTF_32 extends
065: CharsetRecog_Unicode {
066: abstract int getChar(byte[] input, int index);
067:
068: abstract String getName();
069:
070: int match(CharsetDetector det) {
071: byte[] input = det.fRawInput;
072: int limit = (det.fRawLength / 4) * 4;
073: int numValid = 0;
074: int numInvalid = 0;
075: boolean hasBOM = false;
076: int confidence = 0;
077:
078: if (getChar(input, 0) == 0x0000FEFF) {
079: hasBOM = true;
080: }
081:
082: for (int i = 0; i < limit; i += 4) {
083: int ch = getChar(input, i);
084:
085: if (ch < 0 || ch >= 0x10FFFF
086: || (ch >= 0xD800 && ch <= 0xDFFF)) {
087: numInvalid += 1;
088: } else {
089: numValid += 1;
090: }
091: }
092:
093: // Cook up some sort of confidence score, based on presense of a BOM
094: // and the existence of valid and/or invalid multi-byte sequences.
095: if (hasBOM && numInvalid == 0) {
096: confidence = 100;
097: } else if (hasBOM && numValid > numInvalid * 10) {
098: confidence = 80;
099: } else if (numValid > 3 && numInvalid == 0) {
100: confidence = 100;
101: } else if (numValid > 0 && numInvalid == 0) {
102: confidence = 80;
103: } else if (numValid > numInvalid * 10) {
104: // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
105: confidence = 25;
106: }
107:
108: return confidence;
109: }
110: }
111:
112: static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
113: int getChar(byte[] input, int index) {
114: return (input[index + 0] & 0xFF) << 24
115: | (input[index + 1] & 0xFF) << 16
116: | (input[index + 2] & 0xFF) << 8
117: | (input[index + 3] & 0xFF);
118: }
119:
120: String getName() {
121: return "UTF-32BE";
122: }
123: }
124:
125: static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
126: int getChar(byte[] input, int index) {
127: return (input[index + 3] & 0xFF) << 24
128: | (input[index + 2] & 0xFF) << 16
129: | (input[index + 1] & 0xFF) << 8
130: | (input[index + 0] & 0xFF);
131: }
132:
133: String getName() {
134: return "UTF-32LE";
135: }
136: }
137: }
|