001: /*
002: *******************************************************************************
003: * Copyright (C) 2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: /**
010: * class CharsetRecog_2022 part of the ICU charset detection imlementation.
011: * This is a superclass for the individual detectors for
012: * each of the detectable members of the ISO 2022 family
013: * of encodings.
014: *
015: * The separate classes are nested within this class.
016: *
017: * @internal
018: */
019: abstract class CharsetRecog_2022 extends CharsetRecognizer {
020:
021: /**
022: * Matching function shared among the 2022 detectors JP, CN and KR
023: * Counts up the number of legal an unrecognized escape sequences in
024: * the sample of text, and computes a score based on the total number &
025: * the proportion that fit the encoding.
026: *
027: *
028: * @param text the byte buffer containing text to analyse
029: * @param textLen the size of the text in the byte.
030: * @param escapeSequences the byte escape sequences to test for.
031: * @return match quality, in the range of 0-100.
032: */
033: int match(byte[] text, int textLen, byte[][] escapeSequences) {
034: int i, j;
035: int escN;
036: int hits = 0;
037: int misses = 0;
038: int shifts = 0;
039: int quality;
040: scanInput: for (i = 0; i < textLen; i++) {
041: if (text[i] == 0x1b) {
042: checkEscapes: for (escN = 0; escN < escapeSequences.length; escN++) {
043: byte[] seq = escapeSequences[escN];
044:
045: for (j = 1; j < seq.length; j++) {
046: if (seq[j] != text[i + j]) {
047: continue checkEscapes;
048: }
049: }
050:
051: hits++;
052: i += seq.length - 1;
053: continue scanInput;
054: }
055:
056: misses++;
057: }
058:
059: if (text[i] == 0x0e || text[i] == 0x0f) {
060: // Shift in/out
061: shifts++;
062: }
063: }
064:
065: if (hits == 0) {
066: return 0;
067: }
068:
069: //
070: // Initial quality is based on relative proportion of recongized vs.
071: // unrecognized escape sequences.
072: // All good: quality = 100;
073: // half or less good: quality = 0;
074: // linear inbetween.
075: quality = (100 * hits - 100 * misses) / (hits + misses);
076:
077: // Back off quality if there were too few escape sequences seen.
078: // Include shifts in this computation, so that KR does not get penalized
079: // for having only a single Escape sequence, but many shifts.
080: if (hits + shifts < 5) {
081: quality -= (5 - (hits + shifts)) * 10;
082: }
083:
084: if (quality < 0) {
085: quality = 0;
086: }
087: return quality;
088: }
089:
090: static class CharsetRecog_2022JP extends CharsetRecog_2022 {
091: private byte[][] escapeSequences = {
092: { 0x1b, 0x24, 0x28, 0x43 }, // KS X 1001:1992
093: { 0x1b, 0x24, 0x28, 0x44 }, // JIS X 212-1990
094: { 0x1b, 0x24, 0x40 }, // JIS C 6226-1978
095: { 0x1b, 0x24, 0x41 }, // GB 2312-80
096: { 0x1b, 0x24, 0x42 }, // JIS X 208-1983
097: { 0x1b, 0x26, 0x40 }, // JIS X 208 1990, 1997
098: { 0x1b, 0x28, 0x42 }, // ASCII
099: { 0x1b, 0x28, 0x48 }, // JIS-Roman
100: { 0x1b, 0x28, 0x49 }, // Half-width katakana
101: { 0x1b, 0x28, 0x4a }, // JIS-Roman
102: { 0x1b, 0x2e, 0x41 }, // ISO 8859-1
103: { 0x1b, 0x2e, 0x46 } // ISO 8859-7
104: };
105:
106: String getName() {
107: return "ISO-2022-JP";
108: }
109:
110: int match(CharsetDetector det) {
111: return match(det.fInputBytes, det.fInputLen,
112: escapeSequences);
113: }
114: }
115:
116: static class CharsetRecog_2022KR extends CharsetRecog_2022 {
117: private byte[][] escapeSequences = { { 0x1b, 0x24, 0x29, 0x43 } };
118:
119: String getName() {
120: return "ISO-2022-KR";
121: }
122:
123: int match(CharsetDetector det) {
124: return match(det.fInputBytes, det.fInputLen,
125: escapeSequences);
126: }
127:
128: }
129:
130: static class CharsetRecog_2022CN extends CharsetRecog_2022 {
131: private byte[][] escapeSequences = {
132: { 0x1b, 0x24, 0x29, 0x41 }, // GB 2312-80
133: { 0x1b, 0x24, 0x29, 0x47 }, // CNS 11643-1992 Plane 1
134: { 0x1b, 0x24, 0x2A, 0x48 }, // CNS 11643-1992 Plane 2
135: { 0x1b, 0x24, 0x29, 0x45 }, // ISO-IR-165
136: { 0x1b, 0x24, 0x2B, 0x49 }, // CNS 11643-1992 Plane 3
137: { 0x1b, 0x24, 0x2B, 0x4A }, // CNS 11643-1992 Plane 4
138: { 0x1b, 0x24, 0x2B, 0x4B }, // CNS 11643-1992 Plane 5
139: { 0x1b, 0x24, 0x2B, 0x4C }, // CNS 11643-1992 Plane 6
140: { 0x1b, 0x24, 0x2B, 0x4D }, // CNS 11643-1992 Plane 7
141: { 0x1b, 0x4e }, // SS2
142: { 0x1b, 0x4f }, // SS3
143: };
144:
145: String getName() {
146: return "ISO-2022-CN";
147: }
148:
149: int match(CharsetDetector det) {
150: return match(det.fInputBytes, det.fInputLen,
151: escapeSequences);
152: }
153: }
154:
155: }
|