01: /**
02: *******************************************************************************
03: * Copyright (C) 2005, International Business Machines Corporation and *
04: * others. All Rights Reserved. *
05: *******************************************************************************
06: */package com.ibm.icu.text;
07:
08: /**
09: * Charset recognizer for UTF-8
10: *
11: * @internal
12: */
13: class CharsetRecog_UTF8 extends CharsetRecognizer {
14:
15: String getName() {
16: return "UTF-8";
17: }
18:
19: /* (non-Javadoc)
20: * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
21: */
22: int match(CharsetDetector det) {
23: boolean hasBOM = false;
24: int numValid = 0;
25: int numInvalid = 0;
26: byte input[] = det.fRawInput;
27: int i;
28: int trailBytes = 0;
29: int confidence;
30:
31: if (det.fRawLength >= 3 && input[0] == 0xef && input[1] == 0xbb
32: & input[2] == 0xbf) {
33: hasBOM = true;
34: }
35:
36: // Scan for multi-byte sequences
37: for (i = 0; i < det.fRawLength; i++) {
38: int b = input[i];
39: if ((b & 0x80) == 0) {
40: continue; // ASCII
41: }
42:
43: // Hi bit on char found. Figure out how long the sequence should be
44: if ((b & 0x0e0) == 0x0c0) {
45: trailBytes = 1;
46: } else if ((b & 0x0f0) == 0x0e0) {
47: trailBytes = 2;
48: } else if ((b & 0x0f8) == 0xf0) {
49: trailBytes = 3;
50: } else {
51: numInvalid++;
52: if (numInvalid > 5) {
53: break;
54: }
55: trailBytes = 0;
56: }
57:
58: // Verify that we've got the right number of trail bytes in the sequence
59: for (;;) {
60: i++;
61: if (i >= det.fRawLength) {
62: break;
63: }
64: b = input[i];
65: if ((b & 0xc0) != 0x080) {
66: numInvalid++;
67: break;
68: }
69: if (--trailBytes == 0) {
70: numValid++;
71: break;
72: }
73: }
74:
75: }
76:
77: // Cook up some sort of confidence score, based on presense of a BOM
78: // and the existence of valid and/or invalid multi-byte sequences.
79: confidence = 0;
80: if (hasBOM && numInvalid == 0) {
81: confidence = 100;
82: } else if (hasBOM && numValid > numInvalid * 10) {
83: confidence = 80;
84: } else if (numValid > 3 && numInvalid == 0) {
85: confidence = 100;
86: } else if (numValid > 0 && numInvalid == 0) {
87: confidence = 80;
88: } else if (numValid == 0 && numInvalid == 0) {
89: // Plain ASCII.
90: confidence = 10;
91: } else if (numValid > numInvalid * 10) {
92: // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
93: confidence = 25;
94: }
95: return confidence;
96: }
97:
98: }
|