001: /**
002: *******************************************************************************
003: * Copyright (C) 2005-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.text;
007:
008: import java.io.InputStream;
009: import java.io.Reader;
010: import java.io.IOException;
011: import java.util.ArrayList;
012: import java.util.Collections;
013: import java.util.Arrays;
014:
015: /**
016: * <code>CharsetDetector</code> provides a facility for detecting the
017: * charset or encoding of character data in an unknown format.
018: * The input data can either be from an input stream or an array of bytes.
019: * The result of the detection operation is a list of possibly matching
020: * charsets, or, for simple use, you can just ask for a Java Reader that
021: * will will work over the input data.
022: * <p/>
023: * Character set detection is at best an imprecise operation. The detection
024: * process will attempt to identify the charset that best matches the characteristics
025: * of the byte data, but the process is partly statistical in nature, and
026: * the results can not be guaranteed to always be correct.
027: * <p/>
028: * For best accuracy in charset detection, the input data should be primarily
029: * in a single language, and a minimum of a few hundred bytes worth of plain text
030: * in the language are needed. The detection process will attempt to
031: * ignore html or xml style markup that could otherwise obscure the content.
032: * <p/>
033: * @draft ICU 3.4
034: * @provisional This API might change or be removed in a future release.
035: */
036: public class CharsetDetector {
037:
038: // Question: Should we have getters corresponding to the setters for inut text
039: // and declared encoding?
040:
041: // A thought: If we were to create our own type of Java Reader, we could defer
042: // figuring out an actual charset for data that starts out with too much English
043: // only ASCII until the user actually read through to something that didn't look
044: // like 7 bit English. If nothing else ever appeared, we would never need to
045: // actually choose the "real" charset. All assuming that the application just
046: // wants the data, and doesn't care about a char set name.
047:
048: /**
049: * Constructor
050: *
051: * @draft ICU 3.4
052: * @provisional This API might change or be removed in a future release.
053: */
054: public CharsetDetector() {
055: }
056:
057: /**
058: * Set the declared encoding for charset detection.
059: * The declared encoding of an input text is an encoding obtained
060: * from an http header or xml declaration or similar source that
061: * can be provided as additional information to the charset detector.
062: * A match between a declared encoding and a possible detected encoding
063: * will raise the quality of that detected encoding by a small delta,
064: * and will also appear as a "reason" for the match.
065: * <p/>
066: * A declared encoding that is incompatible with the input data being
067: * analyzed will not be added to the list of possible encodings.
068: *
069: * @param encoding The declared encoding
070: *
071: * @draft ICU 3.4
072: * @provisional This API might change or be removed in a future release.
073: */
074: public CharsetDetector setDeclaredEncoding(String encoding) {
075: fDeclaredEncoding = encoding;
076: return this ;
077: }
078:
079: /**
080: * Set the input text (byte) data whose charset is to be detected.
081: *
082: * @param in the input text of unknown encoding
083: *
084: * @return This CharsetDetector
085: *
086: * @draft ICU 3.4
087: * @provisional This API might change or be removed in a future release.
088: */
089: public CharsetDetector setText(byte[] in) {
090: fRawInput = in;
091: fRawLength = in.length;
092:
093: MungeInput();
094:
095: return this ;
096: }
097:
098: private static final int kBufSize = 8000;
099:
100: /**
101: * Set the input text (byte) data whose charset is to be detected.
102: * <p/>
103: * The input stream that supplies the character data must have markSupported()
104: * == true; the charset detection process will read a small amount of data,
105: * then return the stream to its original position via
106: * the InputStream.reset() operation. The exact amount that will
107: * be read depends on the characteristics of the data itself.
108: *
109: * @param in the input text of unknown encoding
110: *
111: * @return This CharsetDetector
112: *
113: * @draft ICU 3.4
114: * @provisional This API might change or be removed in a future release.
115: */
116:
117: public CharsetDetector setText(InputStream in) throws IOException {
118: fInputStream = in;
119: fInputStream.mark(kBufSize);
120: fRawInput = new byte[kBufSize]; // Always make a new buffer because the
121: // previous one may have come from the caller,
122: // in which case we can't touch it.
123: fRawLength = 0;
124: int remainingLength = kBufSize;
125: while (remainingLength > 0) {
126: // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
127: int bytesRead = fInputStream.read(fRawInput, fRawLength,
128: remainingLength);
129: if (bytesRead <= 0) {
130: break;
131: }
132: fRawLength += bytesRead;
133: remainingLength -= bytesRead;
134: }
135: fInputStream.reset();
136:
137: MungeInput(); // Strip html markup, collect byte stats.
138: return this ;
139: }
140:
141: /**
142: * Return the charset that best matches the supplied input data.
143: *
144: * Note though, that because the detection
145: * only looks at the start of the input data,
146: * there is a possibility that the returned charset will fail to handle
147: * the full set of input data.
148: * <p/>
149: * Raise an exception if
150: * <ul>
151: * <li>no charset appears to match the data.</li>
152: * <li>no input text has been provided</li>
153: * </ul>
154: *
155: * @return a CharsetMatch object representing the best matching charset, or
156: * <code>null</code> if there are no matches.
157: *
158: * @draft ICU 3.4
159: * @provisional This API might change or be removed in a future release.
160: */
161: public CharsetMatch detect() {
162: // TODO: A better implementation would be to copy the detect loop from
163: // detectAll(), and cut it short as soon as a match with a high confidence
164: // is found. This is something to be done later, after things are otherwise
165: // working.
166: CharsetMatch matches[] = detectAll();
167:
168: if (matches == null || matches.length == 0) {
169: return null;
170: }
171:
172: return matches[0];
173: }
174:
175: /**
176: * Return an array of all charsets that appear to be plausible
177: * matches with the input data. The array is ordered with the
178: * best quality match first.
179: * <p/>
180: * Raise an exception if
181: * <ul>
182: * <li>no charsets appear to match the input data.</li>
183: * <li>no input text has been provided</li>
184: * </ul>
185: *
186: * @return An array of CharsetMatch objects representing possibly matching charsets.
187: *
188: * @draft ICU 3.4
189: * @provisional This API might change or be removed in a future release.
190: */
191: public CharsetMatch[] detectAll() {
192: CharsetRecognizer csr;
193: int i;
194: int detectResults;
195: int confidence;
196: ArrayList matches = new ArrayList();
197:
198: // Iterate over all possible charsets, remember all that
199: // give a match quality > 0.
200: for (i = 0; i < fCSRecognizers.size(); i++) {
201: csr = (CharsetRecognizer) fCSRecognizers.get(i);
202: detectResults = csr.match(this );
203: confidence = detectResults & 0x000000ff;
204: if (confidence > 0) {
205: CharsetMatch m = new CharsetMatch(this , csr, confidence);
206: matches.add(m);
207: }
208: }
209: Collections.sort(matches); // CharsetMatch compares on confidence
210: Collections.reverse(matches); // Put best match first.
211: CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
212: resultArray = (CharsetMatch[]) matches.toArray(resultArray);
213: return resultArray;
214: }
215:
216: /**
217: * Autodetect the charset of an inputStream, and return a Java Reader
218: * to access the converted input data.
219: * <p/>
220: * This is a convenience method that is equivalent to
221: * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
222: * <p/>
223: * For the input stream that supplies the character data, markSupported()
224: * must be true; the charset detection will read a small amount of data,
225: * then return the stream to its original position via
226: * the InputStream.reset() operation. The exact amount that will
227: * be read depends on the characteristics of the data itself.
228: *<p/>
229: * Raise an exception if no charsets appear to match the input data.
230: *
231: * @param in The source of the byte data in the unknown charset.
232: *
233: * @param declaredEncoding A declared encoding for the data, if available,
234: * or null or an empty string if none is available.
235: *
236: * @draft ICU 3.4
237: * @provisional This API might change or be removed in a future release.
238: */
239: public Reader getReader(InputStream in, String declaredEncoding) {
240: fDeclaredEncoding = declaredEncoding;
241:
242: try {
243: setText(in);
244:
245: CharsetMatch match = detect();
246:
247: if (match == null) {
248: return null;
249: }
250:
251: return match.getReader();
252: } catch (IOException e) {
253: return null;
254: }
255: }
256:
257: /**
258: * Autodetect the charset of an inputStream, and return a String
259: * containing the converted input data.
260: * <p/>
261: * This is a convenience method that is equivalent to
262: * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
263: *<p/>
264: * Raise an exception if no charsets appear to match the input data.
265: *
266: * @param in The source of the byte data in the unknown charset.
267: *
268: * @param declaredEncoding A declared encoding for the data, if available,
269: * or null or an empty string if none is available.
270: *
271: * @draft ICU 3.4
272: * @provisional This API might change or be removed in a future release.
273: */
274: public String getString(byte[] in, String declaredEncoding) {
275: fDeclaredEncoding = declaredEncoding;
276:
277: try {
278: setText(in);
279:
280: CharsetMatch match = detect();
281:
282: if (match == null) {
283: return null;
284: }
285:
286: return match.getString(-1);
287: } catch (IOException e) {
288: return null;
289: }
290: }
291:
292: /**
293: * Get the names of all char sets that can be recognized by the char set detector.
294: *
295: * @return an array of the names of all charsets that can be recognized
296: * by the charset detector.
297: *
298: * @draft ICU 3.4
299: * @provisional This API might change or be removed in a future release.
300: */
301: public static String[] getAllDetectableCharsets() {
302: return fCharsetNames;
303: }
304:
305: /**
306: * Test whether or not input filtering is enabled.
307: *
308: * @return <code>true</code> if input text will be filtered.
309: *
310: * @see #enableInputFilter
311: *
312: * @draft ICU 3.4
313: * @provisional This API might change or be removed in a future release.
314: */
315: public boolean inputFilterEnabled() {
316: return fStripTags;
317: }
318:
319: /**
320: * Enable filtering of input text. If filtering is enabled,
321: * text within angle brackets ("<" and ">") will be removed
322: * before detection.
323: *
324: * @param filter <code>true</code> to enable input text filtering.
325: *
326: * @return The previous setting.
327: *
328: * @draft ICU 3.4
329: * @provisional This API might change or be removed in a future release.
330: */
331: public boolean enableInputFilter(boolean filter) {
332: boolean previous = fStripTags;
333:
334: fStripTags = filter;
335:
336: return previous;
337: }
338:
339: /**
340: * MungeInput - after getting a set of raw input data to be analyzed, preprocess
341: * it by removing what appears to be html markup.
342: *
343: * @internal
344: */
345: private void MungeInput() {
346: int srci = 0;
347: int dsti = 0;
348: byte b;
349: boolean inMarkup = false;
350: int openTags = 0;
351: int badTags = 0;
352:
353: //
354: // html / xml markup stripping.
355: // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
356: // discard everything within < brackets >
357: // Count how many total '<' and illegal (nested) '<' occur, so we can make some
358: // guess as to whether the input was actually marked up at all.
359: if (fStripTags) {
360: for (srci = 0; srci < fRawLength
361: && dsti < fInputBytes.length; srci++) {
362: b = fRawInput[srci];
363: if (b == (byte) '<') {
364: if (inMarkup) {
365: badTags++;
366: }
367: inMarkup = true;
368: openTags++;
369: }
370:
371: if (!inMarkup) {
372: fInputBytes[dsti++] = b;
373: }
374:
375: if (b == (byte) '>') {
376: inMarkup = false;
377: }
378: }
379:
380: fInputLen = dsti;
381: }
382:
383: //
384: // If it looks like this input wasn't marked up, or if it looks like it's
385: // essentially nothing but markup abandon the markup stripping.
386: // Detection will have to work on the unstripped input.
387: //
388: if (openTags < 5 || openTags / 5 < badTags
389: || (fInputLen < 100 && fRawLength > 600)) {
390: int limit = fRawLength;
391:
392: if (limit > kBufSize) {
393: limit = kBufSize;
394: }
395:
396: for (srci = 0; srci < limit; srci++) {
397: fInputBytes[srci] = fRawInput[srci];
398: }
399: fInputLen = srci;
400: }
401:
402: //
403: // Tally up the byte occurence statistics.
404: // These are available for use by the various detectors.
405: //
406: Arrays.fill(fByteStats, (short) 0);
407: for (srci = 0; srci < fInputLen; srci++) {
408: int val = fInputBytes[srci] & 0x00ff;
409: fByteStats[val]++;
410: }
411:
412: fC1Bytes = false;
413: for (int i = 0x80; i <= 0x9F; i += 1) {
414: if (fByteStats[i] != 0) {
415: fC1Bytes = true;
416: break;
417: }
418: }
419: }
420:
421: /**
422: * The following items are accessed by individual CharsetRecongizers during
423: * the recognition process
424: *
425: * @internal
426: */
427: byte[] fInputBytes = // The text to be checked. Markup will have been
428: new byte[kBufSize]; // removed if appropriate.
429:
430: int fInputLen; // Length of the byte data in fInputText.
431:
432: short fByteStats[] = // byte frequency statistics for the input text.
433: new short[256]; // Value is percent, not absolute.
434: // Value is rounded up, so zero really means zero occurences.
435:
436: boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
437: false;
438:
439: String fDeclaredEncoding;
440:
441: //
442: // Stuff private to CharsetDetector
443: //
444: byte[] fRawInput; // Original, untouched input bytes.
445: // If user gave us a byte array, this is it.
446: // If user gave us a stream, it's read to a
447: // buffer here.
448: int fRawLength; // Length of data in fRawInput array.
449:
450: InputStream fInputStream; // User's input stream, or null if the user
451: // gave us a byte array.
452:
453: boolean fStripTags = // If true, setText() will strip tags from input text.
454: false;
455:
456: /**
457: * List of recognizers for all charsets known to the implementation.
458: *
459: * @internal
460: */
461: private static ArrayList fCSRecognizers = createRecognizers();
462: private static String[] fCharsetNames;
463:
464: /**
465: * Create the singleton instances of the CharsetRecognizer classes
466: *
467: * @internal
468: */
469: private static ArrayList createRecognizers() {
470: ArrayList recognizers = new ArrayList();
471:
472: recognizers.add(new CharsetRecog_UTF8());
473:
474: recognizers
475: .add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
476: recognizers
477: .add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
478: recognizers
479: .add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
480: recognizers
481: .add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
482:
483: recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
484: recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
485: recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
486: recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
487: recognizers
488: .add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
489: recognizers
490: .add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
491: recognizers
492: .add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
493: recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
494:
495: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
496: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
497: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
498: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
499: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
500: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
501: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
502: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
503: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
504: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
505: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
506: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
507: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
508: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
509: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
510: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
511: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
512: recognizers
513: .add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
514: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
515: recognizers
516: .add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
517: recognizers
518: .add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
519: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
520: recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
521:
522: // Create an array of all charset names, as a side effect.
523: // Needed for the getAllDetectableCharsets() API.
524: String[] charsetNames = new String[recognizers.size()];
525: int out = 0;
526:
527: for (int i = 0; i < recognizers.size(); i++) {
528: String name = ((CharsetRecognizer) recognizers.get(i))
529: .getName();
530:
531: if (out == 0 || !name.equals(charsetNames[out - 1])) {
532: charsetNames[out++] = name;
533: }
534: }
535:
536: fCharsetNames = new String[out];
537: System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
538:
539: return recognizers;
540: }
541: }
|