001: /**
002: *******************************************************************************
003: * Copyright (C) 2005-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.text;
007:
008: import java.io.ByteArrayInputStream;
009: import java.io.IOException;
010: import java.io.InputStream;
011: import java.io.InputStreamReader;
012: import java.io.Reader;
013:
014: /**
015: * This class represents a charset that has been identified by a CharsetDetector
016: * as a possible encoding for a set of input data. From an instance of this
017: * class, you can ask for a confidence level in the charset identification,
018: * or for Java Reader or String to access the original byte data in Unicode form.
019: * <p/>
020: * Instances of this class are created only by CharsetDetectors.
021: * <p/>
022: * Note: this class has a natural ordering that is inconsistent with equals.
023: * The natural ordering is based on the match confidence value.
024: *
025: * @draft ICU 3.4
026: * @provisional This API might change or be removed in a future release.
027: */
028: public class CharsetMatch implements Comparable {
029:
030: /**
031: * Create a java.io.Reader for reading the Unicode character data corresponding
032: * to the original byte data supplied to the Charset detect operation.
033: * <p/>
034: * CAUTION: if the source of the byte data was an InputStream, a Reader
035: * can be created for only one matching char set using this method. If more
036: * than one charset needs to be tried, the caller will need to reset
037: * the InputStream and create InputStreamReaders itself, based on the charset name.
038: *
039: * @return the Reader for the Unicode character data.
040: *
041: * @draft ICU 3.4
042: * @provisional This API might change or be removed in a future release.
043: */
044: public Reader getReader() {
045: InputStream inputStream = fInputStream;
046:
047: if (inputStream == null) {
048: inputStream = new ByteArrayInputStream(fRawInput, 0,
049: fRawLength);
050: }
051:
052: try {
053: inputStream.reset();
054: return new InputStreamReader(inputStream, getName());
055: } catch (IOException e) {
056: return null;
057: }
058: }
059:
060: /**
061: * Create a Java String from Unicode character data corresponding
062: * to the original byte data supplied to the Charset detect operation.
063: *
064: * @return a String created from the converted input data.
065: *
066: * @draft ICU 3.4
067: * @provisional This API might change or be removed in a future release.
068: */
069: public String getString() throws java.io.IOException {
070: return getString(-1);
071:
072: }
073:
074: /**
075: * Create a Java String from Unicode character data corresponding
076: * to the original byte data supplied to the Charset detect operation.
077: * The length of the returned string is limited to the specified size;
078: * the string will be trunctated to this length if necessary. A limit value of
079: * zero or less is ignored, and treated as no limit.
080: *
081: * @param maxLength The maximium length of the String to be created when the
082: * source of the data is an input stream, or -1 for
083: * unlimited length.
084: * @return a String created from the converted input data.
085: *
086: * @draft ICU 3.4
087: * @provisional This API might change or be removed in a future release.
088: */
089: public String getString(int maxLength) throws java.io.IOException {
090: String result = null;
091: if (fInputStream != null) {
092: StringBuffer sb = new StringBuffer();
093: char[] buffer = new char[1024];
094: Reader reader = getReader();
095: int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
096: int bytesRead = 0;
097:
098: while ((bytesRead = reader.read(buffer, 0, Math.min(max,
099: 1024))) >= 0) {
100: sb.append(buffer, 0, bytesRead);
101: max -= bytesRead;
102: }
103:
104: reader.close();
105:
106: return sb.toString();
107: } else {
108: result = new String(fRawInput, getName());
109: }
110: return result;
111:
112: }
113:
114: /**
115: * Get an indication of the confidence in the charset detected.
116: * Confidence values range from 0-100, with larger numbers indicating
117: * a better match of the input data to the characteristics of the
118: * charset.
119: *
120: * @return the confidence in the charset match
121: *
122: * @draft ICU 3.4
123: * @provisional This API might change or be removed in a future release.
124: */
125: public int getConfidence() {
126: return fConfidence;
127: }
128:
129: /**
130: * Bit flag indicating the match is based on the the encoding scheme.
131: *
132: * @see #getMatchType
133: * @draft ICU 3.4
134: * @provisional This API might change or be removed in a future release.
135: */
136: static public final int ENCODING_SCHEME = 1;
137:
138: /**
139: * Bit flag indicating the match is based on the presence of a BOM.
140: *
141: * @see #getMatchType
142: * @draft ICU 3.4
143: * @provisional This API might change or be removed in a future release.
144: */
145: static public final int BOM = 2;
146:
147: /**
148: * Bit flag indicating he match is based on the declared encoding.
149: *
150: * @see #getMatchType
151: * @draft ICU 3.4
152: * @provisional This API might change or be removed in a future release.
153: */
154: static public final int DECLARED_ENCODING = 4;
155:
156: /**
157: * Bit flag indicating the match is based on language statistics.
158: *
159: * @see #getMatchType
160: * @draft ICU 3.4
161: * @provisional This API might change or be removed in a future release.
162: */
163: static public final int LANG_STATISTICS = 8;
164:
165: /**
166: * Return flags indicating what it was about the input data
167: * that caused this charset to be considered as a possible match.
168: * The result is a bitfield containing zero or more of the flags
169: * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
170: * A result of zero means no information is available.
171: * <p>
172: * Note: currently, this method always returns zero.
173: * <p>
174: *
175: * @return the type of match found for this charset.
176: *
177: * @draft ICU 3.4
178: * @provisional This API might change or be removed in a future release.
179: */
180: public int getMatchType() {
181: // TODO: create a list of enum-like constants for common combinations of types of matches.
182: return 0;
183: }
184:
185: /**
186: * Get the name of the detected charset.
187: * The name will be one that can be used with other APIs on the
188: * platform that accept charset names. It is the "Canonical name"
189: * as defined by the class java.nio.charset.Charset; for
190: * charsets that are registered with the IANA charset registry,
191: * this is the MIME-preferred registerd name.
192: *
193: * @see java.nio.charset.Charset
194: * @see java.io.InputStreamReader
195: *
196: * @return The name of the charset.
197: *
198: * @draft ICU 3.4
199: * @provisional This API might change or be removed in a future release.
200: */
201: public String getName() {
202: return fRecognizer.getName();
203: }
204:
205: /**
206: * Get the ISO code for the language of the detected charset.
207: *
208: * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
209: *
210: * @draft ICU 3.4
211: * @provisional This API might change or be removed in a future release.
212: */
213: public String getLanguage() {
214: return fRecognizer.getLanguage();
215: }
216:
217: /**
218: * Compare to other CharsetMatch objects.
219: * Comparison is based on the match confidence value, which
220: * allows CharsetDetector.detectAll() to order its results.
221: *
222: * @param o the CharsetMatch object to compare against.
223: * @return a negative integer, zero, or a positive integer as the
224: * confidence level of this CharsetMatch
225: * is less than, equal to, or greater than that of
226: * the argument.
227: * @throws ClassCastException if the argument is not a CharsetMatch.
228: * @draft ICU 3.4
229: * @provisional This API might change or be removed in a future release.
230: */
231: public int compareTo(Object o) {
232: CharsetMatch other = (CharsetMatch) o;
233: int compareResult = 0;
234: if (this .fConfidence > other.fConfidence) {
235: compareResult = 1;
236: } else if (this .fConfidence < other.fConfidence) {
237: compareResult = -1;
238: }
239: return compareResult;
240: }
241:
242: /**
243: * Constructor. Implementation internal
244: *
245: * @internal
246: */
247: CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
248: fRecognizer = rec;
249: fConfidence = conf;
250:
251: // The references to the original aplication input data must be copied out
252: // of the charset recognizer to here, in case the application resets the
253: // recognizer before using this CharsetMatch.
254: if (det.fInputStream == null) {
255: // We only want the existing input byte data if it came straight from the user,
256: // not if is just the head of a stream.
257: fRawInput = det.fRawInput;
258: fRawLength = det.fRawLength;
259: }
260: fInputStream = det.fInputStream;
261: }
262:
263: //
264: // Private Data
265: //
266: private int fConfidence;
267: private CharsetRecognizer fRecognizer;
268: private byte[] fRawInput = null; // Original, untouched input bytes.
269: // If user gave us a byte array, this is it.
270: private int fRawLength; // Length of data in fRawInput array.
271:
272: private InputStream fInputStream = null; // User's input stream, or null if the user
273: // gave us a byte array.
274: }
|