001: //##header
002: /**
003: *******************************************************************************
004: * Copyright (C) 2005-2006, International Business Machines Corporation and *
005: * others. All Rights Reserved. *
006: *******************************************************************************
007: */package com.ibm.icu.dev.test.charsetdet;
008:
009: import java.io.ByteArrayInputStream;
010: import java.io.InputStream;
011: import java.io.Reader;
012: import java.io.UnsupportedEncodingException;
013:
014: import com.ibm.icu.dev.test.TestFmwk;
015: import com.ibm.icu.impl.UTF32;
016: import com.ibm.icu.text.CharsetDetector;
017: import com.ibm.icu.text.CharsetMatch;
018:
019: //#ifdef FOUNDATION
020: //##import com.ibm.icu.impl.Utility;
021: //#endif
022:
023: import javax.xml.parsers.*;
024: import org.w3c.dom.*;
025:
026: /**
027: * @author andy
028: *
029: * TODO To change the template for this generated type comment go to
030: * Window - Preferences - Java - Code Style - Code Templates
031: */
032: public class TestCharsetDetector extends TestFmwk {
033:
034: /**
035: * Constructor
036: */
037: public TestCharsetDetector() {
038: }
039:
040: public static void main(String[] args) {
041: try {
042: TestCharsetDetector test = new TestCharsetDetector();
043: test.run(args);
044: } catch (Exception e) {
045: e.printStackTrace();
046: }
047: }
048:
049: private void CheckAssert(boolean exp) {
050: if (exp == false) {
051: String msg;
052: try {
053: throw new Exception();
054: } catch (Exception e) {
055: //#ifndef FOUNDATION
056: StackTraceElement failPoint = e.getStackTrace()[1];
057: msg = "Test failure in file " + failPoint.getFileName()
058: + " at line " + failPoint.getLineNumber();
059: //#else
060: //## msg = "Test failure " + e.getMessage() ;
061: //#endif
062: }
063: errln(msg);
064: }
065:
066: }
067:
068: private String stringFromReader(Reader reader) {
069: StringBuffer sb = new StringBuffer();
070: char[] buffer = new char[1024];
071: int bytesRead = 0;
072:
073: try {
074: while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
075: sb.append(buffer, 0, bytesRead);
076: }
077:
078: return sb.toString();
079: } catch (Exception e) {
080: errln("stringFromReader() failed: " + e.toString());
081: return null;
082: }
083: }
084:
085: private void checkMatch(CharsetDetector det, String testString,
086: String encoding, String language, String id)
087: throws Exception {
088: CharsetMatch m = det.detect();
089: String decoded;
090:
091: if (!m.getName().equals(encoding)) {
092: errln(id + ": encoding detection failure - expected "
093: + encoding + ", got " + m.getName());
094: return;
095: }
096:
097: if (!(language == null || m.getLanguage().equals(language))) {
098: errln(id + ", " + encoding
099: + ": language detection failure - expected "
100: + language + ", got " + m.getLanguage());
101: }
102:
103: if (encoding.startsWith("UTF-32")) {
104: return;
105: }
106:
107: decoded = m.getString();
108:
109: if (!testString.equals(decoded)) {
110: errln(id
111: + ", "
112: + encoding
113: + ": getString() didn't return the original string!");
114: }
115:
116: decoded = stringFromReader(m.getReader());
117:
118: if (!testString.equals(decoded)) {
119: errln(id + ", " + encoding
120: + ": getReader() didn't yield the original string!");
121: }
122: }
123:
124: private void checkEncoding(String testString, String encoding,
125: String id) {
126: String enc = null, lang = null;
127: //#ifndef FOUNDATION
128: String[] split = encoding.split("/");
129: //#else
130: //## String[] split = Utility.split(encoding,'/');
131: //#endif
132:
133: enc = split[0];
134:
135: if (split.length > 1) {
136: lang = split[1];
137: }
138:
139: try {
140: CharsetDetector det = new CharsetDetector();
141: byte[] bytes;
142:
143: if (enc.startsWith("UTF-32")) {
144: UTF32 utf32 = UTF32.getInstance(enc);
145:
146: bytes = utf32.toBytes(testString);
147: } else {
148: String from = enc;
149:
150: while (true) {
151: try {
152: bytes = testString.getBytes(from);
153: } catch (UnsupportedOperationException uoe) {
154: // In some runtimes, the ISO-2022-CN converter
155: // only converts *to* Unicode - we have to use
156: // x-ISO-2022-CN-GB to convert *from* Unicode.
157: if (from.equals("ISO-2022-CN")) {
158: from = "x-ISO-2022-CN-GB";
159: continue;
160: }
161:
162: // Ignore any other converters that can't
163: // convert from Unicode.
164: return;
165: } catch (UnsupportedEncodingException uee) {
166: // Ignore any encodings that this runtime
167: // doesn't support.
168: return;
169: }
170:
171: break;
172: }
173: }
174:
175: det.setText(bytes);
176: checkMatch(det, testString, enc, lang, id);
177:
178: det.setText(new ByteArrayInputStream(bytes));
179: checkMatch(det, testString, enc, lang, id);
180: } catch (Exception e) {
181: errln(id + ": " + e.toString());
182: }
183: }
184:
185: public void TestConstruction() {
186: int i;
187: CharsetDetector det = new CharsetDetector();
188: if (det == null) {
189: errln("Could not construct a charset detector");
190: }
191: String[] charsetNames = CharsetDetector
192: .getAllDetectableCharsets();
193: CheckAssert(charsetNames.length != 0);
194: for (i = 0; i < charsetNames.length; i++) {
195: CheckAssert(charsetNames[i].equals("") == false);
196: // System.out.println("\"" + charsetNames[i] + "\"");
197: }
198: }
199:
200: public void TestInputFilter() throws Exception {
201: String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
202: byte[] bytes = s.getBytes("ISO-8859-1");
203: CharsetDetector det = new CharsetDetector();
204: CharsetMatch m;
205:
206: det.enableInputFilter(true);
207: if (!det.inputFilterEnabled()) {
208: errln("input filter should be enabled");
209: }
210:
211: det.setText(bytes);
212: m = det.detect();
213:
214: if (!m.getLanguage().equals("fr")) {
215: errln("input filter did not strip markup!");
216: }
217:
218: det.enableInputFilter(false);
219: det.setText(bytes);
220: m = det.detect();
221:
222: if (!m.getLanguage().equals("en")) {
223: errln("unfiltered input did not detect as English!");
224: }
225: }
226:
227: public void TestUTF8() throws Exception {
228:
229: String s = "This is a string with some non-ascii characters that will "
230: + "be converted to UTF-8, then shoved through the detection process. "
231: + "\u0391\u0392\u0393\u0394\u0395"
232: + "Sure would be nice if our source could contain Unicode directly!";
233: byte[] bytes = s.getBytes("UTF-8");
234: CharsetDetector det = new CharsetDetector();
235: String retrievedS;
236: Reader reader;
237:
238: retrievedS = det.getString(bytes, "UTF-8");
239: CheckAssert(s.equals(retrievedS));
240:
241: reader = det
242: .getReader(new ByteArrayInputStream(bytes), "UTF-8");
243: CheckAssert(s.equals(stringFromReader(reader)));
244: det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
245: }
246:
247: public void TestUTF16() throws Exception {
248: String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a "
249: + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
250:
251: byte[] beBytes = source.getBytes("UnicodeBig");
252: byte[] leBytes = source.getBytes("UnicodeLittle");
253: CharsetDetector det = new CharsetDetector();
254: CharsetMatch m;
255:
256: det.setText(beBytes);
257: m = det.detect();
258:
259: if (!m.getName().equals("UTF-16BE")) {
260: errln("Encoding detection failure: expected UTF-16BE, got "
261: + m.getName());
262: }
263:
264: det.setText(leBytes);
265: m = det.detect();
266:
267: if (!m.getName().equals("UTF-16LE")) {
268: errln("Encoding detection failure: expected UTF-16LE, got "
269: + m.getName());
270: }
271:
272: // Jitterbug 4451, for coverage
273: int confidence = m.getConfidence();
274: if (confidence != 100) {
275: errln("Did not get the expected confidence level "
276: + confidence);
277: }
278: int matchType = m.getMatchType();
279: if (matchType != 0) {
280: errln("Did not get the expected matchType level "
281: + matchType);
282: }
283: }
284:
285: public void TestC1Bytes() throws Exception {
286: String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
287:
288: String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
289:
290: byte[] bISO = sISO.getBytes("ISO-8859-1");
291: byte[] bWindows = sWindows.getBytes("windows-1252");
292:
293: CharsetDetector det = new CharsetDetector();
294: CharsetMatch m;
295:
296: det.setText(bWindows);
297: m = det.detect();
298:
299: if (m.getName() != "windows-1252") {
300: errln("Text with C1 bytes not correctly detected as windows-1252.");
301: return;
302: }
303:
304: det.setText(bISO);
305: m = det.detect();
306:
307: if (m.getName() != "ISO-8859-1") {
308: errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
309: }
310: }
311:
312: public void TestDetection() {
313: //
314: // Open and read the test data file.
315: //
316: //InputStreamReader isr = null;
317:
318: try {
319: InputStream is = TestCharsetDetector.class
320: .getResourceAsStream("CharsetDetectionTests.xml");
321: if (is == null) {
322: errln("Could not open test data file CharsetDetectionTests.xml");
323: return;
324: }
325:
326: //isr = new InputStreamReader(is, "UTF-8");
327:
328: // Set up an xml parser.
329: DocumentBuilderFactory factory = DocumentBuilderFactory
330: .newInstance();
331:
332: factory.setIgnoringComments(true);
333:
334: DocumentBuilder builder = factory.newDocumentBuilder();
335:
336: // Parse the xml content from the test case file.
337: Document doc = builder.parse(is, null);
338: Element root = doc.getDocumentElement();
339:
340: NodeList testCases = root.getElementsByTagName("test-case");
341:
342: // Process each test case
343: for (int n = 0; n < testCases.getLength(); n += 1) {
344: Node testCase = testCases.item(n);
345: NamedNodeMap attrs = testCase.getAttributes();
346: NodeList testData = testCase.getChildNodes();
347: StringBuffer testText = new StringBuffer();
348: String id = attrs.getNamedItem("id").getNodeValue();
349: String encodings = attrs.getNamedItem("encodings")
350: .getNodeValue();
351:
352: // Collect the test case text.
353: for (int t = 0; t < testData.getLength(); t += 1) {
354: Node textNode = testData.item(t);
355:
356: testText.append(textNode.getNodeValue());
357: }
358:
359: // Process test text with each encoding / language pair.
360: String testString = testText.toString();
361: //#ifndef FOUNDATION
362: String[] encodingList = encodings.split(" ");
363: //#else
364: //## String[] encodingList = Utility.split(encodings, ' ');
365: //#endif
366:
367: for (int e = 0; e < encodingList.length; e += 1) {
368: checkEncoding(testString, encodingList[e], id);
369: }
370: }
371:
372: } catch (Exception e) {
373: errln("exception while processing test cases: "
374: + e.toString());
375: }
376: }
377: }
|