Source Code Cross Referenced for TestCharsetDetector.java in  » Internationalization-Localization » icu4j » com » ibm » icu » dev » test » charsetdet » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.dev.test.charsetdet 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        //##header
002:        /**
003:         *******************************************************************************
004:         * Copyright (C) 2005-2006, International Business Machines Corporation and    *
005:         * others. All Rights Reserved.                                                *
006:         *******************************************************************************
007:         */package com.ibm.icu.dev.test.charsetdet;
008:
009:        import java.io.ByteArrayInputStream;
010:        import java.io.InputStream;
011:        import java.io.Reader;
012:        import java.io.UnsupportedEncodingException;
013:
014:        import com.ibm.icu.dev.test.TestFmwk;
015:        import com.ibm.icu.impl.UTF32;
016:        import com.ibm.icu.text.CharsetDetector;
017:        import com.ibm.icu.text.CharsetMatch;
018:
019:        //#ifdef FOUNDATION
020:        //##import com.ibm.icu.impl.Utility;
021:        //#endif
022:
023:        import javax.xml.parsers.*;
024:        import org.w3c.dom.*;
025:
026:        /**
027:         * @author andy
028:         *
029:         * TODO To change the template for this generated type comment go to
030:         * Window - Preferences - Java - Code Style - Code Templates
031:         */
032:        public class TestCharsetDetector extends TestFmwk {
033:
034:            /**
035:             * Constructor
036:             */
037:            public TestCharsetDetector() {
038:            }
039:
040:            public static void main(String[] args) {
041:                try {
042:                    TestCharsetDetector test = new TestCharsetDetector();
043:                    test.run(args);
044:                } catch (Exception e) {
045:                    e.printStackTrace();
046:                }
047:            }
048:
049:            private void CheckAssert(boolean exp) {
050:                if (exp == false) {
051:                    String msg;
052:                    try {
053:                        throw new Exception();
054:                    } catch (Exception e) {
055:                        //#ifndef FOUNDATION
056:                        StackTraceElement failPoint = e.getStackTrace()[1];
057:                        msg = "Test failure in file " + failPoint.getFileName()
058:                                + " at line " + failPoint.getLineNumber();
059:                        //#else
060:                        //##           msg = "Test failure  " + e.getMessage() ;
061:                        //#endif
062:                    }
063:                    errln(msg);
064:                }
065:
066:            }
067:
068:            private String stringFromReader(Reader reader) {
069:                StringBuffer sb = new StringBuffer();
070:                char[] buffer = new char[1024];
071:                int bytesRead = 0;
072:
073:                try {
074:                    while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
075:                        sb.append(buffer, 0, bytesRead);
076:                    }
077:
078:                    return sb.toString();
079:                } catch (Exception e) {
080:                    errln("stringFromReader() failed: " + e.toString());
081:                    return null;
082:                }
083:            }
084:
085:            private void checkMatch(CharsetDetector det, String testString,
086:                    String encoding, String language, String id)
087:                    throws Exception {
088:                CharsetMatch m = det.detect();
089:                String decoded;
090:
091:                if (!m.getName().equals(encoding)) {
092:                    errln(id + ": encoding detection failure - expected "
093:                            + encoding + ", got " + m.getName());
094:                    return;
095:                }
096:
097:                if (!(language == null || m.getLanguage().equals(language))) {
098:                    errln(id + ", " + encoding
099:                            + ": language detection failure - expected "
100:                            + language + ", got " + m.getLanguage());
101:                }
102:
103:                if (encoding.startsWith("UTF-32")) {
104:                    return;
105:                }
106:
107:                decoded = m.getString();
108:
109:                if (!testString.equals(decoded)) {
110:                    errln(id
111:                            + ", "
112:                            + encoding
113:                            + ": getString() didn't return the original string!");
114:                }
115:
116:                decoded = stringFromReader(m.getReader());
117:
118:                if (!testString.equals(decoded)) {
119:                    errln(id + ", " + encoding
120:                            + ": getReader() didn't yield the original string!");
121:                }
122:            }
123:
124:            private void checkEncoding(String testString, String encoding,
125:                    String id) {
126:                String enc = null, lang = null;
127:                //#ifndef FOUNDATION
128:                String[] split = encoding.split("/");
129:                //#else
130:                //##        String[] split = Utility.split(encoding,'/');
131:                //#endif
132:
133:                enc = split[0];
134:
135:                if (split.length > 1) {
136:                    lang = split[1];
137:                }
138:
139:                try {
140:                    CharsetDetector det = new CharsetDetector();
141:                    byte[] bytes;
142:
143:                    if (enc.startsWith("UTF-32")) {
144:                        UTF32 utf32 = UTF32.getInstance(enc);
145:
146:                        bytes = utf32.toBytes(testString);
147:                    } else {
148:                        String from = enc;
149:
150:                        while (true) {
151:                            try {
152:                                bytes = testString.getBytes(from);
153:                            } catch (UnsupportedOperationException uoe) {
154:                                // In some runtimes, the ISO-2022-CN converter
155:                                // only converts *to* Unicode - we have to use
156:                                // x-ISO-2022-CN-GB to convert *from* Unicode.
157:                                if (from.equals("ISO-2022-CN")) {
158:                                    from = "x-ISO-2022-CN-GB";
159:                                    continue;
160:                                }
161:
162:                                // Ignore any other converters that can't
163:                                // convert from Unicode.
164:                                return;
165:                            } catch (UnsupportedEncodingException uee) {
166:                                // Ignore any encodings that this runtime
167:                                // doesn't support.
168:                                return;
169:                            }
170:
171:                            break;
172:                        }
173:                    }
174:
175:                    det.setText(bytes);
176:                    checkMatch(det, testString, enc, lang, id);
177:
178:                    det.setText(new ByteArrayInputStream(bytes));
179:                    checkMatch(det, testString, enc, lang, id);
180:                } catch (Exception e) {
181:                    errln(id + ": " + e.toString());
182:                }
183:            }
184:
185:            public void TestConstruction() {
186:                int i;
187:                CharsetDetector det = new CharsetDetector();
188:                if (det == null) {
189:                    errln("Could not construct a charset detector");
190:                }
191:                String[] charsetNames = CharsetDetector
192:                        .getAllDetectableCharsets();
193:                CheckAssert(charsetNames.length != 0);
194:                for (i = 0; i < charsetNames.length; i++) {
195:                    CheckAssert(charsetNames[i].equals("") == false);
196:                    // System.out.println("\"" + charsetNames[i] + "\"");
197:                }
198:            }
199:
200:            public void TestInputFilter() throws Exception {
201:                String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
202:                byte[] bytes = s.getBytes("ISO-8859-1");
203:                CharsetDetector det = new CharsetDetector();
204:                CharsetMatch m;
205:
206:                det.enableInputFilter(true);
207:                if (!det.inputFilterEnabled()) {
208:                    errln("input filter should be enabled");
209:                }
210:
211:                det.setText(bytes);
212:                m = det.detect();
213:
214:                if (!m.getLanguage().equals("fr")) {
215:                    errln("input filter did not strip markup!");
216:                }
217:
218:                det.enableInputFilter(false);
219:                det.setText(bytes);
220:                m = det.detect();
221:
222:                if (!m.getLanguage().equals("en")) {
223:                    errln("unfiltered input did not detect as English!");
224:                }
225:            }
226:
227:            public void TestUTF8() throws Exception {
228:
229:                String s = "This is a string with some non-ascii characters that will "
230:                        + "be converted to UTF-8, then shoved through the detection process.  "
231:                        + "\u0391\u0392\u0393\u0394\u0395"
232:                        + "Sure would be nice if our source could contain Unicode directly!";
233:                byte[] bytes = s.getBytes("UTF-8");
234:                CharsetDetector det = new CharsetDetector();
235:                String retrievedS;
236:                Reader reader;
237:
238:                retrievedS = det.getString(bytes, "UTF-8");
239:                CheckAssert(s.equals(retrievedS));
240:
241:                reader = det
242:                        .getReader(new ByteArrayInputStream(bytes), "UTF-8");
243:                CheckAssert(s.equals(stringFromReader(reader)));
244:                det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
245:            }
246:
247:            public void TestUTF16() throws Exception {
248:                String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a "
249:                        + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
250:
251:                byte[] beBytes = source.getBytes("UnicodeBig");
252:                byte[] leBytes = source.getBytes("UnicodeLittle");
253:                CharsetDetector det = new CharsetDetector();
254:                CharsetMatch m;
255:
256:                det.setText(beBytes);
257:                m = det.detect();
258:
259:                if (!m.getName().equals("UTF-16BE")) {
260:                    errln("Encoding detection failure: expected UTF-16BE, got "
261:                            + m.getName());
262:                }
263:
264:                det.setText(leBytes);
265:                m = det.detect();
266:
267:                if (!m.getName().equals("UTF-16LE")) {
268:                    errln("Encoding detection failure: expected UTF-16LE, got "
269:                            + m.getName());
270:                }
271:
272:                // Jitterbug 4451, for coverage
273:                int confidence = m.getConfidence();
274:                if (confidence != 100) {
275:                    errln("Did not get the expected confidence level "
276:                            + confidence);
277:                }
278:                int matchType = m.getMatchType();
279:                if (matchType != 0) {
280:                    errln("Did not get the expected matchType level "
281:                            + matchType);
282:                }
283:            }
284:
285:            public void TestC1Bytes() throws Exception {
286:                String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
287:
288:                String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
289:
290:                byte[] bISO = sISO.getBytes("ISO-8859-1");
291:                byte[] bWindows = sWindows.getBytes("windows-1252");
292:
293:                CharsetDetector det = new CharsetDetector();
294:                CharsetMatch m;
295:
296:                det.setText(bWindows);
297:                m = det.detect();
298:
299:                if (m.getName() != "windows-1252") {
300:                    errln("Text with C1 bytes not correctly detected as windows-1252.");
301:                    return;
302:                }
303:
304:                det.setText(bISO);
305:                m = det.detect();
306:
307:                if (m.getName() != "ISO-8859-1") {
308:                    errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
309:                }
310:            }
311:
312:            public void TestDetection() {
313:                //
314:                //  Open and read the test data file.
315:                //
316:                //InputStreamReader isr = null;
317:
318:                try {
319:                    InputStream is = TestCharsetDetector.class
320:                            .getResourceAsStream("CharsetDetectionTests.xml");
321:                    if (is == null) {
322:                        errln("Could not open test data file CharsetDetectionTests.xml");
323:                        return;
324:                    }
325:
326:                    //isr = new InputStreamReader(is, "UTF-8"); 
327:
328:                    // Set up an xml parser.
329:                    DocumentBuilderFactory factory = DocumentBuilderFactory
330:                            .newInstance();
331:
332:                    factory.setIgnoringComments(true);
333:
334:                    DocumentBuilder builder = factory.newDocumentBuilder();
335:
336:                    // Parse the xml content from the test case file.
337:                    Document doc = builder.parse(is, null);
338:                    Element root = doc.getDocumentElement();
339:
340:                    NodeList testCases = root.getElementsByTagName("test-case");
341:
342:                    // Process each test case
343:                    for (int n = 0; n < testCases.getLength(); n += 1) {
344:                        Node testCase = testCases.item(n);
345:                        NamedNodeMap attrs = testCase.getAttributes();
346:                        NodeList testData = testCase.getChildNodes();
347:                        StringBuffer testText = new StringBuffer();
348:                        String id = attrs.getNamedItem("id").getNodeValue();
349:                        String encodings = attrs.getNamedItem("encodings")
350:                                .getNodeValue();
351:
352:                        // Collect the test case text.
353:                        for (int t = 0; t < testData.getLength(); t += 1) {
354:                            Node textNode = testData.item(t);
355:
356:                            testText.append(textNode.getNodeValue());
357:                        }
358:
359:                        // Process test text with each encoding / language pair.
360:                        String testString = testText.toString();
361:                        //#ifndef FOUNDATION
362:                        String[] encodingList = encodings.split(" ");
363:                        //#else
364:                        //##                String[] encodingList = Utility.split(encodings, ' ');
365:                        //#endif
366:
367:                        for (int e = 0; e < encodingList.length; e += 1) {
368:                            checkEncoding(testString, encodingList[e], id);
369:                        }
370:                    }
371:
372:                } catch (Exception e) {
373:                    errln("exception while processing test cases: "
374:                            + e.toString());
375:                }
376:            }
377:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.