0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 2000-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */package com.ibm.icu.dev.test.translit;
0007:
0008: import com.ibm.icu.dev.test.*;
0009: import com.ibm.icu.lang.*;
0010: import com.ibm.icu.text.*;
0011: import com.ibm.icu.util.LocaleData;
0012: import com.ibm.icu.util.ULocale;
0013: import com.ibm.icu.impl.Utility;
0014:
0015: import java.io.BufferedWriter;
0016: import java.io.ByteArrayOutputStream;
0017: import java.io.File;
0018: import java.io.FileNotFoundException;
0019: import java.io.FileOutputStream;
0020: import java.io.IOException;
0021: import java.io.OutputStreamWriter;
0022: import java.io.PrintWriter;
0023: import java.io.UnsupportedEncodingException;
0024: import java.util.MissingResourceException;
0025:
0026: /**
0027: * @test
0028: * @summary Round trip test of Transliterator
0029: */
0030: public class RoundTripTest extends TestFmwk {
0031:
0032: static final boolean EXTRA_TESTS = true;
0033: static final boolean PRINT_RULES = true;
0034:
0035: public static void main(String[] args) throws Exception {
0036: new RoundTripTest().run(args);
0037: }
0038:
0039: /*
0040: public void TestSingle() throws IOException, ParseException {
0041: Transliterator t = Transliterator.getInstance("Latin-Greek");
0042: String s = t.transliterate("\u0101\u0069");
0043: }
0044: */
0045:
0046: /*
0047: Note: Unicode 3.2 added new Hiragana/Katakana characters:
0048:
0049: 3095..3096 ; 3.2 # [2] HIRAGANA LETTER SMALL KA..HIRAGANA LETTER SMALL KE
0050: 309F..30A0 ; 3.2 # [2] HIRAGANA DIGRAPH YORI..KATAKANA-HIRAGANA DOUBLE HYPHEN
0051: 30FF ; 3.2 # KATAKANA DIGRAPH KOTO
0052: 31F0..31FF ; 3.2 # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
0053:
0054: We will not add them to the rules until they are more supported (e.g. in fonts on Windows)
0055: A bug has been filed to remind us to do this: #1979.
0056: */
0057:
0058: static String KATAKANA = "[[[:katakana:][\u30A1-\u30FA\u30FC]]-[\u30FF\u31F0-\u31FF]]";
0059: static String HIRAGANA = "[[[:hiragana:][\u3040-\u3094]]-[\u3095-\u3096\u309F-\u30A0]]";
0060: static String LENGTH = "[\u30FC]";
0061: static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9D]";
0062: static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
0063: static String HIRAGANA_ITERATION = "[\u309D\u309E]";
0064:
0065: //------------------------------------------------------------------
0066: // AbbreviatedUnicodeSetIterator
0067: //------------------------------------------------------------------
0068:
0069: static class AbbreviatedUnicodeSetIterator extends
0070: UnicodeSetIterator {
0071:
0072: private boolean abbreviated;
0073: private int perRange;
0074:
0075: public AbbreviatedUnicodeSetIterator() {
0076: super ();
0077: abbreviated = false;
0078: }
0079:
0080: public void reset(UnicodeSet newSet) {
0081: reset(newSet, false);
0082: }
0083:
0084: public void reset(UnicodeSet newSet, boolean abb) {
0085: reset(newSet, abb, 100);
0086: }
0087:
0088: public void reset(UnicodeSet newSet, boolean abb, int density) {
0089: super .reset(newSet);
0090: abbreviated = abb;
0091: perRange = newSet.getRangeCount();
0092: if (perRange != 0) {
0093: perRange = density / perRange;
0094: }
0095: }
0096:
0097: protected void loadRange(int myRange) {
0098: super .loadRange(myRange);
0099: if (abbreviated && (endElement > nextElement + perRange)) {
0100: endElement = nextElement + perRange;
0101: }
0102: }
0103: }
0104:
0105: //--------------------------------------------------------------------
0106:
0107: public void showElapsed(long start, String name) {
0108: double dur = (System.currentTimeMillis() - start) / 1000.0;
0109: logln(name + " took " + dur + " seconds");
0110: }
0111:
0112: public void TestKana() throws IOException {
0113: long start = System.currentTimeMillis();
0114: new Test("Katakana-Hiragana").test(KATAKANA, "[" + HIRAGANA
0115: + LENGTH + "]",
0116: "[" + HALFWIDTH_KATAKANA + LENGTH + "]", this ,
0117: new Legal());
0118: showElapsed(start, "TestKana");
0119: }
0120:
0121: public void TestHiragana() throws IOException {
0122: long start = System.currentTimeMillis();
0123: new Test("Latin-Hiragana").test("[a-zA-Z]", HIRAGANA,
0124: HIRAGANA_ITERATION, this , new Legal());
0125: showElapsed(start, "TestHiragana");
0126: }
0127:
0128: public void TestKatakana() throws IOException {
0129: long start = System.currentTimeMillis();
0130: new Test("Latin-Katakana").test("[a-zA-Z]", KATAKANA, "["
0131: + KATAKANA_ITERATION + HALFWIDTH_KATAKANA + "]", this ,
0132: new Legal());
0133: showElapsed(start, "TestKatakana");
0134: }
0135:
0136: public void TestJamo() throws IOException {
0137: long start = System.currentTimeMillis();
0138: new Test("Latin-Jamo").test("[a-zA-Z]",
0139: "[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]", "",
0140: this , new LegalJamo());
0141: showElapsed(start, "TestJamo");
0142: }
0143:
0144: /*
0145: SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
0146: LCount = 19, VCount = 21, TCount = 28,
0147: NCount = VCount * TCount, // 588
0148: SCount = LCount * NCount, // 11172
0149: LLimit = LBase + LCount, // 1113
0150: VLimit = VBase + VCount, // 1176
0151: TLimit = TBase + TCount, // 11C3
0152: SLimit = SBase + SCount; // D7A4
0153: */
0154:
0155: public void TestHangul() throws IOException {
0156: long start = System.currentTimeMillis();
0157: Test t = new Test("Latin-Hangul", 5);
0158: if (getInclusion() < 10)
0159: t.setPairLimit(1000);
0160: t.test("[a-zA-Z]", "[\uAC00-\uD7A4]", "", this , new Legal());
0161: showElapsed(start, "TestHangul");
0162: }
0163:
0164: public void TestHan() throws UnsupportedEncodingException,
0165: FileNotFoundException {
0166: try {
0167: UnicodeSet exemplars = LocaleData.getExemplarSet(
0168: new ULocale("zh"), 0);
0169: // create string with all chars
0170: StringBuffer b = new StringBuffer();
0171: for (UnicodeSetIterator it = new UnicodeSetIterator(
0172: exemplars); it.next();) {
0173: UTF16.append(b, it.codepoint);
0174: }
0175: String source = b.toString();
0176: // transform with Han translit
0177: Transliterator han = Transliterator
0178: .getInstance("Han-Latin");
0179: String target = han.transliterate(source);
0180: // now verify that there are no Han characters left
0181: UnicodeSet allHan = new UnicodeSet("[:han:]");
0182: assertFalse(
0183: "No Han must be left after Han-Latin transliteration",
0184: allHan.containsSome(target));
0185: // check the pinyin translit
0186: Transliterator pn = Transliterator
0187: .getInstance("Latin-NumericPinyin");
0188: String target2 = pn.transliterate(target);
0189: // verify that there are no marks
0190: Transliterator nfc = Transliterator.getInstance("nfc");
0191: String nfced = nfc.transliterate(target2);
0192: UnicodeSet allMarks = new UnicodeSet("[:mark:]");
0193: assertFalse("NumericPinyin must contain no marks", allMarks
0194: .containsSome(nfced));
0195: // verify roundtrip
0196: Transliterator np = pn.getInverse();
0197: String target3 = np.transliterate(target);
0198: boolean roundtripOK = target3.equals(target);
0199: assertTrue("NumericPinyin must roundtrip", roundtripOK);
0200: if (!roundtripOK) {
0201: String filename = "numeric-pinyin.log.txt";
0202: PrintWriter out = new PrintWriter(new BufferedWriter(
0203: new OutputStreamWriter(new FileOutputStream(
0204: filename), "UTF8"), 4 * 1024));
0205: errln("Creating log file "
0206: + new File(filename).getAbsoluteFile());
0207: out.println("Pinyin: " + target);
0208: out.println("Pinyin-Numeric-Pinyin: " + target2);
0209: out.close();
0210: }
0211: } catch (MissingResourceException ex) {
0212: warnln("Could not load the locale data for fetching the exemplar characters.");
0213: }
0214: }
0215:
0216: public void TestSingle() {
0217: Transliterator t = Transliterator.getInstance("Latin-Greek");
0218: t.transliterate("\u0061\u0101\u0069");
0219: }
0220:
0221: String getGreekSet() {
0222: // Time bomb
0223: if (skipIfBeforeICU(3, 6)) {
0224: // We temporarily filter against Unicode 4.1, but we only do this
0225: // before version 3.5.
0226: logln("TestGreek needs to be updated to remove delete the section marked [:Age=4.0:] filter");
0227: } else {
0228: errln("TestGreek needs to be updated to remove delete the [:Age=4.0:] filter ");
0229: }
0230: return
0231: // isICU28() ? "[[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]&[:Age=3.2:]]" :
0232: "[\u003B\u00B7[[:Greek:]&[:Letter:]]-[" + "\u1D26-\u1D2A" + // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI
0233: "\u1D5D-\u1D61" + // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI
0234: "\u1D66-\u1D6A" + // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI
0235: "\u03D7-\u03EF" + // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}
0236: "] & [:Age=4.0:]]";
0237: }
0238:
0239: public void TestGreek() throws IOException {
0240: long start = System.currentTimeMillis();
0241: new Test("Latin-Greek", 50).test("[a-zA-Z]", getGreekSet(),
0242: "[\u00B5\u037A\u03D0-\u03F5\u03F9]", /* roundtrip exclusions */
0243: this , new LegalGreek(true));
0244: showElapsed(start, "TestGreek");
0245: }
0246:
0247: public void TestGreekUNGEGN() throws IOException {
0248: long start = System.currentTimeMillis();
0249: new Test("Latin-Greek/UNGEGN").test("[a-zA-Z]", getGreekSet(),
0250: "[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
0251: this , new LegalGreek(false));
0252: showElapsed(start, "TestGreekUNGEGN");
0253: }
0254:
0255: public void Testel() throws IOException {
0256: long start = System.currentTimeMillis();
0257: new Test("Latin-el").test("[a-zA-Z]", getGreekSet(),
0258: "[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
0259: this , new LegalGreek(false));
0260: showElapsed(start, "Testel");
0261: }
0262:
0263: public void TestCyrillic() throws IOException {
0264: long start = System.currentTimeMillis();
0265: new Test("Latin-Cyrillic").test(
0266: "[a-zA-Z\u0110\u0111\u02BA\u02B9]", "[\u0400-\u045F]",
0267: null, this , new Legal());
0268: showElapsed(start, "TestCyrillic");
0269: }
0270:
0271: static final String ARABIC = "[\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";
0272:
0273: public void TestArabic() throws IOException {
0274: long start = System.currentTimeMillis();
0275: new Test("Latin-Arabic").test("[a-zA-Z\u02BE\u02BF]", ARABIC,
0276: "[a-zA-Z\u02BE\u02BF\u207F]", null, this , new Legal()); //
0277: showElapsed(start, "TestArabic");
0278: }
0279:
0280: public void TestHebrew() throws IOException {
0281: // Time bomb
0282: if (skipIfBeforeICU(3, 6)) {
0283: // We temporarily filter against Unicode 4.1, but we only do this
0284: // before version 3.5.
0285: logln("TestHebrew needs to be updated to remove delete the section marked [:Age=4.0:] filter");
0286: } else {
0287: errln("TestHebrew needs to be updated to remove delete the [:Age=4.0:] filter ");
0288: }
0289: long start = System.currentTimeMillis();
0290: new Test("Latin-Hebrew").test("[a-zA-Z\u02BC\u02BB]",
0291: "[[[:hebrew:]-[\u05BD\uFB00-\uFBFF]]& [:Age=4.0:]]",
0292: "[\u05F0\u05F1\u05F2]", this , new LegalHebrew());
0293: showElapsed(start, "TestHebrew");
0294: }
0295:
0296: public void TestThai() throws IOException {
0297: long start = System.currentTimeMillis();
0298: if (isICUVersionAtLeast(3, 7)) {
0299: new Test("Latin-Thai")
0300: .test(
0301: "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",
0302: "[\u0E01-\u0E3A\u0E40-\u0E5B]",
0303: "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",
0304: null, this , new LegalThai());
0305: } else {
0306: new Test("Latin-Thai")
0307: .test(
0308: "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",
0309: "[\u0E01-\u0E3A\u0E40-\u0E5B]",
0310: "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",
0311: "[\u0E4F]", this , new LegalThai());
0312: }
0313:
0314: showElapsed(start, "TestThai");
0315: }
0316:
0317: //----------------------------------
0318: // Inter-Indic Tests
0319: //----------------------------------
0320: public static class LegalIndic extends Legal {
0321: UnicodeSet vowelSignSet = new UnicodeSet();
0322:
0323: public LegalIndic() {
0324: vowelSignSet
0325: .addAll(new UnicodeSet(
0326: "[\u0901\u0902\u0903\u0904\u093e-\u094c\u0962\u0963]")); /* Devanagari */
0327: vowelSignSet
0328: .addAll(new UnicodeSet(
0329: "[\u0981\u0982\u0983\u09be-\u09cc\u09e2\u09e3\u09D7]")); /* Bengali */
0330: vowelSignSet
0331: .addAll(new UnicodeSet(
0332: "[\u0a01\u0a02\u0a03\u0a3e-\u0a4c\u0a62\u0a63\u0a70\u0a71]")); /* Gurmukhi */
0333: vowelSignSet.addAll(new UnicodeSet(
0334: "[\u0a81\u0a82\u0a83\u0abe-\u0acc\u0ae2\u0ae3]")); /* Gujarati */
0335: vowelSignSet
0336: .addAll(new UnicodeSet(
0337: "[\u0b01\u0b02\u0b03\u0b3e-\u0b4c\u0b62\u0b63\u0b56\u0b57]")); /* Oriya */
0338: vowelSignSet
0339: .addAll(new UnicodeSet(
0340: "[\u0b81\u0b82\u0b83\u0bbe-\u0bcc\u0be2\u0be3\u0bd7]")); /* Tamil */
0341: vowelSignSet
0342: .addAll(new UnicodeSet(
0343: "[\u0c01\u0c02\u0c03\u0c3e-\u0c4c\u0c62\u0c63\u0c55\u0c56]")); /* Telugu */
0344: vowelSignSet
0345: .addAll(new UnicodeSet(
0346: "[\u0c81\u0c82\u0c83\u0cbe-\u0ccc\u0ce2\u0ce3\u0cd5\u0cd6]")); /* Kannada */
0347: vowelSignSet
0348: .addAll(new UnicodeSet(
0349: "[\u0d01\u0d02\u0d03\u0d3e-\u0d4c\u0d62\u0d63\u0d57]")); /* Malayalam */
0350: }
0351:
0352: String avagraha = "\u093d\u09bd\u0abd\u0b3d\u0cbd";
0353: String nukta = "\u093c\u09bc\u0a3c\u0abc\u0b3c\u0cbc";
0354: String virama = "\u094d\u09cd\u0a4d\u0acd\u0b4d\u0bcd\u0c4d\u0ccd\u0d4d";
0355: String sanskritStressSigns = "\u0951\u0952\u0953\u0954\u097d";
0356: String chandrabindu = "\u0901\u0981\u0A81\u0b01\u0c01";
0357:
0358: public boolean is(String sourceString) {
0359: int cp = sourceString.charAt(0);
0360:
0361: // A vowel sign cannot be the first char
0362: if (vowelSignSet.contains(cp)) {
0363: return false;
0364: } else if (avagraha.indexOf(cp) != -1) {
0365: return false;
0366: } else if (virama.indexOf(cp) != -1) {
0367: return false;
0368: } else if (nukta.indexOf(cp) != -1) {
0369: return false;
0370: } else if (sanskritStressSigns.indexOf(cp) != -1) {
0371: return false;
0372: } else if ((chandrabindu.indexOf(cp) != -1)
0373: && (sourceString.length() > 1 && vowelSignSet
0374: .contains(sourceString.charAt(1)))) {
0375: return false;
0376: }
0377: return true;
0378: }
0379: }
0380:
0381: static String latinForIndic = "[['.0-9A-Za-z~\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD"
0382: + "\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F"
0383: + "\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148"
0384: + "\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0"
0385: + "\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u01FB"
0386: + "\u0200-\u021B\u021E-\u021F\u0226-\u0233\u0294\u0303-\u0304\u0306\u0314-\u0315"
0387: + "\u0325\u040E\u0419\u0439\u045E\u04C1-\u04C2\u04D0-\u04D1\u04D6-\u04D7"
0388: + "\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1F01\u1F03\u1F05"
0389: + "\u1F07\u1F09\u1F0B\u1F0D\u1F0F\u1F11\u1F13\u1F15\u1F19\u1F1B\u1F1D\u1F21"
0390: + "\u1F23\u1F25\u1F27\u1F29\u1F2B\u1F2D\u1F2F\u1F31\u1F33\u1F35\u1F37\u1F39"
0391: + "\u1F3B\u1F3D\u1F3F\u1F41\u1F43\u1F45\u1F49\u1F4B\u1F4D\u1F51\u1F53\u1F55"
0392: + "\u1F57\u1F59\u1F5B\u1F5D\u1F5F\u1F61\u1F63\u1F65\u1F67\u1F69\u1F6B\u1F6D"
0393: + "\u1F6F\u1F81\u1F83\u1F85\u1F87\u1F89\u1F8B\u1F8D\u1F8F\u1F91\u1F93\u1F95"
0394: + "\u1F97\u1F99\u1F9B\u1F9D\u1F9F\u1FA1\u1FA3\u1FA5\u1FA7\u1FA9\u1FAB\u1FAD"
0395: + "\u1FAF-\u1FB1\u1FB8-\u1FB9\u1FD0-\u1FD1\u1FD8-\u1FD9\u1FE0-\u1FE1\u1FE5"
0396: + "\u1FE8-\u1FE9\u1FEC\u212A-\u212B\uE04D\uE064]"
0397: + "-[\uE000-\uE080 \u01E2\u01E3]& [[:latin:][:mark:]]]";
0398:
0399: public void TestDevanagariLatin() throws IOException {
0400: long start = System.currentTimeMillis();
0401: if (skipIfBeforeICU(2, 8)) {
0402: new Test("Latin-DEVANAGARI", 50)
0403: .test(
0404: latinForIndic,
0405: "[[:Devanagari:][\u094d][\u0964\u0965] & [:Age=3.2:]]",
0406: "[\u0965]", this , new LegalIndic());
0407:
0408: } else {
0409: if (isICUVersionAtLeast(3, 8)) {
0410: // We temporarily filter against Unicode 4.1, but we only do this
0411: // before version 3.4.
0412: errln("FAIL: TestDevanagariLatin needs to be updated to remove delete the [:Age=4.1:] filter ");
0413: return;
0414: } else {
0415: logln("Warning: TestDevanagariLatin needs to be updated to remove delete the section marked [:Age=4.1:] filter");
0416: }
0417: new Test("Latin-DEVANAGARI", 50)
0418: .test(
0419: latinForIndic,
0420: "[[[:Devanagari:][\u094d][\u0964\u0965]]&[:Age=4.1:]]",
0421: "[\u0965\u0904]", this , new LegalIndic());
0422: }
0423: showElapsed(start, "TestDevanagariLatin");
0424: }
0425:
0426: private static final String[][] interIndicArray = new String[][] {
0427: new String[] {
0428: "BENGALI-DEVANAGARI",
0429: "[:BENGALI:]",
0430: "[:Devanagari:]",
0431: "[\u0904\u0951-\u0954\u0943-\u0949\u094a\u0962\u0963\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u097d]", /*roundtrip exclusions*/
0432: },
0433: new String[] {
0434: "DEVANAGARI-BENGALI",
0435: "[:Devanagari:]",
0436: "[:BENGALI:]",
0437: "[\u09D7\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0438: },
0439:
0440: new String[] {
0441: "GURMUKHI-DEVANAGARI",
0442: "[:GURMUKHI:]",
0443: "[:Devanagari:]",
0444: "[\u0904\u0902\u0936\u0933\u0951-\u0954\u0902\u0903\u0943-\u0949\u094a\u0962\u0963\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u097d]", /*roundtrip exclusions*/
0445: },
0446: new String[] {
0447: "DEVANAGARI-GURMUKHI",
0448: "[:Devanagari:]",
0449: "[:GURMUKHI:]",
0450: "[\u0A02\u0946\u0A5C\u0951-\u0954\u0A70\u0A71\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
0451: },
0452:
0453: new String[] {
0454: "GUJARATI-DEVANAGARI",
0455: "[:GUJARATI:]",
0456: "[:Devanagari:]",
0457: "[\u0904\u0946\u094A\u0962\u0963\u0951-\u0954\u0961\u090c\u090e\u0912\u097d]", /*roundtrip exclusions*/
0458: },
0459: new String[] { "DEVANAGARI-GUJARATI", "[:Devanagari:]",
0460: "[:GUJARATI:]",
0461: "[\u0951-\u0954\u0961\u090c\u090e\u0912]", /*roundtrip exclusions*/
0462: },
0463:
0464: new String[] {
0465: "ORIYA-DEVANAGARI",
0466: "[:ORIYA:]",
0467: "[:Devanagari:]",
0468: "[\u0904\u0912\u0911\u090D\u090e\u0931\u0943-\u094a\u0962\u0963\u0951-\u0954\u0950\u097d]", /*roundtrip exclusions*/
0469: },
0470: new String[] {
0471: "DEVANAGARI-ORIYA",
0472: "[:Devanagari:]",
0473: "[:ORIYA:]",
0474: "[\u0b5f\u0b56\u0b57\u0b70\u0b71\u0950\u090D\u090e\u0912\u0911\u0931]", /*roundtrip exclusions*/
0475: },
0476:
0477: new String[] {
0478: "Tamil-DEVANAGARI",
0479: "[:tamil:]",
0480: "[:Devanagari:]",
0481: "[\u0901\u0904\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]\u097d]", /*roundtrip exclusions*/
0482: },
0483: new String[] { "DEVANAGARI-Tamil", "[:Devanagari:]",
0484: "[:tamil:]", "[\u0bd7]", /*roundtrip exclusions*/
0485: },
0486:
0487: new String[] {
0488: "Telugu-DEVANAGARI",
0489: "[:telugu:]",
0490: "[:Devanagari:]",
0491: "[\u0904\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
0492: },
0493: new String[] {
0494: "DEVANAGARI-TELUGU",
0495: "[:Devanagari:]",
0496: "[:TELUGU:]",
0497: "[\u0c55\u0c56\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
0498: },
0499:
0500: new String[] {
0501: "KANNADA-DEVANAGARI",
0502: "[:KANNADA:]",
0503: "[:Devanagari:]",
0504: "[\u0901\u0904\u0946\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
0505: },
0506: new String[] {
0507: "DEVANAGARI-KANNADA",
0508: "[:Devanagari:]",
0509: "[:KANNADA:]",
0510: "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cde\u0cd5\u0cd6\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
0511: },
0512:
0513: new String[] {
0514: "MALAYALAM-DEVANAGARI",
0515: "[:MALAYALAM:]",
0516: "[:Devanagari:]",
0517: "[\u0901\u0904\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
0518: },
0519: new String[] {
0520: "DEVANAGARI-MALAYALAM",
0521: "[:Devanagari:]",
0522: "[:MALAYALAM:]",
0523: "[\u0d4c\u0d57\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
0524: },
0525:
0526: new String[] {
0527: "GURMUKHI-BENGALI",
0528: "[:GURMUKHI:]",
0529: "[:BENGALI:]",
0530: "[\u0982\u09b6\u09e2\u09e3\u09c3\u09c4\u09d7\u098B\u098C\u09B7\u09E0\u09E1\u09F0\u09F1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0531: },
0532: new String[] {
0533: "BENGALI-GURMUKHI",
0534: "[:BENGALI:]",
0535: "[:GURMUKHI:]",
0536: "[\u0A02\u0a5c\u0a47\u0a70\u0a71\u0A33\u0A35\u0A59\u0A5A\u0A5B\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
0537: },
0538:
0539: new String[] {
0540: "GUJARATI-BENGALI",
0541: "[:GUJARATI:]",
0542: "[:BENGALI:]",
0543: "[\u09d7\u09e2\u09e3\u098c\u09e1\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0544: },
0545: new String[] {
0546: "BENGALI-GUJARATI",
0547: "[:BENGALI:]",
0548: "[:GUJARATI:]",
0549: "[\u0A82\u0a83\u0Ac9\u0Ac5\u0ac7\u0A8D\u0A91\u0AB3\u0AB5\u0ABD\u0AD0]", /*roundtrip exclusions*/
0550: },
0551:
0552: new String[] {
0553: "ORIYA-BENGALI",
0554: "[:ORIYA:]",
0555: "[:BENGALI:]",
0556: "[\u09c4\u09e2\u09e3\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0557: },
0558: new String[] { "BENGALI-ORIYA", "[:BENGALI:]", "[:ORIYA:]",
0559: "[\u0b35\u0b71\u0b5f\u0b56\u0b33\u0b3d]", /*roundtrip exclusions*/
0560: },
0561:
0562: new String[] {
0563: "Tamil-BENGALI",
0564: "[:tamil:]",
0565: "[:BENGALI:]",
0566: "[\u0981\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0567: },
0568: new String[] {
0569: "BENGALI-Tamil",
0570: "[:BENGALI:]",
0571: "[:tamil:]",
0572: "[\u0bc6\u0bc7\u0bca\u0B8E\u0B92\u0BA9\u0BB1\u0BB3\u0BB4\u0BB5]", /*roundtrip exclusions*/
0573: },
0574:
0575: new String[] {
0576: "Telugu-BENGALI",
0577: "[:telugu:]",
0578: "[:BENGALI:]",
0579: "[\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0580: },
0581: new String[] {
0582: "BENGALI-TELUGU",
0583: "[:BENGALI:]",
0584: "[:TELUGU:]",
0585: "[\u0c55\u0c56\u0c47\u0c46\u0c4a\u0C0E\u0C12\u0C31\u0C33\u0C35]", /*roundtrip exclusions*/
0586: },
0587:
0588: new String[] {
0589: "KANNADA-BENGALI",
0590: "[:KANNADA:]",
0591: "[:BENGALI:]",
0592: "[\u0981\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0593: },
0594: new String[] {
0595: "BENGALI-KANNADA",
0596: "[:BENGALI:]",
0597: "[:KANNADA:]",
0598: "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0cc7\u0C8E\u0C92\u0CB1\u0cb3\u0cb5\u0cde]", /*roundtrip exclusions*/
0599: },
0600:
0601: new String[] {
0602: "MALAYALAM-BENGALI",
0603: "[:MALAYALAM:]",
0604: "[:BENGALI:]",
0605: "[\u0981\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
0606: },
0607: new String[] { "BENGALI-MALAYALAM", "[:BENGALI:]",
0608: "[:MALAYALAM:]",
0609: "[\u0d46\u0d4a\u0d47\u0d31-\u0d35\u0d0e\u0d12]", /*roundtrip exclusions*/
0610: },
0611:
0612: new String[] {
0613: "GUJARATI-GURMUKHI",
0614: "[:GUJARATI:]",
0615: "[:GURMUKHI:]",
0616: "[\u0A02\u0ab3\u0ab6\u0A70\u0a71\u0a82\u0a83\u0ac3\u0ac4\u0ac5\u0ac9\u0a5c\u0a72\u0a73\u0a74\u0a8b\u0a8d\u0a91\u0abd]", /*roundtrip exclusions*/
0617: },
0618: new String[] {
0619: "GURMUKHI-GUJARATI",
0620: "[:GURMUKHI:]",
0621: "[:GUJARATI:]",
0622: "[\u0a5c\u0A70\u0a71\u0a72\u0a73\u0a74\u0a82\u0a83\u0a8b\u0a8c\u0a8d\u0a91\u0ab3\u0ab6\u0ab7\u0abd\u0ac3\u0ac4\u0ac5\u0ac9\u0ad0\u0ae0\u0ae1]", /*roundtrip exclusions*/
0623: },
0624:
0625: new String[] {
0626: "ORIYA-GURMUKHI",
0627: "[:ORIYA:]",
0628: "[:GURMUKHI:]",
0629: "[\u0A02\u0a5c\u0a21\u0a47\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0a35\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
0630: },
0631: new String[] {
0632: "GURMUKHI-ORIYA",
0633: "[:GURMUKHI:]",
0634: "[:ORIYA:]",
0635: "[\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
0636: },
0637:
0638: new String[] {
0639: "TAMIL-GURMUKHI",
0640: "[:TAMIL:]",
0641: "[:GURMUKHI:]",
0642: "[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0a47\u0A16\u0A17\u0A18\u0A1B\u0A1D\u0A20\u0A21\u0A22\u0A25\u0A26\u0A27\u0A2B\u0A2C\u0A2D\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
0643: },
0644: new String[] {
0645: "GURMUKHI-TAMIL",
0646: "[:GURMUKHI:]",
0647: "[:TAMIL:]",
0648: "[\u0b82\u0bc6\u0bca\u0bd7\u0bb7\u0bb3\u0b83\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0bb6]", /*roundtrip exclusions*/
0649: },
0650:
0651: new String[] {
0652: "TELUGU-GURMUKHI",
0653: "[:TELUGU:]",
0654: "[:GURMUKHI:]",
0655: "[\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
0656: },
0657: new String[] {
0658: "GURMUKHI-TELUGU",
0659: "[:GURMUKHI:]",
0660: "[:TELUGU:]",
0661: "[\u0c02\u0c03\u0c33\u0c36\u0c44\u0c43\u0c46\u0c4a\u0c56\u0c55\u0C0B\u0C0C\u0C0E\u0C12\u0C31\u0C37\u0C60\u0C61]", /*roundtrip exclusions*/
0662: },
0663: new String[] {
0664: "KANNADA-GURMUKHI",
0665: "[:KANNADA:]",
0666: "[:GURMUKHI:]",
0667: "[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
0668: },
0669: new String[] {
0670: "GURMUKHI-KANNADA",
0671: "[:GURMUKHI:]",
0672: "[:KANNADA:]",
0673: "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0c82\u0c83\u0cb3\u0cb6\u0cc4\u0cc3\u0cc6\u0cca\u0cd5\u0cd6\u0C8B\u0C8C\u0C8E\u0C92\u0CB1\u0CB7\u0cbd\u0CE0\u0CE1\u0cde]", /*roundtrip exclusions*/
0674: },
0675:
0676: new String[] {
0677: "MALAYALAM-GURMUKHI",
0678: "[:MALAYALAM:]",
0679: "[:GURMUKHI:]",
0680: "[\u0A01\u0A02\u0a4b\u0a4c\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
0681: },
0682: new String[] {
0683: "GURMUKHI-MALAYALAM",
0684: "[:GURMUKHI:]",
0685: "[:MALAYALAM:]",
0686: "[\u0d02\u0d03\u0d33\u0d36\u0d43\u0d46\u0d4a\u0d4c\u0d57\u0D0B\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D37\u0D60\u0D61]", /*roundtrip exclusions*/
0687: },
0688:
0689: new String[] { "GUJARATI-ORIYA", "[:GUJARATI:]",
0690: "[:ORIYA:]",
0691: "[\u0b56\u0b57\u0B0C\u0B5F\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
0692: },
0693: new String[] {
0694: "ORIYA-GUJARATI",
0695: "[:ORIYA:]",
0696: "[:GUJARATI:]",
0697: "[\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8D\u0A91\u0AB5\u0Ad0]", /*roundtrip exclusions*/
0698: },
0699:
0700: new String[] {
0701: "TAMIL-GUJARATI",
0702: "[:TAMIL:]",
0703: "[:GUJARATI:]",
0704: "[\u0A81\u0a8c\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0\u0AE1]", /*roundtrip exclusions*/
0705: },
0706: new String[] {
0707: "GUJARATI-TAMIL",
0708: "[:GUJARATI:]",
0709: "[:TAMIL:]",
0710: "[\u0Bc6\u0Bca\u0Bd7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4]", /*roundtrip exclusions*/
0711: },
0712:
0713: new String[] { "TELUGU-GUJARATI", "[:TELUGU:]",
0714: "[:GUJARATI:]",
0715: "[\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
0716: },
0717: new String[] {
0718: "GUJARATI-TELUGU",
0719: "[:GUJARATI:]",
0720: "[:TELUGU:]",
0721: "[\u0c46\u0c4a\u0c55\u0c56\u0C0C\u0C0E\u0C12\u0C31\u0C61]", /*roundtrip exclusions*/
0722: },
0723:
0724: new String[] {
0725: "KANNADA-GUJARATI",
0726: "[:KANNADA:]",
0727: "[:GUJARATI:]",
0728: "[\u0A81\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
0729: },
0730: new String[] {
0731: "GUJARATI-KANNADA",
0732: "[:GUJARATI:]",
0733: "[:KANNADA:]",
0734: "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0C8C\u0C8E\u0C92\u0CB1\u0CDE\u0CE1]", /*roundtrip exclusions*/
0735: },
0736:
0737: new String[] {
0738: "MALAYALAM-GUJARATI",
0739: "[:MALAYALAM:]",
0740: "[:GUJARATI:]",
0741: "[\u0A81\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
0742: },
0743: new String[] {
0744: "GUJARATI-MALAYALAM",
0745: "[:GUJARATI:]",
0746: "[:MALAYALAM:]",
0747: "[\u0d46\u0d4a\u0d4c\u0d55\u0d57\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D61]", /*roundtrip exclusions*/
0748: },
0749:
0750: new String[] {
0751: "TAMIL-ORIYA",
0752: "[:TAMIL:]",
0753: "[:ORIYA:]",
0754: "[\u0B01\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
0755: },
0756: new String[] {
0757: "ORIYA-TAMIL",
0758: "[:ORIYA:]",
0759: "[:TAMIL:]",
0760: "[\u0bc6\u0bca\u0bc7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0BB5]", /*roundtrip exclusions*/
0761: },
0762:
0763: new String[] {
0764: "TELUGU-ORIYA",
0765: "[:TELUGU:]",
0766: "[:ORIYA:]",
0767: "[\u0b3c\u0b57\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
0768: },
0769: new String[] {
0770: "ORIYA-TELUGU",
0771: "[:ORIYA:]",
0772: "[:TELUGU:]",
0773: "[\u0c44\u0c46\u0c4a\u0c55\u0c47\u0C0E\u0C12\u0C31\u0C35]", /*roundtrip exclusions*/
0774: },
0775:
0776: new String[] {
0777: "KANNADA-ORIYA",
0778: "[:KANNADA:]",
0779: "[:ORIYA:]",
0780: "[\u0B01\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
0781: },
0782: new String[] {
0783: "ORIYA-KANNADA",
0784: "[:ORIYA:]",
0785: "[:KANNADA:]",
0786: "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc4\u0cc6\u0cca\u0cd5\u0cc7\u0C8E\u0C92\u0CB1\u0CB5\u0CDE]", /*roundtrip exclusions*/
0787: },
0788:
0789: new String[] {
0790: "MALAYALAM-ORIYA",
0791: "[:MALAYALAM:]",
0792: "[:ORIYA:]",
0793: "[\u0B01\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
0794: },
0795: new String[] {
0796: "ORIYA-MALAYALAM",
0797: "[:ORIYA:]",
0798: "[:MALAYALAM:]",
0799: "[\u0D47\u0D46\u0D4a\u0D0E\u0D12\u0D31\u0D34\u0D35]", /*roundtrip exclusions*/
0800: },
0801:
0802: new String[] { "TELUGU-TAMIL", "[:TELUGU:]", "[:TAMIL:]",
0803: "[\u0bd7\u0ba9\u0bb4]", /*roundtrip exclusions*/
0804: },
0805: new String[] {
0806: "TAMIL-TELUGU",
0807: "[:TAMIL:]",
0808: "[:TELUGU:]",
0809: "[\u0C01\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
0810: },
0811:
0812: new String[] { "KANNADA-TAMIL", "[:KANNADA:]", "[:TAMIL:]",
0813: "[\u0bd7\u0bc6\u0ba9\u0bb4]", /*roundtrip exclusions*/
0814: },
0815: new String[] {
0816: "TAMIL-KANNADA",
0817: "[:TAMIL:]",
0818: "[:KANNADA:]",
0819: "[\u0cc3\u0cc4\u0cc6\u0cc7\u0cd5\u0cd6\u0C8B\u0C8C\u0C96\u0C97\u0C98\u0C9B\u0C9D\u0CA0\u0CA1\u0CA2\u0CA5\u0CA6\u0CA7\u0CAB\u0CAC\u0CAD\u0CB6\u0cbc\u0cbd\u0CDE\u0CE0\u0CE1]", /*roundtrip exclusions*/
0820: },
0821:
0822: new String[] { "MALAYALAM-TAMIL", "[:MALAYALAM:]",
0823: "[:TAMIL:]", "[\u0ba9]", /*roundtrip exclusions*/
0824: },
0825: new String[] {
0826: "TAMIL-MALAYALAM",
0827: "[:TAMIL:]",
0828: "[:MALAYALAM:]",
0829: "[\u0d43\u0d12\u0D0B\u0D0C\u0D16\u0D17\u0D18\u0D1B\u0D1D\u0D20\u0D21\u0D22\u0D25\u0D26\u0D27\u0D2B\u0D2C\u0D2D\u0D36\u0D60\u0D61]", /*roundtrip exclusions*/
0830: },
0831:
0832: new String[] { "KANNADA-TELUGU", "[:KANNADA:]",
0833: "[:TELUGU:]", "[\u0C01\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
0834: },
0835: new String[] { "TELUGU-KANNADA", "[:TELUGU:]",
0836: "[:KANNADA:]",
0837: "[\u0cc8\u0cd5\u0cd6\u0CDE\u0cbc\u0cbd]", /*roundtrip exclusions*/
0838: },
0839:
0840: new String[] { "MALAYALAM-TELUGU", "[:MALAYALAM:]",
0841: "[:TELUGU:]",
0842: "[\u0C01\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
0843: },
0844: new String[] { "TELUGU-MALAYALAM", "[:TELUGU:]",
0845: "[:MALAYALAM:]", "[\u0d4c\u0d57\u0D34]", /*roundtrip exclusions*/
0846: },
0847:
0848: new String[] {
0849: "MALAYALAM-KANNADA",
0850: "[:MALAYALAM:]",
0851: "[:KANNADA:]",
0852: "[\u0cbc\u0cbd\u0cc4\u0cc6\u0cca\u0ccc\u0ccb\u0cd5\u0cd6\u0cDe]", /*roundtrip exclusions*/
0853: },
0854: new String[] { "Latin-Bengali", latinForIndic,
0855: "[[:Bengali:][\u0964\u0965]]",
0856: "[\u0965\u09f0-\u09fa\u09ce]", /*roundtrip exclusions*/
0857: },
0858: new String[] { "Latin-Gurmukhi", latinForIndic,
0859: "[[:Gurmukhi:][\u0964\u0965]]",
0860: "[\u0a01\u0a02\u0965\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
0861: },
0862: new String[] { "Latin-Gujarati", latinForIndic,
0863: "[[:Gujarati:][\u0964\u0965]]", "[\u0965]", /*roundtrip exclusions*/
0864: },
0865: new String[] { "Latin-Oriya", latinForIndic,
0866: "[[:Oriya:][\u0964\u0965]]", "[\u0965\u0b70]", /*roundtrip exclusions*/
0867: },
0868: new String[] { "Latin-Tamil", latinForIndic, "[:Tamil:]",
0869: null, /*roundtrip exclusions*/
0870: },
0871: new String[] { "Latin-Telugu", latinForIndic, "[:Telugu:]",
0872: null, /*roundtrip exclusions*/
0873: },
0874: new String[] { "Latin-Kannada", latinForIndic,
0875: "[:Kannada:]", null, /*roundtrip exclusions*/
0876: },
0877: new String[] { "Latin-Malayalam", latinForIndic,
0878: "[:Malayalam:]", null, /*roundtrip exclusions*/
0879: }, };
0880:
0881: public void TestInterIndic() throws Exception {
0882: long start = System.currentTimeMillis();
0883: int num = interIndicArray.length;
0884: if (isQuick()) {
0885: logln("Testing only 5 of " + interIndicArray.length
0886: + " Skipping rest (use -e for exhaustive)");
0887: num = 5;
0888: }
0889: if (isICUVersionAtLeast(3, 8)) {
0890: // We temporarily filter against Unicode 4.1, but we only do this
0891: // before version 3.4.
0892: errln("FAIL: TestInterIndic needs to be updated to remove delete the [:Age=4.1:] filter ");
0893: return;
0894: } else {
0895: logln("Warning: TestInterIndic needs to be updated to remove delete the section marked [:Age=4.1:] filter");
0896: }
0897: for (int i = 0; i < num; i++) {
0898: logln("Testing " + interIndicArray[i][0] + " at index " + i);
0899: if (skipIfBeforeICU(2, 8)) {
0900: new Test(interIndicArray[i][0], 50)
0901: .test("[" + interIndicArray[i][1]
0902: + " & [:Age=3.2:]]", "["
0903: + interIndicArray[i][2]
0904: + " & [:Age=3.2:]]",
0905: interIndicArray[i][3], this ,
0906: new LegalIndic());
0907: } else {
0908: /*TODO: uncomment the line below when the transliterator is fixed
0909: new Test(interIndicArray[i][0], 50)
0910: .test(interIndicArray[i][1],
0911: interIndicArray[i][2],
0912: interIndicArray[i][3],
0913: this, new LegalIndic());
0914: */
0915: /* comment lines below when transliterator is fixed */
0916: // start
0917: new Test(interIndicArray[i][0], 50).test("["
0918: + interIndicArray[i][1] + " &[:Age=4.1:]]", "["
0919: + interIndicArray[i][2] + " &[:Age=4.1:]]",
0920: interIndicArray[i][3], this , new LegalIndic());
0921: //end
0922: }
0923:
0924: }
0925: showElapsed(start, "TestInterIndic");
0926: }
0927:
0928: //---------------
0929: // End Indic
0930: //---------------
0931:
0932: public static class Legal {
0933: public boolean is(String sourceString) {
0934: return true;
0935: }
0936: }
0937:
0938: public static class LegalJamo extends Legal {
0939: // any initial must be followed by a medial (or initial)
0940: // any medial must follow an initial (or medial)
0941: // any final must follow a medial (or final)
0942:
0943: public boolean is(String sourceString) {
0944: try {
0945: int t;
0946: String decomp = Normalizer.normalize(sourceString,
0947: Normalizer.NFD);
0948: for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates
0949: switch (getType(decomp.charAt(i))) {
0950: case 0:
0951: t = getType(decomp.charAt(i + 1));
0952: if (t != 0 && t != 1)
0953: return false;
0954: break;
0955: case 1:
0956: t = getType(decomp.charAt(i - 1));
0957: if (t != 0 && t != 1)
0958: return false;
0959: break;
0960: case 2:
0961: t = getType(decomp.charAt(i - 1));
0962: if (t != 1 && t != 2)
0963: return false;
0964: break;
0965: }
0966: }
0967: return true;
0968: } catch (StringIndexOutOfBoundsException e) {
0969: return false;
0970: }
0971: }
0972:
0973: public int getType(char c) {
0974: if ('\u1100' <= c && c <= '\u1112')
0975: return 0;
0976: else if ('\u1161' <= c && c <= '\u1175')
0977: return 1;
0978: else if ('\u11A8' <= c && c <= '\u11C2')
0979: return 2;
0980: return -1; // other
0981: }
0982: }
0983:
0984: //static BreakIterator thaiBreak = BreakIterator.getWordInstance(new Locale("th", "TH"));
0985: // anything is legal except word ending with Logical-order-exception
0986: public static class LegalThai extends Legal {
0987: public boolean is(String sourceString) {
0988: if (sourceString.length() == 0)
0989: return true;
0990: char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.
0991: if (UCharacter.hasBinaryProperty(ch,
0992: UProperty.LOGICAL_ORDER_EXCEPTION))
0993: return false;
0994:
0995: // disallow anything with a wordbreak between
0996: /*
0997: if (UTF16.countCodePoint(sourceString) <= 1) return true;
0998: thaiBreak.setText(sourceString);
0999: for (int pos = thaiBreak.first(); pos != BreakIterator.DONE; pos = thaiBreak.next()) {
1000: if (pos > 0 && pos < sourceString.length()) {
1001: System.out.println("Skipping " + Utility.escape(sourceString));
1002: return false;
1003: }
1004: }
1005: */
1006: return true;
1007: }
1008: }
1009:
1010: // anything is legal except that Final letters can't be followed by letter; NonFinal must be
1011: public static class LegalHebrew extends Legal {
1012: static UnicodeSet FINAL = new UnicodeSet(
1013: "[\u05DA\u05DD\u05DF\u05E3\u05E5]");
1014: static UnicodeSet NON_FINAL = new UnicodeSet(
1015: "[\u05DB\u05DE\u05E0\u05E4\u05E6]");
1016: static UnicodeSet LETTER = new UnicodeSet("[:letter:]");
1017:
1018: public boolean is(String sourceString) {
1019: if (sourceString.length() == 0)
1020: return true;
1021: // don't worry about surrogates.
1022: for (int i = 0; i < sourceString.length(); ++i) {
1023: char ch = sourceString.charAt(i);
1024: char next = i + 1 == sourceString.length() ? '\u0000'
1025: : sourceString.charAt(i);
1026: if (FINAL.contains(ch)) {
1027: if (LETTER.contains(next))
1028: return false;
1029: } else if (NON_FINAL.contains(ch)) {
1030: if (!LETTER.contains(next))
1031: return false;
1032: }
1033: }
1034: return true;
1035: }
1036: }
1037:
1038: public static class LegalGreek extends Legal {
1039:
1040: boolean full;
1041:
1042: public LegalGreek(boolean full) {
1043: this .full = full;
1044: }
1045:
1046: static final char IOTA_SUBSCRIPT = '\u0345';
1047: static final UnicodeSet breathing = new UnicodeSet(
1048: "[\\u0313\\u0314']");
1049: static final UnicodeSet validSecondVowel = new UnicodeSet(
1050: "[\\u03C5\\u03B9\\u03A5\\u0399]");
1051:
1052: public static boolean isVowel(char c) {
1053: return "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9"
1054: .indexOf(c) >= 0;
1055: }
1056:
1057: public static boolean isRho(char c) {
1058: return "\u03C1\u03A1".indexOf(c) >= 0;
1059: }
1060:
1061: public boolean is(String sourceString) {
1062: try {
1063: String decomp = Normalizer.normalize(sourceString,
1064: Normalizer.NFD);
1065:
1066: // modern is simpler: don't care about anything but a grave
1067: if (!full) {
1068: //if (sourceString.equals("\u039C\u03C0")) return false;
1069: for (int i = 0; i < decomp.length(); ++i) {
1070: char c = decomp.charAt(i);
1071: // exclude all the accents
1072: if (c == '\u0313' || c == '\u0314'
1073: || c == '\u0300' || c == '\u0302'
1074: || c == '\u0342' || c == '\u0345')
1075: return false;
1076: }
1077: return true;
1078: }
1079:
1080: // Legal full Greek has breathing marks IFF there is a vowel or RHO at the start
1081: // IF it has them, it has exactly one.
1082: // IF it starts with a RHO, then the breathing mark must come before the second letter.
1083: // IF it starts with a vowel, then it must before the third letter.
1084: // it will only come after the second if of the format [vowel] [no iota subscript!] [upsilon or iota]
1085: // Since there are no surrogates in greek, don't worry about them
1086:
1087: boolean firstIsVowel = false;
1088: boolean firstIsRho = false;
1089: boolean noLetterYet = true;
1090: int breathingCount = 0;
1091: int letterCount = 0;
1092: //int breathingPosition = -1;
1093:
1094: for (int i = 0; i < decomp.length(); ++i) {
1095: char c = decomp.charAt(i);
1096: if (UCharacter.isLetter(c)) {
1097: ++letterCount;
1098: if (firstIsVowel
1099: && !validSecondVowel.contains(c)
1100: && breathingCount == 0)
1101: return false;
1102: if (noLetterYet) {
1103: noLetterYet = false;
1104: firstIsVowel = isVowel(c);
1105: firstIsRho = isRho(c);
1106: }
1107: if (firstIsRho && letterCount == 2
1108: && breathingCount == 0)
1109: return false;
1110: }
1111: if (c == IOTA_SUBSCRIPT && firstIsVowel
1112: && breathingCount == 0)
1113: return false;
1114: if (breathing.contains(c)) {
1115: // breathingPosition = i;
1116: ++breathingCount;
1117: }
1118: }
1119:
1120: if (firstIsVowel || firstIsRho)
1121: return breathingCount == 1;
1122: return breathingCount == 0;
1123: } catch (Throwable t) {
1124: System.out.println(t.getClass().getName() + " "
1125: + t.getMessage());
1126: return true;
1127: }
1128: }
1129: }
1130:
1131: static class Test {
1132:
1133: PrintWriter out;
1134:
1135: private String transliteratorID;
1136: private int errorLimit = 500;
1137: private int errorCount = 0;
1138: private int pairLimit = 0x10000;
1139: private int density = 100;
1140: UnicodeSet sourceRange;
1141: UnicodeSet targetRange;
1142: UnicodeSet toSource;
1143: UnicodeSet toTarget;
1144: UnicodeSet roundtripExclusions;
1145:
1146: RoundTripTest log;
1147: Legal legalSource;
1148: UnicodeSet badCharacters;
1149:
1150: /*
1151: * create a test for the given script transliterator.
1152: */
1153: Test(String transliteratorID) {
1154: this (transliteratorID, 100);
1155: }
1156:
1157: Test(String transliteratorID, int dens) {
1158: this .transliteratorID = transliteratorID;
1159: this .density = dens;
1160: }
1161:
1162: public void setErrorLimit(int limit) {
1163: errorLimit = limit;
1164: }
1165:
1166: public void setPairLimit(int limit) {
1167: pairLimit = limit;
1168: }
1169:
1170: // Added to do better equality check.
1171:
1172: public static boolean isSame(String a, String b) {
1173: if (a.equals(b))
1174: return true;
1175: if (a.equalsIgnoreCase(b) && isCamel(a))
1176: return true;
1177: a = Normalizer.normalize(a, Normalizer.NFD);
1178: b = Normalizer.normalize(b, Normalizer.NFD);
1179: if (a.equals(b))
1180: return true;
1181: if (a.equalsIgnoreCase(b) && isCamel(a))
1182: return true;
1183: return false;
1184: }
1185:
1186: /*
1187: public boolean includesSome(UnicodeSet set, String a) {
1188: int cp;
1189: for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
1190: cp = UTF16.charAt(a, i);
1191: if (set.contains(cp)) return true;
1192: }
1193: return false;
1194: }
1195: */
1196:
1197: public static boolean isCamel(String a) {
1198: //System.out.println("CamelTest");
1199: // see if string is of the form aB; e.g. lower, then upper or title
1200: int cp;
1201: boolean haveLower = false;
1202: for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
1203: cp = UTF16.charAt(a, i);
1204: int t = UCharacter.getType(cp);
1205: //System.out.println("\t" + t + " " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
1206: switch (t) {
1207: case Character.UPPERCASE_LETTER:
1208: if (haveLower)
1209: return true;
1210: break;
1211: case Character.TITLECASE_LETTER:
1212: if (haveLower)
1213: return true;
1214: // drop through, since second letter is lower.
1215: case Character.LOWERCASE_LETTER:
1216: haveLower = true;
1217: break;
1218: }
1219: }
1220: //System.out.println("FALSE");
1221: return false;
1222: }
1223:
1224: static final UnicodeSet okAnyway = new UnicodeSet(
1225: "[^[:Letter:]]");
1226: static final UnicodeSet neverOk = new UnicodeSet("[:Other:]");
1227:
1228: public void test(String sourceRange, String targetRange,
1229: String roundtripExclusions, RoundTripTest log,
1230: Legal legalSource) throws java.io.IOException {
1231: test(sourceRange, targetRange, sourceRange,
1232: roundtripExclusions, log, legalSource);
1233: }
1234:
1235: /**
1236: * Will test
1237: * that everything in sourceRange maps to targetRange,
1238: * that everything in targetRange maps to backtoSourceRange
1239: * that everything roundtrips from target -> source -> target, except roundtripExceptions
1240: */
1241: public void test(String sourceRange, String targetRange,
1242: String backtoSourceRange, String roundtripExclusions,
1243: RoundTripTest log, Legal legalSource)
1244: throws java.io.IOException {
1245:
1246: this .legalSource = legalSource;
1247: this .sourceRange = new UnicodeSet(sourceRange);
1248: this .sourceRange.removeAll(neverOk);
1249:
1250: this .targetRange = new UnicodeSet(targetRange);
1251: this .targetRange.removeAll(neverOk);
1252:
1253: this .toSource = new UnicodeSet(backtoSourceRange);
1254: this .toSource.addAll(okAnyway);
1255:
1256: this .toTarget = new UnicodeSet(targetRange);
1257: this .toTarget.addAll(okAnyway);
1258:
1259: if (roundtripExclusions != null
1260: && roundtripExclusions.length() > 0) {
1261: this .roundtripExclusions = new UnicodeSet(
1262: roundtripExclusions);
1263: } else {
1264: this .roundtripExclusions = new UnicodeSet(); // empty
1265: }
1266:
1267: this .log = log;
1268:
1269: log.logln(Utility.escape("Source: " + this .sourceRange));
1270: log.logln(Utility.escape("Target: " + this .targetRange));
1271: log.logln(Utility.escape("Exclude: "
1272: + this .roundtripExclusions));
1273: if (log.isQuick())
1274: log.logln("Abbreviated Test");
1275:
1276: badCharacters = new UnicodeSet("[:other:]");
1277:
1278: // make a UTF-8 output file we can read with a browser
1279:
1280: // note: check that every transliterator transliterates the null string correctly!
1281:
1282: // {dlf} reorganize so can run test in protected security environment
1283: // String logFileName = "test_" + transliteratorID.replace('/', '_') + ".html";
1284:
1285: // File lf = new File(logFileName);
1286: // log.logln("Creating log file " + lf.getAbsoluteFile());
1287:
1288: // out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
1289: // new FileOutputStream(logFileName), "UTF8"), 4*1024));
1290:
1291: ByteArrayOutputStream bast = new ByteArrayOutputStream();
1292: out = new PrintWriter(new BufferedWriter(
1293: new OutputStreamWriter(bast, "UTF8"), 4 * 1024));
1294: //out.write('\uFFEF'); // BOM
1295: out
1296: .println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
1297: out.println("<HTML><HEAD>");
1298: out
1299: .println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
1300: out
1301: .println("<BODY bgcolor='#FFFFFF' style='font-family: Arial Unicode MS'>");
1302:
1303: try {
1304: test2();
1305: } catch (TestTruncated e) {
1306: out.println(e.getMessage());
1307: }
1308: out.println("</BODY></HTML>");
1309: out.close();
1310:
1311: if (errorCount > 0) {
1312: try {
1313: String logFileName = "test_"
1314: + transliteratorID.replace('/', '_')
1315: + ".html";
1316: File lf = new File(logFileName);
1317: log.logln("Creating log file "
1318: + lf.getAbsoluteFile());
1319: FileOutputStream fos = new FileOutputStream(lf);
1320: fos.write(bast.toByteArray());
1321: fos.close();
1322: log.errln(transliteratorID
1323: + " errors: "
1324: + errorCount
1325: + (errorCount > errorLimit ? " (at least!)"
1326: : "") + ", see "
1327: + lf.getAbsoluteFile());
1328: } catch (SecurityException e) {
1329: log
1330: .errln(transliteratorID
1331: + " errors: "
1332: + errorCount
1333: + (errorCount > errorLimit ? " (at least!)"
1334: : "")
1335: + ", no log provided due to protected test domain");
1336: }
1337: } else {
1338: log.logln(transliteratorID + " ok");
1339: // new File(logFileName).delete();
1340: }
1341: }
1342:
1343: // ok if at least one is not equal
1344: public boolean checkIrrelevants(Transliterator t,
1345: String irrelevants) {
1346: for (int i = 0; i < irrelevants.length(); ++i) {
1347: char c = irrelevants.charAt(i);
1348: String cs = UTF16.valueOf(c);
1349: String targ = t.transliterate(cs);
1350: if (cs.equals(targ))
1351: return true;
1352: }
1353: return false;
1354: }
1355:
1356: public void test2() {
1357:
1358: Transliterator sourceToTarget = Transliterator
1359: .getInstance(transliteratorID);
1360: Transliterator targetToSource = sourceToTarget.getInverse();
1361: AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
1362: AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();
1363:
1364: log
1365: .logln("Checking that at least one irrevant characters is not NFC'ed");
1366: out
1367: .println("<h3>Checking that at least one irrevant characters is not NFC'ed</h3>");
1368:
1369: String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD
1370:
1371: if (!checkIrrelevants(sourceToTarget, irrelevants)) {
1372: logFails("Source-Target, Must not NFC everything");
1373: }
1374: if (!checkIrrelevants(targetToSource, irrelevants)) {
1375: logFails("Target-Source, irrelevants");
1376: }
1377:
1378: if (EXTRA_TESTS) {
1379: log.logln("Checking that toRules works");
1380: String rules = "";
1381: Transliterator sourceToTarget2;
1382: Transliterator targetToSource2;
1383: try {
1384: rules = sourceToTarget.toRules(false);
1385: sourceToTarget2 = Transliterator.createFromRules(
1386: "s2t2", rules, Transliterator.FORWARD);
1387: if (PRINT_RULES) {
1388: out.println("<h3>Forward Rules:</h3><p>");
1389: out.println(TestUtility.replace(rules, "\n",
1390: "<br>\n"));
1391: out.println("</p>");
1392: }
1393: rules = targetToSource.toRules(false);
1394: targetToSource2 = Transliterator.createFromRules(
1395: "t2s2", rules, Transliterator.FORWARD);
1396: if (PRINT_RULES) {
1397: out.println("<h3>Backward Rules:</h3><p>");
1398: out.println(TestUtility.replace(rules, "\n",
1399: "<br>\n"));
1400: out.println("</p>");
1401: }
1402: } catch (RuntimeException e) {
1403: out.println("<h3>Broken Rules:</h3><p>");
1404: out.println(TestUtility.replace(rules, "\n",
1405: "<br>\n"));
1406: out.println("</p>");
1407: out.flush();
1408: throw e;
1409: }
1410:
1411: out
1412: .println("<h3>Roundtrip Exclusions: "
1413: + new UnicodeSet(roundtripExclusions)
1414: + "</h3>");
1415: out.flush();
1416:
1417: log.logln("Checking that source -> target -> source");
1418: out
1419: .println("<h3>Checking that source -> target -> source</h3>");
1420:
1421: usi.reset(sourceRange);
1422: while (usi.next()) {
1423: int c = usi.codepoint;
1424:
1425: String cs = UTF16.valueOf(c);
1426: String targ = sourceToTarget.transliterate(cs);
1427: String targ2 = sourceToTarget2.transliterate(cs);
1428: if (!targ.equals(targ2)) {
1429: logToRulesFails("Source-Target, toRules", cs,
1430: targ, targ2);
1431: }
1432: }
1433:
1434: log.logln("Checking that target -> source -> target");
1435: out
1436: .println("<h3>Checking that target -> source -> target</h3>");
1437: usi.reset(targetRange);
1438: while (usi.next()) {
1439: int c = usi.codepoint;
1440:
1441: String cs = UTF16.valueOf(c);
1442: String targ = targetToSource.transliterate(cs);
1443: String targ2 = targetToSource2.transliterate(cs);
1444: if (!targ.equals(targ2)) {
1445: logToRulesFails("Target-Source, toRules", cs,
1446: targ, targ2);
1447: }
1448: }
1449: }
1450:
1451: log
1452: .logln("Checking that source characters convert to target - Singles");
1453: out
1454: .println("<h3>Checking that source characters convert to target - Singles</h3>");
1455:
1456: UnicodeSet failSourceTarg = new UnicodeSet();
1457:
1458: /*
1459: for (char c = 0; c < 0xFFFF; ++c) {
1460: if (!sourceRange.contains(c)) continue;
1461: */
1462: usi.reset(sourceRange);
1463: while (usi.next()) {
1464: int c = usi.codepoint;
1465:
1466: String cs = UTF16.valueOf(c);
1467: String targ = sourceToTarget.transliterate(cs);
1468: if (!toTarget.containsAll(targ)
1469: || badCharacters.containsSome(targ)) {
1470: String targD = Normalizer.normalize(targ,
1471: Normalizer.NFD);
1472: if (!toTarget.containsAll(targD)
1473: || badCharacters.containsSome(targD)) {
1474: logWrongScript("Source-Target", cs, targ,
1475: toTarget, badCharacters);
1476: failSourceTarg.add(c);
1477: continue;
1478: }
1479: }
1480:
1481: String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
1482: String targ2 = sourceToTarget.transliterate(cs2);
1483: if (!targ.equals(targ2)) {
1484: logNotCanonical("Source-Target", cs, targ, cs2,
1485: targ2);
1486: }
1487: }
1488:
1489: log
1490: .logln("Checking that source characters convert to target - Doubles");
1491: out
1492: .println("<h3>Checking that source characters convert to target - Doubles</h3>");
1493:
1494: /*
1495: for (char c = 0; c < 0xFFFF; ++c) {
1496: if (TestUtility.isUnassigned(c) ||
1497: !sourceRange.contains(c)) continue;
1498: if (failSourceTarg.get(c)) continue;
1499:
1500: */
1501:
1502: UnicodeSet sourceRangeMinusFailures = new UnicodeSet(
1503: sourceRange);
1504: sourceRangeMinusFailures.removeAll(failSourceTarg);
1505:
1506: boolean quickRt = log.getInclusion() < 10;
1507:
1508: usi.reset(sourceRangeMinusFailures, quickRt, density);
1509:
1510: while (usi.next()) {
1511: int c = usi.codepoint;
1512:
1513: /*
1514: for (char d = 0; d < 0xFFFF; ++d) {
1515: if (TestUtility.isUnassigned(d) ||
1516: !sourceRange.contains(d)) continue;
1517: if (failSourceTarg.get(d)) continue;
1518: */
1519: usi2.reset(sourceRangeMinusFailures, quickRt, density);
1520:
1521: while (usi2.next()) {
1522: int d = usi2.codepoint;
1523:
1524: String cs = UTF16.valueOf(c) + UTF16.valueOf(d);
1525: String targ = sourceToTarget.transliterate(cs);
1526: if (!toTarget.containsAll(targ)
1527: || badCharacters.containsSome(targ)) {
1528: String targD = Normalizer.normalize(targ,
1529: Normalizer.NFD);
1530: if (!toTarget.containsAll(targD)
1531: || badCharacters.containsSome(targD)) {
1532: logWrongScript("Source-Target", cs, targ,
1533: toTarget, badCharacters);
1534: continue;
1535: }
1536: }
1537: String cs2 = Normalizer.normalize(cs,
1538: Normalizer.NFD);
1539: String targ2 = sourceToTarget.transliterate(cs2);
1540: if (!targ.equals(targ2)) {
1541: logNotCanonical("Source-Target", cs, targ, cs2,
1542: targ2);
1543: }
1544: }
1545: }
1546:
1547: log
1548: .logln("Checking that target characters convert to source and back - Singles");
1549: out
1550: .println("<h3>Checking that target characters convert to source and back - Singles</h3>");
1551:
1552: UnicodeSet failTargSource = new UnicodeSet();
1553: UnicodeSet failRound = new UnicodeSet();
1554:
1555: /*for (char c = 0; c < 0xFFFF; ++c) {
1556: if (TestUtility.isUnassigned(c) ||
1557: !targetRange.contains(c)) continue;
1558: */
1559:
1560: usi.reset(targetRange);
1561: while (usi.next()) {
1562: String cs;
1563: int c;
1564: if (usi.codepoint == UnicodeSetIterator.IS_STRING) {
1565: cs = usi.string;
1566: c = UTF16.charAt(cs, 0);
1567: } else {
1568: c = usi.codepoint;
1569: cs = UTF16.valueOf(c);
1570: }
1571:
1572: String targ = targetToSource.transliterate(cs);
1573: String reverse = sourceToTarget.transliterate(targ);
1574:
1575: if (!toSource.containsAll(targ)
1576: || badCharacters.containsSome(targ)) {
1577: String targD = Normalizer.normalize(targ,
1578: Normalizer.NFD);
1579: if (!toSource.containsAll(targD)
1580: || badCharacters.containsSome(targD)) {
1581: /*UnicodeSet temp = */new UnicodeSet()
1582: .addAll(targD);
1583: logWrongScript("Target-Source", cs, targ,
1584: toSource, badCharacters);
1585: failTargSource.add(cs);
1586: continue;
1587: }
1588: }
1589: if (!isSame(cs, reverse)
1590: && !roundtripExclusions.contains(c)
1591: && !roundtripExclusions.contains(cs)) {
1592: logRoundTripFailure(cs, targetToSource.getID(),
1593: targ, sourceToTarget.getID(), reverse);
1594: failRound.add(c);
1595: continue;
1596: }
1597: String targ2 = Normalizer.normalize(targ,
1598: Normalizer.NFD);
1599: String reverse2 = sourceToTarget.transliterate(targ2);
1600: if (!reverse.equals(reverse2)) {
1601: logNotCanonical("Target-Source", targ, reverse,
1602: targ2, reverse2);
1603: }
1604: }
1605:
1606: log
1607: .logln("Checking that target characters convert to source and back - Doubles");
1608: out
1609: .println("<h3>Checking that target characters convert to source and back - Doubles</h3>");
1610: int count = 0;
1611:
1612: UnicodeSet targetRangeMinusFailures = new UnicodeSet(
1613: targetRange);
1614: targetRangeMinusFailures.removeAll(failTargSource);
1615: targetRangeMinusFailures.removeAll(failRound);
1616:
1617: //char[] buf = new char[4]; // maximum we can have with 2 code points
1618: /*
1619: for (char c = 0; c < 0xFFFF; ++c) {
1620: if (TestUtility.isUnassigned(c) ||
1621: !targetRange.contains(c)) continue;
1622: */
1623:
1624: usi.reset(targetRangeMinusFailures, quickRt, density);
1625:
1626: while (usi.next()) {
1627: int c = usi.codepoint;
1628:
1629: if (++count > pairLimit) {
1630: throw new TestTruncated("Test truncated at "
1631: + pairLimit + " x 64k pairs");
1632: }
1633: //log.log(TestUtility.hex(c));
1634:
1635: /*
1636: for (char d = 0; d < 0xFFFF; ++d) {
1637: if (TestUtility.isUnassigned(d) ||
1638: !targetRange.contains(d)) continue;
1639: */
1640: usi2.reset(targetRangeMinusFailures, quickRt, density);
1641:
1642: while (usi2.next()) {
1643: int d = usi2.codepoint;
1644: if (d < 0)
1645: break;
1646:
1647: String cs = UTF16.valueOf(c) + UTF16.valueOf(d);
1648: String targ = targetToSource.transliterate(cs);
1649: String reverse = sourceToTarget.transliterate(targ);
1650:
1651: if (!toSource.containsAll(targ) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
1652: || badCharacters.containsSome(targ)) {
1653: String targD = Normalizer.normalize(targ,
1654: Normalizer.NFD);
1655: if (!toSource.containsAll(targD) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
1656: || badCharacters.containsSome(targD)) {
1657: logWrongScript("Target-Source", cs, targ,
1658: toSource, badCharacters);
1659: continue;
1660: }
1661: }
1662: if (!isSame(cs, reverse) /*&& !failRound.contains(c) && !failRound.contains(d)*/
1663: && !roundtripExclusions.contains(c)
1664: && !roundtripExclusions.contains(d)
1665: && !roundtripExclusions.contains(cs)) {
1666: logRoundTripFailure(cs, targetToSource.getID(),
1667: targ, sourceToTarget.getID(), reverse);
1668: continue;
1669: }
1670: String targ2 = Normalizer.normalize(targ,
1671: Normalizer.NFD);
1672: String reverse2 = sourceToTarget
1673: .transliterate(targ2);
1674: if (!reverse.equals(reverse2)) {
1675: logNotCanonical("Target-Source", targ, reverse,
1676: targ2, reverse2);
1677: }
1678: }
1679: }
1680: log.logln("");
1681: }
1682:
1683: final String info(String s) {
1684: StringBuffer result = new StringBuffer();
1685: result.append("\u200E").append(s).append("\u200E (")
1686: .append(TestUtility.hex(s)).append("/");
1687: if (false) { // append age, as a check
1688: int cp = 0;
1689: for (int i = 0; i < s.length(); i += UTF16
1690: .getCharCount(cp)) {
1691: cp = UTF16.charAt(s, i);
1692: if (i > 0)
1693: result.append(", ");
1694: result.append(UCharacter.getAge(cp));
1695: }
1696: }
1697: result.append(")");
1698: return result.toString();
1699: }
1700:
1701: final void logWrongScript(String label, String from, String to,
1702: UnicodeSet shouldContainAll,
1703: UnicodeSet shouldNotContainAny) {
1704: if (++errorCount > errorLimit) {
1705: throw new TestTruncated(
1706: "Test truncated; too many failures");
1707: }
1708: String toD = Normalizer.normalize(to, Normalizer.NFD);
1709: UnicodeSet temp = new UnicodeSet().addAll(toD);
1710: UnicodeSet bad = new UnicodeSet(shouldNotContainAny)
1711: .retainAll(temp).addAll(
1712: new UnicodeSet(temp)
1713: .removeAll(shouldContainAll));
1714:
1715: out.println("<br>Fail " + label + ": " + info(from)
1716: + " => " + info(to) + " " + bad);
1717: }
1718:
1719: final void logNotCanonical(String label, String from,
1720: String to, String fromCan, String toCan) {
1721: if (++errorCount > errorLimit) {
1722: throw new TestTruncated(
1723: "Test truncated; too many failures");
1724: }
1725: out.println("<br>Fail (can.equiv) " + label + ": "
1726: + info(from) + " => " + info(to) + " -- "
1727: + info(fromCan) + " => " + info(toCan) + ")");
1728: }
1729:
1730: final void logFails(String label) {
1731: if (++errorCount > errorLimit) {
1732: throw new TestTruncated(
1733: "Test truncated; too many failures");
1734: }
1735: out.println("<br>Fail (can.equiv)" + label);
1736: }
1737:
1738: final void logToRulesFails(String label, String from,
1739: String to, String toCan) {
1740: if (++errorCount > errorLimit) {
1741: throw new TestTruncated(
1742: "Test truncated; too many failures");
1743: }
1744: out.println("<br>Fail " + label + ": " + info(from)
1745: + " => " + info(to) + ", " + info(toCan));
1746: }
1747:
1748: final void logRoundTripFailure(String from, String toID,
1749: String to, String backID, String back) {
1750: if (!legalSource.is(from))
1751: return; // skip illegals
1752:
1753: if (++errorCount > errorLimit) {
1754: throw new TestTruncated(
1755: "Test truncated; too many failures");
1756: }
1757: out.println("<br>Fail Roundtrip: " + info(from) + " "
1758: + toID + " => " + info(to) + " " + backID + " => "
1759: + info(back));
1760: }
1761:
1762: /*
1763: * Characters to filter for source-target mapping completeness
1764: * Typically is base alphabet, minus extended characters
1765: * Default is ASCII letters for Latin
1766: */
1767: /*
1768: public boolean isSource(char c) {
1769: if (!sourceRange.contains(c)) return false;
1770: return true;
1771: }
1772: */
1773:
1774: /*
1775: * Characters to check for target back to source mapping.
1776: * Typically the same as the target script, plus punctuation
1777: */
1778: /*
1779: public boolean isReceivingSource(char c) {
1780: if (!targetRange.contains(c)) return false;
1781: return true;
1782: }
1783: */
1784: /*
1785: * Characters to filter for target-source mapping
1786: * Typically is base alphabet, minus extended characters
1787: */
1788: /*
1789: public boolean isTarget(char c) {
1790: byte script = TestUtility.getScript(c);
1791: if (script != targetScript) return false;
1792: if (!TestUtility.isLetter(c)) return false;
1793: if (targetRange != null && !targetRange.contains(c)) return false;
1794: return true;
1795: }
1796: */
1797:
1798: /*
1799: * Characters to check for target-source mapping
1800: * Typically the same as the source script, plus punctuation
1801: */
1802: /*
1803: public boolean isReceivingTarget(char c) {
1804: byte script = TestUtility.getScript(c);
1805: return (script == targetScript || script == TestUtility.COMMON_SCRIPT);
1806: }
1807:
1808: final boolean isSource(String s) {
1809: for (int i = 0; i < s.length(); ++i) {
1810: if (!isSource(s.charAt(i))) return false;
1811: }
1812: return true;
1813: }
1814:
1815: final boolean isTarget(String s) {
1816: for (int i = 0; i < s.length(); ++i) {
1817: if (!isTarget(s.charAt(i))) return false;
1818: }
1819: return true;
1820: }
1821:
1822: final boolean isReceivingSource(String s) {
1823: for (int i = 0; i < s.length(); ++i) {
1824: if (!isReceivingSource(s.charAt(i))) return false;
1825: }
1826: return true;
1827: }
1828:
1829: final boolean isReceivingTarget(String s) {
1830: for (int i = 0; i < s.length(); ++i) {
1831: if (!isReceivingTarget(s.charAt(i))) return false;
1832: }
1833: return true;
1834: }
1835: */
1836:
1837: static class TestTruncated extends RuntimeException {
1838: TestTruncated(String msg) {
1839: super (msg);
1840: }
1841: }
1842: }
1843:
1844: // static class TestHangul extends Test {
1845: // TestHangul () {
1846: // super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT);
1847: // }
1848: //
1849: // public boolean isSource(char c) {
1850: // if (0x1113 <= c && c <= 0x1160) return false;
1851: // if (0x1176 <= c && c <= 0x11F9) return false;
1852: // if (0x3131 <= c && c <= 0x318E) return false;
1853: // return super.isSource(c);
1854: // }
1855: // }
1856: }
|