001: //##header
002: /*
003: *******************************************************************************
004: * Copyright (C) 2002-2006, International Business Machines Corporation and *
005: * others. All Rights Reserved. *
006: *******************************************************************************
007: */
008: package com.ibm.icu.dev.test.util;
009:
010: import java.io.BufferedReader;
011: import java.io.IOException;
012:
013: import com.ibm.icu.text.Transliterator;
014:
015: import com.ibm.icu.dev.test.TestUtil;
016:
017: public class TransliteratorUtilities {
018: public static boolean DEBUG = false;
019:
020: public static void registerTransliteratorFromFile(String dir,
021: String id) {
022: try {
023: String filename = id.replace('-', '_') + ".txt";
024: String rules = getFileContents(dir, filename);
025: Transliterator t;
026: int pos = id.indexOf('-');
027: String rid;
028: if (pos < 0) {
029: rid = id + "-Any";
030: id = "Any-" + id;
031: } else {
032: rid = id.substring(pos + 1) + "-"
033: + id.substring(0, pos);
034: }
035: t = Transliterator.createFromRules(id, rules,
036: Transliterator.FORWARD);
037: Transliterator.unregister(id);
038: Transliterator.registerInstance(t);
039:
040: /*String test = "\u049A\u0430\u0437\u0430\u049B";
041: System.out.println(t.transliterate(test));
042: t = Transliterator.getInstance(id);
043: System.out.println(t.transliterate(test));
044: */
045:
046: t = Transliterator.createFromRules(rid, rules,
047: Transliterator.REVERSE);
048: Transliterator.unregister(rid);
049: Transliterator.registerInstance(t);
050: if (DEBUG)
051: System.out.println("Registered new Transliterator: "
052: + id + ", " + rid);
053: } catch (IOException e) {
054: //#ifndef FOUNDATION
055: throw (IllegalArgumentException) new IllegalArgumentException(
056: "Can't open " + dir + ", " + id).initCause(e);
057: //#else
058: //## throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id+" "+ e.getMessage());
059: //#endif
060: }
061: }
062:
063: /**
064: *
065: */
066: public static String getFileContents(String dir, String filename)
067: throws IOException {
068: //#ifndef FOUNDATION
069: BufferedReader br = BagFormatter.openUTF8Reader(dir, filename);
070: //#else
071: //## BufferedReader br = TestUtil.openUTF8Reader(dir, filename);
072: //#endif
073: StringBuffer buffer = new StringBuffer();
074: while (true) {
075: String line = br.readLine();
076: if (line == null)
077: break;
078: if (line.length() > 0 && line.charAt(0) == '\uFEFF')
079: line = line.substring(1);
080: buffer.append(line).append("\r\n");
081: }
082: br.close();
083: return buffer.toString();
084:
085: }
086:
087: private static final String BASE_RULES = ":: (hex-any/xml);"
088: + ":: (hex-any/xml10);" + "'<' > '<' ;"
089: + "'<' < '&'[lL][Tt]';' ;" + "'&' > '&' ;"
090: + "'&' < '&'[aA][mM][pP]';' ;" + "'>' < '&'[gG][tT]';' ;"
091: + "'\"' < '&'[qQ][uU][oO][tT]';' ; "
092: + "'' < '&'[aA][pP][oO][sS]';' ; ";
093:
094: private static final String CONTENT_RULES = "'>' > '>' ;";
095:
096: private static final String HTML_RULES = BASE_RULES + CONTENT_RULES
097: + "'\"' > '"' ; ";
098:
099: private static final String HTML_RULES_CONTROLS = HTML_RULES
100: + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; ";
101:
102: private static final String HTML_RULES_ASCII = HTML_RULES
103: + ":: [[:C:][:^ASCII:]] any-hex/xml ; ";
104:
105: private static final String XML_RULES = HTML_RULES
106: + "'' > ''' ; ";
107:
108: /*
109: The ampersand character (&) and the left angle bracket (<) MUST NOT appear
110:
111: in their literal form, except when used as markup delimiters, or within a
112:
113: comment, a processing instruction, or a CDATA section. If they are needed
114:
115: elsewhere, they MUST be escaped using either numeric character references or
116:
117: the strings "&" and "<" respectively. The right angle bracket (>) MAY
118:
119: be represented using the string ">", and MUST, for compatibility, be
120:
121: escaped using either ">" or a character reference when it appears in the string
122:
123: "]]>" in content, when that string is not marking the end of a CDATA section.
124:
125: In the content of elements, character data is any string of characters which does
126:
127: not contain the start-delimiter of any markup and does not include the
128:
129: CDATA-section-close delimiter, "]]>". In a CDATA section, character data is
130:
131: any string of characters not including the CDATA-section-close delimiter,
132:
133: "]]>".
134:
135: To allow attribute values to contain both single and double quotes, the
136:
137: apostrophe or single-quote character (') MAY be represented as "'", and
138:
139: the double-quote character (") as """.
140:
141:
142: */
143:
144: public static final Transliterator toXML = Transliterator
145: .createFromRules("any-xml", XML_RULES,
146: Transliterator.FORWARD);
147: public static final Transliterator fromXML = Transliterator
148: .createFromRules("xml-any", XML_RULES,
149: Transliterator.REVERSE);
150: public static final Transliterator toHTML = Transliterator
151: .createFromRules("any-html", HTML_RULES,
152: Transliterator.FORWARD);
153: public static final Transliterator toHTMLControl = Transliterator
154: .createFromRules("any-html", HTML_RULES_CONTROLS,
155: Transliterator.FORWARD);
156: public static final Transliterator toHTMLAscii = Transliterator
157: .createFromRules("any-html", HTML_RULES_ASCII,
158: Transliterator.FORWARD);
159: public static final Transliterator fromHTML = Transliterator
160: .createFromRules("html-any", HTML_RULES,
161: Transliterator.REVERSE);
162: }
|