001: /*
002: * Copyright 2004-2008 H2 Group. Licensed under the H2 License, Version 1.0
003: * (license2)
004: * Initial Developer: H2 Group
005: */
006: package org.h2.tools.indexer;
007:
008: import java.util.HashMap;
009:
010: /**
011: * This class replaces HTML entities in text (for example ü) to the correct
012: * character and vice versa.
013: */
014: public class HtmlConverter {
015: private static HashMap charMap = new HashMap();
016: private static HashMap codeMap = new HashMap();
017:
018: private static final String[] CHARS = { "quot:34", "amp:38",
019: "lt:60", "gt:62", "nbsp:160", "iexcl:161", "cent:162",
020: "pound:163", "curren:164", "yen:165", "brvbar:166",
021: "sect:167", "uml:168", "copy:169", "ordf:170", "laquo:171",
022: "not:172", "shy:173", "reg:174", "macr:175", "deg:176",
023: "plusmn:177", "sup2:178", "sup3:179", "acute:180",
024: "micro:181", "para:182", "middot:183", "cedil:184",
025: "sup1:185", "ordm:186", "raquo:187", "frac14:188",
026: "frac12:189", "frac34:190", "iquest:191", "Agrave:192",
027: "Aacute:193", "Acirc:194", "Atilde:195", "Auml:196",
028: "Aring:197", "AElig:198", "Ccedil:199", "Egrave:200",
029: "Eacute:201", "Ecirc:202", "Euml:203", "Igrave:204",
030: "Iacute:205", "Icirc:206", "Iuml:207", "ETH:208",
031: "Ntilde:209", "Ograve:210", "Oacute:211", "Ocirc:212",
032: "Otilde:213", "Ouml:214", "times:215", "Oslash:216",
033: "Ugrave:217", "Uacute:218", "Ucirc:219", "Uuml:220",
034: "Yacute:221", "THORN:222", "szlig:223", "agrave:224",
035: "aacute:225", "acirc:226", "atilde:227", "auml:228",
036: "aring:229", "aelig:230", "ccedil:231", "egrave:232",
037: "eacute:233", "ecirc:234", "euml:235", "igrave:236",
038: "iacute:237", "icirc:238", "iuml:239", "eth:240",
039: "ntilde:241", "ograve:242", "oacute:243", "ocirc:244",
040: "otilde:245", "ouml:246", "divide:247", "oslash:248",
041: "ugrave:249", "uacute:250", "ucirc:251", "uuml:252",
042: "yacute:253", "thorn:254", "yuml:255", "Alpha:913",
043: "alpha:945", "Beta:914", "beta:946", "Gamma:915",
044: "gamma:947", "Delta:916", "delta:948", "Epsilon:917",
045: "epsilon:949", "Zeta:918", "zeta:950", "Eta:919",
046: "eta:951", "Theta:920", "theta:952", "Iota:921",
047: "iota:953", "Kappa:922", "kappa:954", "Lambda:923",
048: "lambda:955", "Mu:924", "mu:956", "Nu:925", "nu:957",
049: "Xi:926", "xi:958", "Omicron:927", "omicron:959", "Pi:928",
050: "pi:960", "Rho:929", "rho:961", "Sigma:931", "sigmaf:962",
051: "sigma:963", "Tau:932", "tau:964", "Upsilon:933",
052: "upsilon:965", "Phi:934", "phi:966", "Chi:935", "chi:967",
053: "Psi:936", "psi:968", "Omega:937", "omega:969",
054: "thetasym:977", "upsih:978", "piv:982", "forall:8704",
055: "part:8706", "exist:8707", "empty:8709", "nabla:8711",
056: "isin:8712", "notin:8713", "ni:8715", "prod:8719",
057: "sum:8721", "minus:8722", "lowast:8727", "radic:8730",
058: "prop:8733", "infin:8734", "ang:8736", "and:8743",
059: "or:8744", "cap:8745", "cup:8746", "int:8747",
060: "there4:8756", "sim:8764", "cong:8773", "asymp:8776",
061: "ne:8800", "equiv:8801", "le:8804", "ge:8805", "sub:8834",
062: "sup:8835", "nsub:8836", "sube:8838", "supe:8839",
063: "oplus:8853", "otimes:8855", "perp:8869", "sdot:8901",
064: "loz:9674", "lceil:8968", "rceil:8969", "lfloor:8970",
065: "rfloor:8971", "lang:9001", "rang:9002", "larr:8592",
066: "uarr:8593", "rarr:8594", "darr:8595", "harr:8596",
067: "crarr:8629", "lArr:8656", "uArr:8657", "rArr:8658",
068: "dArr:8659", "hArr:8660", "bull:8226", "prime:8242",
069: "oline:8254", "frasl:8260", "weierp:8472", "image:8465",
070: "real:8476", "trade:8482", "euro:8364", "alefsym:8501",
071: "spades:9824", "clubs:9827", "hearts:9829", "diams:9830",
072: "ensp:8194", "emsp:8195", "thinsp:8201", "zwnj:8204",
073: "zwj:8205", "lrm:8206", "rlm:8207", "ndash:8211",
074: "mdash:8212", "lsquo:8216", "rsquo:8217", "sbquo:8218",
075: "ldquo:8220", "rdquo:8221", "bdquo:8222", "dagger:8224",
076: "Dagger:8225", "hellip:8230", "permil:8240", "lsaquo:8249",
077: "rsaquo:8250" };
078:
079: static {
080: for (int i = 0; i < CHARS.length; i++) {
081: String token = CHARS[i];
082: int idx = token.indexOf(':');
083: String key = token.substring(0, idx);
084: int ch = Integer.parseInt(token.substring(idx + 1));
085: Character character = new Character((char) ch);
086: charMap.put(key, character);
087: codeMap.put(character, key);
088: }
089: }
090:
091: public static String convertStringToHtml(String s) {
092: if (s == null) {
093: return null;
094: }
095: if (s.length() == 0) {
096: return s;
097: }
098: StringBuffer buff = new StringBuffer();
099: for (int i = 0; i < s.length(); i++) {
100: char ch = s.charAt(i);
101: Character c = new Character(ch);
102: String token = (String) codeMap.get(c);
103: if (token == null) {
104: if (ch < 128) {
105: buff.append(ch);
106: } else {
107: buff.append('&');
108: buff.append('#');
109: buff.append((int) ch);
110: buff.append(';');
111: }
112: } else {
113: buff.append('&');
114: buff.append(token);
115: buff.append(';');
116: }
117: }
118: return buff.toString();
119: }
120:
121: public static String convertHtmlToString(String html) {
122: if (html == null) {
123: return null;
124: }
125: if (html.length() == 0) {
126: return html;
127: }
128: if (html.indexOf('&') < 0) {
129: return html;
130: }
131: StringBuffer buff = new StringBuffer();
132: for (int i = 0; i < html.length(); i++) {
133: char ch = html.charAt(i);
134: if (ch != '&') {
135: buff.append(ch);
136: continue;
137: }
138: int idx = html.indexOf(';', i + 1);
139: if (idx < 0) {
140: buff.append("???");
141: continue;
142: }
143: String key = html.substring(i + 1, idx);
144: Character repl;
145: if (key.startsWith("#")) {
146: try {
147: int code = Integer.parseInt(key.substring(1));
148: if (code < 0 || code > 0xffff) {
149: repl = null;
150: } else {
151: repl = new Character((char) code);
152: }
153: } catch (NumberFormatException e) {
154: repl = null;
155: }
156: } else {
157: repl = (Character) charMap.get(key);
158: }
159: if (repl == null) {
160: buff.append("???" + key + "???");
161: continue;
162: } else {
163: buff.append(repl.charValue());
164: }
165: i = idx;
166: }
167: return buff.toString();
168: }
169:
170: }
|