0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.commons.lang;
0019:
0020: import java.io.IOException;
0021: import java.io.StringWriter;
0022: import java.io.Writer;
0023: import java.util.HashMap;
0024: import java.util.Map;
0025: import java.util.TreeMap;
0026:
0027: /**
0028: * <p>
0029: * Provides HTML and XML entity utilities.
0030: * </p>
0031: *
0032: * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
0033: * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
0034: * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
0035: * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
0036: * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
0037: *
0038: * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
0039: * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
0040: * @since 2.0
0041: * @version $Id: Entities.java 504343 2007-02-06 22:40:12Z bayard $
0042: */
0043: class Entities {
0044:
0045: private static final String[][] BASIC_ARRAY = { { "quot", "34" }, // " - double-quote
0046: { "amp", "38" }, // & - ampersand
0047: { "lt", "60" }, // < - less-than
0048: { "gt", "62" }, // > - greater-than
0049: };
0050:
0051: private static final String[][] APOS_ARRAY = { { "apos", "39" }, // XML apostrophe
0052: };
0053:
0054: // package scoped for testing
0055: static final String[][] ISO8859_1_ARRAY = { { "nbsp", "160" }, // non-breaking space
0056: { "iexcl", "161" }, // inverted exclamation mark
0057: { "cent", "162" }, // cent sign
0058: { "pound", "163" }, // pound sign
0059: { "curren", "164" }, // currency sign
0060: { "yen", "165" }, // yen sign = yuan sign
0061: { "brvbar", "166" }, // broken bar = broken vertical bar
0062: { "sect", "167" }, // section sign
0063: { "uml", "168" }, // diaeresis = spacing diaeresis
0064: { "copy", "169" }, // © - copyright sign
0065: { "ordf", "170" }, // feminine ordinal indicator
0066: { "laquo", "171" }, // left-pointing double angle quotation mark = left pointing guillemet
0067: { "not", "172" }, // not sign
0068: { "shy", "173" }, // soft hyphen = discretionary hyphen
0069: { "reg", "174" }, // ® - registered trademark sign
0070: { "macr", "175" }, // macron = spacing macron = overline = APL overbar
0071: { "deg", "176" }, // degree sign
0072: { "plusmn", "177" }, // plus-minus sign = plus-or-minus sign
0073: { "sup2", "178" }, // superscript two = superscript digit two = squared
0074: { "sup3", "179" }, // superscript three = superscript digit three = cubed
0075: { "acute", "180" }, // acute accent = spacing acute
0076: { "micro", "181" }, // micro sign
0077: { "para", "182" }, // pilcrow sign = paragraph sign
0078: { "middot", "183" }, // middle dot = Georgian comma = Greek middle dot
0079: { "cedil", "184" }, // cedilla = spacing cedilla
0080: { "sup1", "185" }, // superscript one = superscript digit one
0081: { "ordm", "186" }, // masculine ordinal indicator
0082: { "raquo", "187" }, // right-pointing double angle quotation mark = right pointing guillemet
0083: { "frac14", "188" }, // vulgar fraction one quarter = fraction one quarter
0084: { "frac12", "189" }, // vulgar fraction one half = fraction one half
0085: { "frac34", "190" }, // vulgar fraction three quarters = fraction three quarters
0086: { "iquest", "191" }, // inverted question mark = turned question mark
0087: { "Agrave", "192" }, // À - uppercase A, grave accent
0088: { "Aacute", "193" }, // Á - uppercase A, acute accent
0089: { "Acirc", "194" }, // Â - uppercase A, circumflex accent
0090: { "Atilde", "195" }, // Ã - uppercase A, tilde
0091: { "Auml", "196" }, // Ä - uppercase A, umlaut
0092: { "Aring", "197" }, // Å - uppercase A, ring
0093: { "AElig", "198" }, // Æ - uppercase AE
0094: { "Ccedil", "199" }, // Ç - uppercase C, cedilla
0095: { "Egrave", "200" }, // È - uppercase E, grave accent
0096: { "Eacute", "201" }, // É - uppercase E, acute accent
0097: { "Ecirc", "202" }, // Ê - uppercase E, circumflex accent
0098: { "Euml", "203" }, // Ë - uppercase E, umlaut
0099: { "Igrave", "204" }, // Ì - uppercase I, grave accent
0100: { "Iacute", "205" }, // Í - uppercase I, acute accent
0101: { "Icirc", "206" }, // Î - uppercase I, circumflex accent
0102: { "Iuml", "207" }, // Ï - uppercase I, umlaut
0103: { "ETH", "208" }, // Ð - uppercase Eth, Icelandic
0104: { "Ntilde", "209" }, // Ñ - uppercase N, tilde
0105: { "Ograve", "210" }, // Ò - uppercase O, grave accent
0106: { "Oacute", "211" }, // Ó - uppercase O, acute accent
0107: { "Ocirc", "212" }, // Ô - uppercase O, circumflex accent
0108: { "Otilde", "213" }, // Õ - uppercase O, tilde
0109: { "Ouml", "214" }, // Ö - uppercase O, umlaut
0110: { "times", "215" }, // multiplication sign
0111: { "Oslash", "216" }, // Ø - uppercase O, slash
0112: { "Ugrave", "217" }, // Ù - uppercase U, grave accent
0113: { "Uacute", "218" }, // Ú - uppercase U, acute accent
0114: { "Ucirc", "219" }, // Û - uppercase U, circumflex accent
0115: { "Uuml", "220" }, // Ü - uppercase U, umlaut
0116: { "Yacute", "221" }, // Ý - uppercase Y, acute accent
0117: { "THORN", "222" }, // Þ - uppercase THORN, Icelandic
0118: { "szlig", "223" }, // ß - lowercase sharps, German
0119: { "agrave", "224" }, // à - lowercase a, grave accent
0120: { "aacute", "225" }, // á - lowercase a, acute accent
0121: { "acirc", "226" }, // â - lowercase a, circumflex accent
0122: { "atilde", "227" }, // ã - lowercase a, tilde
0123: { "auml", "228" }, // ä - lowercase a, umlaut
0124: { "aring", "229" }, // å - lowercase a, ring
0125: { "aelig", "230" }, // æ - lowercase ae
0126: { "ccedil", "231" }, // ç - lowercase c, cedilla
0127: { "egrave", "232" }, // è - lowercase e, grave accent
0128: { "eacute", "233" }, // é - lowercase e, acute accent
0129: { "ecirc", "234" }, // ê - lowercase e, circumflex accent
0130: { "euml", "235" }, // ë - lowercase e, umlaut
0131: { "igrave", "236" }, // ì - lowercase i, grave accent
0132: { "iacute", "237" }, // í - lowercase i, acute accent
0133: { "icirc", "238" }, // î - lowercase i, circumflex accent
0134: { "iuml", "239" }, // ï - lowercase i, umlaut
0135: { "eth", "240" }, // ð - lowercase eth, Icelandic
0136: { "ntilde", "241" }, // ñ - lowercase n, tilde
0137: { "ograve", "242" }, // ò - lowercase o, grave accent
0138: { "oacute", "243" }, // ó - lowercase o, acute accent
0139: { "ocirc", "244" }, // ô - lowercase o, circumflex accent
0140: { "otilde", "245" }, // õ - lowercase o, tilde
0141: { "ouml", "246" }, // ö - lowercase o, umlaut
0142: { "divide", "247" }, // division sign
0143: { "oslash", "248" }, // ø - lowercase o, slash
0144: { "ugrave", "249" }, // ù - lowercase u, grave accent
0145: { "uacute", "250" }, // ú - lowercase u, acute accent
0146: { "ucirc", "251" }, // û - lowercase u, circumflex accent
0147: { "uuml", "252" }, // ü - lowercase u, umlaut
0148: { "yacute", "253" }, // ý - lowercase y, acute accent
0149: { "thorn", "254" }, // þ - lowercase thorn, Icelandic
0150: { "yuml", "255" }, // ÿ - lowercase y, umlaut
0151: };
0152:
0153: // http://www.w3.org/TR/REC-html40/sgml/entities.html
0154: // package scoped for testing
0155: static final String[][] HTML40_ARRAY = {
0156: // <!-- Latin Extended-B -->
0157: { "fnof", "402" }, // latin small f with hook = function= florin, U+0192 ISOtech -->
0158: // <!-- Greek -->
0159: { "Alpha", "913" }, // greek capital letter alpha, U+0391 -->
0160: { "Beta", "914" }, // greek capital letter beta, U+0392 -->
0161: { "Gamma", "915" }, // greek capital letter gamma,U+0393 ISOgrk3 -->
0162: { "Delta", "916" }, // greek capital letter delta,U+0394 ISOgrk3 -->
0163: { "Epsilon", "917" }, // greek capital letter epsilon, U+0395 -->
0164: { "Zeta", "918" }, // greek capital letter zeta, U+0396 -->
0165: { "Eta", "919" }, // greek capital letter eta, U+0397 -->
0166: { "Theta", "920" }, // greek capital letter theta,U+0398 ISOgrk3 -->
0167: { "Iota", "921" }, // greek capital letter iota, U+0399 -->
0168: { "Kappa", "922" }, // greek capital letter kappa, U+039A -->
0169: { "Lambda", "923" }, // greek capital letter lambda,U+039B ISOgrk3 -->
0170: { "Mu", "924" }, // greek capital letter mu, U+039C -->
0171: { "Nu", "925" }, // greek capital letter nu, U+039D -->
0172: { "Xi", "926" }, // greek capital letter xi, U+039E ISOgrk3 -->
0173: { "Omicron", "927" }, // greek capital letter omicron, U+039F -->
0174: { "Pi", "928" }, // greek capital letter pi, U+03A0 ISOgrk3 -->
0175: { "Rho", "929" }, // greek capital letter rho, U+03A1 -->
0176: // <!-- there is no Sigmaf, and no U+03A2 character either -->
0177: { "Sigma", "931" }, // greek capital letter sigma,U+03A3 ISOgrk3 -->
0178: { "Tau", "932" }, // greek capital letter tau, U+03A4 -->
0179: { "Upsilon", "933" }, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
0180: { "Phi", "934" }, // greek capital letter phi,U+03A6 ISOgrk3 -->
0181: { "Chi", "935" }, // greek capital letter chi, U+03A7 -->
0182: { "Psi", "936" }, // greek capital letter psi,U+03A8 ISOgrk3 -->
0183: { "Omega", "937" }, // greek capital letter omega,U+03A9 ISOgrk3 -->
0184: { "alpha", "945" }, // greek small letter alpha,U+03B1 ISOgrk3 -->
0185: { "beta", "946" }, // greek small letter beta, U+03B2 ISOgrk3 -->
0186: { "gamma", "947" }, // greek small letter gamma,U+03B3 ISOgrk3 -->
0187: { "delta", "948" }, // greek small letter delta,U+03B4 ISOgrk3 -->
0188: { "epsilon", "949" }, // greek small letter epsilon,U+03B5 ISOgrk3 -->
0189: { "zeta", "950" }, // greek small letter zeta, U+03B6 ISOgrk3 -->
0190: { "eta", "951" }, // greek small letter eta, U+03B7 ISOgrk3 -->
0191: { "theta", "952" }, // greek small letter theta,U+03B8 ISOgrk3 -->
0192: { "iota", "953" }, // greek small letter iota, U+03B9 ISOgrk3 -->
0193: { "kappa", "954" }, // greek small letter kappa,U+03BA ISOgrk3 -->
0194: { "lambda", "955" }, // greek small letter lambda,U+03BB ISOgrk3 -->
0195: { "mu", "956" }, // greek small letter mu, U+03BC ISOgrk3 -->
0196: { "nu", "957" }, // greek small letter nu, U+03BD ISOgrk3 -->
0197: { "xi", "958" }, // greek small letter xi, U+03BE ISOgrk3 -->
0198: { "omicron", "959" }, // greek small letter omicron, U+03BF NEW -->
0199: { "pi", "960" }, // greek small letter pi, U+03C0 ISOgrk3 -->
0200: { "rho", "961" }, // greek small letter rho, U+03C1 ISOgrk3 -->
0201: { "sigmaf", "962" }, // greek small letter final sigma,U+03C2 ISOgrk3 -->
0202: { "sigma", "963" }, // greek small letter sigma,U+03C3 ISOgrk3 -->
0203: { "tau", "964" }, // greek small letter tau, U+03C4 ISOgrk3 -->
0204: { "upsilon", "965" }, // greek small letter upsilon,U+03C5 ISOgrk3 -->
0205: { "phi", "966" }, // greek small letter phi, U+03C6 ISOgrk3 -->
0206: { "chi", "967" }, // greek small letter chi, U+03C7 ISOgrk3 -->
0207: { "psi", "968" }, // greek small letter psi, U+03C8 ISOgrk3 -->
0208: { "omega", "969" }, // greek small letter omega,U+03C9 ISOgrk3 -->
0209: { "thetasym", "977" }, // greek small letter theta symbol,U+03D1 NEW -->
0210: { "upsih", "978" }, // greek upsilon with hook symbol,U+03D2 NEW -->
0211: { "piv", "982" }, // greek pi symbol, U+03D6 ISOgrk3 -->
0212: // <!-- General Punctuation -->
0213: { "bull", "8226" }, // bullet = black small circle,U+2022 ISOpub -->
0214: // <!-- bullet is NOT the same as bullet operator, U+2219 -->
0215: { "hellip", "8230" }, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
0216: { "prime", "8242" }, // prime = minutes = feet, U+2032 ISOtech -->
0217: { "Prime", "8243" }, // double prime = seconds = inches,U+2033 ISOtech -->
0218: { "oline", "8254" }, // overline = spacing overscore,U+203E NEW -->
0219: { "frasl", "8260" }, // fraction slash, U+2044 NEW -->
0220: // <!-- Letterlike Symbols -->
0221: { "weierp", "8472" }, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
0222: { "image", "8465" }, // blackletter capital I = imaginary part,U+2111 ISOamso -->
0223: { "real", "8476" }, // blackletter capital R = real part symbol,U+211C ISOamso -->
0224: { "trade", "8482" }, // trade mark sign, U+2122 ISOnum -->
0225: { "alefsym", "8501" }, // alef symbol = first transfinite cardinal,U+2135 NEW -->
0226: // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
0227: // same glyph could be used to depict both characters -->
0228: // <!-- Arrows -->
0229: { "larr", "8592" }, // leftwards arrow, U+2190 ISOnum -->
0230: { "uarr", "8593" }, // upwards arrow, U+2191 ISOnum-->
0231: { "rarr", "8594" }, // rightwards arrow, U+2192 ISOnum -->
0232: { "darr", "8595" }, // downwards arrow, U+2193 ISOnum -->
0233: { "harr", "8596" }, // left right arrow, U+2194 ISOamsa -->
0234: { "crarr", "8629" }, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
0235: { "lArr", "8656" }, // leftwards double arrow, U+21D0 ISOtech -->
0236: // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
0237: // arrow but also does not have any other character for that function.
0238: // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
0239: { "uArr", "8657" }, // upwards double arrow, U+21D1 ISOamsa -->
0240: { "rArr", "8658" }, // rightwards double arrow,U+21D2 ISOtech -->
0241: // <!-- ISO 10646 does not say this is the 'implies' character but does not
0242: // have another character with this function so ?rArr can be used for
0243: // 'implies' as ISOtech suggests -->
0244: { "dArr", "8659" }, // downwards double arrow, U+21D3 ISOamsa -->
0245: { "hArr", "8660" }, // left right double arrow,U+21D4 ISOamsa -->
0246: // <!-- Mathematical Operators -->
0247: { "forall", "8704" }, // for all, U+2200 ISOtech -->
0248: { "part", "8706" }, // partial differential, U+2202 ISOtech -->
0249: { "exist", "8707" }, // there exists, U+2203 ISOtech -->
0250: { "empty", "8709" }, // empty set = null set = diameter,U+2205 ISOamso -->
0251: { "nabla", "8711" }, // nabla = backward difference,U+2207 ISOtech -->
0252: { "isin", "8712" }, // element of, U+2208 ISOtech -->
0253: { "notin", "8713" }, // not an element of, U+2209 ISOtech -->
0254: { "ni", "8715" }, // contains as member, U+220B ISOtech -->
0255: // <!-- should there be a more memorable name than 'ni'? -->
0256: { "prod", "8719" }, // n-ary product = product sign,U+220F ISOamsb -->
0257: // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
0258: // though the same glyph might be used for both -->
0259: { "sum", "8721" }, // n-ary summation, U+2211 ISOamsb -->
0260: // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
0261: // though the same glyph might be used for both -->
0262: { "minus", "8722" }, // minus sign, U+2212 ISOtech -->
0263: { "lowast", "8727" }, // asterisk operator, U+2217 ISOtech -->
0264: { "radic", "8730" }, // square root = radical sign,U+221A ISOtech -->
0265: { "prop", "8733" }, // proportional to, U+221D ISOtech -->
0266: { "infin", "8734" }, // infinity, U+221E ISOtech -->
0267: { "ang", "8736" }, // angle, U+2220 ISOamso -->
0268: { "and", "8743" }, // logical and = wedge, U+2227 ISOtech -->
0269: { "or", "8744" }, // logical or = vee, U+2228 ISOtech -->
0270: { "cap", "8745" }, // intersection = cap, U+2229 ISOtech -->
0271: { "cup", "8746" }, // union = cup, U+222A ISOtech -->
0272: { "int", "8747" }, // integral, U+222B ISOtech -->
0273: { "there4", "8756" }, // therefore, U+2234 ISOtech -->
0274: { "sim", "8764" }, // tilde operator = varies with = similar to,U+223C ISOtech -->
0275: // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
0276: // the same glyph might be used to represent both -->
0277: { "cong", "8773" }, // approximately equal to, U+2245 ISOtech -->
0278: { "asymp", "8776" }, // almost equal to = asymptotic to,U+2248 ISOamsr -->
0279: { "ne", "8800" }, // not equal to, U+2260 ISOtech -->
0280: { "equiv", "8801" }, // identical to, U+2261 ISOtech -->
0281: { "le", "8804" }, // less-than or equal to, U+2264 ISOtech -->
0282: { "ge", "8805" }, // greater-than or equal to,U+2265 ISOtech -->
0283: { "sub", "8834" }, // subset of, U+2282 ISOtech -->
0284: { "sup", "8835" }, // superset of, U+2283 ISOtech -->
0285: // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
0286: // Symbol font encoding and is not included. Should it be, for symmetry?
0287: // It is in ISOamsn --> <!ENTITY nsub", "8836"},
0288: // not a subset of, U+2284 ISOamsn -->
0289: { "sube", "8838" }, // subset of or equal to, U+2286 ISOtech -->
0290: { "supe", "8839" }, // superset of or equal to,U+2287 ISOtech -->
0291: { "oplus", "8853" }, // circled plus = direct sum,U+2295 ISOamsb -->
0292: { "otimes", "8855" }, // circled times = vector product,U+2297 ISOamsb -->
0293: { "perp", "8869" }, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
0294: { "sdot", "8901" }, // dot operator, U+22C5 ISOamsb -->
0295: // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
0296: // <!-- Miscellaneous Technical -->
0297: { "lceil", "8968" }, // left ceiling = apl upstile,U+2308 ISOamsc -->
0298: { "rceil", "8969" }, // right ceiling, U+2309 ISOamsc -->
0299: { "lfloor", "8970" }, // left floor = apl downstile,U+230A ISOamsc -->
0300: { "rfloor", "8971" }, // right floor, U+230B ISOamsc -->
0301: { "lang", "9001" }, // left-pointing angle bracket = bra,U+2329 ISOtech -->
0302: // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
0303: // mark' -->
0304: { "rang", "9002" }, // right-pointing angle bracket = ket,U+232A ISOtech -->
0305: // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
0306: // 'single right-pointing angle quotation mark' -->
0307: // <!-- Geometric Shapes -->
0308: { "loz", "9674" }, // lozenge, U+25CA ISOpub -->
0309: // <!-- Miscellaneous Symbols -->
0310: { "spades", "9824" }, // black spade suit, U+2660 ISOpub -->
0311: // <!-- black here seems to mean filled as opposed to hollow -->
0312: { "clubs", "9827" }, // black club suit = shamrock,U+2663 ISOpub -->
0313: { "hearts", "9829" }, // black heart suit = valentine,U+2665 ISOpub -->
0314: { "diams", "9830" }, // black diamond suit, U+2666 ISOpub -->
0315:
0316: // <!-- Latin Extended-A -->
0317: { "OElig", "338" }, // -- latin capital ligature OE,U+0152 ISOlat2 -->
0318: { "oelig", "339" }, // -- latin small ligature oe, U+0153 ISOlat2 -->
0319: // <!-- ligature is a misnomer, this is a separate character in some languages -->
0320: { "Scaron", "352" }, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
0321: { "scaron", "353" }, // -- latin small letter s with caron,U+0161 ISOlat2 -->
0322: { "Yuml", "376" }, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
0323: // <!-- Spacing Modifier Letters -->
0324: { "circ", "710" }, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
0325: { "tilde", "732" }, // small tilde, U+02DC ISOdia -->
0326: // <!-- General Punctuation -->
0327: { "ensp", "8194" }, // en space, U+2002 ISOpub -->
0328: { "emsp", "8195" }, // em space, U+2003 ISOpub -->
0329: { "thinsp", "8201" }, // thin space, U+2009 ISOpub -->
0330: { "zwnj", "8204" }, // zero width non-joiner,U+200C NEW RFC 2070 -->
0331: { "zwj", "8205" }, // zero width joiner, U+200D NEW RFC 2070 -->
0332: { "lrm", "8206" }, // left-to-right mark, U+200E NEW RFC 2070 -->
0333: { "rlm", "8207" }, // right-to-left mark, U+200F NEW RFC 2070 -->
0334: { "ndash", "8211" }, // en dash, U+2013 ISOpub -->
0335: { "mdash", "8212" }, // em dash, U+2014 ISOpub -->
0336: { "lsquo", "8216" }, // left single quotation mark,U+2018 ISOnum -->
0337: { "rsquo", "8217" }, // right single quotation mark,U+2019 ISOnum -->
0338: { "sbquo", "8218" }, // single low-9 quotation mark, U+201A NEW -->
0339: { "ldquo", "8220" }, // left double quotation mark,U+201C ISOnum -->
0340: { "rdquo", "8221" }, // right double quotation mark,U+201D ISOnum -->
0341: { "bdquo", "8222" }, // double low-9 quotation mark, U+201E NEW -->
0342: { "dagger", "8224" }, // dagger, U+2020 ISOpub -->
0343: { "Dagger", "8225" }, // double dagger, U+2021 ISOpub -->
0344: { "permil", "8240" }, // per mille sign, U+2030 ISOtech -->
0345: { "lsaquo", "8249" }, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
0346: // <!-- lsaquo is proposed but not yet ISO standardized -->
0347: { "rsaquo", "8250" }, // single right-pointing angle quotation mark,U+203A ISO proposed -->
0348: // <!-- rsaquo is proposed but not yet ISO standardized -->
0349: { "euro", "8364" }, // -- euro sign, U+20AC NEW -->
0350: };
0351:
0352: /**
0353: * <p>
0354: * The set of entities supported by standard XML.
0355: * </p>
0356: */
0357: public static final Entities XML;
0358:
0359: /**
0360: * <p>
0361: * The set of entities supported by HTML 3.2.
0362: * </p>
0363: */
0364: public static final Entities HTML32;
0365:
0366: /**
0367: * <p>
0368: * The set of entities supported by HTML 4.0.
0369: * </p>
0370: */
0371: public static final Entities HTML40;
0372:
0373: static {
0374: XML = new Entities();
0375: XML.addEntities(BASIC_ARRAY);
0376: XML.addEntities(APOS_ARRAY);
0377: }
0378:
0379: static {
0380: HTML32 = new Entities();
0381: HTML32.addEntities(BASIC_ARRAY);
0382: HTML32.addEntities(ISO8859_1_ARRAY);
0383: }
0384:
0385: static {
0386: HTML40 = new Entities();
0387: fillWithHtml40Entities(HTML40);
0388: }
0389:
0390: /**
0391: * <p>
0392: * Fills the specified entities instance with HTML 40 entities.
0393: * </p>
0394: *
0395: * @param entities
0396: * the instance to be filled.
0397: */
0398: static void fillWithHtml40Entities(Entities entities) {
0399: entities.addEntities(BASIC_ARRAY);
0400: entities.addEntities(ISO8859_1_ARRAY);
0401: entities.addEntities(HTML40_ARRAY);
0402: }
0403:
0404: static interface EntityMap {
0405: /**
0406: * <p>
0407: * Add an entry to this entity map.
0408: * </p>
0409: *
0410: * @param name
0411: * the entity name
0412: * @param value
0413: * the entity value
0414: */
0415: void add(String name, int value);
0416:
0417: /**
0418: * <p>
0419: * Returns the name of the entity identified by the specified value.
0420: * </p>
0421: *
0422: * @param value
0423: * the value to locate
0424: * @return entity name associated with the specified value
0425: */
0426: String name(int value);
0427:
0428: /**
0429: * <p>
0430: * Returns the value of the entity identified by the specified name.
0431: * </p>
0432: *
0433: * @param name
0434: * the name to locate
0435: * @return entity value associated with the specified name
0436: */
0437: int value(String name);
0438: }
0439:
0440: static class PrimitiveEntityMap implements EntityMap {
0441: private Map mapNameToValue = new HashMap();
0442:
0443: private IntHashMap mapValueToName = new IntHashMap();
0444:
0445: /**
0446: * {@inheritDoc}
0447: */
0448: public void add(String name, int value) {
0449: mapNameToValue.put(name, new Integer(value));
0450: mapValueToName.put(value, name);
0451: }
0452:
0453: /**
0454: * {@inheritDoc}
0455: */
0456: public String name(int value) {
0457: return (String) mapValueToName.get(value);
0458: }
0459:
0460: /**
0461: * {@inheritDoc}
0462: */
0463: public int value(String name) {
0464: Object value = mapNameToValue.get(name);
0465: if (value == null) {
0466: return -1;
0467: }
0468: return ((Integer) value).intValue();
0469: }
0470: }
0471:
0472: static abstract class MapIntMap implements Entities.EntityMap {
0473: protected Map mapNameToValue;
0474:
0475: protected Map mapValueToName;
0476:
0477: /**
0478: * {@inheritDoc}
0479: */
0480: public void add(String name, int value) {
0481: mapNameToValue.put(name, new Integer(value));
0482: mapValueToName.put(new Integer(value), name);
0483: }
0484:
0485: /**
0486: * {@inheritDoc}
0487: */
0488: public String name(int value) {
0489: return (String) mapValueToName.get(new Integer(value));
0490: }
0491:
0492: /**
0493: * {@inheritDoc}
0494: */
0495: public int value(String name) {
0496: Object value = mapNameToValue.get(name);
0497: if (value == null) {
0498: return -1;
0499: }
0500: return ((Integer) value).intValue();
0501: }
0502: }
0503:
0504: static class HashEntityMap extends MapIntMap {
0505: /**
0506: * Constructs a new instance of <code>HashEntityMap</code>.
0507: */
0508: public HashEntityMap() {
0509: mapNameToValue = new HashMap();
0510: mapValueToName = new HashMap();
0511: }
0512: }
0513:
0514: static class TreeEntityMap extends MapIntMap {
0515: /**
0516: * Constructs a new instance of <code>TreeEntityMap</code>.
0517: */
0518: public TreeEntityMap() {
0519: mapNameToValue = new TreeMap();
0520: mapValueToName = new TreeMap();
0521: }
0522: }
0523:
0524: static class LookupEntityMap extends PrimitiveEntityMap {
0525: private String[] lookupTable;
0526:
0527: private int LOOKUP_TABLE_SIZE = 256;
0528:
0529: /**
0530: * {@inheritDoc}
0531: */
0532: public String name(int value) {
0533: if (value < LOOKUP_TABLE_SIZE) {
0534: return lookupTable()[value];
0535: }
0536: return super .name(value);
0537: }
0538:
0539: /**
0540: * <p>
0541: * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
0542: * </p>
0543: *
0544: * @return the lookup table
0545: */
0546: private String[] lookupTable() {
0547: if (lookupTable == null) {
0548: createLookupTable();
0549: }
0550: return lookupTable;
0551: }
0552:
0553: /**
0554: * <p>
0555: * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
0556: * </p>
0557: */
0558: private void createLookupTable() {
0559: lookupTable = new String[LOOKUP_TABLE_SIZE];
0560: for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
0561: lookupTable[i] = super .name(i);
0562: }
0563: }
0564: }
0565:
0566: static class ArrayEntityMap implements EntityMap {
0567: protected int growBy = 100;
0568:
0569: protected int size = 0;
0570:
0571: protected String[] names;
0572:
0573: protected int[] values;
0574:
0575: /**
0576: * Constructs a new instance of <code>ArrayEntityMap</code>.
0577: */
0578: public ArrayEntityMap() {
0579: names = new String[growBy];
0580: values = new int[growBy];
0581: }
0582:
0583: /**
0584: * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
0585: * grow.
0586: *
0587: * @param growBy
0588: * array will be initialized to and will grow by this amount
0589: */
0590: public ArrayEntityMap(int growBy) {
0591: this .growBy = growBy;
0592: names = new String[growBy];
0593: values = new int[growBy];
0594: }
0595:
0596: /**
0597: * {@inheritDoc}
0598: */
0599: public void add(String name, int value) {
0600: ensureCapacity(size + 1);
0601: names[size] = name;
0602: values[size] = value;
0603: size++;
0604: }
0605:
0606: /**
0607: * Verifies the capacity of the entity array, adjusting the size if necessary.
0608: *
0609: * @param capacity
0610: * size the array should be
0611: */
0612: protected void ensureCapacity(int capacity) {
0613: if (capacity > names.length) {
0614: int newSize = Math.max(capacity, size + growBy);
0615: String[] newNames = new String[newSize];
0616: System.arraycopy(names, 0, newNames, 0, size);
0617: names = newNames;
0618: int[] newValues = new int[newSize];
0619: System.arraycopy(values, 0, newValues, 0, size);
0620: values = newValues;
0621: }
0622: }
0623:
0624: /**
0625: * {@inheritDoc}
0626: */
0627: public String name(int value) {
0628: for (int i = 0; i < size; ++i) {
0629: if (values[i] == value) {
0630: return names[i];
0631: }
0632: }
0633: return null;
0634: }
0635:
0636: /**
0637: * {@inheritDoc}
0638: */
0639: public int value(String name) {
0640: for (int i = 0; i < size; ++i) {
0641: if (names[i].equals(name)) {
0642: return values[i];
0643: }
0644: }
0645: return -1;
0646: }
0647: }
0648:
0649: static class BinaryEntityMap extends ArrayEntityMap {
0650:
0651: /**
0652: * Constructs a new instance of <code>BinaryEntityMap</code>.
0653: */
0654: public BinaryEntityMap() {
0655: super ();
0656: }
0657:
0658: /**
0659: * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
0660: * should grow.
0661: *
0662: * @param growBy
0663: * array will be initialized to and will grow by this amount
0664: */
0665: public BinaryEntityMap(int growBy) {
0666: super (growBy);
0667: }
0668:
0669: /**
0670: * Performs a binary search of the entity array for the specified key. This method is based on code in
0671: * {@link java.util.Arrays}.
0672: *
0673: * @param key
0674: * the key to be found
0675: * @return the index of the entity array matching the specified key
0676: */
0677: private int binarySearch(int key) {
0678: int low = 0;
0679: int high = size - 1;
0680:
0681: while (low <= high) {
0682: int mid = (low + high) >> 1;
0683: int midVal = values[mid];
0684:
0685: if (midVal < key) {
0686: low = mid + 1;
0687: } else if (midVal > key) {
0688: high = mid - 1;
0689: } else {
0690: return mid; // key found
0691: }
0692: }
0693: return -(low + 1); // key not found.
0694: }
0695:
0696: /**
0697: * {@inheritDoc}
0698: */
0699: public void add(String name, int value) {
0700: ensureCapacity(size + 1);
0701: int insertAt = binarySearch(value);
0702: if (insertAt > 0) {
0703: return; // note: this means you can't insert the same value twice
0704: }
0705: insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
0706: System.arraycopy(values, insertAt, values, insertAt + 1,
0707: size - insertAt);
0708: values[insertAt] = value;
0709: System.arraycopy(names, insertAt, names, insertAt + 1, size
0710: - insertAt);
0711: names[insertAt] = name;
0712: size++;
0713: }
0714:
0715: /**
0716: * {@inheritDoc}
0717: */
0718: public String name(int value) {
0719: int index = binarySearch(value);
0720: if (index < 0) {
0721: return null;
0722: }
0723: return names[index];
0724: }
0725: }
0726:
0727: // package scoped for testing
0728: EntityMap map = new Entities.LookupEntityMap();
0729:
0730: /**
0731: * <p>
0732: * Adds entities to this entity.
0733: * </p>
0734: *
0735: * @param entityArray
0736: * array of entities to be added
0737: */
0738: public void addEntities(String[][] entityArray) {
0739: for (int i = 0; i < entityArray.length; ++i) {
0740: addEntity(entityArray[i][0], Integer
0741: .parseInt(entityArray[i][1]));
0742: }
0743: }
0744:
0745: /**
0746: * <p>
0747: * Add an entity to this entity.
0748: * </p>
0749: *
0750: * @param name
0751: * name of the entity
0752: * @param value
0753: * vale of the entity
0754: */
0755: public void addEntity(String name, int value) {
0756: map.add(name, value);
0757: }
0758:
0759: /**
0760: * <p>
0761: * Returns the name of the entity identified by the specified value.
0762: * </p>
0763: *
0764: * @param value
0765: * the value to locate
0766: * @return entity name associated with the specified value
0767: */
0768: public String entityName(int value) {
0769: return map.name(value);
0770: }
0771:
0772: /**
0773: * <p>
0774: * Returns the value of the entity identified by the specified name.
0775: * </p>
0776: *
0777: * @param name
0778: * the name to locate
0779: * @return entity value associated with the specified name
0780: */
0781: public int entityValue(String name) {
0782: return map.value(name);
0783: }
0784:
0785: /**
0786: * <p>
0787: * Escapes the characters in a <code>String</code>.
0788: * </p>
0789: *
0790: * <p>
0791: * For example, if you have called addEntity("foo", 0xA1), escape("\u00A1") will return
0792: * "&foo;"
0793: * </p>
0794: *
0795: * @param str
0796: * The <code>String</code> to escape.
0797: * @return A new escaped <code>String</code>.
0798: */
0799: public String escape(String str) {
0800: StringWriter stringWriter = createStringWriter(str);
0801: try {
0802: this .escape(stringWriter, str);
0803: } catch (IOException e) {
0804: // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
0805: // throw IOExceptions.
0806: throw new UnhandledException(e);
0807: }
0808: return stringWriter.toString();
0809: }
0810:
0811: /**
0812: * <p>
0813: * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
0814: * passed.
0815: * </p>
0816: *
0817: * @param writer
0818: * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
0819: * @param str
0820: * The <code>String</code> to escape. Assumed to be a non-null value.
0821: * @throws IOException
0822: * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
0823: * methods.
0824: *
0825: * @see #escape(String)
0826: * @see Writer
0827: */
0828: public void escape(Writer writer, String str) throws IOException {
0829: int len = str.length();
0830: for (int i = 0; i < len; i++) {
0831: char c = str.charAt(i);
0832: String entityName = this .entityName(c);
0833: if (entityName == null) {
0834: if (c > 0x7F) {
0835: writer.write("&#");
0836: writer.write(Integer.toString(c, 10));
0837: writer.write(';');
0838: } else {
0839: writer.write(c);
0840: }
0841: } else {
0842: writer.write('&');
0843: writer.write(entityName);
0844: writer.write(';');
0845: }
0846: }
0847: }
0848:
0849: /**
0850: * <p>
0851: * Unescapes the entities in a <code>String</code>.
0852: * </p>
0853: *
0854: * <p>
0855: * For example, if you have called addEntity("foo", 0xA1), unescape("&foo;") will return
0856: * "\u00A1"
0857: * </p>
0858: *
0859: * @param str
0860: * The <code>String</code> to escape.
0861: * @return A new escaped <code>String</code>.
0862: */
0863: public String unescape(String str) {
0864: int firstAmp = str.indexOf('&');
0865: if (firstAmp < 0) {
0866: return str;
0867: } else {
0868: StringWriter stringWriter = createStringWriter(str);
0869: try {
0870: this .doUnescape(stringWriter, str, firstAmp);
0871: } catch (IOException e) {
0872: // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
0873: // do not throw IOExceptions.
0874: throw new UnhandledException(e);
0875: }
0876: return stringWriter.toString();
0877: }
0878: }
0879:
0880: /**
0881: * Make the StringWriter 10% larger than the source String to avoid growing the writer
0882: *
0883: * @param str The source string
0884: * @return A newly created StringWriter
0885: */
0886: private StringWriter createStringWriter(String str) {
0887: return new StringWriter(
0888: (int) (str.length() + (str.length() * 0.1)));
0889: }
0890:
0891: /**
0892: * <p>
0893: * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
0894: * <code>Writer</code> passed.
0895: * </p>
0896: *
0897: * @param writer
0898: * The <code>Writer</code> to write the results to; assumed to be non-null.
0899: * @param str
0900: * The source <code>String</code> to unescape; assumed to be non-null.
0901: * @throws IOException
0902: * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
0903: * methods.
0904: *
0905: * @see #escape(String)
0906: * @see Writer
0907: */
0908: public void unescape(Writer writer, String str) throws IOException {
0909: int firstAmp = str.indexOf('&');
0910: if (firstAmp < 0) {
0911: writer.write(str);
0912: return;
0913: } else {
0914: doUnescape(writer, str, firstAmp);
0915: }
0916: }
0917:
0918: /**
0919: * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
0920: *
0921: * @param writer
0922: * The <code>Writer</code> to write the results to; assumed to be non-null.
0923: * @param str
0924: * The source <code>String</code> to unescape; assumed to be non-null.
0925: * @param firstAmp
0926: * The <code>int</code> index of the first ampersand in the source String.
0927: * @throws IOException
0928: * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
0929: * methods.
0930: */
0931: private void doUnescape(Writer writer, String str, int firstAmp)
0932: throws IOException {
0933: writer.write(str, 0, firstAmp);
0934: int len = str.length();
0935: for (int i = firstAmp; i < len; i++) {
0936: char c = str.charAt(i);
0937: if (c == '&') {
0938: int nextIdx = i + 1;
0939: int semiColonIdx = str.indexOf(';', nextIdx);
0940: if (semiColonIdx == -1) {
0941: writer.write(c);
0942: continue;
0943: }
0944: int amphersandIdx = str.indexOf('&', i + 1);
0945: if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
0946: // Then the text looks like &...&...;
0947: writer.write(c);
0948: continue;
0949: }
0950: String entityContent = str.substring(nextIdx,
0951: semiColonIdx);
0952: int entityValue = -1;
0953: int entityContentLen = entityContent.length();
0954: if (entityContentLen > 0) {
0955: if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
0956: // hexidecimal)
0957: if (entityContentLen > 1) {
0958: char isHexChar = entityContent.charAt(1);
0959: try {
0960: switch (isHexChar) {
0961: case 'X':
0962: case 'x': {
0963: entityValue = Integer.parseInt(
0964: entityContent.substring(2),
0965: 16);
0966: break;
0967: }
0968: default: {
0969: entityValue = Integer.parseInt(
0970: entityContent.substring(1),
0971: 10);
0972: }
0973: }
0974: if (entityValue > 0xFFFF) {
0975: entityValue = -1;
0976: }
0977: } catch (NumberFormatException e) {
0978: entityValue = -1;
0979: }
0980: }
0981: } else { // escaped value content is an entity name
0982: entityValue = this .entityValue(entityContent);
0983: }
0984: }
0985:
0986: if (entityValue == -1) {
0987: writer.write('&');
0988: writer.write(entityContent);
0989: writer.write(';');
0990: } else {
0991: writer.write(entityValue);
0992: }
0993: i = semiColonIdx; // move index up to the semi-colon
0994: } else {
0995: writer.write(c);
0996: }
0997: }
0998: }
0999:
1000: }
|