001: package it.unimi.dsi.mg4j.util.parser;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.Hash;
025: import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
026: import it.unimi.dsi.mg4j.util.MutableString;
027:
028: /** A parsing factory for (X)HTML.
029: *
030: * <p><strong>Warning:</strong> for maximum flexibility, the methods of this factory
031: * do <em>not</em> perform case normalisation. If you are parsing HTML, you are invited
032: * to downcase your names before accessing {@link #getElement(MutableString)}
033: * and {@link #getAttribute(MutableString)}.
034: *
035: * <p>This class is a singleton, and its only instance is accessible using the public field
036: * {@link #INSTANCE}.
037: *
038: * <p>The relationship between this class and {@link Element}/{@link Attribute} is a bit
039: * twisted due to the need to accomodate two features:
040: * <ul>
041: * <li>(X)HTML interned objects must be accessible directly (see, e.g., {@link Element#A});
042: * <li>(X)HTML interned objects must be put into suitable name-to-object maps.
043: * </ul>
044: *
045: * <p>To this purpose, this class exports packagewise some static factory methods that create {@link Element}s and
046: * {@link Attribute}s and register them locally. The static initialisation code in
047: * {@link Element} and {@link Attribute} creates elements such as {@link Element#A} using the abovementioned
048: * factory methods.
049: *
050: * <p>An alternative implementation could use reflection, but I don't see great advantages.
051: * @deprecated Moved to <code>dsiutils</code>.
052: */
053:
054: @Deprecated
055: public class HTMLFactory implements ParsingFactory {
056:
057: private HTMLFactory() {
058: }
059:
060: public static final HTMLFactory INSTANCE = new HTMLFactory();
061:
062: public Element getElement(final MutableString name) {
063: return NAME2ELEMENT.get(name);
064: }
065:
066: public Attribute getAttribute(final MutableString name) {
067: return NAME2ATTRIBUTE.get(name);
068: }
069:
070: public Entity getEntity(final MutableString name) {
071: return NAME2ENTITY.get(name);
072: }
073:
074: /** A (quick) map from entity names to entites. */
075: static final Object2ObjectOpenHashMap<CharSequence, Entity> NAME2ENTITY = new Object2ObjectOpenHashMap<CharSequence, Entity>(
076: Hash.DEFAULT_INITIAL_SIZE, .5f);
077:
078: /** A (quick) map from attribute names to attributes. */
079: static final Object2ObjectOpenHashMap<CharSequence, Attribute> NAME2ATTRIBUTE = new Object2ObjectOpenHashMap<CharSequence, Attribute>(
080: Hash.DEFAULT_INITIAL_SIZE, .5f);
081:
082: /** A (quick) map from element-type names to element types. */
083: static final Object2ObjectOpenHashMap<CharSequence, Element> NAME2ELEMENT = new Object2ObjectOpenHashMap<CharSequence, Element>(
084: Hash.DEFAULT_INITIAL_SIZE, .5f);
085:
086: static Element newElement(final CharSequence name) {
087: final Element element = new Element(name);
088: NAME2ELEMENT.put(element.name, element);
089: return element;
090: }
091:
092: static Element newElement(final CharSequence name,
093: final boolean breaksFlow, final boolean isSimple) {
094: final Element element = new Element(name, breaksFlow, isSimple);
095: NAME2ELEMENT.put(element.name, element);
096: return element;
097: }
098:
099: static Element newElement(final CharSequence name,
100: final boolean breaksFlow, final boolean isSimple,
101: final boolean isImplicit) {
102: final Element element = new Element(name, breaksFlow, isSimple,
103: isImplicit);
104: NAME2ELEMENT.put(element.name, element);
105: return element;
106: }
107:
108: static Attribute newAttribute(final CharSequence name) {
109: final Attribute attribute = new Attribute(name);
110: NAME2ATTRIBUTE.put(attribute.name, attribute);
111: return attribute;
112: }
113:
114: static Entity newEntity(final CharSequence name, final char c) {
115: final Entity entity = new Entity(name, c);
116: NAME2ENTITY.put(entity.name, entity);
117: return entity;
118: }
119:
120: static {
121: NAME2ATTRIBUTE.defaultReturnValue(Attribute.UNKNOWN);
122: NAME2ELEMENT.defaultReturnValue(Element.UNKNOWN);
123:
124: // --- Entity Names -----------------------------------
125:
126: // Latin 1
127: HTMLFactory.newEntity("nbsp", (char) 160);
128: HTMLFactory.newEntity("iexcl", (char) 161);
129: HTMLFactory.newEntity("cent", (char) 162);
130: HTMLFactory.newEntity("pound", (char) 163);
131: HTMLFactory.newEntity("curren", (char) 164);
132: HTMLFactory.newEntity("yen", (char) 165);
133: HTMLFactory.newEntity("brvbar", (char) 166);
134: HTMLFactory.newEntity("sect", (char) 167);
135: HTMLFactory.newEntity("uml", (char) 168);
136: HTMLFactory.newEntity("copy", (char) 169);
137: HTMLFactory.newEntity("ordf", (char) 170);
138: HTMLFactory.newEntity("laquo", (char) 171);
139: HTMLFactory.newEntity("not", (char) 172);
140: HTMLFactory.newEntity("shy", (char) 173);
141: HTMLFactory.newEntity("reg", (char) 174);
142: HTMLFactory.newEntity("macr", (char) 175);
143: HTMLFactory.newEntity("deg", (char) 176);
144: HTMLFactory.newEntity("plusmn", (char) 177);
145: HTMLFactory.newEntity("sup2", (char) 178);
146: HTMLFactory.newEntity("sup3", (char) 179);
147: HTMLFactory.newEntity("acute", (char) 180);
148: HTMLFactory.newEntity("micro", (char) 181);
149: HTMLFactory.newEntity("para", (char) 182);
150: HTMLFactory.newEntity("middot", (char) 183);
151: HTMLFactory.newEntity("cedil", (char) 184);
152: HTMLFactory.newEntity("sup1", (char) 185);
153: HTMLFactory.newEntity("ordm", (char) 186);
154: HTMLFactory.newEntity("raquo", (char) 187);
155: HTMLFactory.newEntity("frac14", (char) 188);
156: HTMLFactory.newEntity("frac12", (char) 189);
157: HTMLFactory.newEntity("frac34", (char) 190);
158: HTMLFactory.newEntity("iquest", (char) 191);
159: HTMLFactory.newEntity("Agrave", (char) 192);
160: HTMLFactory.newEntity("Aacute", (char) 193);
161: HTMLFactory.newEntity("Acirc", (char) 194);
162: HTMLFactory.newEntity("Atilde", (char) 195);
163: HTMLFactory.newEntity("Auml", (char) 196);
164: HTMLFactory.newEntity("Aring", (char) 197);
165: HTMLFactory.newEntity("AElig", (char) 198);
166: HTMLFactory.newEntity("Ccedil", (char) 199);
167: HTMLFactory.newEntity("Egrave", (char) 200);
168: HTMLFactory.newEntity("Eacute", (char) 201);
169: HTMLFactory.newEntity("Ecirc", (char) 202);
170: HTMLFactory.newEntity("Euml", (char) 203);
171: HTMLFactory.newEntity("Igrave", (char) 204);
172: HTMLFactory.newEntity("Iacute", (char) 205);
173: HTMLFactory.newEntity("Icirc", (char) 206);
174: HTMLFactory.newEntity("Iuml", (char) 207);
175: HTMLFactory.newEntity("ETH", (char) 208);
176: HTMLFactory.newEntity("Ntilde", (char) 209);
177: HTMLFactory.newEntity("Ograve", (char) 210);
178: HTMLFactory.newEntity("Oacute", (char) 211);
179: HTMLFactory.newEntity("Ocirc", (char) 212);
180: HTMLFactory.newEntity("Otilde", (char) 213);
181: HTMLFactory.newEntity("Ouml", (char) 214);
182: HTMLFactory.newEntity("times", (char) 215);
183: HTMLFactory.newEntity("Oslash", (char) 216);
184: HTMLFactory.newEntity("Ugrave", (char) 217);
185: HTMLFactory.newEntity("Uacute", (char) 218);
186: HTMLFactory.newEntity("Ucirc", (char) 219);
187: HTMLFactory.newEntity("Uuml", (char) 220);
188: HTMLFactory.newEntity("Yacute", (char) 221);
189: HTMLFactory.newEntity("THORN", (char) 222);
190: HTMLFactory.newEntity("szlig", (char) 223);
191: HTMLFactory.newEntity("agrave", (char) 224);
192: HTMLFactory.newEntity("aacute", (char) 225);
193: HTMLFactory.newEntity("acirc", (char) 226);
194: HTMLFactory.newEntity("atilde", (char) 227);
195: HTMLFactory.newEntity("auml", (char) 228);
196: HTMLFactory.newEntity("aring", (char) 229);
197: HTMLFactory.newEntity("aelig", (char) 230);
198: HTMLFactory.newEntity("ccedil", (char) 231);
199: HTMLFactory.newEntity("egrave", (char) 232);
200: HTMLFactory.newEntity("eacute", (char) 233);
201: HTMLFactory.newEntity("ecirc", (char) 234);
202: HTMLFactory.newEntity("euml", (char) 235);
203: HTMLFactory.newEntity("igrave", (char) 236);
204: HTMLFactory.newEntity("iacute", (char) 237);
205: HTMLFactory.newEntity("icirc", (char) 238);
206: HTMLFactory.newEntity("iuml", (char) 239);
207: HTMLFactory.newEntity("eth", (char) 240);
208: HTMLFactory.newEntity("ntilde", (char) 241);
209: HTMLFactory.newEntity("ograve", (char) 242);
210: HTMLFactory.newEntity("oacute", (char) 243);
211: HTMLFactory.newEntity("ocirc", (char) 244);
212: HTMLFactory.newEntity("otilde", (char) 245);
213: HTMLFactory.newEntity("ouml", (char) 246);
214: HTMLFactory.newEntity("divide", (char) 247);
215: HTMLFactory.newEntity("oslash", (char) 248);
216: HTMLFactory.newEntity("ugrave", (char) 249);
217: HTMLFactory.newEntity("uacute", (char) 250);
218: HTMLFactory.newEntity("ucirc", (char) 251);
219: HTMLFactory.newEntity("uuml", (char) 252);
220: HTMLFactory.newEntity("yacute", (char) 253);
221: HTMLFactory.newEntity("thorn", (char) 254);
222: HTMLFactory.newEntity("yuml", (char) 255);
223:
224: // Special
225: HTMLFactory.newEntity("quot", (char) 34);
226: HTMLFactory.newEntity("apos", (char) 39);
227: HTMLFactory.newEntity("amp", (char) 38);
228: HTMLFactory.newEntity("lt", (char) 60);
229: HTMLFactory.newEntity("gt", (char) 62);
230: HTMLFactory.newEntity("OElig", (char) 338);
231: HTMLFactory.newEntity("oelig", (char) 339);
232: HTMLFactory.newEntity("Scaron", (char) 352);
233: HTMLFactory.newEntity("scaron", (char) 353);
234: HTMLFactory.newEntity("Yuml", (char) 376);
235: HTMLFactory.newEntity("circ", (char) 710);
236: HTMLFactory.newEntity("tilde", (char) 732);
237: HTMLFactory.newEntity("ensp", (char) 8194);
238: HTMLFactory.newEntity("emsp", (char) 8195);
239: HTMLFactory.newEntity("thinsp", (char) 8201);
240: HTMLFactory.newEntity("zwnj", (char) 8204);
241: HTMLFactory.newEntity("zwj", (char) 8205);
242: HTMLFactory.newEntity("lrm", (char) 8206);
243: HTMLFactory.newEntity("rlm", (char) 8207);
244: HTMLFactory.newEntity("ndash", (char) 8211);
245: HTMLFactory.newEntity("mdash", (char) 8212);
246: HTMLFactory.newEntity("lsquo", (char) 8216);
247: HTMLFactory.newEntity("rsquo", (char) 8217);
248: HTMLFactory.newEntity("sbquo", (char) 8218);
249: HTMLFactory.newEntity("ldquo", (char) 8220);
250: HTMLFactory.newEntity("rdquo", (char) 8221);
251: HTMLFactory.newEntity("bdquo", (char) 8222);
252: HTMLFactory.newEntity("dagger", (char) 8224);
253: HTMLFactory.newEntity("Dagger", (char) 8225);
254: HTMLFactory.newEntity("permil", (char) 8240);
255: HTMLFactory.newEntity("lsaquo", (char) 8249);
256: HTMLFactory.newEntity("rsaquo", (char) 8250);
257: HTMLFactory.newEntity("euro", (char) 8364);
258:
259: // Symbols
260: HTMLFactory.newEntity("fnof", (char) 402);
261: HTMLFactory.newEntity("Alpha", (char) 913);
262: HTMLFactory.newEntity("Beta", (char) 914);
263: HTMLFactory.newEntity("Gamma", (char) 915);
264: HTMLFactory.newEntity("Delta", (char) 916);
265: HTMLFactory.newEntity("Epsilon", (char) 917);
266: HTMLFactory.newEntity("Zeta", (char) 918);
267: HTMLFactory.newEntity("Eta", (char) 919);
268: HTMLFactory.newEntity("Theta", (char) 920);
269: HTMLFactory.newEntity("Iota", (char) 921);
270: HTMLFactory.newEntity("Kappa", (char) 922);
271: HTMLFactory.newEntity("Lambda", (char) 923);
272: HTMLFactory.newEntity("Mu", (char) 924);
273: HTMLFactory.newEntity("Nu", (char) 925);
274: HTMLFactory.newEntity("Xi", (char) 926);
275: HTMLFactory.newEntity("Omicron", (char) 927);
276: HTMLFactory.newEntity("Pi", (char) 928);
277: HTMLFactory.newEntity("Rho", (char) 929);
278: HTMLFactory.newEntity("Sigma", (char) 931);
279: HTMLFactory.newEntity("Tau", (char) 932);
280: HTMLFactory.newEntity("Upsilon", (char) 933);
281: HTMLFactory.newEntity("Phi", (char) 934);
282: HTMLFactory.newEntity("Chi", (char) 935);
283: HTMLFactory.newEntity("Psi", (char) 936);
284: HTMLFactory.newEntity("Omega", (char) 937);
285: HTMLFactory.newEntity("alpha", (char) 945);
286: HTMLFactory.newEntity("beta", (char) 946);
287: HTMLFactory.newEntity("gamma", (char) 947);
288: HTMLFactory.newEntity("delta", (char) 948);
289: HTMLFactory.newEntity("epsilon", (char) 949);
290: HTMLFactory.newEntity("zeta", (char) 950);
291: HTMLFactory.newEntity("eta", (char) 951);
292: HTMLFactory.newEntity("theta", (char) 952);
293: HTMLFactory.newEntity("iota", (char) 953);
294: HTMLFactory.newEntity("kappa", (char) 954);
295: HTMLFactory.newEntity("lambda", (char) 955);
296: HTMLFactory.newEntity("mu", (char) 956);
297: HTMLFactory.newEntity("nu", (char) 957);
298: HTMLFactory.newEntity("xi", (char) 958);
299: HTMLFactory.newEntity("omicron", (char) 959);
300: HTMLFactory.newEntity("pi", (char) 960);
301: HTMLFactory.newEntity("rho", (char) 961);
302: HTMLFactory.newEntity("sigmaf", (char) 962);
303: HTMLFactory.newEntity("sigma", (char) 963);
304: HTMLFactory.newEntity("tau", (char) 964);
305: HTMLFactory.newEntity("upsilon", (char) 965);
306: HTMLFactory.newEntity("phi", (char) 966);
307: HTMLFactory.newEntity("chi", (char) 967);
308: HTMLFactory.newEntity("psi", (char) 968);
309: HTMLFactory.newEntity("omega", (char) 969);
310: HTMLFactory.newEntity("thetasym", (char) 977);
311: HTMLFactory.newEntity("upsih", (char) 978);
312: HTMLFactory.newEntity("piv", (char) 982);
313: HTMLFactory.newEntity("bull", (char) 8226);
314: HTMLFactory.newEntity("hellip", (char) 8230);
315: HTMLFactory.newEntity("prime", (char) 8242);
316: HTMLFactory.newEntity("Prime", (char) 8243);
317: HTMLFactory.newEntity("oline", (char) 8254);
318: HTMLFactory.newEntity("frasl", (char) 8260);
319: HTMLFactory.newEntity("weierp", (char) 8472);
320: HTMLFactory.newEntity("image", (char) 8465);
321: HTMLFactory.newEntity("real", (char) 8476);
322: HTMLFactory.newEntity("trade", (char) 8482);
323: HTMLFactory.newEntity("alefsym", (char) 8501);
324: HTMLFactory.newEntity("larr", (char) 8592);
325: HTMLFactory.newEntity("uarr", (char) 8593);
326: HTMLFactory.newEntity("rarr", (char) 8594);
327: HTMLFactory.newEntity("darr", (char) 8595);
328: HTMLFactory.newEntity("harr", (char) 8596);
329: HTMLFactory.newEntity("crarr", (char) 8629);
330: HTMLFactory.newEntity("lArr", (char) 8656);
331: HTMLFactory.newEntity("uArr", (char) 8657);
332: HTMLFactory.newEntity("rArr", (char) 8658);
333: HTMLFactory.newEntity("dArr", (char) 8659);
334: HTMLFactory.newEntity("hArr", (char) 8660);
335: HTMLFactory.newEntity("forall", (char) 8704);
336: HTMLFactory.newEntity("part", (char) 8706);
337: HTMLFactory.newEntity("exist", (char) 8707);
338: HTMLFactory.newEntity("empty", (char) 8709);
339: HTMLFactory.newEntity("nabla", (char) 8711);
340: HTMLFactory.newEntity("isin", (char) 8712);
341: HTMLFactory.newEntity("notin", (char) 8713);
342: HTMLFactory.newEntity("ni", (char) 8715);
343: HTMLFactory.newEntity("prod", (char) 8719);
344: HTMLFactory.newEntity("sum", (char) 8721);
345: HTMLFactory.newEntity("minus", (char) 8722);
346: HTMLFactory.newEntity("lowast", (char) 8727);
347: HTMLFactory.newEntity("radic", (char) 8730);
348: HTMLFactory.newEntity("prop", (char) 8733);
349: HTMLFactory.newEntity("infin", (char) 8734);
350: HTMLFactory.newEntity("ang", (char) 8736);
351: HTMLFactory.newEntity("and", (char) 8743);
352: HTMLFactory.newEntity("or", (char) 8744);
353: HTMLFactory.newEntity("cap", (char) 8745);
354: HTMLFactory.newEntity("cup", (char) 8746);
355: HTMLFactory.newEntity("int", (char) 8747);
356: HTMLFactory.newEntity("there4", (char) 8756);
357: HTMLFactory.newEntity("sim", (char) 8764);
358: HTMLFactory.newEntity("cong", (char) 8773);
359: HTMLFactory.newEntity("asymp", (char) 8776);
360: HTMLFactory.newEntity("ne", (char) 8800);
361: HTMLFactory.newEntity("equiv", (char) 8801);
362: HTMLFactory.newEntity("le", (char) 8804);
363: HTMLFactory.newEntity("ge", (char) 8805);
364: HTMLFactory.newEntity("sub", (char) 8834);
365: HTMLFactory.newEntity("sup", (char) 8835);
366: HTMLFactory.newEntity("nsub", (char) 8836);
367: HTMLFactory.newEntity("sube", (char) 8838);
368: HTMLFactory.newEntity("supe", (char) 8839);
369: HTMLFactory.newEntity("oplus", (char) 8853);
370: HTMLFactory.newEntity("otimes", (char) 8855);
371: HTMLFactory.newEntity("perp", (char) 8869);
372: HTMLFactory.newEntity("sdot", (char) 8901);
373: HTMLFactory.newEntity("lceil", (char) 8968);
374: HTMLFactory.newEntity("rceil", (char) 8969);
375: HTMLFactory.newEntity("lfloor", (char) 8970);
376: HTMLFactory.newEntity("rfloor", (char) 8971);
377: HTMLFactory.newEntity("lang", (char) 9001);
378: HTMLFactory.newEntity("rang", (char) 9002);
379: HTMLFactory.newEntity("loz", (char) 9674);
380: HTMLFactory.newEntity("spades", (char) 9824);
381: HTMLFactory.newEntity("clubs", (char) 9827);
382: HTMLFactory.newEntity("hearts", (char) 9829);
383: HTMLFactory.newEntity("diams", (char) 9830);
384: }
385: }
|