001: package net.sf.saxon.event;
002:
003: import net.sf.saxon.trans.DynamicError;
004: import net.sf.saxon.trans.XPathException;
005:
006: import javax.xml.transform.OutputKeys;
007:
008: /**
009: * This class generates HTML output
010: * @author Michael H. Kay
011: */
012:
013: public class HTMLEmitter extends XMLEmitter {
014:
015: /**
016: * Preferred character representations
017: */
018:
019: private static final int REP_NATIVE = 0;
020: private static final int REP_ENTITY = 1;
021: private static final int REP_DECIMAL = 2;
022: private static final int REP_HEX = 3;
023:
024: private int nonASCIIRepresentation = REP_NATIVE;
025: private int excludedRepresentation = REP_DECIMAL;
026: //private String mediaType = "text/html";
027: private int inScript;
028: private boolean started = false;
029: private String elementName;
030: private short uriCode;
031:
032: /**
033: * Decode preferred representation
034: */
035:
036: private static int representationCode(String rep) {
037: if (rep.equalsIgnoreCase("native"))
038: return REP_NATIVE;
039: if (rep.equalsIgnoreCase("entity"))
040: return REP_ENTITY;
041: if (rep.equalsIgnoreCase("decimal"))
042: return REP_DECIMAL;
043: if (rep.equalsIgnoreCase("hex"))
044: return REP_HEX;
045: return REP_ENTITY;
046: }
047:
048: /**
049: * Table of HTML tags that have no closing tag
050: */
051:
052: static HTMLTagHashSet emptyTags = new HTMLTagHashSet(31);
053:
054: static {
055: setEmptyTag("area");
056: setEmptyTag("base");
057: setEmptyTag("basefont");
058: setEmptyTag("br");
059: setEmptyTag("col");
060: setEmptyTag("frame");
061: setEmptyTag("hr");
062: setEmptyTag("img");
063: setEmptyTag("input");
064: setEmptyTag("isindex");
065: setEmptyTag("link");
066: setEmptyTag("meta");
067: setEmptyTag("param");
068: }
069:
070: private static void setEmptyTag(String tag) {
071: emptyTags.add(tag);
072: }
073:
074: protected static boolean isEmptyTag(String tag) {
075: return emptyTags.contains(tag);
076: }
077:
078: /**
079: * Table of boolean attributes
080: */
081:
082: // we use two HashMaps to avoid unnecessary string concatenations
083: private static HTMLTagHashSet booleanAttributes = new HTMLTagHashSet(
084: 31);
085: private static HTMLTagHashSet booleanCombinations = new HTMLTagHashSet(
086: 53);
087:
088: static {
089: setBooleanAttribute("area", "nohref");
090: setBooleanAttribute("button", "disabled");
091: setBooleanAttribute("dir", "compact");
092: setBooleanAttribute("dl", "compact");
093: setBooleanAttribute("frame", "noresize");
094: setBooleanAttribute("hr", "noshade");
095: setBooleanAttribute("img", "ismap");
096: setBooleanAttribute("input", "checked");
097: setBooleanAttribute("input", "disabled");
098: setBooleanAttribute("input", "readonly");
099: setBooleanAttribute("menu", "compact");
100: setBooleanAttribute("object", "declare");
101: setBooleanAttribute("ol", "compact");
102: setBooleanAttribute("optgroup", "disabled");
103: setBooleanAttribute("option", "selected");
104: setBooleanAttribute("option", "disabled");
105: setBooleanAttribute("script", "defer");
106: setBooleanAttribute("select", "multiple");
107: setBooleanAttribute("select", "disabled");
108: setBooleanAttribute("td", "nowrap");
109: setBooleanAttribute("textarea", "disabled");
110: setBooleanAttribute("textarea", "readonly");
111: setBooleanAttribute("th", "nowrap");
112: setBooleanAttribute("ul", "compact");
113: }
114:
115: private static void setBooleanAttribute(String element,
116: String attribute) {
117: booleanAttributes.add(attribute);
118: booleanCombinations.add(element + '+' + attribute);
119: }
120:
121: private static boolean isBooleanAttribute(String element,
122: String attribute, String value) {
123: if (!attribute.equalsIgnoreCase(value))
124: return false;
125: if (!booleanAttributes.contains(attribute))
126: return false;
127: return booleanCombinations.contains(element + '+' + attribute);
128: }
129:
130: /**
131: * Constructor
132: */
133:
134: public HTMLEmitter() {
135:
136: }
137:
138: /**
139: * Output start of document
140: */
141:
142: public void open() throws XPathException {
143: }
144:
145: protected void openDocument() throws XPathException {
146: if (writer == null) {
147: makeWriter();
148: }
149: if (started)
150: return;
151: started = true;
152: // This method is sometimes called twice, especially during an identity transform
153: // This check stops two DOCTYPE declarations being output.
154:
155: // String mime = outputProperties.getProperty(OutputKeys.MEDIA_TYPE);
156: // if (mime!=null) {
157: // mediaType = mime;
158: // }
159:
160: String byteOrderMark = outputProperties
161: .getProperty(SaxonOutputKeys.BYTE_ORDER_MARK);
162:
163: if ("yes".equals(byteOrderMark)
164: && "UTF-8".equalsIgnoreCase(outputProperties
165: .getProperty(OutputKeys.ENCODING))) {
166: try {
167: writer.write('\uFEFF');
168: } catch (java.io.IOException err) {
169: // Might be an encoding exception; just ignore it
170: }
171: }
172:
173: String systemId = outputProperties
174: .getProperty(OutputKeys.DOCTYPE_SYSTEM);
175: String publicId = outputProperties
176: .getProperty(OutputKeys.DOCTYPE_PUBLIC);
177:
178: if (systemId != null || publicId != null) {
179: writeDocType("html", systemId, publicId);
180: }
181:
182: empty = false;
183: inScript = -1000000;
184:
185: String representation = outputProperties
186: .getProperty(SaxonOutputKeys.CHARACTER_REPRESENTATION);
187: if (representation != null) {
188: String nonASCIIrep;
189: String excludedRep;
190: int semi = representation.indexOf(';');
191: if (semi < 0) {
192: nonASCIIrep = representation;
193: excludedRep = representation;
194: } else {
195: nonASCIIrep = representation.substring(0, semi).trim();
196: excludedRep = representation.substring(semi + 1).trim();
197: }
198: nonASCIIRepresentation = representationCode(nonASCIIrep);
199: excludedRepresentation = representationCode(excludedRep);
200: if (excludedRepresentation == REP_NATIVE) {
201: excludedRepresentation = REP_ENTITY;
202: }
203: }
204:
205: }
206:
207: /**
208: * Output element start tag
209: */
210:
211: public void startElement(int nameCode, int typeCode,
212: int locationId, int properties) throws XPathException {
213:
214: super .startElement(nameCode, typeCode, locationId, properties);
215: uriCode = namePool.getURICode(nameCode);
216: elementName = (String) elementStack.peek();
217:
218: if (uriCode == 0
219: && (elementName.equalsIgnoreCase("script") || elementName
220: .equalsIgnoreCase("style"))) {
221: inScript = 0;
222: }
223: inScript++;
224: }
225:
226: public void startContent() throws XPathException {
227: closeStartTag(); // prevent <xxx/> syntax
228: }
229:
230: /**
231: * Write attribute name=value pair. Overrides the XML behaviour if the name and value
232: * are the same (we assume this is a boolean attribute to be minimised), or if the value is
233: * a URL.
234: */
235:
236: protected void writeAttribute(int elCode, String attname,
237: CharSequence value, int properties) throws XPathException {
238: try {
239: if (uriCode == 0) {
240: if (isBooleanAttribute(elementName, attname, value
241: .toString())) {
242: writer.write(attname);
243: return;
244: }
245: }
246: super .writeAttribute(elCode, attname, value, properties);
247: } catch (java.io.IOException err) {
248: throw new DynamicError(err);
249: }
250: }
251:
252: /**
253: * Escape characters. Overrides the XML behaviour
254: */
255:
256: protected void writeEscape(final CharSequence chars,
257: final boolean inAttribute) throws java.io.IOException,
258: XPathException {
259:
260: int segstart = 0;
261: final boolean[] specialChars = (inAttribute ? specialInAtt
262: : specialInText);
263: boolean disabled = false;
264:
265: while (segstart < chars.length()) {
266: int i = segstart;
267:
268: // find a maximal sequence of "ordinary" characters
269:
270: while (i < chars.length()
271: && (chars.charAt(i) < 127 ? !specialChars[chars
272: .charAt(i)]
273: : (characterSet.inCharset(chars.charAt(i)) ? nonASCIIRepresentation == REP_NATIVE
274: && chars.charAt(i) > 160
275: : false))) {
276: i++;
277: }
278:
279: // if this was the whole string, output the string and quit
280:
281: if (i == chars.length()) {
282: if (segstart == 0) {
283: writeCharSequence(chars);
284: } else {
285: writeCharSequence(chars.subSequence(segstart, i));
286: }
287: return;
288: }
289:
290: // otherwise, output this sequence and continue
291: if (i > segstart) {
292: writeCharSequence(chars.subSequence(segstart, i));
293: }
294:
295: final char c = chars.charAt(i);
296:
297: if (c == 0) {
298: // used to switch escaping on and off
299: disabled = !disabled;
300: } else if (disabled) {
301: writer.write(c);
302: } else if (c <= 127) {
303:
304: // handle a special ASCII character
305:
306: if (inAttribute) {
307: if (c == '<') {
308: writer.write('<'); // not escaped
309: } else if (c == '>') {
310: writer.write(">"); // recommended for older browsers
311: } else if (c == '&') {
312: if (i + 1 < chars.length()
313: && chars.charAt(i + 1) == '{') {
314: writer.write('&'); // not escaped if followed by '{'
315: } else {
316: writer.write("&");
317: }
318: } else if (c == '\"') {
319: writer.write(""");
320: } else if (c == '\n') {
321: writer.write("
");
322: }
323: } else {
324: if (c == '<') {
325: writer.write("<");
326: } else if (c == '>') {
327: writer.write(">"); // changed to allow for "]]>"
328: } else if (c == '&') {
329: writer.write("&");
330: }
331: }
332:
333: } else if (c == 160) {
334: // always output NBSP as an entity reference
335: writer.write(" ");
336:
337: } else if (c >= 127 && c < 160) {
338: // these control characters are illegal in HTML
339: DynamicError err = new DynamicError(
340: "Illegal HTML character: decimal " + (int) c);
341: err.setErrorCode("SERE0014");
342: throw err;
343:
344: } else if (c >= 55296 && c <= 56319) { //handle surrogate pair
345:
346: //A surrogate pair is two consecutive Unicode characters. The first
347: //is in the range D800 to DBFF, the second is in the range DC00 to DFFF.
348: //To compute the numeric value of the character corresponding to a surrogate
349: //pair, use this formula (all numbers are hex):
350: //(FirstChar - D800) * 400 + (SecondChar - DC00) + 10000
351:
352: // we'll trust the data to be sound
353: int charval = (((int) c - 55296) * 1024)
354: + ((int) chars.charAt(i + 1) - 56320) + 65536;
355: outputCharacterReference(charval);
356: i++;
357:
358: } else if (characterSet.inCharset(c)) {
359: switch (nonASCIIRepresentation) {
360: case REP_NATIVE:
361: writer.write(c);
362: break;
363: case REP_ENTITY:
364: if (c > 160 && c <= 255) {
365:
366: // if chararacter in iso-8859-1, use an entity reference
367:
368: writer.write('&');
369: writer.write(latin1Entities[(int) c - 160]);
370: writer.write(';');
371: break;
372: }
373: // else fall through
374: case REP_DECIMAL:
375: preferHex = false;
376: outputCharacterReference(c);
377: break;
378: case REP_HEX:
379: preferHex = true;
380: // fall through
381: default:
382: outputCharacterReference(c);
383: break;
384: }
385:
386: } else { // output numeric character reference
387: preferHex = (excludedRepresentation == REP_HEX);
388: outputCharacterReference((int) c);
389: }
390:
391: segstart = ++i;
392: }
393:
394: }
395:
396: /**
397: * Output an element end tag.
398: */
399:
400: public void endElement() throws XPathException {
401: String name = (String) elementStack.peek();
402: inScript--;
403: if (inScript == 0) {
404: inScript = -1000000;
405: }
406:
407: if (isEmptyTag(name) && uriCode == 0) {
408: // no end tag required
409: elementStack.pop();
410: } else {
411: super .endElement();
412: }
413:
414: }
415:
416: /**
417: * Character data.
418: */
419:
420: public void characters(CharSequence chars, int locationId,
421: int properties) throws XPathException {
422: int options = properties;
423: if (inScript > 0) {
424: options |= ReceiverOptions.DISABLE_ESCAPING;
425: }
426: super .characters(chars, locationId, options);
427: }
428:
429: /**
430: * Handle a processing instruction.
431: */
432:
433: public void processingInstruction(String target, CharSequence data,
434: int locationId, int properties) throws XPathException {
435: if (empty) {
436: openDocument();
437: }
438: for (int i = 0; i < data.length(); i++) {
439: if (data.charAt(i) == '>') {
440: DynamicError err = new DynamicError(
441: "A processing instruction in HTML must not contain a > character");
442: err.setErrorCode("SERE0015");
443: throw err;
444: }
445: }
446: try {
447: writer.write("<?");
448: writer.write(target);
449: writer.write(' ');
450: writeCharSequence(data);
451: writer.write('>');
452: } catch (java.io.IOException err) {
453: throw new DynamicError(err);
454: }
455: }
456:
457: private static final String[] latin1Entities = {
458:
459: "nbsp", // " " -- no-break space = non-breaking space,
460: // U+00A0 ISOnum -->
461: "iexcl", // "¡" -- inverted exclamation mark, U+00A1 ISOnum -->
462: "cent", // "¢" -- cent sign, U+00A2 ISOnum -->
463: "pound", // "£" -- pound sign, U+00A3 ISOnum -->
464: "curren", // "¤" -- currency sign, U+00A4 ISOnum -->
465: "yen", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum -->
466: "brvbar", // "¦" -- broken bar = broken vertical bar,
467: // U+00A6 ISOnum -->
468: "sect", // "§" -- section sign, U+00A7 ISOnum -->
469: "uml", // "¨" -- diaeresis = spacing diaeresis,
470: // U+00A8 ISOdia -->
471: "copy", // "©" -- copyright sign, U+00A9 ISOnum -->
472: "ordf", // "ª" -- feminine ordinal indicator, U+00AA ISOnum -->
473: "laquo", // "«" -- left-pointing double angle quotation mark
474: // = left pointing guillemet, U+00AB ISOnum -->
475: "not", // "¬" -- not sign, U+00AC ISOnum -->
476: "shy", // "­" -- soft hyphen = discretionary hyphen,
477: // U+00AD ISOnum -->
478: "reg", // "®" -- registered sign = registered trade mark sign,
479: // U+00AE ISOnum -->
480: "macr", // "¯" -- macron = spacing macron = overline
481: // = APL overbar, U+00AF ISOdia -->
482: "deg", // "°" -- degree sign, U+00B0 ISOnum -->
483: "plusmn", // "±" -- plus-minus sign = plus-or-minus sign,
484: // U+00B1 ISOnum -->
485: "sup2", // "²" -- superscript two = superscript digit two
486: // = squared, U+00B2 ISOnum -->
487: "sup3", // "³" -- superscript three = superscript digit three
488: // = cubed, U+00B3 ISOnum -->
489: "acute", // "´" -- acute accent = spacing acute,
490: // U+00B4 ISOdia -->
491: "micro", // "µ" -- micro sign, U+00B5 ISOnum -->
492: "para", // "¶" -- pilcrow sign = paragraph sign,
493: // U+00B6 ISOnum -->
494: "middot", // "·" -- middle dot = Georgian comma
495: // = Greek middle dot, U+00B7 ISOnum -->
496: "cedil", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
497: "sup1", // "¹" -- superscript one = superscript digit one,
498: // U+00B9 ISOnum -->
499: "ordm", // "º" -- masculine ordinal indicator,
500: // U+00BA ISOnum -->
501: "raquo", // "»" -- right-pointing double angle quotation mark
502: // = right pointing guillemet, U+00BB ISOnum -->
503: "frac14", // "¼" -- vulgar fraction one quarter
504: // = fraction one quarter, U+00BC ISOnum -->
505: "frac12", // "½" -- vulgar fraction one half
506: // = fraction one half, U+00BD ISOnum -->
507: "frac34", // "¾" -- vulgar fraction three quarters
508: // = fraction three quarters, U+00BE ISOnum -->
509: "iquest", // "¿" -- inverted question mark
510: // = turned question mark, U+00BF ISOnum -->
511: "Agrave", // "À" -- latin capital letter A with grave
512: // = latin capital letter A grave,
513: // U+00C0 ISOlat1 -->
514: "Aacute", // "Á" -- latin capital letter A with acute,
515: // U+00C1 ISOlat1 -->
516: "Acirc", // "Â" -- latin capital letter A with circumflex,
517: // U+00C2 ISOlat1 -->
518: "Atilde", // "Ã" -- latin capital letter A with tilde,
519: // U+00C3 ISOlat1 -->
520: "Auml", // "Ä" -- latin capital letter A with diaeresis,
521: // U+00C4 ISOlat1 -->
522: "Aring", // "Å" -- latin capital letter A with ring above
523: // = latin capital letter A ring,
524: // U+00C5 ISOlat1 -->
525: "AElig", // "Æ" -- latin capital letter AE
526: // = latin capital ligature AE,
527: // U+00C6 ISOlat1 -->
528: "Ccedil", // "Ç" -- latin capital letter C with cedilla,
529: // U+00C7 ISOlat1 -->
530: "Egrave", // "È" -- latin capital letter E with grave,
531: // U+00C8 ISOlat1 -->
532: "Eacute", // "É" -- latin capital letter E with acute,
533: // U+00C9 ISOlat1 -->
534: "Ecirc", // "Ê" -- latin capital letter E with circumflex,
535: // U+00CA ISOlat1 -->
536: "Euml", // "Ë" -- latin capital letter E with diaeresis,
537: // U+00CB ISOlat1 -->
538: "Igrave", // "Ì" -- latin capital letter I with grave,
539: // U+00CC ISOlat1 -->
540: "Iacute", // "Í" -- latin capital letter I with acute,
541: // U+00CD ISOlat1 -->
542: "Icirc", // "Î" -- latin capital letter I with circumflex,
543: // U+00CE ISOlat1 -->
544: "Iuml", // "Ï" -- latin capital letter I with diaeresis,
545: // U+00CF ISOlat1 -->
546: "ETH", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 -->
547: "Ntilde", // "Ñ" -- latin capital letter N with tilde,
548: // U+00D1 ISOlat1 -->
549: "Ograve", // "Ò" -- latin capital letter O with grave,
550: // U+00D2 ISOlat1 -->
551: "Oacute", // "Ó" -- latin capital letter O with acute,
552: // U+00D3 ISOlat1 -->
553: "Ocirc", // "Ô" -- latin capital letter O with circumflex,
554: // U+00D4 ISOlat1 -->
555: "Otilde", // "Õ" -- latin capital letter O with tilde,
556: // U+00D5 ISOlat1 -->
557: "Ouml", // "Ö" -- latin capital letter O with diaeresis,
558: // U+00D6 ISOlat1 -->
559: "times", // "×" -- multiplication sign, U+00D7 ISOnum -->
560: "Oslash", // "Ø" -- latin capital letter O with stroke
561: // = latin capital letter O slash,
562: // U+00D8 ISOlat1 -->
563: "Ugrave", // "Ù" -- latin capital letter U with grave,
564: // U+00D9 ISOlat1 -->
565: "Uacute", // "Ú" -- latin capital letter U with acute,
566: // U+00DA ISOlat1 -->
567: "Ucirc", // "Û" -- latin capital letter U with circumflex,
568: // U+00DB ISOlat1 -->
569: "Uuml", // "Ü" -- latin capital letter U with diaeresis,
570: // U+00DC ISOlat1 -->
571: "Yacute", // "Ý" -- latin capital letter Y with acute,
572: // U+00DD ISOlat1 -->
573: "THORN", // "Þ" -- latin capital letter THORN,
574: // U+00DE ISOlat1 -->
575: "szlig", // "ß" -- latin small letter sharp s = ess-zed,
576: // U+00DF ISOlat1 -->
577: "agrave", // "à" -- latin small letter a with grave
578: // = latin small letter a grave,
579: // U+00E0 ISOlat1 -->
580: "aacute", // "á" -- latin small letter a with acute,
581: // U+00E1 ISOlat1 -->
582: "acirc", // "â" -- latin small letter a with circumflex,
583: // U+00E2 ISOlat1 -->
584: "atilde", // "ã" -- latin small letter a with tilde,
585: // U+00E3 ISOlat1 -->
586: "auml", // "ä" -- latin small letter a with diaeresis,
587: // U+00E4 ISOlat1 -->
588: "aring", // "å" -- latin small letter a with ring above
589: // = latin small letter a ring,
590: // U+00E5 ISOlat1 -->
591: "aelig", // "æ" -- latin small letter ae
592: // = latin small ligature ae, U+00E6 ISOlat1 -->
593: "ccedil", // "ç" -- latin small letter c with cedilla,
594: // U+00E7 ISOlat1 -->
595: "egrave", // "è" -- latin small letter e with grave,
596: // U+00E8 ISOlat1 -->
597: "eacute", // "é" -- latin small letter e with acute,
598: // U+00E9 ISOlat1 -->
599: "ecirc", // "ê" -- latin small letter e with circumflex,
600: // U+00EA ISOlat1 -->
601: "euml", // "ë" -- latin small letter e with diaeresis,
602: // U+00EB ISOlat1 -->
603: "igrave", // "ì" -- latin small letter i with grave,
604: // U+00EC ISOlat1 -->
605: "iacute", // "í" -- latin small letter i with acute,
606: // U+00ED ISOlat1 -->
607: "icirc", // "î" -- latin small letter i with circumflex,
608: // U+00EE ISOlat1 -->
609: "iuml", // "ï" -- latin small letter i with diaeresis,
610: // U+00EF ISOlat1 -->
611: "eth", // "ð" -- latin small letter eth, U+00F0 ISOlat1 -->
612: "ntilde", // "ñ" -- latin small letter n with tilde,
613: // U+00F1 ISOlat1 -->
614: "ograve", // "ò" -- latin small letter o with grave,
615: // U+00F2 ISOlat1 -->
616: "oacute", // "ó" -- latin small letter o with acute,
617: // U+00F3 ISOlat1 -->
618: "ocirc", // "ô" -- latin small letter o with circumflex,
619: // U+00F4 ISOlat1 -->
620: "otilde", // "õ" -- latin small letter o with tilde,
621: // U+00F5 ISOlat1 -->
622: "ouml", // "ö" -- latin small letter o with diaeresis,
623: // U+00F6 ISOlat1 -->
624: "divide", // "÷" -- division sign, U+00F7 ISOnum -->
625: "oslash", // "ø" -- latin small letter o with stroke,
626: // = latin small letter o slash,
627: // U+00F8 ISOlat1 -->
628: "ugrave", // "ù" -- latin small letter u with grave,
629: // U+00F9 ISOlat1 -->
630: "uacute", // "ú" -- latin small letter u with acute,
631: // U+00FA ISOlat1 -->
632: "ucirc", // "û" -- latin small letter u with circumflex,
633: // U+00FB ISOlat1 -->
634: "uuml", // "ü" -- latin small letter u with diaeresis,
635: // U+00FC ISOlat1 -->
636: "yacute", // "ý" -- latin small letter y with acute,
637: // U+00FD ISOlat1 -->
638: "thorn", // "þ" -- latin small letter thorn,
639: // U+00FE ISOlat1 -->
640: "yuml" // "ÿ" -- latin small letter y with diaeresis,
641: // U+00FF ISOlat1 -->
642: };
643:
644: }
645:
646: //
647: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
648: // you may not use this file except in compliance with the License. You may obtain a copy of the
649: // License at http://www.mozilla.org/MPL/
650: //
651: // Software distributed under the License is distributed on an "AS IS" basis,
652: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
653: // See the License for the specific language governing rights and limitations under the License.
654: //
655: // The Original Code is: all this file.
656: //
657: // The Initial Developer of the Original Code is Michael H. Kay.
658: //
659: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
660: //
661: // Contributor(s): none.
662: //
|