001: /*
002: * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: * Free SoftwareFoundation, Inc.
023: * 59 Temple Place, Suite 330
024: * Boston, MA 02111-1307 USA
025: *
026: * @author Scott Ferguson
027: */
028:
029: package com.caucho.xml;
030:
031: import com.caucho.util.CharBuffer;
032: import com.caucho.util.CharCursor;
033: import com.caucho.util.CharScanner;
034: import com.caucho.util.IntMap;
035: import com.caucho.util.StringCharCursor;
036:
037: import org.w3c.dom.Element;
038:
039: import java.io.IOException;
040:
041: /**
042: * Policy for parsing an HTML file.
043: */
044: class HtmlPolicy extends Policy {
045: static final int DOCUMENT = 1;
046: static final int COMMENT = DOCUMENT + 1;
047: static final int TEXT = COMMENT + 1;
048: static final int JSP = TEXT + 1;
049: static final int WHITESPACE = JSP + 1;
050:
051: static final int HTML = WHITESPACE + 1;
052: static final int HEAD = HTML + 1;
053: static final int TITLE = HEAD + 1;
054: static final int ISINDEX = TITLE + 1;
055: static final int BASE = ISINDEX + 1;
056: static final int SCRIPT = BASE + 1;
057: static final int STYLE = SCRIPT + 1;
058: static final int META = STYLE + 1;
059: static final int LINK = META + 1;
060: static final int OBJECT = LINK + 1;
061:
062: static final int BODY = OBJECT + 1;
063:
064: static final int BASEFONT = BODY + 1;
065: static final int BR = BASEFONT + 1;
066: static final int AREA = BR + 1;
067: static final int IMG = AREA + 1;
068: static final int PARAM = IMG + 1;
069: static final int HR = PARAM + 1;
070: static final int INPUT = HR + 1;
071:
072: static final int P = INPUT + 1;
073: static final int DT = P + 1;
074: static final int DD = DT + 1;
075: static final int LI = DD + 1;
076: static final int OPTION = LI + 1;
077:
078: static final int TABLE = OPTION + 1;
079: static final int CAPTION = TABLE + 1;
080: static final int THEAD = CAPTION + 1;
081: static final int TFOOT = THEAD + 1;
082: static final int COL = TFOOT + 1;
083: static final int COLGROUP = COL + 1;
084: static final int TBODY = COLGROUP + 1;
085: static final int TR = TBODY + 1;
086: static final int TD = TR + 1;
087: static final int TH = TD + 1;
088:
089: static final int FRAME = TH + 1;
090: static final int FRAMESET = FRAME + 1;
091:
092: static final int BLOCK = FRAMESET + 1;
093: static final int INLINE = BLOCK + 1;
094:
095: static IntMap names;
096: static IntMap cbNames;
097:
098: static QName htmlName = new QName(null, "html", null);
099: static QName headName = new QName(null, "head", null);
100: static QName bodyName = new QName(null, "body", null);
101:
102: boolean toLower = true;
103: boolean isJsp = false;
104: boolean autoHtml = false;
105: boolean hasBody = false;
106: boolean autoHead = false;
107:
108: CharBuffer cb = new CharBuffer();
109:
110: public void init() {
111: toLower = true;
112: isJsp = false;
113: autoHtml = false;
114: hasBody = false;
115: autoHead = false;
116: }
117:
118: /**
119: * When true, HTML parsing normalizes HTML tags to lower case.
120: */
121: public void setToLower(boolean toLower) {
122: this .toLower = toLower;
123: }
124:
125: /**
126: * When true, treat text before HTML specially.
127: */
128: public void setJsp(boolean isJsp) {
129: this .isJsp = isJsp;
130: }
131:
132: /**
133: * Return the normalized name.
134: *
135: * @param tag the raw name in the XML file.
136: *
137: * @return the normalized name.
138: */
139: QName getName(CharBuffer tag) {
140: if (!toLower)
141: return super .getName(tag);
142:
143: cb.clear();
144: cb.append(tag);
145: cb.toLowerCase();
146:
147: int name = cbNames.get(cb);
148:
149: if (name >= 0)
150: return super .getName(cb);
151: else
152: return super .getName(tag);
153: }
154:
155: QName getAttributeName(CharBuffer eltName, CharBuffer source) {
156: if (!toLower)
157: return super .getName(source);
158:
159: cb.clear();
160: cb.append(eltName);
161: cb.toLowerCase();
162: int name = cbNames.get(cb);
163:
164: if (name < 0)
165: return super .getName(source);
166: else {
167: source.toLowerCase();
168: return super .getName(source);
169: }
170: }
171:
172: /**
173: * Returns the appropriate action when opening a HTML tag.
174: *
175: * @param parser the XML parser
176: * @param node the parent node
177: * @param next the next child
178: * @return the action code
179: */
180: int openAction(XmlParser parser, QName node, QName next)
181: throws XmlParseException {
182: String nodeName = node == null ? "#document" : node.getName();
183: String nextName = next.getName();
184:
185: int nextCode = names.get(nextName);
186:
187: switch (names.get(nodeName)) {
188: case DOCUMENT:
189: switch (nextCode) {
190: case HTML:
191: return PUSH;
192:
193: case COMMENT:
194: return PUSH;
195:
196: case HEAD:
197: case TITLE:
198: case ISINDEX:
199: case BASE:
200: case SCRIPT:
201: case STYLE:
202: case META:
203: case LINK:
204: case OBJECT:
205: opt = htmlName;
206: return PUSH_OPT;
207:
208: case WHITESPACE:
209: return IGNORE;
210:
211: case JSP:
212: return PUSH;
213:
214: default:
215: if (autoHtml)
216: return PUSH;
217:
218: autoHtml = true;
219: opt = htmlName;
220: return PUSH_OPT;
221: }
222:
223: case HTML:
224: switch (nextCode) {
225: case HTML:
226: return ERROR;
227:
228: case HEAD:
229: case COMMENT:
230: case FRAMESET:
231: return PUSH;
232:
233: case BODY:
234: hasBody = true;
235: return PUSH;
236:
237: case TITLE:
238: case ISINDEX:
239: case BASE:
240: case SCRIPT:
241: case STYLE:
242: case META:
243: case LINK:
244: case OBJECT:
245: opt = headName;
246: autoHead = true;
247: return PUSH_OPT;
248:
249: case WHITESPACE:
250: return PUSH;
251:
252: case JSP:
253: return PUSH;
254:
255: default:
256: if (hasBody)
257: return PUSH;
258:
259: hasBody = true;
260: opt = bodyName;
261: return PUSH_OPT;
262: }
263:
264: case HEAD:
265: switch (nextCode) {
266: case META:
267: // checkMetaEncoding((Element) next);
268: return PUSH_EMPTY;
269:
270: case LINK:
271: case ISINDEX:
272: case BASE:
273: return PUSH_EMPTY;
274:
275: case SCRIPT:
276: case STYLE:
277: return PUSH_VERBATIM;
278:
279: case TITLE:
280: case OBJECT:
281: return PUSH;
282:
283: case WHITESPACE:
284: return PUSH;
285:
286: case JSP:
287: case TEXT:
288: if (autoHead)
289: return POP;
290: else
291: return PUSH;
292:
293: default:
294: return POP;
295: }
296:
297: case LI:
298: switch (nextCode) {
299: case LI:
300: return POP;
301:
302: case BASEFONT:
303: case BR:
304: case AREA:
305: case LINK:
306: case IMG:
307: case PARAM:
308: case HR:
309: case INPUT:
310: case COL:
311: case FRAME:
312: case ISINDEX:
313: case BASE:
314: case META:
315: return PUSH_EMPTY;
316:
317: case SCRIPT:
318: case STYLE:
319: return PUSH_VERBATIM;
320:
321: default:
322: return PUSH;
323: }
324:
325: case OPTION:
326: switch (nextCode) {
327: case WHITESPACE:
328: case TEXT:
329: return PUSH;
330:
331: default:
332: return POP;
333: }
334:
335: case DD:
336: switch (nextCode) {
337: case DD:
338: case DT:
339: return POP;
340:
341: case BASEFONT:
342: case BR:
343: case AREA:
344: case LINK:
345: case IMG:
346: case PARAM:
347: case HR:
348: case INPUT:
349: case COL:
350: case FRAME:
351: case ISINDEX:
352: case BASE:
353: case META:
354: return PUSH_EMPTY;
355:
356: case SCRIPT:
357: case STYLE:
358: return PUSH_VERBATIM;
359:
360: default:
361: return PUSH;
362: }
363:
364: case THEAD:
365: case TFOOT:
366: case COLGROUP:
367: switch (nextCode) {
368: case THEAD:
369: case TFOOT:
370: case TBODY:
371: case COLGROUP:
372: case COL:
373: return POP;
374:
375: case BASEFONT:
376: case BR:
377: case AREA:
378: case LINK:
379: case IMG:
380: case PARAM:
381: case HR:
382: case INPUT:
383: case FRAME:
384: case ISINDEX:
385: case BASE:
386: case META:
387: return PUSH_EMPTY;
388:
389: case SCRIPT:
390: case STYLE:
391: return PUSH_VERBATIM;
392:
393: default:
394: return PUSH;
395: }
396:
397: case TR:
398: switch (nextCode) {
399: case THEAD:
400: case TFOOT:
401: case TBODY:
402: case COLGROUP:
403: case COL:
404: case TR:
405: return POP;
406:
407: case BASEFONT:
408: case BR:
409: case AREA:
410: case LINK:
411: case IMG:
412: case PARAM:
413: case HR:
414: case INPUT:
415: case FRAME:
416: case ISINDEX:
417: case BASE:
418: case META:
419: return PUSH_EMPTY;
420:
421: case TD:
422: case TH:
423: return PUSH;
424:
425: case SCRIPT:
426: case STYLE:
427: return PUSH_VERBATIM;
428:
429: default:
430: return PUSH;
431: }
432:
433: case TD:
434: case TH:
435: switch (nextCode) {
436: case THEAD:
437: case TFOOT:
438: case TBODY:
439: case COLGROUP:
440: case COL:
441: case TR:
442: case TD:
443: case TH:
444: return POP;
445:
446: case BASEFONT:
447: case BR:
448: case AREA:
449: case LINK:
450: case IMG:
451: case PARAM:
452: case HR:
453: case INPUT:
454: case FRAME:
455: case ISINDEX:
456: case BASE:
457: case META:
458: return PUSH_EMPTY;
459:
460: case SCRIPT:
461: case STYLE:
462: return PUSH_VERBATIM;
463:
464: default:
465: return PUSH;
466: }
467:
468: case P:
469: case DT:
470: switch (nextCode) {
471: case BLOCK:
472: case P:
473: case TABLE:
474: case CAPTION:
475: case THEAD:
476: case TFOOT:
477: case COLGROUP:
478: case TBODY:
479: case TR:
480: case TD:
481: case TH:
482: case DT:
483: case LI:
484: return POP;
485:
486: case BASEFONT:
487: case BR:
488: case AREA:
489: case LINK:
490: case IMG:
491: case PARAM:
492: case HR:
493: case INPUT:
494: case COL:
495: case FRAME:
496: case ISINDEX:
497: case BASE:
498: case META:
499: return PUSH_EMPTY;
500:
501: case SCRIPT:
502: case STYLE:
503: return PUSH_VERBATIM;
504:
505: default:
506: return PUSH;
507: }
508:
509: case TABLE:
510: switch (nextCode) {
511: case CAPTION:
512: case THEAD:
513: case TFOOT:
514: case COL:
515: case COLGROUP:
516: case TBODY:
517: case TR:
518: return PUSH;
519:
520: case SCRIPT:
521: case STYLE:
522: return PUSH_VERBATIM;
523:
524: default:
525: /*
526: opt = "tr";
527: return PUSH_OPT;
528: */
529: return PUSH;
530: }
531:
532: default:
533: switch (nextCode) {
534: case BASEFONT:
535: case BR:
536: case AREA:
537: case LINK:
538: case IMG:
539: case PARAM:
540: case HR:
541: case INPUT:
542: case COL:
543: case FRAME:
544: case ISINDEX:
545: case BASE:
546: case META:
547: return PUSH_EMPTY;
548:
549: case SCRIPT:
550: case STYLE:
551: return PUSH_VERBATIM;
552:
553: default:
554: return PUSH;
555: }
556: }
557: }
558:
559: private static CharScanner charsetScanner = new CharScanner(" \t=;");
560:
561: private void checkMetaEncoding(Element elt) {
562: String http = elt.getAttribute("http-equiv");
563: String content = elt.getAttribute("content");
564: if (http.equals("") || content.equals("")
565: || !http.equalsIgnoreCase("content-type"))
566: return;
567:
568: CharCursor cursor = new StringCharCursor(content);
569: charsetScanner.scan(cursor);
570: charsetScanner.skip(cursor);
571: CharBuffer buf = CharBuffer.allocate();
572: while (cursor.current() != cursor.DONE) {
573: buf.clear();
574: charsetScanner.scan(cursor, buf);
575: if (buf.toString().equalsIgnoreCase("charset")) {
576: charsetScanner.skip(cursor);
577: buf.clear();
578: charsetScanner.scan(cursor, buf);
579: if (buf.length() > 0) {
580: try {
581: is.setEncoding(buf.close());
582: } catch (IOException e) {
583: }
584: return;
585: }
586: }
587: }
588: }
589:
590: int elementCloseAction(XmlParser parser, QName node, String tagEnd)
591: throws XmlParseException {
592: String nodeName = node.getName();
593: if (nodeName.equals(tagEnd))
594: return POP;
595:
596: if (nodeName == "#document" && tagEnd.equals("")) {
597: /*
598: Document doc = (Document) node;
599:
600: // If JSP, move any text into the body element
601: if (isJsp && doc.getDocumentElement() == null &&
602: node.getFirstChild() instanceof Text) {
603: Element html = doc.createElement("html");
604: doc.appendChild(html);
605: Element body = doc.createElement("body");
606: html.appendChild(body);
607: Node child;
608: while ((child = doc.getFirstChild()) instanceof Text ||
609: child instanceof Comment) {
610: body.appendChild(child);
611: }
612: }
613: */
614: return POP;
615: }
616: switch (names.get(tagEnd)) {
617: case BASEFONT:
618: case BR:
619: case AREA:
620: case LINK:
621: case IMG:
622: case PARAM:
623: case HR:
624: case INPUT:
625: case COL:
626: case FRAME:
627: case ISINDEX:
628: case BASE:
629: case META:
630: String errorTagEnd;
631: if (tagEnd.equals(""))
632: errorTagEnd = L.l("end of file");
633: else
634: errorTagEnd = "`<" + tagEnd + ">'";
635:
636: throw parser.error(L.l("{0} expects to be empty",
637: errorTagEnd));
638: }
639:
640: switch (names.get(nodeName)) {
641: case BODY:
642: case P:
643: case DT:
644: case DD:
645: case LI:
646: case OPTION:
647: case THEAD:
648: case TFOOT:
649: case TBODY:
650: case COLGROUP:
651: case TR:
652: case TH:
653: case TD:
654: return POP_AND_LOOP;
655:
656: case HTML:
657: case HEAD:
658: // If JSP and missing a body, move any text into the body element
659: /*
660: if (isJsp && node.getLastChild() instanceof Text) {
661: Node child;
662:
663: for (child = node.getLastChild();
664: child != null;
665: child = child.getPreviousSibling()) {
666: if (child.getNodeName().equals("body"))
667: return POP_AND_LOOP;
668: }
669:
670: Document doc = node.getOwnerDocument();
671: Element body = doc.createElement("body");
672:
673: while ((child = node.getLastChild()) instanceof Text ||
674: child instanceof Comment) {
675: body.insertBefore(child, body.getFirstChild());
676: }
677:
678: doc.getDocumentElement().appendChild(body);
679: }
680: */
681: return POP_AND_LOOP;
682:
683: default:
684:
685: if (forgiving) {
686: /*
687: Node parent = node;
688: for (; parent != null; parent = parent.getParentNode()) {
689: if (parent.getNodeName().equals(tagEnd))
690: return POP_AND_LOOP;
691: }
692: return IGNORE;
693: */
694: return POP_AND_LOOP;
695: }
696:
697: String errorTagEnd;
698: if (tagEnd.equals(""))
699: errorTagEnd = L.l("end of file");
700: else
701: errorTagEnd = "`</" + tagEnd + ">'";
702:
703: String expect;
704: if (nodeName.equals("#document")) {
705: throw parser.error(L.l("expected {0} at {1}", L
706: .l("end of document"), errorTagEnd));
707: } else
708: expect = "`</" + nodeName + ">'";
709:
710: throw parser.error(L.l("expected {0} at {1} (open at {2})",
711: expect, errorTagEnd, "" + parser.getNodeLine()));
712: }
713: }
714:
715: private static void addName(String name, int code) {
716: names.put(name, code);
717: cbNames.put(new CharBuffer(name), code);
718:
719: String upper = name.toUpperCase();
720: names.put(upper, code);
721: cbNames.put(new CharBuffer(upper), code);
722: }
723:
724: static {
725: names = new IntMap();
726: cbNames = new IntMap();
727:
728: addName("#document", DOCUMENT);
729: addName("#comment", COMMENT);
730: addName("#text", TEXT);
731: addName("#jsp", JSP);
732: addName("#whitespace", WHITESPACE);
733: addName("html", HTML);
734:
735: addName("head", HEAD);
736: addName("title", TITLE);
737: addName("isindex", ISINDEX);
738: addName("base", BASE);
739: addName("script", SCRIPT);
740: addName("style", STYLE);
741: addName("meta", META);
742: addName("link", LINK);
743: addName("object", OBJECT);
744:
745: addName("body", BODY);
746:
747: addName("basefont", BASEFONT);
748: addName("br", BR);
749: addName("area", AREA);
750: addName("link", LINK);
751: addName("img", IMG);
752: addName("param", PARAM);
753: addName("hr", HR);
754: addName("input", INPUT);
755: addName("frame", FRAME);
756:
757: addName("p", P);
758: addName("dt", DT);
759: addName("dd", DD);
760: addName("li", LI);
761: addName("option", OPTION);
762:
763: addName("table", TABLE);
764: addName("caption", CAPTION);
765: addName("thead", THEAD);
766: addName("tfoot", TFOOT);
767: addName("col", COL);
768: addName("colgroup", COLGROUP);
769: addName("tbody", TBODY);
770: addName("tr", TR);
771: addName("th", TH);
772: addName("td", TD);
773:
774: addName("h1", BLOCK);
775: addName("h2", BLOCK);
776: addName("h3", BLOCK);
777: addName("h4", BLOCK);
778: addName("h5", BLOCK);
779: addName("h6", BLOCK);
780: addName("ul", BLOCK);
781: addName("ol", BLOCK);
782: addName("dir", BLOCK);
783: addName("menu", BLOCK);
784: addName("pre", BLOCK);
785: addName("dl", BLOCK);
786: addName("div", BLOCK);
787: addName("center", BLOCK);
788: addName("noscript", BLOCK);
789: addName("noframes", BLOCK);
790: addName("blockquote", BLOCK);
791: addName("form", BLOCK);
792: addName("fieldset", BLOCK);
793: addName("address", BLOCK);
794:
795: addName("tt", INLINE);
796: addName("i", INLINE);
797: addName("b", INLINE);
798: addName("u", INLINE);
799: addName("s", INLINE);
800: addName("strike", INLINE);
801: addName("big", INLINE);
802: addName("small", INLINE);
803:
804: addName("em", INLINE);
805: addName("strong", INLINE);
806: addName("dfn", INLINE);
807: addName("code", INLINE);
808: addName("samp", INLINE);
809: addName("kbd", INLINE);
810: addName("var", INLINE);
811: addName("cite", INLINE);
812: addName("abbr", INLINE);
813: addName("acronym", INLINE);
814: addName("font", INLINE);
815: addName("iframe", INLINE);
816: addName("applet", INLINE);
817: addName("ins", INLINE);
818: addName("del", INLINE);
819:
820: addName("a", INLINE);
821: addName("map", INLINE);
822: addName("q", INLINE);
823: addName("sub", INLINE);
824: addName("sup", INLINE);
825: addName("span", INLINE);
826: addName("bdo", INLINE);
827:
828: addName("select", INLINE);
829: addName("textarea", INLINE);
830: addName("label", INLINE);
831: addName("optgroup", INLINE);
832: addName("button", INLINE);
833: addName("legend", INLINE);
834: addName("frameset", FRAMESET);
835:
836: // CDATA -- STYLE, SCRIPT
837: }
838: }
|