0001: /*
0002: * Copyright 2004 Outerthought bvba and Schaubroeck nv
0003: *
0004: * Licensed under the Apache License, Version 2.0 (the "License");
0005: * you may not use this file except in compliance with the License.
0006: * You may obtain a copy of the License at
0007: *
0008: * http://www.apache.org/licenses/LICENSE-2.0
0009: *
0010: * Unless required by applicable law or agreed to in writing, software
0011: * distributed under the License is distributed on an "AS IS" BASIS,
0012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013: * See the License for the specific language governing permissions and
0014: * limitations under the License.
0015: */
0016: package org.outerj.daisy.htmlcleaner;
0017:
0018: import org.xml.sax.SAXException;
0019: import org.xml.sax.Attributes;
0020: import org.xml.sax.ContentHandler;
0021: import org.xml.sax.helpers.AttributesImpl;
0022: import org.outerj.daisy.xmlutil.SaxBuffer;
0023: import org.outerj.daisy.xmlutil.ForwardingContentHandler;
0024:
0025: import java.util.*;
0026:
0027: /**
0028: * Works on HTML input to clean it out to a
0029: * limited subset of HTML, mostly focussing on structural/semantic
0030: * elements (actually, what should be kept is configurable).
0031: *
0032: * <p>The input events should be in no namespace and contain
0033: * the html and body tags. All elements and attributes should already
0034: * be lowercased.</p>
0035: *
0036: * <p>All elements and attributes that are not explicitely allowed
0037: * in the configuration will be dropped (but
0038: * their character content will remain).</p>
0039: *
0040: * <p>Span and div elements are treated specially. The will only be
0041: * kept if their class attribute has one of the allowed values, specified
0042: * in the configuration of this component. Span elements that contain
0043: * a style attribute specifying bold and/or italic styling will
0044: * be converted to the equivalent strong/em tags.</p>
0045: *
0046: */
0047: class HtmlRepairer {
0048: private HtmlCleanerTemplate template;
0049: /**
0050: * Hardcoded set of elements that can be removed if they contain
0051: * no character data or only other wipeable elements. Usually used
0052: * for inline elements.
0053: */
0054: private static Set<String> wipeableEmptyElements;
0055: static {
0056: wipeableEmptyElements = new HashSet<String>();
0057: wipeableEmptyElements.add("strong");
0058: wipeableEmptyElements.add("em");
0059: wipeableEmptyElements.add("sub");
0060: wipeableEmptyElements.add("sup");
0061: wipeableEmptyElements.add("a");
0062: wipeableEmptyElements.add("tt");
0063: wipeableEmptyElements.add("ul");
0064: wipeableEmptyElements.add("del");
0065: wipeableEmptyElements.add("span");
0066: }
0067: private static final char[] NEWLINE = new char[] { '\n' };
0068: private static Set<String> contentBlockElements;
0069: static {
0070: contentBlockElements = new HashSet<String>();
0071: contentBlockElements.add("p");
0072: contentBlockElements.add("h1");
0073: contentBlockElements.add("h2");
0074: contentBlockElements.add("h3");
0075: contentBlockElements.add("h4");
0076: contentBlockElements.add("h5");
0077: contentBlockElements.add("blockquote");
0078: contentBlockElements.add("li");
0079: }
0080: private static Set<String> needsCleanupOfEndBrs;
0081: static {
0082: needsCleanupOfEndBrs = new HashSet<String>();
0083: needsCleanupOfEndBrs.add("th");
0084: needsCleanupOfEndBrs.add("td");
0085: needsCleanupOfEndBrs.add("li");
0086: }
0087:
0088: public HtmlRepairer(HtmlCleanerTemplate template) {
0089: this .template = template;
0090: }
0091:
0092: /**
0093: * Cleans the HTML stored in the SaxBuffer.
0094: *
0095: * @param buffer should only contain following types of events: start/endElement, start/endDocument, characters
0096: * @param contentHandler where the outcome will be send to
0097: */
0098: public void clean(SaxBuffer buffer, ContentHandler contentHandler)
0099: throws SAXException {
0100: Cleaner cleaner = new Cleaner(buffer, contentHandler);
0101: cleaner.clean();
0102: }
0103:
0104: private class Cleaner {
0105: private ContentHandler finalContentHandler;
0106: private SaxBuffer input;
0107:
0108: public Cleaner(SaxBuffer input, ContentHandler contentHandler) {
0109: this .input = input;
0110: this .finalContentHandler = contentHandler;
0111: }
0112:
0113: private void clean() throws SAXException {
0114: // cleaning happens in multiple stages to make the logic simpler.
0115: // The different stages are implemented in pull-style, by reading
0116: // events from a SaxBuffer instance.
0117:
0118: CleaningPipe pipe = new CleaningPipe();
0119: pipe.addCleaningStep(new ElementCleanup());
0120: pipe.addCleaningStep(new IntroducedParas());
0121: pipe.addCleaningStep(new StructuralCleanup());
0122: // do structural cleanup a second time, since it might have introduced new elements which need to be checked again
0123: pipe.addCleaningStep(new StructuralCleanup());
0124: pipe.addCleaningStep(new CleanupBrsAndEmptyContentBlocks());
0125: pipe.addCleaningStep(new CleanupWipeableEmptyElements());
0126: // do content block cleanup a second time, since cleanup of empty inline elements might have left empty content blocks
0127: pipe.addCleaningStep(new CleanupBrsAndEmptyContentBlocks());
0128: pipe.addCleaningStep(new TranslateBeeaarsInPees());
0129: pipe.addCleaningStep(new CleanupNewlineAtEndOfPre());
0130:
0131: pipe.execute(input.getBits(), finalContentHandler);
0132: }
0133: }
0134:
0135: /**
0136: * <ul>
0137: * <li>Makes sure all content is contained inside html/body
0138: * <li>Drops unallowed elements
0139: * <li>Does element translations (ie b into strong)
0140: * <li>Only outputs non-namespaced elements
0141: * </ul>
0142: */
0143: private class ElementCleanup implements CleaningStep {
0144:
0145: public void perform(List<SaxBuffer.SaxBit> bits,
0146: OutputHandler output) throws SAXException {
0147: Stack<XMLizable> endElements = new Stack<XMLizable>();
0148: boolean preSupported = template.descriptors
0149: .containsKey("pre");
0150:
0151: int i = 0;
0152: while (i < bits.size()) {
0153: Object bit = bits.get(i);
0154: if (bit instanceof SaxBuffer.StartElement) {
0155: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0156: if (!startElement.namespaceURI.equals("")) {
0157: // namespaced elements are dropped
0158: endElements.add(new EndElementInfo());
0159: } else {
0160: if (startElement.localName.equals("span")) {
0161: // two possibilities:
0162: // * has only class attribute with recognized value
0163: // * has style with certain recognized effects (bold/italic) -> translate to semantic correct tag.
0164: String classAttr = startElement.attrs
0165: .getValue("class");
0166: if (classAttr != null) {
0167: if (template.allowedSpanClasses
0168: .contains(classAttr)) {
0169: // make new attributes element to make sure there are no other attributes on the element
0170: AttributesImpl attrs = new AttributesImpl();
0171: attrs
0172: .addAttribute("", "class",
0173: "class", "CDATA",
0174: classAttr);
0175: output.startElement("span", attrs);
0176: endElements
0177: .push(new EndElementInfo(
0178: "span"));
0179: } else {
0180: // span element is dropped
0181: endElements
0182: .push(new EndElementInfo());
0183: }
0184: } else {
0185: String styleAttr = startElement.attrs
0186: .getValue("style");
0187: if (styleAttr != null) {
0188: StringTokenizer styleAttrTokenizer = new StringTokenizer(
0189: styleAttr, ";");
0190: boolean hasBold = false;
0191: boolean hasItalic = false;
0192: while (styleAttrTokenizer
0193: .hasMoreTokens()) {
0194: String styleToken = styleAttrTokenizer
0195: .nextToken();
0196: int colonPos = styleToken
0197: .indexOf(':');
0198: if (colonPos != -1) {
0199: String name = styleToken
0200: .substring(0,
0201: colonPos)
0202: .trim()
0203: .toLowerCase();
0204: String value = styleToken
0205: .substring(
0206: colonPos + 1)
0207: .trim()
0208: .toLowerCase();
0209: if (name
0210: .equals("font-weight")
0211: && value
0212: .equals("bold")) {
0213: hasBold = true;
0214: } else if (name
0215: .equals("font-style")
0216: && value
0217: .equals("italic")) {
0218: hasItalic = true;
0219: }
0220: }
0221: }
0222:
0223: MultiEndElementInfo endElement = new MultiEndElementInfo();
0224: if (hasBold) {
0225: output.startElement("strong",
0226: new AttributesImpl());
0227: endElement
0228: .add(new EndElementInfo(
0229: "strong"));
0230: }
0231: if (hasItalic) {
0232: output.startElement("em",
0233: new AttributesImpl());
0234: endElement
0235: .add(new EndElementInfo(
0236: "em"));
0237: }
0238: endElements.push(endElement);
0239: } else {
0240: endElements
0241: .push(new EndElementInfo());
0242: }
0243: }
0244: } else if (startElement.localName.equals("div")) {
0245: String classAttr = startElement.attrs
0246: .getValue("class");
0247: if (classAttr != null
0248: && template.allowedDivClasses
0249: .contains(classAttr)) {
0250: AttributesImpl attrs = new AttributesImpl();
0251: attrs.addAttribute("", "class",
0252: "class", "CDATA", classAttr);
0253: output.startElement("div", attrs);
0254: endElements.push(new EndElementInfo(
0255: "div"));
0256: } else if (classAttr != null
0257: && template.dropDivClasses
0258: .contains(classAttr)) {
0259: /** Skip over the content of the div element. */
0260: int openElementCounter = 0;
0261: while (true) {
0262: i++;
0263: if (i >= bits.size())
0264: throw new SAXException(
0265: "Reached end of input without encountering matching close div tag.");
0266:
0267: Object nextBit = bits.get(i);
0268: if (nextBit instanceof SaxBuffer.StartElement) {
0269: openElementCounter++;
0270: } else if (nextBit instanceof SaxBuffer.EndElement) {
0271: if (openElementCounter == 0) {
0272: break;
0273: }
0274: openElementCounter--;
0275: }
0276: }
0277: } else {
0278: // unallowed class, drop div element
0279: endElements.push(new EndElementInfo());
0280: }
0281: } else if (startElement.localName.equals("p")) {
0282: String classAttr = startElement.attrs
0283: .getValue("class");
0284: if (classAttr != null
0285: && template.allowedParaClasses
0286: .contains(classAttr)) {
0287: output
0288: .startElement(
0289: "p",
0290: getAllowedAttributes(startElement));
0291: endElements
0292: .push(new EndElementInfo("p"));
0293: } else {
0294: AttributesImpl attrs = getAllowedAttributes(startElement);
0295: int classPos = attrs.getIndex("class");
0296: if (classPos != -1)
0297: attrs.removeAttribute(classPos);
0298: output.startElement("p", attrs);
0299: endElements
0300: .push(new EndElementInfo("p"));
0301: }
0302: } else if (startElement.localName.equals("pre")
0303: && preSupported) {
0304: String classAttr = startElement.attrs
0305: .getValue("class");
0306: if (classAttr != null
0307: && template.allowedPreClasses
0308: .contains(classAttr)) {
0309: output
0310: .startElement(
0311: "pre",
0312: getAllowedAttributes(startElement));
0313: endElements.push(new EndElementInfo(
0314: "pre"));
0315: } else {
0316: AttributesImpl attrs = getAllowedAttributes(startElement);
0317: int classPos = attrs.getIndex("class");
0318: if (classPos != -1)
0319: attrs.removeAttribute(classPos);
0320: output.startElement("pre", attrs);
0321: endElements.push(new EndElementInfo(
0322: "pre"));
0323: }
0324: } else if (startElement.localName.equals("b")) {
0325: // translate to <strong>
0326: output.startElement("strong",
0327: new AttributesImpl());
0328: endElements.push(new EndElementInfo(
0329: "strong"));
0330: } else if (startElement.localName.equals("i")) {
0331: // translate to <em>
0332: output.startElement("em",
0333: new AttributesImpl());
0334: endElements.push(new EndElementInfo("em"));
0335: } else if (startElement.localName
0336: .equals("strike")) {
0337: // translate to <del>
0338: output.startElement("del",
0339: new AttributesImpl());
0340: endElements.push(new EndElementInfo("del"));
0341: } else if (startElement.localName
0342: .equals("html")) {
0343: if (output.openElements.size() != 0)
0344: throw new SAXException(
0345: "html element can only appear as root element.");
0346:
0347: output.startElement(startElement.localName,
0348: new AttributesImpl());
0349: endElements.push(new EndElementInfo(
0350: startElement.localName));
0351:
0352: // fast forward to body element
0353: while (true) {
0354: i++;
0355: if (i >= bits.size())
0356: throw new SAXException(
0357: "Reached end of input without encountering opening body tag.");
0358:
0359: Object nextBit = bits.get(i);
0360: if (nextBit instanceof SaxBuffer.StartElement
0361: && ((SaxBuffer.StartElement) nextBit).localName
0362: .equals("body")
0363: && ((SaxBuffer.StartElement) nextBit).namespaceURI
0364: .equals("")) {
0365: i--;
0366: break;
0367: }
0368: }
0369:
0370: } else if (startElement.localName
0371: .equals("body")) {
0372: if (output.openElements.size() != 1)
0373: throw new SAXException(
0374: "body element can only appear as child of html element");
0375:
0376: if (!output.openElements.get(0).getName()
0377: .equals("html"))
0378: throw new SAXException(
0379: "body element can only appear as child of html element");
0380:
0381: output.startElement("body",
0382: new AttributesImpl());
0383: endElements
0384: .push(new EndElementInfo("body"));
0385: } else if (startElement.localName.equals("img")
0386: && template.descriptors
0387: .containsKey("img")) {
0388: AttributesImpl attrs = getAllowedAttributes(startElement);
0389: if (template.imgAlternateSrcAttr != null) {
0390: String altSrc = startElement.attrs
0391: .getValue(template.imgAlternateSrcAttr);
0392: if (altSrc != null
0393: && !altSrc.equals("")) {
0394: int hrefIndex = attrs
0395: .getIndex("src");
0396: if (hrefIndex != -1)
0397: attrs.setValue(hrefIndex,
0398: altSrc);
0399: else
0400: attrs.addAttribute("", "src",
0401: "src", "CDATA", altSrc);
0402: }
0403: }
0404: output.startElement(startElement.localName,
0405: attrs);
0406: endElements.push(new EndElementInfo(
0407: startElement.localName));
0408: } else if (startElement.localName.equals("a")
0409: && template.descriptors
0410: .containsKey("a")) {
0411: AttributesImpl attrs = getAllowedAttributes(startElement);
0412: if (template.linkAlternateHrefAttr != null) {
0413: String altHref = startElement.attrs
0414: .getValue(template.linkAlternateHrefAttr);
0415: if (altHref != null
0416: && !altHref.equals("")) {
0417: int hrefIndex = attrs
0418: .getIndex("href");
0419: if (hrefIndex != -1)
0420: attrs.setValue(hrefIndex,
0421: altHref);
0422: else
0423: attrs.addAttribute("", "href",
0424: "href", "CDATA",
0425: altHref);
0426: }
0427: }
0428: output.startElement(startElement.localName,
0429: attrs);
0430: endElements.push(new EndElementInfo(
0431: startElement.localName));
0432: } else if (startElement.localName.equals("td")
0433: || startElement.localName.equals("th")) {
0434: AttributesImpl attrs = getAllowedAttributes(startElement);
0435:
0436: // remove dummy rowspan and colspan attributes
0437: String rowspan = attrs.getValue("rowspan");
0438: if (rowspan != null && rowspan.equals("1")) {
0439: attrs.removeAttribute(attrs
0440: .getIndex("rowspan"));
0441: }
0442: String colspan = attrs.getValue("colspan");
0443: if (colspan != null && colspan.equals("1")) {
0444: attrs.removeAttribute(attrs
0445: .getIndex("colspan"));
0446: }
0447:
0448: output.startElement(startElement.localName,
0449: attrs);
0450: endElements.push(new EndElementInfo(
0451: startElement.localName));
0452: } else if ((startElement.localName
0453: .equals("style") || startElement.localName
0454: .equals("script"))
0455: && !template.descriptors
0456: .containsKey(startElement.localName)) {
0457: // skip over the content
0458: int endPos = searchEndElement(bits, i);
0459: if (endPos == -1)
0460: throw new SAXException(
0461: "Abnormal situation which should never occur: didn't find closing tag for "
0462: + startElement.localName);
0463: i = endPos;
0464: } else if (template.descriptors
0465: .containsKey(startElement.localName)) {
0466: output.startElement(startElement.localName,
0467: getAllowedAttributes(startElement));
0468: endElements.push(new EndElementInfo(
0469: startElement.localName));
0470: } else {
0471: // skip element
0472: endElements.push(new EndElementInfo());
0473: }
0474: }
0475: } else if (bit instanceof SaxBuffer.EndElement) {
0476: XMLizable endElement = endElements.pop();
0477: endElement.toSAX(output);
0478: } else if (bit instanceof SaxBuffer.Characters) {
0479: ((SaxBuffer.Characters) bit).send(output);
0480: } else if (bit instanceof SaxBuffer.StartDocument) {
0481: output.startDocument();
0482: } else if (bit instanceof SaxBuffer.EndDocument) {
0483: output.endDocument();
0484: // don't do any events after endDocument
0485: return;
0486: }
0487: i++;
0488: }
0489: }
0490: }
0491:
0492: private AttributesImpl getAllowedAttributes(
0493: SaxBuffer.StartElement startElement) {
0494: // limit attributes to the allowed attributes
0495: String[] allowedAttributes = template.descriptors.get(
0496: startElement.localName).getAttributeNames();
0497: AttributesImpl attrs = new AttributesImpl();
0498: for (String allowedAttribute : allowedAttributes) {
0499: String value = startElement.attrs
0500: .getValue(allowedAttribute);
0501: if (value != null) {
0502: attrs.addAttribute("", allowedAttribute,
0503: allowedAttribute, "CDATA", value);
0504: }
0505: }
0506: return attrs;
0507: }
0508:
0509: /**
0510: * Puts p tags around all characters or elements that are child of html/body
0511: * but are not allowed there.
0512: */
0513: private class IntroducedParas implements CleaningStep {
0514:
0515: public void perform(List<SaxBuffer.SaxBit> bits,
0516: OutputHandler output) throws SAXException {
0517: Stack<XMLizable> endElements = new Stack<XMLizable>();
0518: Stack<Integer> introducedParas = new Stack<Integer>();
0519: ElementDescriptor bodyDescriptor = template.descriptors
0520: .get("body");
0521: ElementDescriptor tdDescriptor = template.descriptors
0522: .get("td");
0523: ElementDescriptor thDescriptor = template.descriptors
0524: .get("th");
0525: ElementDescriptor paraDescriptor = template.descriptors
0526: .get("p");
0527: ElementDescriptor blockQuoteDescriptor = template.descriptors
0528: .get("blockquote");
0529:
0530: int i = -1;
0531: while (i < bits.size()) {
0532: i++;
0533: Object bit = bits.get(i);
0534: if (bit instanceof SaxBuffer.StartElement) {
0535: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0536:
0537: if (!introducedParas.empty()
0538: && introducedParas.peek() == 0
0539: && !paraDescriptor
0540: .childAllowed(startElement.localName)) {
0541: output.endElement("p");
0542: introducedParas.pop();
0543: } else if (output.openElements.size() > 1) {
0544: StartElementInfo parentInfo = output.openElements
0545: .get(output.openElements.size() - 1);
0546: String parentName = parentInfo.getName();
0547: boolean startPara = (parentName.equals("body")
0548: && !bodyDescriptor
0549: .childAllowed(startElement.localName) && paraDescriptor
0550: .childAllowed(startElement.localName))
0551: || (parentName.equals("td")
0552: && !tdDescriptor
0553: .childAllowed(startElement.localName) && paraDescriptor
0554: .childAllowed(startElement.localName))
0555: || (parentName.equals("th")
0556: && !thDescriptor
0557: .childAllowed(startElement.localName) && paraDescriptor
0558: .childAllowed(startElement.localName))
0559: || (parentName.equals("blockquote")
0560: && !blockQuoteDescriptor
0561: .childAllowed(startElement.localName) && paraDescriptor
0562: .childAllowed(startElement.localName));
0563:
0564: if (startPara) {
0565: output.startElement("p",
0566: new AttributesImpl());
0567: introducedParas.push(0);
0568: }
0569: }
0570:
0571: if (!introducedParas.empty()) {
0572: introducedParas.push(introducedParas.pop() + 1);
0573: }
0574:
0575: output.startElement(startElement.localName,
0576: startElement.attrs);
0577: endElements.push(new EndElementInfo(
0578: startElement.localName));
0579:
0580: } else if (bit instanceof SaxBuffer.EndElement) {
0581: if (!introducedParas.empty()
0582: && introducedParas.peek() == 0) {
0583: output.endElement("p");
0584: introducedParas.pop();
0585: }
0586:
0587: XMLizable endElement = endElements.pop();
0588: endElement.toSAX(output);
0589:
0590: if (!introducedParas.empty()) {
0591: introducedParas.push(introducedParas.pop() - 1);
0592: }
0593: } else if (bit instanceof SaxBuffer.Characters) {
0594: if (output.openElements.size() > 1) {
0595: StartElementInfo parentInfo = output.openElements
0596: .get(output.openElements.size() - 1);
0597: String parentName = parentInfo.getName();
0598: boolean startPara = parentName.equals("body")
0599: || parentName.equals("td")
0600: || parentName.equals("th")
0601: || parentName.equals("blockquote");
0602: if (startPara) {
0603: output.startElement("p",
0604: new AttributesImpl());
0605: introducedParas.push(0);
0606: }
0607: }
0608: ((SaxBuffer.Characters) bit).send(output);
0609: } else if (bit instanceof SaxBuffer.StartDocument) {
0610: output.startDocument();
0611: } else if (bit instanceof SaxBuffer.EndDocument) {
0612: output.endDocument();
0613: // don't do any events after endDocument
0614: return;
0615: }
0616: }
0617: }
0618: }
0619:
0620: /**
0621: * Performs structural corrections, so that the end result is
0622: * limited to what XHTML1 allows (or at least close to it).
0623: */
0624: private class StructuralCleanup implements CleaningStep {
0625:
0626: public void perform(List<SaxBuffer.SaxBit> bits,
0627: OutputHandler output) throws SAXException {
0628: Stack<XMLizable> endElements = new Stack<XMLizable>();
0629: Map additionalEnds = new IdentityHashMap();
0630:
0631: int i = -1;
0632: while (i < bits.size()) {
0633: i++;
0634: Object bit = bits.get(i);
0635: if (bit instanceof SaxBuffer.StartElement) {
0636: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0637:
0638: ElementDescriptor descriptor = template.descriptors
0639: .get(startElement.localName);
0640: if (descriptor == null)
0641: throw new SAXException(
0642: "Missing ElementDescriptor for tagname "
0643: + startElement.localName);
0644:
0645: // check if this element can occur inside its parent
0646: if (output.openElements.size() > 0) {
0647: String parentElementName = (output.openElements
0648: .get(output.openElements.size() - 1))
0649: .getName();
0650: ElementDescriptor parentDescriptor = template.descriptors
0651: .get(parentElementName);
0652:
0653: boolean allowed = parentDescriptor
0654: .childAllowed(startElement.localName);
0655:
0656: // if it's allowed, let's get it done and over with
0657: if (allowed) {
0658: output.startElement(startElement.localName,
0659: startElement.attrs);
0660:
0661: XMLizable endElementInfo = new EndElementInfo(
0662: startElement.localName);
0663: EndElementInfo extraEndElementInfo = (EndElementInfo) additionalEnds
0664: .remove(startElement);
0665: if (extraEndElementInfo != null) {
0666: MultiEndElementInfo multiEndElementInfo = new MultiEndElementInfo();
0667: multiEndElementInfo
0668: .add(extraEndElementInfo);
0669: multiEndElementInfo
0670: .add((EndElementInfo) endElementInfo);
0671: endElementInfo = multiEndElementInfo;
0672: }
0673: endElements.push(endElementInfo);
0674: continue;
0675: }
0676:
0677: // not allowed -> search for first parent where it is allowed
0678: int firstGoodAncestor = -1;
0679: for (int k = output.openElements.size() - 2; k >= 0; k--) {
0680: String ancestorElementName = (output.openElements
0681: .get(k)).getName();
0682: ElementDescriptor ancestorDescriptor = template.descriptors
0683: .get(ancestorElementName);
0684: if (ancestorDescriptor
0685: .childAllowed(startElement.localName)) {
0686: firstGoodAncestor = k;
0687: break;
0688: }
0689: }
0690:
0691: if (firstGoodAncestor != -1) {
0692: // upon end of the problem element, we'll need to re-open the tags,
0693: // collect this info now while we still have it
0694: MultiEndElementInfo endElementInfo = new MultiEndElementInfo();
0695: for (int k = output.openElements.size() - 1; k > firstGoodAncestor; k--) {
0696: endElementInfo.add(output.openElements
0697: .get(k));
0698: }
0699: endElementInfo.add(new EndElementInfo(
0700: startElement.localName));
0701:
0702: // close open elements to get to the allowed ancestor
0703: for (int k = output.openElements.size() - 1; k > firstGoodAncestor; k--) {
0704: output.endElement(output.openElements
0705: .get(k).getName());
0706: }
0707:
0708: // start the problem element
0709: output.startElement(startElement.localName,
0710: startElement.attrs);
0711:
0712: endElements.push(endElementInfo);
0713: } else {
0714: if (startElement.localName.equals("li")) {
0715: // automatically introduce an ul
0716: output.startElement("ul",
0717: new AttributesImpl());
0718: output.startElement(
0719: startElement.localName,
0720: startElement.attrs);
0721:
0722: // wrap all sibling li's into one big ul
0723: SaxBuffer.StartElement sibling = null;
0724: int nextSiblingPos = searchSibling(
0725: bits, i);
0726: while (nextSiblingPos != -1
0727: && ((SaxBuffer.StartElement) bits
0728: .get(nextSiblingPos)).localName
0729: .equals("li")) {
0730: sibling = (SaxBuffer.StartElement) bits
0731: .get(nextSiblingPos);
0732: nextSiblingPos = searchSibling(
0733: bits, nextSiblingPos);
0734: }
0735:
0736: if (sibling != null) {
0737: endElements
0738: .push(new EndElementInfo(
0739: startElement.localName));
0740: // the ul should be closed after the last li
0741: additionalEnds.put(sibling,
0742: new EndElementInfo("ul"));
0743: } else {
0744: MultiEndElementInfo endElementInfo = new MultiEndElementInfo();
0745: endElementInfo
0746: .add(new EndElementInfo(
0747: "ul"));
0748: endElementInfo
0749: .add(new EndElementInfo(
0750: startElement.localName));
0751: endElements.push(endElementInfo);
0752: }
0753: } else if (startElement.localName
0754: .equals("br")) {
0755: // throw away the br
0756: int endPos = searchEndElement(bits, i);
0757: if (endPos == -1)
0758: throw new SAXException(
0759: "Abnormal situation which should never occur: didn't find end of br.");
0760: i = endPos;
0761: } else {
0762: throw new SAXException(
0763: "Element \""
0764: + startElement.localName
0765: + "\" is disallowed at its current location, and could not automatically fix this.");
0766: }
0767: }
0768:
0769: } else {
0770: output.startElement(startElement.localName,
0771: startElement.attrs);
0772: endElements.push(new EndElementInfo(
0773: startElement.localName));
0774: }
0775:
0776: } else if (bit instanceof SaxBuffer.EndElement) {
0777: XMLizable endElement = endElements.pop();
0778: endElement.toSAX(output);
0779: } else if (bit instanceof SaxBuffer.Characters) {
0780: ((SaxBuffer.Characters) bit).send(output);
0781: } else if (bit instanceof SaxBuffer.StartDocument) {
0782: output.startDocument();
0783: } else if (bit instanceof SaxBuffer.EndDocument) {
0784: output.endDocument();
0785: // don't do any events after endDocument
0786: return;
0787: }
0788: }
0789: }
0790: }
0791:
0792: /**
0793: * Removes p's, headers containing only whitespace or br's, changes sequences
0794: * of more then two br's into a new paragraph, drops br's at start or
0795: * end of p, headers.
0796: */
0797: private class CleanupBrsAndEmptyContentBlocks implements
0798: CleaningStep {
0799:
0800: public void perform(List<SaxBuffer.SaxBit> bits,
0801: OutputHandler output) throws SAXException {
0802: Stack<XMLizable> endElements = new Stack<XMLizable>();
0803:
0804: int i = -1;
0805: while (i < bits.size()) {
0806: i++;
0807: Object bit = bits.get(i);
0808: if (bit instanceof SaxBuffer.StartElement) {
0809: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0810:
0811: boolean contentBlockElement = contentBlockElements
0812: .contains(startElement.localName);
0813: if (contentBlockElement
0814: || startElement.localName.equals("td")
0815: || startElement.localName.equals("th")) {
0816: // starting a new p, td, ...: search if this element contains anything non-whitespace non-br
0817: int elementNesting = 0;
0818: int z = i;
0819: boolean reachedEndElement = false;
0820: while (true) {
0821: z++;
0822: Object bit2 = bits.get(z);
0823: if (bit2 instanceof SaxBuffer.Characters
0824: && isWhitespace((SaxBuffer.Characters) bit2)) {
0825: // continue loop
0826: } else if (bit2 instanceof SaxBuffer.StartElement
0827: && ((SaxBuffer.StartElement) bit2).localName
0828: .equals("br")) {
0829: elementNesting++;
0830: } else if (bit2 instanceof SaxBuffer.EndElement
0831: && ((SaxBuffer.EndElement) bit2).localName
0832: .equals("br")) {
0833: elementNesting--;
0834: } else if (bit2 instanceof SaxBuffer.EndElement
0835: && elementNesting == 0) {
0836: reachedEndElement = true;
0837: break;
0838: } else {
0839: break;
0840: }
0841: }
0842:
0843: if (reachedEndElement) {
0844: if (contentBlockElement) {
0845: // skip over this element
0846: i = z;
0847: continue;
0848: } else {
0849: output.startElement(
0850: startElement.localName,
0851: startElement.attrs);
0852: endElements.push(new EndElementInfo(
0853: startElement.localName));
0854: // skip content of this element
0855: i = z - 1;
0856: continue;
0857: }
0858: } else {
0859: if (contentBlockElement) {
0860: // skip over initial br's or whitespace at start of content block
0861: i = z - 1;
0862: } else {
0863: // nothing to do
0864: }
0865: }
0866:
0867: } else if (startElement.localName.equals("br")) {
0868: // search for a parent content block element
0869: int firstContentBlockAncestor = -1;
0870: for (int k = output.openElements.size() - 1; k >= 0; k--) {
0871: StartElementInfo startElementInfo = output.openElements
0872: .get(k);
0873: if (contentBlockElements
0874: .contains(startElementInfo
0875: .getName())) {
0876: firstContentBlockAncestor = k;
0877: break;
0878: }
0879: }
0880:
0881: // if we are inside a content block ...
0882: if (firstContentBlockAncestor != -1) {
0883: // count number of br's following this
0884: int z = i;
0885: int brCount = 1;
0886: boolean continueSearch = true;
0887: while (continueSearch) {
0888: z++;
0889: Object bit2 = bits.get(z);
0890: if (bit2 instanceof SaxBuffer.EndElement) {
0891: String name = ((SaxBuffer.EndElement) bit2).localName;
0892: if (!name.equals("br")) {
0893: continueSearch = false;
0894: }
0895: } else if (bit2 instanceof SaxBuffer.StartElement
0896: && ((SaxBuffer.StartElement) bit2).localName
0897: .equals("br")) {
0898: brCount++;
0899: continueSearch = true;
0900: } else if (bit2 instanceof SaxBuffer.Characters
0901: && isWhitespace((SaxBuffer.Characters) bit2)) {
0902: continueSearch = true;
0903: } else {
0904: continueSearch = false;
0905: }
0906: }
0907:
0908: // if all the next bits till the first closing content block tag are either end elements or whitespace,
0909: // then drop the br's.
0910: boolean beforeEndContentBlock = false;
0911: for (int t = z; t < bits.size(); t++) {
0912: if (bits.get(t) instanceof SaxBuffer.EndElement) {
0913: SaxBuffer.EndElement endEl = (SaxBuffer.EndElement) bits
0914: .get(t);
0915: if (contentBlockElements
0916: .contains(endEl.localName)) {
0917: beforeEndContentBlock = true;
0918: break;
0919: }
0920: // other end element events: continue searching
0921: } else if (bits.get(t) instanceof SaxBuffer.Characters
0922: && isWhitespace((SaxBuffer.Characters) bits
0923: .get(t))) {
0924: // whitespace: continue searching
0925: } else {
0926: // everything else: stop
0927: break;
0928: }
0929: }
0930: if (beforeEndContentBlock) {
0931: i = z - 1;
0932: continue;
0933: }
0934:
0935: if (brCount >= 2) {
0936: // drop the br's, close content block element, open content block element
0937: i = z - 1; // z is positioned on the first non-br, non-whitespace element following the br's
0938:
0939: List<StartElementInfo> elementsToRestart = new ArrayList<StartElementInfo>();
0940: for (int k = output.openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
0941: elementsToRestart
0942: .add(output.openElements
0943: .get(k));
0944: }
0945:
0946: for (int k = output.openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
0947: output
0948: .endElement(output.openElements
0949: .get(k).getName());
0950: }
0951:
0952: for (int k = elementsToRestart.size() - 1; k >= 0; k--) {
0953: StartElementInfo startElementInfo = elementsToRestart
0954: .get(k);
0955: output
0956: .startElement(
0957: startElementInfo
0958: .getName(),
0959: startElementInfo
0960: .getAttrs());
0961: }
0962: continue;
0963: }
0964: } else if (startElement.localName.equals("br")
0965: && output.openElements.size() > 1
0966: && needsCleanupOfEndBrs
0967: .contains(output.openElements
0968: .get(
0969: output.openElements
0970: .size() - 1)
0971: .getName())) {
0972: // this is useful to remove <br>s inside <td>s or <br>s at the end of <li>s like mozilla does
0973: String elementName = output.openElements
0974: .get(output.openElements.size() - 1)
0975: .getName();
0976:
0977: boolean nextIsEndOfElement = false;
0978: int r = i + 1;
0979: for (; r < bits.size(); r++) {
0980: Object nextBit = bits.get(r);
0981: if (nextBit instanceof SaxBuffer.EndElement) {
0982: SaxBuffer.EndElement endEl = (SaxBuffer.EndElement) nextBit;
0983: if (endEl.localName.equals("br")) {
0984: continue;
0985: } else if (endEl.localName
0986: .equals(elementName)) {
0987: nextIsEndOfElement = true;
0988: break;
0989: } else {
0990: break;
0991: }
0992: } else if (nextBit instanceof SaxBuffer.Characters
0993: && isWhitespace((SaxBuffer.Characters) nextBit)) {
0994: // do nothing
0995: } else {
0996: break;
0997: }
0998: }
0999:
1000: if (nextIsEndOfElement) {
1001: i = r - 1;
1002: continue;
1003: }
1004: }
1005: }
1006:
1007: output.startElement(startElement.localName,
1008: startElement.attrs);
1009: endElements.push(new EndElementInfo(
1010: startElement.localName));
1011:
1012: } else if (bit instanceof SaxBuffer.EndElement) {
1013: XMLizable endElement = endElements.pop();
1014: endElement.toSAX(output);
1015: } else if (bit instanceof SaxBuffer.Characters) {
1016: ((SaxBuffer.Characters) bit).send(output);
1017: } else if (bit instanceof SaxBuffer.StartDocument) {
1018: output.startDocument();
1019: } else if (bit instanceof SaxBuffer.EndDocument) {
1020: output.endDocument();
1021: // don't do any events after endDocument
1022: return;
1023: }
1024: }
1025: }
1026: }
1027:
1028: private class CleanupWipeableEmptyElements implements CleaningStep {
1029:
1030: public void perform(List<SaxBuffer.SaxBit> bits,
1031: OutputHandler output) throws SAXException {
1032: Stack<XMLizable> endElements = new Stack<XMLizable>();
1033:
1034: int i = -1;
1035: while (i < bits.size()) {
1036: i++;
1037: Object bit = bits.get(i);
1038: if (bit instanceof SaxBuffer.StartElement) {
1039: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
1040: if (wipeableEmptyElements
1041: .contains(startElement.localName)) {
1042: boolean hasWhitespace = false;
1043: boolean reachedEndElement = false;
1044: int elementNesting = 0;
1045: int k = i;
1046: while (true) {
1047: k++;
1048: Object nextBit = bits.get(k);
1049: if (nextBit instanceof SaxBuffer.StartElement
1050: && wipeableEmptyElements
1051: .contains(((SaxBuffer.StartElement) nextBit).localName)) {
1052: elementNesting++;
1053: } else if (nextBit instanceof SaxBuffer.Characters
1054: && isWhitespace((SaxBuffer.Characters) nextBit)) {
1055: hasWhitespace = true;
1056: } else if (nextBit instanceof SaxBuffer.EndElement
1057: && elementNesting > 0) {
1058: elementNesting--;
1059: } else if (nextBit instanceof SaxBuffer.EndElement
1060: && elementNesting == 0) {
1061: reachedEndElement = true;
1062: break;
1063: } else {
1064: break;
1065: }
1066: }
1067:
1068: if (reachedEndElement) {
1069: // skip the elements
1070: i = k;
1071: // if the wipeable elements contained whitespace, generate a whitespace character
1072: if (hasWhitespace)
1073: output.characters(new char[] { ' ' },
1074: 0, 1);
1075: continue;
1076: }
1077: }
1078:
1079: output.startElement(startElement.localName,
1080: startElement.attrs);
1081: endElements.push(new EndElementInfo(
1082: startElement.localName));
1083:
1084: } else if (bit instanceof SaxBuffer.EndElement) {
1085: XMLizable endElement = endElements.pop();
1086: endElement.toSAX(output);
1087: } else if (bit instanceof SaxBuffer.Characters) {
1088: ((SaxBuffer.Characters) bit).send(output);
1089: } else if (bit instanceof SaxBuffer.StartDocument) {
1090: output.startDocument();
1091: } else if (bit instanceof SaxBuffer.EndDocument) {
1092: output.endDocument();
1093: // don't do any events after endDocument
1094: return;
1095: }
1096: }
1097: }
1098: }
1099:
1100: /**
1101: * Changes br elements inside pre elements into newline character events.
1102: */
1103: private class TranslateBeeaarsInPees implements CleaningStep {
1104:
1105: public void perform(List<SaxBuffer.SaxBit> bits,
1106: OutputHandler output) throws SAXException {
1107: int preLevel = 0;
1108: int i = -1;
1109: while (i < bits.size()) {
1110: i++;
1111: Object bit = bits.get(i);
1112: if (bit instanceof SaxBuffer.StartElement) {
1113: SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
1114: if (startElement.localName.equals("pre")) {
1115: preLevel++;
1116: } else if (preLevel > 0
1117: && startElement.localName.equals("br")) {
1118: // normally an opening br should be immediatelly followed by the closing br,
1119: // so let us restrict us to that case
1120: Object nextBit = bits.get(i + 1);
1121: if (nextBit instanceof SaxBuffer.EndElement
1122: && ((SaxBuffer.EndElement) nextBit).localName
1123: .equals("br")) {
1124: // replace this br by a newline
1125: output.characters(NEWLINE, 0, 1);
1126: i++;
1127: continue;
1128: }
1129: }
1130:
1131: output.startElement(startElement.localName,
1132: startElement.attrs);
1133:
1134: } else if (bit instanceof SaxBuffer.EndElement) {
1135: SaxBuffer.EndElement endElement = (SaxBuffer.EndElement) bit;
1136: if (endElement.localName.equals("pre")) {
1137: preLevel--;
1138: }
1139: output.endElement(endElement.localName);
1140: } else if (bit instanceof SaxBuffer.Characters) {
1141: ((SaxBuffer.Characters) bit).send(output);
1142: } else if (bit instanceof SaxBuffer.StartDocument) {
1143: output.startDocument();
1144: } else if (bit instanceof SaxBuffer.EndDocument) {
1145: output.endDocument();
1146: // don't do any events after endDocument
1147: return;
1148: }
1149: }
1150: }
1151: }
1152:
1153: /**
1154: * Removes a "\n" if it occurs right before a closing pre tag. Such a newline has
1155: * no meaning. This is often inserted by Firefox, and causes layout troubles in
1156: * Internet Explorer (subsequent block elements are extra indented, though they
1157: * shift left again once you start typing in them).
1158: */
1159: private class CleanupNewlineAtEndOfPre implements CleaningStep {
1160:
1161: public void perform(List<SaxBuffer.SaxBit> bits,
1162: OutputHandler output) throws SAXException {
1163: int i = 0;
1164: while (i < bits.size()) {
1165: SaxBuffer.SaxBit bit = bits.get(i);
1166: if (bit instanceof SaxBuffer.Characters
1167: && i < bits.size() - 1
1168: && bits.get(i + 1) instanceof SaxBuffer.EndElement
1169: && ((SaxBuffer.EndElement) bits.get(i + 1)).localName
1170: .equals("pre")) {
1171: SaxBuffer.Characters characters = (SaxBuffer.Characters) bit;
1172: char[] ch = characters.ch;
1173: if (ch.length > 0
1174: && ch[ch.length - 1] == '\n'
1175: && (ch.length <= 1 || ch[ch.length - 2] != '\n')) {
1176: output.characters(characters.ch, 0,
1177: characters.ch.length - 1);
1178: } else {
1179: characters.send(output);
1180: }
1181: } else {
1182: bit.send(output);
1183: }
1184: i++;
1185: }
1186: }
1187: }
1188:
1189: private boolean isWhitespace(SaxBuffer.Characters characters) {
1190: for (char ch : characters.ch) {
1191: if (!(Character.isWhitespace(ch) || ch == (char) 160)) // 160 is
1192: return false;
1193: }
1194: return true;
1195: }
1196:
1197: /**
1198: *
1199: * @param bits SaxBuffer bits
1200: * @param index the index of the current start element, of which we want to find the sibling
1201: */
1202: private int searchSibling(List bits, int index) {
1203: int nesting = 0;
1204: boolean passedEndElement = false;
1205: for (int i = index + 1; i < bits.size(); i++) {
1206: Object bit = bits.get(i);
1207: if (bit instanceof SaxBuffer.StartElement) {
1208: if (passedEndElement)
1209: return i;
1210: else
1211: nesting++;
1212: } else if (bit instanceof SaxBuffer.EndElement) {
1213: if (nesting == 0)
1214: passedEndElement = true;
1215: else
1216: nesting--;
1217: }
1218: }
1219: return -1;
1220: }
1221:
1222: /**
1223: *
1224: * @param bits SaxBuffer bits
1225: * @param index the index of the current start element, of which we want to find the end element
1226: */
1227: private int searchEndElement(List bits, int index) {
1228: int nesting = 0;
1229: for (int i = index + 1; i < bits.size(); i++) {
1230: Object bit = bits.get(i);
1231: if (bit instanceof SaxBuffer.StartElement) {
1232: nesting++;
1233: } else if (bit instanceof SaxBuffer.EndElement) {
1234: if (nesting == 0)
1235: return i;
1236: }
1237: }
1238: return -1;
1239: }
1240:
1241: private static class CleaningPipe {
1242: private List<CleaningStep> steps = new ArrayList<CleaningStep>();
1243:
1244: void addCleaningStep(CleaningStep step) {
1245: steps.add(step);
1246: }
1247:
1248: /**
1249: *
1250: * @param result where the result of the last cleanup step should be sent to
1251: */
1252: void execute(List<SaxBuffer.SaxBit> startBits,
1253: ContentHandler result) throws SAXException {
1254: List<SaxBuffer.SaxBit> bits = startBits;
1255: SaxBuffer buffer;
1256: for (int i = 0; i < steps.size(); i++) {
1257: buffer = new SaxBuffer();
1258: OutputHandler handler = new OutputHandler(i == steps
1259: .size() - 1 ? result : buffer);
1260: steps.get(i).perform(bits, handler);
1261: bits = buffer.getBits();
1262: }
1263: }
1264: }
1265:
1266: private interface CleaningStep {
1267: void perform(List<SaxBuffer.SaxBit> bits, OutputHandler output)
1268: throws SAXException;
1269: }
1270:
1271: private static class OutputHandler extends ForwardingContentHandler {
1272: private List<StartElementInfo> openElements = new ArrayList<StartElementInfo>();
1273:
1274: public OutputHandler(ContentHandler consumer) {
1275: super (consumer);
1276: }
1277:
1278: public void startElement(String name, Attributes attrs)
1279: throws SAXException {
1280: super .startElement("", name, name, attrs);
1281: openElements.add(new StartElementInfo(name, attrs));
1282: }
1283:
1284: public void endElement(String name) throws SAXException {
1285: super .endElement("", name, name);
1286: String removed = openElements.remove(
1287: openElements.size() - 1).getName();
1288: if (!removed.equals(name)) {
1289: throw new SAXException("The close tag \"" + name
1290: + "\" did not match the open tag \"" + removed
1291: + "\".");
1292: }
1293: }
1294:
1295: }
1296:
1297: private static class StartElementInfo implements XMLizable {
1298: private final String name;
1299: private final Attributes attrs;
1300:
1301: public StartElementInfo(String name, Attributes attrs) {
1302: this .name = name;
1303: this .attrs = attrs;
1304: }
1305:
1306: public String getName() {
1307: return name;
1308: }
1309:
1310: public Attributes getAttrs() {
1311: return attrs;
1312: }
1313:
1314: public void toSAX(OutputHandler contentHandler)
1315: throws SAXException {
1316: contentHandler.startElement(name, attrs);
1317: }
1318: }
1319:
1320: private static class EndElementInfo implements XMLizable {
1321: private final boolean skip;
1322: private final String localName;
1323:
1324: public EndElementInfo() {
1325: this .skip = true;
1326: this .localName = null;
1327: }
1328:
1329: public EndElementInfo(String localName) {
1330: this .skip = false;
1331: this .localName = localName;
1332: }
1333:
1334: public void toSAX(OutputHandler contentHandler)
1335: throws SAXException {
1336: if (!skip) {
1337: contentHandler.endElement(localName);
1338: }
1339: }
1340: }
1341:
1342: private final class MultiEndElementInfo implements XMLizable {
1343: private List<XMLizable> tags = new ArrayList<XMLizable>(2);
1344:
1345: public void add(EndElementInfo endElement) {
1346: this .tags.add(endElement);
1347: }
1348:
1349: public void add(StartElementInfo endElement) {
1350: this .tags.add(endElement);
1351: }
1352:
1353: public void toSAX(OutputHandler contentHandler)
1354: throws SAXException {
1355: for (int i = tags.size() - 1; i >= 0; i--) {
1356: XMLizable tag = tags.get(i);
1357: tag.toSAX(contentHandler);
1358: }
1359: }
1360: }
1361:
1362: interface XMLizable {
1363: public void toSAX(OutputHandler contentHandler)
1364: throws SAXException;
1365: }
1366: }
|