Source Code Cross Referenced for HtmlRepairer.java in  » Content-Management-System » daisy » org » outerj » daisy » htmlcleaner » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » daisy » org.outerj.daisy.htmlcleaner 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         * Copyright 2004 Outerthought bvba and Schaubroeck nv
0003:         *
0004:         * Licensed under the Apache License, Version 2.0 (the "License");
0005:         * you may not use this file except in compliance with the License.
0006:         * You may obtain a copy of the License at
0007:         *
0008:         *     http://www.apache.org/licenses/LICENSE-2.0
0009:         *
0010:         * Unless required by applicable law or agreed to in writing, software
0011:         * distributed under the License is distributed on an "AS IS" BASIS,
0012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013:         * See the License for the specific language governing permissions and
0014:         * limitations under the License.
0015:         */
0016:        package org.outerj.daisy.htmlcleaner;
0017:
0018:        import org.xml.sax.SAXException;
0019:        import org.xml.sax.Attributes;
0020:        import org.xml.sax.ContentHandler;
0021:        import org.xml.sax.helpers.AttributesImpl;
0022:        import org.outerj.daisy.xmlutil.SaxBuffer;
0023:        import org.outerj.daisy.xmlutil.ForwardingContentHandler;
0024:
0025:        import java.util.*;
0026:
0027:        /**
0028:         * Works on HTML input to clean it out to a
0029:         * limited subset of HTML, mostly focussing on structural/semantic
0030:         * elements (actually, what should be kept is configurable).
0031:         *
0032:         * <p>The input events should be in no namespace and contain
0033:         * the html and body tags. All elements and attributes should already
0034:         * be lowercased.</p>
0035:         *
0036:         * <p>All elements and attributes that are not explicitely allowed
0037:         * in the configuration will be dropped (but
0038:         * their character content will remain).</p>
0039:         *
0040:         * <p>Span and div elements are treated specially. The will only be
0041:         * kept if their class attribute has one of the allowed values, specified
0042:         * in the configuration of this component. Span elements that contain
0043:         * a style attribute specifying bold and/or italic styling will
0044:         * be converted to the equivalent strong/em tags.</p>
0045:         *
0046:         */
0047:        class HtmlRepairer {
0048:            private HtmlCleanerTemplate template;
0049:            /**
0050:             * Hardcoded set of elements that can be removed if they contain
0051:             * no character data or only other wipeable elements. Usually used
0052:             * for inline elements.
0053:             */
0054:            private static Set<String> wipeableEmptyElements;
0055:            static {
0056:                wipeableEmptyElements = new HashSet<String>();
0057:                wipeableEmptyElements.add("strong");
0058:                wipeableEmptyElements.add("em");
0059:                wipeableEmptyElements.add("sub");
0060:                wipeableEmptyElements.add("sup");
0061:                wipeableEmptyElements.add("a");
0062:                wipeableEmptyElements.add("tt");
0063:                wipeableEmptyElements.add("ul");
0064:                wipeableEmptyElements.add("del");
0065:                wipeableEmptyElements.add("span");
0066:            }
0067:            private static final char[] NEWLINE = new char[] { '\n' };
0068:            private static Set<String> contentBlockElements;
0069:            static {
0070:                contentBlockElements = new HashSet<String>();
0071:                contentBlockElements.add("p");
0072:                contentBlockElements.add("h1");
0073:                contentBlockElements.add("h2");
0074:                contentBlockElements.add("h3");
0075:                contentBlockElements.add("h4");
0076:                contentBlockElements.add("h5");
0077:                contentBlockElements.add("blockquote");
0078:                contentBlockElements.add("li");
0079:            }
0080:            private static Set<String> needsCleanupOfEndBrs;
0081:            static {
0082:                needsCleanupOfEndBrs = new HashSet<String>();
0083:                needsCleanupOfEndBrs.add("th");
0084:                needsCleanupOfEndBrs.add("td");
0085:                needsCleanupOfEndBrs.add("li");
0086:            }
0087:
0088:            public HtmlRepairer(HtmlCleanerTemplate template) {
0089:                this .template = template;
0090:            }
0091:
0092:            /**
0093:             * Cleans the HTML stored in the SaxBuffer.
0094:             *
0095:             * @param buffer should only contain following types of events: start/endElement, start/endDocument, characters
0096:             * @param contentHandler where the outcome will be send to
0097:             */
0098:            public void clean(SaxBuffer buffer, ContentHandler contentHandler)
0099:                    throws SAXException {
0100:                Cleaner cleaner = new Cleaner(buffer, contentHandler);
0101:                cleaner.clean();
0102:            }
0103:
0104:            private class Cleaner {
0105:                private ContentHandler finalContentHandler;
0106:                private SaxBuffer input;
0107:
0108:                public Cleaner(SaxBuffer input, ContentHandler contentHandler) {
0109:                    this .input = input;
0110:                    this .finalContentHandler = contentHandler;
0111:                }
0112:
0113:                private void clean() throws SAXException {
0114:                    // cleaning happens in multiple stages to make the logic simpler.
0115:                    // The different stages are implemented in pull-style, by reading
0116:                    // events from a SaxBuffer instance.
0117:
0118:                    CleaningPipe pipe = new CleaningPipe();
0119:                    pipe.addCleaningStep(new ElementCleanup());
0120:                    pipe.addCleaningStep(new IntroducedParas());
0121:                    pipe.addCleaningStep(new StructuralCleanup());
0122:                    // do structural cleanup a second time, since it might have introduced new elements which need to be checked again
0123:                    pipe.addCleaningStep(new StructuralCleanup());
0124:                    pipe.addCleaningStep(new CleanupBrsAndEmptyContentBlocks());
0125:                    pipe.addCleaningStep(new CleanupWipeableEmptyElements());
0126:                    // do content block cleanup a second time, since cleanup of empty inline elements might have left empty content blocks
0127:                    pipe.addCleaningStep(new CleanupBrsAndEmptyContentBlocks());
0128:                    pipe.addCleaningStep(new TranslateBeeaarsInPees());
0129:                    pipe.addCleaningStep(new CleanupNewlineAtEndOfPre());
0130:
0131:                    pipe.execute(input.getBits(), finalContentHandler);
0132:                }
0133:            }
0134:
0135:            /**
0136:             * <ul>
0137:             *  <li>Makes sure all content is contained inside html/body
0138:             *  <li>Drops unallowed elements
0139:             *  <li>Does element translations (ie b into strong)
0140:             *  <li>Only outputs non-namespaced elements
0141:             * </ul>
0142:             */
0143:            private class ElementCleanup implements  CleaningStep {
0144:
0145:                public void perform(List<SaxBuffer.SaxBit> bits,
0146:                        OutputHandler output) throws SAXException {
0147:                    Stack<XMLizable> endElements = new Stack<XMLizable>();
0148:                    boolean preSupported = template.descriptors
0149:                            .containsKey("pre");
0150:
0151:                    int i = 0;
0152:                    while (i < bits.size()) {
0153:                        Object bit = bits.get(i);
0154:                        if (bit instanceof  SaxBuffer.StartElement) {
0155:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0156:                            if (!startElement.namespaceURI.equals("")) {
0157:                                // namespaced elements are dropped
0158:                                endElements.add(new EndElementInfo());
0159:                            } else {
0160:                                if (startElement.localName.equals("span")) {
0161:                                    // two possibilities:
0162:                                    //  * has only class attribute with recognized value
0163:                                    //  * has style with certain recognized effects (bold/italic) -> translate to semantic correct tag.
0164:                                    String classAttr = startElement.attrs
0165:                                            .getValue("class");
0166:                                    if (classAttr != null) {
0167:                                        if (template.allowedSpanClasses
0168:                                                .contains(classAttr)) {
0169:                                            // make new attributes element to make sure there are no other attributes on the element
0170:                                            AttributesImpl attrs = new AttributesImpl();
0171:                                            attrs
0172:                                                    .addAttribute("", "class",
0173:                                                            "class", "CDATA",
0174:                                                            classAttr);
0175:                                            output.startElement("span", attrs);
0176:                                            endElements
0177:                                                    .push(new EndElementInfo(
0178:                                                            "span"));
0179:                                        } else {
0180:                                            // span element is dropped
0181:                                            endElements
0182:                                                    .push(new EndElementInfo());
0183:                                        }
0184:                                    } else {
0185:                                        String styleAttr = startElement.attrs
0186:                                                .getValue("style");
0187:                                        if (styleAttr != null) {
0188:                                            StringTokenizer styleAttrTokenizer = new StringTokenizer(
0189:                                                    styleAttr, ";");
0190:                                            boolean hasBold = false;
0191:                                            boolean hasItalic = false;
0192:                                            while (styleAttrTokenizer
0193:                                                    .hasMoreTokens()) {
0194:                                                String styleToken = styleAttrTokenizer
0195:                                                        .nextToken();
0196:                                                int colonPos = styleToken
0197:                                                        .indexOf(':');
0198:                                                if (colonPos != -1) {
0199:                                                    String name = styleToken
0200:                                                            .substring(0,
0201:                                                                    colonPos)
0202:                                                            .trim()
0203:                                                            .toLowerCase();
0204:                                                    String value = styleToken
0205:                                                            .substring(
0206:                                                                    colonPos + 1)
0207:                                                            .trim()
0208:                                                            .toLowerCase();
0209:                                                    if (name
0210:                                                            .equals("font-weight")
0211:                                                            && value
0212:                                                                    .equals("bold")) {
0213:                                                        hasBold = true;
0214:                                                    } else if (name
0215:                                                            .equals("font-style")
0216:                                                            && value
0217:                                                                    .equals("italic")) {
0218:                                                        hasItalic = true;
0219:                                                    }
0220:                                                }
0221:                                            }
0222:
0223:                                            MultiEndElementInfo endElement = new MultiEndElementInfo();
0224:                                            if (hasBold) {
0225:                                                output.startElement("strong",
0226:                                                        new AttributesImpl());
0227:                                                endElement
0228:                                                        .add(new EndElementInfo(
0229:                                                                "strong"));
0230:                                            }
0231:                                            if (hasItalic) {
0232:                                                output.startElement("em",
0233:                                                        new AttributesImpl());
0234:                                                endElement
0235:                                                        .add(new EndElementInfo(
0236:                                                                "em"));
0237:                                            }
0238:                                            endElements.push(endElement);
0239:                                        } else {
0240:                                            endElements
0241:                                                    .push(new EndElementInfo());
0242:                                        }
0243:                                    }
0244:                                } else if (startElement.localName.equals("div")) {
0245:                                    String classAttr = startElement.attrs
0246:                                            .getValue("class");
0247:                                    if (classAttr != null
0248:                                            && template.allowedDivClasses
0249:                                                    .contains(classAttr)) {
0250:                                        AttributesImpl attrs = new AttributesImpl();
0251:                                        attrs.addAttribute("", "class",
0252:                                                "class", "CDATA", classAttr);
0253:                                        output.startElement("div", attrs);
0254:                                        endElements.push(new EndElementInfo(
0255:                                                "div"));
0256:                                    } else if (classAttr != null
0257:                                            && template.dropDivClasses
0258:                                                    .contains(classAttr)) {
0259:                                        /** Skip over the content of the div element. */
0260:                                        int openElementCounter = 0;
0261:                                        while (true) {
0262:                                            i++;
0263:                                            if (i >= bits.size())
0264:                                                throw new SAXException(
0265:                                                        "Reached end of input without encountering matching close div tag.");
0266:
0267:                                            Object nextBit = bits.get(i);
0268:                                            if (nextBit instanceof  SaxBuffer.StartElement) {
0269:                                                openElementCounter++;
0270:                                            } else if (nextBit instanceof  SaxBuffer.EndElement) {
0271:                                                if (openElementCounter == 0) {
0272:                                                    break;
0273:                                                }
0274:                                                openElementCounter--;
0275:                                            }
0276:                                        }
0277:                                    } else {
0278:                                        // unallowed class, drop div element
0279:                                        endElements.push(new EndElementInfo());
0280:                                    }
0281:                                } else if (startElement.localName.equals("p")) {
0282:                                    String classAttr = startElement.attrs
0283:                                            .getValue("class");
0284:                                    if (classAttr != null
0285:                                            && template.allowedParaClasses
0286:                                                    .contains(classAttr)) {
0287:                                        output
0288:                                                .startElement(
0289:                                                        "p",
0290:                                                        getAllowedAttributes(startElement));
0291:                                        endElements
0292:                                                .push(new EndElementInfo("p"));
0293:                                    } else {
0294:                                        AttributesImpl attrs = getAllowedAttributes(startElement);
0295:                                        int classPos = attrs.getIndex("class");
0296:                                        if (classPos != -1)
0297:                                            attrs.removeAttribute(classPos);
0298:                                        output.startElement("p", attrs);
0299:                                        endElements
0300:                                                .push(new EndElementInfo("p"));
0301:                                    }
0302:                                } else if (startElement.localName.equals("pre")
0303:                                        && preSupported) {
0304:                                    String classAttr = startElement.attrs
0305:                                            .getValue("class");
0306:                                    if (classAttr != null
0307:                                            && template.allowedPreClasses
0308:                                                    .contains(classAttr)) {
0309:                                        output
0310:                                                .startElement(
0311:                                                        "pre",
0312:                                                        getAllowedAttributes(startElement));
0313:                                        endElements.push(new EndElementInfo(
0314:                                                "pre"));
0315:                                    } else {
0316:                                        AttributesImpl attrs = getAllowedAttributes(startElement);
0317:                                        int classPos = attrs.getIndex("class");
0318:                                        if (classPos != -1)
0319:                                            attrs.removeAttribute(classPos);
0320:                                        output.startElement("pre", attrs);
0321:                                        endElements.push(new EndElementInfo(
0322:                                                "pre"));
0323:                                    }
0324:                                } else if (startElement.localName.equals("b")) {
0325:                                    // translate to <strong>
0326:                                    output.startElement("strong",
0327:                                            new AttributesImpl());
0328:                                    endElements.push(new EndElementInfo(
0329:                                            "strong"));
0330:                                } else if (startElement.localName.equals("i")) {
0331:                                    // translate to <em>
0332:                                    output.startElement("em",
0333:                                            new AttributesImpl());
0334:                                    endElements.push(new EndElementInfo("em"));
0335:                                } else if (startElement.localName
0336:                                        .equals("strike")) {
0337:                                    // translate to <del>
0338:                                    output.startElement("del",
0339:                                            new AttributesImpl());
0340:                                    endElements.push(new EndElementInfo("del"));
0341:                                } else if (startElement.localName
0342:                                        .equals("html")) {
0343:                                    if (output.openElements.size() != 0)
0344:                                        throw new SAXException(
0345:                                                "html element can only appear as root element.");
0346:
0347:                                    output.startElement(startElement.localName,
0348:                                            new AttributesImpl());
0349:                                    endElements.push(new EndElementInfo(
0350:                                            startElement.localName));
0351:
0352:                                    // fast forward to body element
0353:                                    while (true) {
0354:                                        i++;
0355:                                        if (i >= bits.size())
0356:                                            throw new SAXException(
0357:                                                    "Reached end of input without encountering opening body tag.");
0358:
0359:                                        Object nextBit = bits.get(i);
0360:                                        if (nextBit instanceof  SaxBuffer.StartElement
0361:                                                && ((SaxBuffer.StartElement) nextBit).localName
0362:                                                        .equals("body")
0363:                                                && ((SaxBuffer.StartElement) nextBit).namespaceURI
0364:                                                        .equals("")) {
0365:                                            i--;
0366:                                            break;
0367:                                        }
0368:                                    }
0369:
0370:                                } else if (startElement.localName
0371:                                        .equals("body")) {
0372:                                    if (output.openElements.size() != 1)
0373:                                        throw new SAXException(
0374:                                                "body element can only appear as child of html element");
0375:
0376:                                    if (!output.openElements.get(0).getName()
0377:                                            .equals("html"))
0378:                                        throw new SAXException(
0379:                                                "body element can only appear as child of html element");
0380:
0381:                                    output.startElement("body",
0382:                                            new AttributesImpl());
0383:                                    endElements
0384:                                            .push(new EndElementInfo("body"));
0385:                                } else if (startElement.localName.equals("img")
0386:                                        && template.descriptors
0387:                                                .containsKey("img")) {
0388:                                    AttributesImpl attrs = getAllowedAttributes(startElement);
0389:                                    if (template.imgAlternateSrcAttr != null) {
0390:                                        String altSrc = startElement.attrs
0391:                                                .getValue(template.imgAlternateSrcAttr);
0392:                                        if (altSrc != null
0393:                                                && !altSrc.equals("")) {
0394:                                            int hrefIndex = attrs
0395:                                                    .getIndex("src");
0396:                                            if (hrefIndex != -1)
0397:                                                attrs.setValue(hrefIndex,
0398:                                                        altSrc);
0399:                                            else
0400:                                                attrs.addAttribute("", "src",
0401:                                                        "src", "CDATA", altSrc);
0402:                                        }
0403:                                    }
0404:                                    output.startElement(startElement.localName,
0405:                                            attrs);
0406:                                    endElements.push(new EndElementInfo(
0407:                                            startElement.localName));
0408:                                } else if (startElement.localName.equals("a")
0409:                                        && template.descriptors
0410:                                                .containsKey("a")) {
0411:                                    AttributesImpl attrs = getAllowedAttributes(startElement);
0412:                                    if (template.linkAlternateHrefAttr != null) {
0413:                                        String altHref = startElement.attrs
0414:                                                .getValue(template.linkAlternateHrefAttr);
0415:                                        if (altHref != null
0416:                                                && !altHref.equals("")) {
0417:                                            int hrefIndex = attrs
0418:                                                    .getIndex("href");
0419:                                            if (hrefIndex != -1)
0420:                                                attrs.setValue(hrefIndex,
0421:                                                        altHref);
0422:                                            else
0423:                                                attrs.addAttribute("", "href",
0424:                                                        "href", "CDATA",
0425:                                                        altHref);
0426:                                        }
0427:                                    }
0428:                                    output.startElement(startElement.localName,
0429:                                            attrs);
0430:                                    endElements.push(new EndElementInfo(
0431:                                            startElement.localName));
0432:                                } else if (startElement.localName.equals("td")
0433:                                        || startElement.localName.equals("th")) {
0434:                                    AttributesImpl attrs = getAllowedAttributes(startElement);
0435:
0436:                                    // remove dummy rowspan and colspan attributes
0437:                                    String rowspan = attrs.getValue("rowspan");
0438:                                    if (rowspan != null && rowspan.equals("1")) {
0439:                                        attrs.removeAttribute(attrs
0440:                                                .getIndex("rowspan"));
0441:                                    }
0442:                                    String colspan = attrs.getValue("colspan");
0443:                                    if (colspan != null && colspan.equals("1")) {
0444:                                        attrs.removeAttribute(attrs
0445:                                                .getIndex("colspan"));
0446:                                    }
0447:
0448:                                    output.startElement(startElement.localName,
0449:                                            attrs);
0450:                                    endElements.push(new EndElementInfo(
0451:                                            startElement.localName));
0452:                                } else if ((startElement.localName
0453:                                        .equals("style") || startElement.localName
0454:                                        .equals("script"))
0455:                                        && !template.descriptors
0456:                                                .containsKey(startElement.localName)) {
0457:                                    // skip over the content
0458:                                    int endPos = searchEndElement(bits, i);
0459:                                    if (endPos == -1)
0460:                                        throw new SAXException(
0461:                                                "Abnormal situation which should never occur: didn't find closing tag for "
0462:                                                        + startElement.localName);
0463:                                    i = endPos;
0464:                                } else if (template.descriptors
0465:                                        .containsKey(startElement.localName)) {
0466:                                    output.startElement(startElement.localName,
0467:                                            getAllowedAttributes(startElement));
0468:                                    endElements.push(new EndElementInfo(
0469:                                            startElement.localName));
0470:                                } else {
0471:                                    // skip element
0472:                                    endElements.push(new EndElementInfo());
0473:                                }
0474:                            }
0475:                        } else if (bit instanceof  SaxBuffer.EndElement) {
0476:                            XMLizable endElement = endElements.pop();
0477:                            endElement.toSAX(output);
0478:                        } else if (bit instanceof  SaxBuffer.Characters) {
0479:                            ((SaxBuffer.Characters) bit).send(output);
0480:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
0481:                            output.startDocument();
0482:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
0483:                            output.endDocument();
0484:                            // don't do any events after endDocument
0485:                            return;
0486:                        }
0487:                        i++;
0488:                    }
0489:                }
0490:            }
0491:
0492:            private AttributesImpl getAllowedAttributes(
0493:                    SaxBuffer.StartElement startElement) {
0494:                // limit attributes to the allowed attributes
0495:                String[] allowedAttributes = template.descriptors.get(
0496:                        startElement.localName).getAttributeNames();
0497:                AttributesImpl attrs = new AttributesImpl();
0498:                for (String allowedAttribute : allowedAttributes) {
0499:                    String value = startElement.attrs
0500:                            .getValue(allowedAttribute);
0501:                    if (value != null) {
0502:                        attrs.addAttribute("", allowedAttribute,
0503:                                allowedAttribute, "CDATA", value);
0504:                    }
0505:                }
0506:                return attrs;
0507:            }
0508:
0509:            /**
0510:             * Puts p tags around all characters or elements that are child of html/body
0511:             * but are not allowed there.
0512:             */
0513:            private class IntroducedParas implements  CleaningStep {
0514:
0515:                public void perform(List<SaxBuffer.SaxBit> bits,
0516:                        OutputHandler output) throws SAXException {
0517:                    Stack<XMLizable> endElements = new Stack<XMLizable>();
0518:                    Stack<Integer> introducedParas = new Stack<Integer>();
0519:                    ElementDescriptor bodyDescriptor = template.descriptors
0520:                            .get("body");
0521:                    ElementDescriptor tdDescriptor = template.descriptors
0522:                            .get("td");
0523:                    ElementDescriptor thDescriptor = template.descriptors
0524:                            .get("th");
0525:                    ElementDescriptor paraDescriptor = template.descriptors
0526:                            .get("p");
0527:                    ElementDescriptor blockQuoteDescriptor = template.descriptors
0528:                            .get("blockquote");
0529:
0530:                    int i = -1;
0531:                    while (i < bits.size()) {
0532:                        i++;
0533:                        Object bit = bits.get(i);
0534:                        if (bit instanceof  SaxBuffer.StartElement) {
0535:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0536:
0537:                            if (!introducedParas.empty()
0538:                                    && introducedParas.peek() == 0
0539:                                    && !paraDescriptor
0540:                                            .childAllowed(startElement.localName)) {
0541:                                output.endElement("p");
0542:                                introducedParas.pop();
0543:                            } else if (output.openElements.size() > 1) {
0544:                                StartElementInfo parentInfo = output.openElements
0545:                                        .get(output.openElements.size() - 1);
0546:                                String parentName = parentInfo.getName();
0547:                                boolean startPara = (parentName.equals("body")
0548:                                        && !bodyDescriptor
0549:                                                .childAllowed(startElement.localName) && paraDescriptor
0550:                                        .childAllowed(startElement.localName))
0551:                                        || (parentName.equals("td")
0552:                                                && !tdDescriptor
0553:                                                        .childAllowed(startElement.localName) && paraDescriptor
0554:                                                .childAllowed(startElement.localName))
0555:                                        || (parentName.equals("th")
0556:                                                && !thDescriptor
0557:                                                        .childAllowed(startElement.localName) && paraDescriptor
0558:                                                .childAllowed(startElement.localName))
0559:                                        || (parentName.equals("blockquote")
0560:                                                && !blockQuoteDescriptor
0561:                                                        .childAllowed(startElement.localName) && paraDescriptor
0562:                                                .childAllowed(startElement.localName));
0563:
0564:                                if (startPara) {
0565:                                    output.startElement("p",
0566:                                            new AttributesImpl());
0567:                                    introducedParas.push(0);
0568:                                }
0569:                            }
0570:
0571:                            if (!introducedParas.empty()) {
0572:                                introducedParas.push(introducedParas.pop() + 1);
0573:                            }
0574:
0575:                            output.startElement(startElement.localName,
0576:                                    startElement.attrs);
0577:                            endElements.push(new EndElementInfo(
0578:                                    startElement.localName));
0579:
0580:                        } else if (bit instanceof  SaxBuffer.EndElement) {
0581:                            if (!introducedParas.empty()
0582:                                    && introducedParas.peek() == 0) {
0583:                                output.endElement("p");
0584:                                introducedParas.pop();
0585:                            }
0586:
0587:                            XMLizable endElement = endElements.pop();
0588:                            endElement.toSAX(output);
0589:
0590:                            if (!introducedParas.empty()) {
0591:                                introducedParas.push(introducedParas.pop() - 1);
0592:                            }
0593:                        } else if (bit instanceof  SaxBuffer.Characters) {
0594:                            if (output.openElements.size() > 1) {
0595:                                StartElementInfo parentInfo = output.openElements
0596:                                        .get(output.openElements.size() - 1);
0597:                                String parentName = parentInfo.getName();
0598:                                boolean startPara = parentName.equals("body")
0599:                                        || parentName.equals("td")
0600:                                        || parentName.equals("th")
0601:                                        || parentName.equals("blockquote");
0602:                                if (startPara) {
0603:                                    output.startElement("p",
0604:                                            new AttributesImpl());
0605:                                    introducedParas.push(0);
0606:                                }
0607:                            }
0608:                            ((SaxBuffer.Characters) bit).send(output);
0609:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
0610:                            output.startDocument();
0611:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
0612:                            output.endDocument();
0613:                            // don't do any events after endDocument
0614:                            return;
0615:                        }
0616:                    }
0617:                }
0618:            }
0619:
0620:            /**
0621:             * Performs structural corrections, so that the end result is
0622:             * limited to what XHTML1 allows (or at least close to it).
0623:             */
0624:            private class StructuralCleanup implements  CleaningStep {
0625:
0626:                public void perform(List<SaxBuffer.SaxBit> bits,
0627:                        OutputHandler output) throws SAXException {
0628:                    Stack<XMLizable> endElements = new Stack<XMLizable>();
0629:                    Map additionalEnds = new IdentityHashMap();
0630:
0631:                    int i = -1;
0632:                    while (i < bits.size()) {
0633:                        i++;
0634:                        Object bit = bits.get(i);
0635:                        if (bit instanceof  SaxBuffer.StartElement) {
0636:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0637:
0638:                            ElementDescriptor descriptor = template.descriptors
0639:                                    .get(startElement.localName);
0640:                            if (descriptor == null)
0641:                                throw new SAXException(
0642:                                        "Missing ElementDescriptor for tagname "
0643:                                                + startElement.localName);
0644:
0645:                            // check if this element can occur inside its parent
0646:                            if (output.openElements.size() > 0) {
0647:                                String parentElementName = (output.openElements
0648:                                        .get(output.openElements.size() - 1))
0649:                                        .getName();
0650:                                ElementDescriptor parentDescriptor = template.descriptors
0651:                                        .get(parentElementName);
0652:
0653:                                boolean allowed = parentDescriptor
0654:                                        .childAllowed(startElement.localName);
0655:
0656:                                // if it's allowed, let's get it done and over with
0657:                                if (allowed) {
0658:                                    output.startElement(startElement.localName,
0659:                                            startElement.attrs);
0660:
0661:                                    XMLizable endElementInfo = new EndElementInfo(
0662:                                            startElement.localName);
0663:                                    EndElementInfo extraEndElementInfo = (EndElementInfo) additionalEnds
0664:                                            .remove(startElement);
0665:                                    if (extraEndElementInfo != null) {
0666:                                        MultiEndElementInfo multiEndElementInfo = new MultiEndElementInfo();
0667:                                        multiEndElementInfo
0668:                                                .add(extraEndElementInfo);
0669:                                        multiEndElementInfo
0670:                                                .add((EndElementInfo) endElementInfo);
0671:                                        endElementInfo = multiEndElementInfo;
0672:                                    }
0673:                                    endElements.push(endElementInfo);
0674:                                    continue;
0675:                                }
0676:
0677:                                // not allowed -> search for first parent where it is allowed
0678:                                int firstGoodAncestor = -1;
0679:                                for (int k = output.openElements.size() - 2; k >= 0; k--) {
0680:                                    String ancestorElementName = (output.openElements
0681:                                            .get(k)).getName();
0682:                                    ElementDescriptor ancestorDescriptor = template.descriptors
0683:                                            .get(ancestorElementName);
0684:                                    if (ancestorDescriptor
0685:                                            .childAllowed(startElement.localName)) {
0686:                                        firstGoodAncestor = k;
0687:                                        break;
0688:                                    }
0689:                                }
0690:
0691:                                if (firstGoodAncestor != -1) {
0692:                                    // upon end of the problem element, we'll need to re-open the tags,
0693:                                    // collect this info now while we still have it
0694:                                    MultiEndElementInfo endElementInfo = new MultiEndElementInfo();
0695:                                    for (int k = output.openElements.size() - 1; k > firstGoodAncestor; k--) {
0696:                                        endElementInfo.add(output.openElements
0697:                                                .get(k));
0698:                                    }
0699:                                    endElementInfo.add(new EndElementInfo(
0700:                                            startElement.localName));
0701:
0702:                                    // close open elements to get to the allowed ancestor
0703:                                    for (int k = output.openElements.size() - 1; k > firstGoodAncestor; k--) {
0704:                                        output.endElement(output.openElements
0705:                                                .get(k).getName());
0706:                                    }
0707:
0708:                                    // start the problem element
0709:                                    output.startElement(startElement.localName,
0710:                                            startElement.attrs);
0711:
0712:                                    endElements.push(endElementInfo);
0713:                                } else {
0714:                                    if (startElement.localName.equals("li")) {
0715:                                        // automatically introduce an ul
0716:                                        output.startElement("ul",
0717:                                                new AttributesImpl());
0718:                                        output.startElement(
0719:                                                startElement.localName,
0720:                                                startElement.attrs);
0721:
0722:                                        // wrap all sibling li's into one big ul
0723:                                        SaxBuffer.StartElement sibling = null;
0724:                                        int nextSiblingPos = searchSibling(
0725:                                                bits, i);
0726:                                        while (nextSiblingPos != -1
0727:                                                && ((SaxBuffer.StartElement) bits
0728:                                                        .get(nextSiblingPos)).localName
0729:                                                        .equals("li")) {
0730:                                            sibling = (SaxBuffer.StartElement) bits
0731:                                                    .get(nextSiblingPos);
0732:                                            nextSiblingPos = searchSibling(
0733:                                                    bits, nextSiblingPos);
0734:                                        }
0735:
0736:                                        if (sibling != null) {
0737:                                            endElements
0738:                                                    .push(new EndElementInfo(
0739:                                                            startElement.localName));
0740:                                            // the ul should be closed after the last li
0741:                                            additionalEnds.put(sibling,
0742:                                                    new EndElementInfo("ul"));
0743:                                        } else {
0744:                                            MultiEndElementInfo endElementInfo = new MultiEndElementInfo();
0745:                                            endElementInfo
0746:                                                    .add(new EndElementInfo(
0747:                                                            "ul"));
0748:                                            endElementInfo
0749:                                                    .add(new EndElementInfo(
0750:                                                            startElement.localName));
0751:                                            endElements.push(endElementInfo);
0752:                                        }
0753:                                    } else if (startElement.localName
0754:                                            .equals("br")) {
0755:                                        // throw away the br
0756:                                        int endPos = searchEndElement(bits, i);
0757:                                        if (endPos == -1)
0758:                                            throw new SAXException(
0759:                                                    "Abnormal situation which should never occur: didn't find end of br.");
0760:                                        i = endPos;
0761:                                    } else {
0762:                                        throw new SAXException(
0763:                                                "Element \""
0764:                                                        + startElement.localName
0765:                                                        + "\" is disallowed at its current location, and could not automatically fix this.");
0766:                                    }
0767:                                }
0768:
0769:                            } else {
0770:                                output.startElement(startElement.localName,
0771:                                        startElement.attrs);
0772:                                endElements.push(new EndElementInfo(
0773:                                        startElement.localName));
0774:                            }
0775:
0776:                        } else if (bit instanceof  SaxBuffer.EndElement) {
0777:                            XMLizable endElement = endElements.pop();
0778:                            endElement.toSAX(output);
0779:                        } else if (bit instanceof  SaxBuffer.Characters) {
0780:                            ((SaxBuffer.Characters) bit).send(output);
0781:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
0782:                            output.startDocument();
0783:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
0784:                            output.endDocument();
0785:                            // don't do any events after endDocument
0786:                            return;
0787:                        }
0788:                    }
0789:                }
0790:            }
0791:
0792:            /**
0793:             * Removes p's, headers containing only whitespace or br's, changes sequences
0794:             * of more then two br's into a new paragraph, drops br's at start or
0795:             * end of p, headers.
0796:             */
0797:            private class CleanupBrsAndEmptyContentBlocks implements 
0798:                    CleaningStep {
0799:
0800:                public void perform(List<SaxBuffer.SaxBit> bits,
0801:                        OutputHandler output) throws SAXException {
0802:                    Stack<XMLizable> endElements = new Stack<XMLizable>();
0803:
0804:                    int i = -1;
0805:                    while (i < bits.size()) {
0806:                        i++;
0807:                        Object bit = bits.get(i);
0808:                        if (bit instanceof  SaxBuffer.StartElement) {
0809:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
0810:
0811:                            boolean contentBlockElement = contentBlockElements
0812:                                    .contains(startElement.localName);
0813:                            if (contentBlockElement
0814:                                    || startElement.localName.equals("td")
0815:                                    || startElement.localName.equals("th")) {
0816:                                // starting a new p, td, ...: search if this element contains anything non-whitespace non-br
0817:                                int elementNesting = 0;
0818:                                int z = i;
0819:                                boolean reachedEndElement = false;
0820:                                while (true) {
0821:                                    z++;
0822:                                    Object bit2 = bits.get(z);
0823:                                    if (bit2 instanceof  SaxBuffer.Characters
0824:                                            && isWhitespace((SaxBuffer.Characters) bit2)) {
0825:                                        // continue loop
0826:                                    } else if (bit2 instanceof  SaxBuffer.StartElement
0827:                                            && ((SaxBuffer.StartElement) bit2).localName
0828:                                                    .equals("br")) {
0829:                                        elementNesting++;
0830:                                    } else if (bit2 instanceof  SaxBuffer.EndElement
0831:                                            && ((SaxBuffer.EndElement) bit2).localName
0832:                                                    .equals("br")) {
0833:                                        elementNesting--;
0834:                                    } else if (bit2 instanceof  SaxBuffer.EndElement
0835:                                            && elementNesting == 0) {
0836:                                        reachedEndElement = true;
0837:                                        break;
0838:                                    } else {
0839:                                        break;
0840:                                    }
0841:                                }
0842:
0843:                                if (reachedEndElement) {
0844:                                    if (contentBlockElement) {
0845:                                        // skip over this element
0846:                                        i = z;
0847:                                        continue;
0848:                                    } else {
0849:                                        output.startElement(
0850:                                                startElement.localName,
0851:                                                startElement.attrs);
0852:                                        endElements.push(new EndElementInfo(
0853:                                                startElement.localName));
0854:                                        // skip content of this element
0855:                                        i = z - 1;
0856:                                        continue;
0857:                                    }
0858:                                } else {
0859:                                    if (contentBlockElement) {
0860:                                        // skip over initial br's or whitespace at start of content block
0861:                                        i = z - 1;
0862:                                    } else {
0863:                                        // nothing to do
0864:                                    }
0865:                                }
0866:
0867:                            } else if (startElement.localName.equals("br")) {
0868:                                // search for a parent content block element
0869:                                int firstContentBlockAncestor = -1;
0870:                                for (int k = output.openElements.size() - 1; k >= 0; k--) {
0871:                                    StartElementInfo startElementInfo = output.openElements
0872:                                            .get(k);
0873:                                    if (contentBlockElements
0874:                                            .contains(startElementInfo
0875:                                                    .getName())) {
0876:                                        firstContentBlockAncestor = k;
0877:                                        break;
0878:                                    }
0879:                                }
0880:
0881:                                // if we are inside a content block ...
0882:                                if (firstContentBlockAncestor != -1) {
0883:                                    // count number of br's following this
0884:                                    int z = i;
0885:                                    int brCount = 1;
0886:                                    boolean continueSearch = true;
0887:                                    while (continueSearch) {
0888:                                        z++;
0889:                                        Object bit2 = bits.get(z);
0890:                                        if (bit2 instanceof  SaxBuffer.EndElement) {
0891:                                            String name = ((SaxBuffer.EndElement) bit2).localName;
0892:                                            if (!name.equals("br")) {
0893:                                                continueSearch = false;
0894:                                            }
0895:                                        } else if (bit2 instanceof  SaxBuffer.StartElement
0896:                                                && ((SaxBuffer.StartElement) bit2).localName
0897:                                                        .equals("br")) {
0898:                                            brCount++;
0899:                                            continueSearch = true;
0900:                                        } else if (bit2 instanceof  SaxBuffer.Characters
0901:                                                && isWhitespace((SaxBuffer.Characters) bit2)) {
0902:                                            continueSearch = true;
0903:                                        } else {
0904:                                            continueSearch = false;
0905:                                        }
0906:                                    }
0907:
0908:                                    // if all the next bits till the first closing content block tag are either end elements or whitespace,
0909:                                    // then drop the br's.
0910:                                    boolean beforeEndContentBlock = false;
0911:                                    for (int t = z; t < bits.size(); t++) {
0912:                                        if (bits.get(t) instanceof  SaxBuffer.EndElement) {
0913:                                            SaxBuffer.EndElement endEl = (SaxBuffer.EndElement) bits
0914:                                                    .get(t);
0915:                                            if (contentBlockElements
0916:                                                    .contains(endEl.localName)) {
0917:                                                beforeEndContentBlock = true;
0918:                                                break;
0919:                                            }
0920:                                            // other end element events: continue searching
0921:                                        } else if (bits.get(t) instanceof  SaxBuffer.Characters
0922:                                                && isWhitespace((SaxBuffer.Characters) bits
0923:                                                        .get(t))) {
0924:                                            // whitespace: continue searching
0925:                                        } else {
0926:                                            // everything else: stop
0927:                                            break;
0928:                                        }
0929:                                    }
0930:                                    if (beforeEndContentBlock) {
0931:                                        i = z - 1;
0932:                                        continue;
0933:                                    }
0934:
0935:                                    if (brCount >= 2) {
0936:                                        // drop the br's, close content block element, open content block element
0937:                                        i = z - 1; // z is positioned on the first non-br, non-whitespace element following the br's
0938:
0939:                                        List<StartElementInfo> elementsToRestart = new ArrayList<StartElementInfo>();
0940:                                        for (int k = output.openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
0941:                                            elementsToRestart
0942:                                                    .add(output.openElements
0943:                                                            .get(k));
0944:                                        }
0945:
0946:                                        for (int k = output.openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
0947:                                            output
0948:                                                    .endElement(output.openElements
0949:                                                            .get(k).getName());
0950:                                        }
0951:
0952:                                        for (int k = elementsToRestart.size() - 1; k >= 0; k--) {
0953:                                            StartElementInfo startElementInfo = elementsToRestart
0954:                                                    .get(k);
0955:                                            output
0956:                                                    .startElement(
0957:                                                            startElementInfo
0958:                                                                    .getName(),
0959:                                                            startElementInfo
0960:                                                                    .getAttrs());
0961:                                        }
0962:                                        continue;
0963:                                    }
0964:                                } else if (startElement.localName.equals("br")
0965:                                        && output.openElements.size() > 1
0966:                                        && needsCleanupOfEndBrs
0967:                                                .contains(output.openElements
0968:                                                        .get(
0969:                                                                output.openElements
0970:                                                                        .size() - 1)
0971:                                                        .getName())) {
0972:                                    // this is useful to remove <br>s inside <td>s or <br>s at the end of <li>s like mozilla does
0973:                                    String elementName = output.openElements
0974:                                            .get(output.openElements.size() - 1)
0975:                                            .getName();
0976:
0977:                                    boolean nextIsEndOfElement = false;
0978:                                    int r = i + 1;
0979:                                    for (; r < bits.size(); r++) {
0980:                                        Object nextBit = bits.get(r);
0981:                                        if (nextBit instanceof  SaxBuffer.EndElement) {
0982:                                            SaxBuffer.EndElement endEl = (SaxBuffer.EndElement) nextBit;
0983:                                            if (endEl.localName.equals("br")) {
0984:                                                continue;
0985:                                            } else if (endEl.localName
0986:                                                    .equals(elementName)) {
0987:                                                nextIsEndOfElement = true;
0988:                                                break;
0989:                                            } else {
0990:                                                break;
0991:                                            }
0992:                                        } else if (nextBit instanceof  SaxBuffer.Characters
0993:                                                && isWhitespace((SaxBuffer.Characters) nextBit)) {
0994:                                            // do nothing
0995:                                        } else {
0996:                                            break;
0997:                                        }
0998:                                    }
0999:
1000:                                    if (nextIsEndOfElement) {
1001:                                        i = r - 1;
1002:                                        continue;
1003:                                    }
1004:                                }
1005:                            }
1006:
1007:                            output.startElement(startElement.localName,
1008:                                    startElement.attrs);
1009:                            endElements.push(new EndElementInfo(
1010:                                    startElement.localName));
1011:
1012:                        } else if (bit instanceof  SaxBuffer.EndElement) {
1013:                            XMLizable endElement = endElements.pop();
1014:                            endElement.toSAX(output);
1015:                        } else if (bit instanceof  SaxBuffer.Characters) {
1016:                            ((SaxBuffer.Characters) bit).send(output);
1017:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
1018:                            output.startDocument();
1019:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
1020:                            output.endDocument();
1021:                            // don't do any events after endDocument
1022:                            return;
1023:                        }
1024:                    }
1025:                }
1026:            }
1027:
1028:            private class CleanupWipeableEmptyElements implements  CleaningStep {
1029:
1030:                public void perform(List<SaxBuffer.SaxBit> bits,
1031:                        OutputHandler output) throws SAXException {
1032:                    Stack<XMLizable> endElements = new Stack<XMLizable>();
1033:
1034:                    int i = -1;
1035:                    while (i < bits.size()) {
1036:                        i++;
1037:                        Object bit = bits.get(i);
1038:                        if (bit instanceof  SaxBuffer.StartElement) {
1039:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
1040:                            if (wipeableEmptyElements
1041:                                    .contains(startElement.localName)) {
1042:                                boolean hasWhitespace = false;
1043:                                boolean reachedEndElement = false;
1044:                                int elementNesting = 0;
1045:                                int k = i;
1046:                                while (true) {
1047:                                    k++;
1048:                                    Object nextBit = bits.get(k);
1049:                                    if (nextBit instanceof  SaxBuffer.StartElement
1050:                                            && wipeableEmptyElements
1051:                                                    .contains(((SaxBuffer.StartElement) nextBit).localName)) {
1052:                                        elementNesting++;
1053:                                    } else if (nextBit instanceof  SaxBuffer.Characters
1054:                                            && isWhitespace((SaxBuffer.Characters) nextBit)) {
1055:                                        hasWhitespace = true;
1056:                                    } else if (nextBit instanceof  SaxBuffer.EndElement
1057:                                            && elementNesting > 0) {
1058:                                        elementNesting--;
1059:                                    } else if (nextBit instanceof  SaxBuffer.EndElement
1060:                                            && elementNesting == 0) {
1061:                                        reachedEndElement = true;
1062:                                        break;
1063:                                    } else {
1064:                                        break;
1065:                                    }
1066:                                }
1067:
1068:                                if (reachedEndElement) {
1069:                                    // skip the elements
1070:                                    i = k;
1071:                                    // if the wipeable elements contained whitespace, generate a whitespace character
1072:                                    if (hasWhitespace)
1073:                                        output.characters(new char[] { ' ' },
1074:                                                0, 1);
1075:                                    continue;
1076:                                }
1077:                            }
1078:
1079:                            output.startElement(startElement.localName,
1080:                                    startElement.attrs);
1081:                            endElements.push(new EndElementInfo(
1082:                                    startElement.localName));
1083:
1084:                        } else if (bit instanceof  SaxBuffer.EndElement) {
1085:                            XMLizable endElement = endElements.pop();
1086:                            endElement.toSAX(output);
1087:                        } else if (bit instanceof  SaxBuffer.Characters) {
1088:                            ((SaxBuffer.Characters) bit).send(output);
1089:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
1090:                            output.startDocument();
1091:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
1092:                            output.endDocument();
1093:                            // don't do any events after endDocument
1094:                            return;
1095:                        }
1096:                    }
1097:                }
1098:            }
1099:
1100:            /**
1101:             * Changes br elements inside pre elements into newline character events.
1102:             */
1103:            private class TranslateBeeaarsInPees implements  CleaningStep {
1104:
1105:                public void perform(List<SaxBuffer.SaxBit> bits,
1106:                        OutputHandler output) throws SAXException {
1107:                    int preLevel = 0;
1108:                    int i = -1;
1109:                    while (i < bits.size()) {
1110:                        i++;
1111:                        Object bit = bits.get(i);
1112:                        if (bit instanceof  SaxBuffer.StartElement) {
1113:                            SaxBuffer.StartElement startElement = (SaxBuffer.StartElement) bit;
1114:                            if (startElement.localName.equals("pre")) {
1115:                                preLevel++;
1116:                            } else if (preLevel > 0
1117:                                    && startElement.localName.equals("br")) {
1118:                                // normally an opening br should be immediatelly followed by the closing br,
1119:                                // so let us restrict us to that case
1120:                                Object nextBit = bits.get(i + 1);
1121:                                if (nextBit instanceof  SaxBuffer.EndElement
1122:                                        && ((SaxBuffer.EndElement) nextBit).localName
1123:                                                .equals("br")) {
1124:                                    // replace this br by a newline
1125:                                    output.characters(NEWLINE, 0, 1);
1126:                                    i++;
1127:                                    continue;
1128:                                }
1129:                            }
1130:
1131:                            output.startElement(startElement.localName,
1132:                                    startElement.attrs);
1133:
1134:                        } else if (bit instanceof  SaxBuffer.EndElement) {
1135:                            SaxBuffer.EndElement endElement = (SaxBuffer.EndElement) bit;
1136:                            if (endElement.localName.equals("pre")) {
1137:                                preLevel--;
1138:                            }
1139:                            output.endElement(endElement.localName);
1140:                        } else if (bit instanceof  SaxBuffer.Characters) {
1141:                            ((SaxBuffer.Characters) bit).send(output);
1142:                        } else if (bit instanceof  SaxBuffer.StartDocument) {
1143:                            output.startDocument();
1144:                        } else if (bit instanceof  SaxBuffer.EndDocument) {
1145:                            output.endDocument();
1146:                            // don't do any events after endDocument
1147:                            return;
1148:                        }
1149:                    }
1150:                }
1151:            }
1152:
1153:            /**
1154:             * Removes a "\n" if it occurs right before a closing pre tag. Such a newline has
1155:             * no meaning. This is often inserted by Firefox, and causes layout troubles in
1156:             * Internet Explorer (subsequent block elements are extra indented, though they
1157:             * shift left again once you start typing in them).
1158:             */
1159:            private class CleanupNewlineAtEndOfPre implements  CleaningStep {
1160:
1161:                public void perform(List<SaxBuffer.SaxBit> bits,
1162:                        OutputHandler output) throws SAXException {
1163:                    int i = 0;
1164:                    while (i < bits.size()) {
1165:                        SaxBuffer.SaxBit bit = bits.get(i);
1166:                        if (bit instanceof  SaxBuffer.Characters
1167:                                && i < bits.size() - 1
1168:                                && bits.get(i + 1) instanceof  SaxBuffer.EndElement
1169:                                && ((SaxBuffer.EndElement) bits.get(i + 1)).localName
1170:                                        .equals("pre")) {
1171:                            SaxBuffer.Characters characters = (SaxBuffer.Characters) bit;
1172:                            char[] ch = characters.ch;
1173:                            if (ch.length > 0
1174:                                    && ch[ch.length - 1] == '\n'
1175:                                    && (ch.length <= 1 || ch[ch.length - 2] != '\n')) {
1176:                                output.characters(characters.ch, 0,
1177:                                        characters.ch.length - 1);
1178:                            } else {
1179:                                characters.send(output);
1180:                            }
1181:                        } else {
1182:                            bit.send(output);
1183:                        }
1184:                        i++;
1185:                    }
1186:                }
1187:            }
1188:
1189:            private boolean isWhitespace(SaxBuffer.Characters characters) {
1190:                for (char ch : characters.ch) {
1191:                    if (!(Character.isWhitespace(ch) || ch == (char) 160)) // 160 is &nbsp;
1192:                        return false;
1193:                }
1194:                return true;
1195:            }
1196:
1197:            /**
1198:             *
1199:             * @param bits SaxBuffer bits
1200:             * @param index the index of the current start element, of which we want to find the sibling
1201:             */
1202:            private int searchSibling(List bits, int index) {
1203:                int nesting = 0;
1204:                boolean passedEndElement = false;
1205:                for (int i = index + 1; i < bits.size(); i++) {
1206:                    Object bit = bits.get(i);
1207:                    if (bit instanceof  SaxBuffer.StartElement) {
1208:                        if (passedEndElement)
1209:                            return i;
1210:                        else
1211:                            nesting++;
1212:                    } else if (bit instanceof  SaxBuffer.EndElement) {
1213:                        if (nesting == 0)
1214:                            passedEndElement = true;
1215:                        else
1216:                            nesting--;
1217:                    }
1218:                }
1219:                return -1;
1220:            }
1221:
1222:            /**
1223:             *
1224:             * @param bits SaxBuffer bits
1225:             * @param index the index of the current start element, of which we want to find the end element
1226:             */
1227:            private int searchEndElement(List bits, int index) {
1228:                int nesting = 0;
1229:                for (int i = index + 1; i < bits.size(); i++) {
1230:                    Object bit = bits.get(i);
1231:                    if (bit instanceof  SaxBuffer.StartElement) {
1232:                        nesting++;
1233:                    } else if (bit instanceof  SaxBuffer.EndElement) {
1234:                        if (nesting == 0)
1235:                            return i;
1236:                    }
1237:                }
1238:                return -1;
1239:            }
1240:
1241:            private static class CleaningPipe {
1242:                private List<CleaningStep> steps = new ArrayList<CleaningStep>();
1243:
1244:                void addCleaningStep(CleaningStep step) {
1245:                    steps.add(step);
1246:                }
1247:
1248:                /**
1249:                 *
1250:                 * @param result where the result of the last cleanup step should be sent to
1251:                 */
1252:                void execute(List<SaxBuffer.SaxBit> startBits,
1253:                        ContentHandler result) throws SAXException {
1254:                    List<SaxBuffer.SaxBit> bits = startBits;
1255:                    SaxBuffer buffer;
1256:                    for (int i = 0; i < steps.size(); i++) {
1257:                        buffer = new SaxBuffer();
1258:                        OutputHandler handler = new OutputHandler(i == steps
1259:                                .size() - 1 ? result : buffer);
1260:                        steps.get(i).perform(bits, handler);
1261:                        bits = buffer.getBits();
1262:                    }
1263:                }
1264:            }
1265:
1266:            private interface CleaningStep {
1267:                void perform(List<SaxBuffer.SaxBit> bits, OutputHandler output)
1268:                        throws SAXException;
1269:            }
1270:
1271:            private static class OutputHandler extends ForwardingContentHandler {
1272:                private List<StartElementInfo> openElements = new ArrayList<StartElementInfo>();
1273:
1274:                public OutputHandler(ContentHandler consumer) {
1275:                    super (consumer);
1276:                }
1277:
1278:                public void startElement(String name, Attributes attrs)
1279:                        throws SAXException {
1280:                    super .startElement("", name, name, attrs);
1281:                    openElements.add(new StartElementInfo(name, attrs));
1282:                }
1283:
1284:                public void endElement(String name) throws SAXException {
1285:                    super .endElement("", name, name);
1286:                    String removed = openElements.remove(
1287:                            openElements.size() - 1).getName();
1288:                    if (!removed.equals(name)) {
1289:                        throw new SAXException("The close tag \"" + name
1290:                                + "\" did not match the open tag \"" + removed
1291:                                + "\".");
1292:                    }
1293:                }
1294:
1295:            }
1296:
1297:            private static class StartElementInfo implements  XMLizable {
1298:                private final String name;
1299:                private final Attributes attrs;
1300:
1301:                public StartElementInfo(String name, Attributes attrs) {
1302:                    this .name = name;
1303:                    this .attrs = attrs;
1304:                }
1305:
1306:                public String getName() {
1307:                    return name;
1308:                }
1309:
1310:                public Attributes getAttrs() {
1311:                    return attrs;
1312:                }
1313:
1314:                public void toSAX(OutputHandler contentHandler)
1315:                        throws SAXException {
1316:                    contentHandler.startElement(name, attrs);
1317:                }
1318:            }
1319:
1320:            private static class EndElementInfo implements  XMLizable {
1321:                private final boolean skip;
1322:                private final String localName;
1323:
1324:                public EndElementInfo() {
1325:                    this .skip = true;
1326:                    this .localName = null;
1327:                }
1328:
1329:                public EndElementInfo(String localName) {
1330:                    this .skip = false;
1331:                    this .localName = localName;
1332:                }
1333:
1334:                public void toSAX(OutputHandler contentHandler)
1335:                        throws SAXException {
1336:                    if (!skip) {
1337:                        contentHandler.endElement(localName);
1338:                    }
1339:                }
1340:            }
1341:
1342:            private final class MultiEndElementInfo implements  XMLizable {
1343:                private List<XMLizable> tags = new ArrayList<XMLizable>(2);
1344:
1345:                public void add(EndElementInfo endElement) {
1346:                    this .tags.add(endElement);
1347:                }
1348:
1349:                public void add(StartElementInfo endElement) {
1350:                    this .tags.add(endElement);
1351:                }
1352:
1353:                public void toSAX(OutputHandler contentHandler)
1354:                        throws SAXException {
1355:                    for (int i = tags.size() - 1; i >= 0; i--) {
1356:                        XMLizable tag = tags.get(i);
1357:                        tag.toSAX(contentHandler);
1358:                    }
1359:                }
1360:            }
1361:
1362:            interface XMLizable {
1363:                public void toSAX(OutputHandler contentHandler)
1364:                        throws SAXException;
1365:            }
1366:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.