0001: /*
0002: * Copyright 2002-2008 Andy Clark
0003: *
0004: * Licensed under the Apache License, Version 2.0 (the "License");
0005: * you may not use this file except in compliance with the License.
0006: * You may obtain a copy of the License at
0007: *
0008: * http://www.apache.org/licenses/LICENSE-2.0
0009: *
0010: * Unless required by applicable law or agreed to in writing, software
0011: * distributed under the License is distributed on an "AS IS" BASIS,
0012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013: * See the License for the specific language governing permissions and
0014: * limitations under the License.
0015: */
0016:
0017: package org.cyberneko.html;
0018:
0019: import java.lang.reflect.InvocationTargetException;
0020: import java.lang.reflect.Method;
0021:
0022: import org.apache.xerces.util.XMLAttributesImpl;
0023: import org.apache.xerces.xni.Augmentations;
0024: import org.apache.xerces.xni.NamespaceContext;
0025: import org.apache.xerces.xni.QName;
0026: import org.apache.xerces.xni.XMLAttributes;
0027: import org.apache.xerces.xni.XMLDocumentHandler;
0028: import org.apache.xerces.xni.XMLLocator;
0029: import org.apache.xerces.xni.XMLResourceIdentifier;
0030: import org.apache.xerces.xni.XMLString;
0031: import org.apache.xerces.xni.XNIException;
0032: import org.apache.xerces.xni.parser.XMLComponentManager;
0033: import org.apache.xerces.xni.parser.XMLConfigurationException;
0034: import org.apache.xerces.xni.parser.XMLDocumentFilter;
0035: import org.apache.xerces.xni.parser.XMLDocumentSource;
0036:
0037: /**
0038: * Balances tags in an HTML document. This component receives document events
0039: * and tries to correct many common mistakes that human (and computer) HTML
0040: * document authors make. This tag balancer can:
0041: * <ul>
0042: * <li>add missing parent elements;
0043: * <li>automatically close elements with optional end tags; and
0044: * <li>handle mis-matched inline element tags.
0045: * </ul>
0046: * <p>
0047: * This component recognizes the following features:
0048: * <ul>
0049: * <li>http://cyberneko.org/html/features/augmentations
0050: * <li>http://cyberneko.org/html/features/report-errors
0051: * <li>http://cyberneko.org/html/features/balance-tags/document-fragment
0052: * <li>http://cyberneko.org/html/features/balance-tags/ignore-outside-content
0053: * </ul>
0054: * <p>
0055: * This component recognizes the following properties:
0056: * <ul>
0057: * <li>http://cyberneko.org/html/properties/names/elems
0058: * <li>http://cyberneko.org/html/properties/names/attrs
0059: * <li>http://cyberneko.org/html/properties/error-reporter
0060: * </ul>
0061: *
0062: * @see HTMLElements
0063: *
0064: * @author Andy Clark
0065: *
0066: * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
0067: */
0068: public class HTMLTagBalancer implements XMLDocumentFilter,
0069: HTMLComponent {
0070:
0071: //
0072: // Constants
0073: //
0074:
0075: // features
0076:
0077: /** Namespaces. */
0078: protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
0079:
0080: /** Include infoset augmentations. */
0081: protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
0082:
0083: /** Report errors. */
0084: protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
0085:
0086: /** Document fragment balancing only (deprecated). */
0087: protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
0088:
0089: /** Document fragment balancing only. */
0090: protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
0091:
0092: /** Ignore outside content. */
0093: protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
0094:
0095: /** Recognized features. */
0096: private static final String[] RECOGNIZED_FEATURES = { NAMESPACES,
0097: AUGMENTATIONS, REPORT_ERRORS, DOCUMENT_FRAGMENT_DEPRECATED,
0098: DOCUMENT_FRAGMENT, IGNORE_OUTSIDE_CONTENT, };
0099:
0100: /** Recognized features defaults. */
0101: private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
0102: null, null, null, null, Boolean.FALSE, Boolean.FALSE, };
0103:
0104: // properties
0105:
0106: /** Modify HTML element names: { "upper", "lower", "default" }. */
0107: protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
0108:
0109: /** Modify HTML attribute names: { "upper", "lower", "default" }. */
0110: protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
0111:
0112: /** Error reporter. */
0113: protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
0114:
0115: /** Recognized properties. */
0116: private static final String[] RECOGNIZED_PROPERTIES = {
0117: NAMES_ELEMS, NAMES_ATTRS, ERROR_REPORTER, };
0118:
0119: /** Recognized properties defaults. */
0120: private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
0121: null, null, null, };
0122:
0123: // modify HTML names
0124:
0125: /** Don't modify HTML names. */
0126: protected static final short NAMES_NO_CHANGE = 0;
0127:
0128: /** Match HTML element names. */
0129: protected static final short NAMES_MATCH = 0;
0130:
0131: /** Uppercase HTML names. */
0132: protected static final short NAMES_UPPERCASE = 1;
0133:
0134: /** Lowercase HTML names. */
0135: protected static final short NAMES_LOWERCASE = 2;
0136:
0137: // static vars
0138:
0139: /** Synthesized event info item. */
0140: protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
0141:
0142: //
0143: // Data
0144: //
0145:
0146: // features
0147:
0148: /** Namespaces. */
0149: protected boolean fNamespaces;
0150:
0151: /** Include infoset augmentations. */
0152: protected boolean fAugmentations;
0153:
0154: /** Report errors. */
0155: protected boolean fReportErrors;
0156:
0157: /** Document fragment balancing only. */
0158: protected boolean fDocumentFragment;
0159:
0160: /** Ignore outside content. */
0161: protected boolean fIgnoreOutsideContent;
0162:
0163: // properties
0164:
0165: /** Modify HTML element names. */
0166: protected short fNamesElems;
0167:
0168: /** Modify HTML attribute names. */
0169: protected short fNamesAttrs;
0170:
0171: /** Error reporter. */
0172: protected HTMLErrorReporter fErrorReporter;
0173:
0174: // connections
0175:
0176: /** The document source. */
0177: protected XMLDocumentSource fDocumentSource;
0178:
0179: /** The document handler. */
0180: protected XMLDocumentHandler fDocumentHandler;
0181:
0182: // state
0183:
0184: /** The element stack. */
0185: protected final InfoStack fElementStack = new InfoStack();
0186:
0187: /** The inline stack. */
0188: protected final InfoStack fInlineStack = new InfoStack();
0189:
0190: /** True if seen anything. Important for xml declaration. */
0191: protected boolean fSeenAnything;
0192:
0193: /** True if root element has been seen. */
0194: protected boolean fSeenDoctype;
0195:
0196: /** True if root element has been seen. */
0197: protected boolean fSeenRootElement;
0198:
0199: /**
0200: * True if seen the end of the document element. In other words,
0201: * this variable is set to false <em>until</em> the end </HTML>
0202: * tag is seen (or synthesized). This is used to ensure that
0203: * extraneous events after the end of the document element do not
0204: * make the document stream ill-formed.
0205: */
0206: protected boolean fSeenRootElementEnd;
0207:
0208: /** True if seen <head< element. */
0209: protected boolean fSeenHeadElement;
0210:
0211: /** True if seen <body< element. */
0212: protected boolean fSeenBodyElement;
0213:
0214: /** True if a form is in the stack (allow to discard opening of nested forms) */
0215: protected boolean fOpenedForm;
0216:
0217: // temp vars
0218:
0219: /** A qualified name. */
0220: private final QName fQName = new QName();
0221:
0222: /** Empty attributes. */
0223: private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
0224:
0225: /** Augmentations. */
0226: private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
0227:
0228: protected HTMLTagBalancingListener tagBalancingListener;
0229:
0230: //
0231: // HTMLComponent methods
0232: //
0233:
0234: /** Returns the default state for a feature. */
0235: public Boolean getFeatureDefault(String featureId) {
0236: int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length
0237: : 0;
0238: for (int i = 0; i < length; i++) {
0239: if (RECOGNIZED_FEATURES[i].equals(featureId)) {
0240: return RECOGNIZED_FEATURES_DEFAULTS[i];
0241: }
0242: }
0243: return null;
0244: } // getFeatureDefault(String):Boolean
0245:
0246: /** Returns the default state for a property. */
0247: public Object getPropertyDefault(String propertyId) {
0248: int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length
0249: : 0;
0250: for (int i = 0; i < length; i++) {
0251: if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
0252: return RECOGNIZED_PROPERTIES_DEFAULTS[i];
0253: }
0254: }
0255: return null;
0256: } // getPropertyDefault(String):Object
0257:
0258: //
0259: // XMLComponent methods
0260: //
0261:
0262: /** Returns recognized features. */
0263: public String[] getRecognizedFeatures() {
0264: return RECOGNIZED_FEATURES;
0265: } // getRecognizedFeatures():String[]
0266:
0267: /** Returns recognized properties. */
0268: public String[] getRecognizedProperties() {
0269: return RECOGNIZED_PROPERTIES;
0270: } // getRecognizedProperties():String[]
0271:
0272: /** Resets the component. */
0273: public void reset(XMLComponentManager manager)
0274: throws XMLConfigurationException {
0275:
0276: // get features
0277: fNamespaces = manager.getFeature(NAMESPACES);
0278: fAugmentations = manager.getFeature(AUGMENTATIONS);
0279: fReportErrors = manager.getFeature(REPORT_ERRORS);
0280: fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT)
0281: || manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
0282: fIgnoreOutsideContent = manager
0283: .getFeature(IGNORE_OUTSIDE_CONTENT);
0284:
0285: // get properties
0286: fNamesElems = getNamesValue(String.valueOf(manager
0287: .getProperty(NAMES_ELEMS)));
0288: fNamesAttrs = getNamesValue(String.valueOf(manager
0289: .getProperty(NAMES_ATTRS)));
0290: fErrorReporter = (HTMLErrorReporter) manager
0291: .getProperty(ERROR_REPORTER);
0292:
0293: } // reset(XMLComponentManager)
0294:
0295: /** Sets a feature. */
0296: public void setFeature(String featureId, boolean state)
0297: throws XMLConfigurationException {
0298:
0299: if (featureId.equals(AUGMENTATIONS)) {
0300: fAugmentations = state;
0301: return;
0302: }
0303: if (featureId.equals(REPORT_ERRORS)) {
0304: fReportErrors = state;
0305: return;
0306: }
0307: if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
0308: fIgnoreOutsideContent = state;
0309: return;
0310: }
0311:
0312: } // setFeature(String,boolean)
0313:
0314: /** Sets a property. */
0315: public void setProperty(String propertyId, Object value)
0316: throws XMLConfigurationException {
0317:
0318: if (propertyId.equals(NAMES_ELEMS)) {
0319: fNamesElems = getNamesValue(String.valueOf(value));
0320: return;
0321: }
0322:
0323: if (propertyId.equals(NAMES_ATTRS)) {
0324: fNamesAttrs = getNamesValue(String.valueOf(value));
0325: return;
0326: }
0327:
0328: } // setProperty(String,Object)
0329:
0330: //
0331: // XMLDocumentSource methods
0332: //
0333:
0334: /** Sets the document handler. */
0335: public void setDocumentHandler(XMLDocumentHandler handler) {
0336: fDocumentHandler = handler;
0337: } // setDocumentHandler(XMLDocumentHandler)
0338:
0339: // @since Xerces 2.1.0
0340:
0341: /** Returns the document handler. */
0342: public XMLDocumentHandler getDocumentHandler() {
0343: return fDocumentHandler;
0344: } // getDocumentHandler():XMLDocumentHandler
0345:
0346: //
0347: // XMLDocumentHandler methods
0348: //
0349:
0350: // since Xerces-J 2.2.0
0351:
0352: /** Start document. */
0353: public void startDocument(XMLLocator locator, String encoding,
0354: NamespaceContext nscontext, Augmentations augs)
0355: throws XNIException {
0356:
0357: // reset state
0358: fElementStack.top = 0;
0359: fSeenAnything = false;
0360: fSeenDoctype = false;
0361: fSeenRootElement = false;
0362: fSeenRootElementEnd = false;
0363: fSeenHeadElement = false;
0364: fSeenBodyElement = false;
0365:
0366: // pass on event
0367: if (fDocumentHandler != null) {
0368: try {
0369: // NOTE: Hack to allow the default filter to work with
0370: // old and new versions of the XNI document handler
0371: // interface. -Ac
0372: Class cls = fDocumentHandler.getClass();
0373: Class[] types = { XMLLocator.class, String.class,
0374: NamespaceContext.class, Augmentations.class };
0375: Method method = cls.getMethod("startDocument", types);
0376: Object[] params = { locator, encoding, nscontext, augs };
0377: method.invoke(fDocumentHandler, params);
0378: } catch (IllegalAccessException e) {
0379: throw new XNIException(e);
0380: } catch (InvocationTargetException e) {
0381: throw new XNIException(e);
0382: } catch (NoSuchMethodException e) {
0383: try {
0384: // NOTE: Hack to allow the default filter to work with
0385: // old and new versions of the XNI document handler
0386: // interface. -Ac
0387: Class cls = fDocumentHandler.getClass();
0388: Class[] types = { XMLLocator.class, String.class,
0389: Augmentations.class };
0390: Method method = cls.getMethod("startDocument",
0391: types);
0392: Object[] params = { locator, encoding, augs };
0393: method.invoke(fDocumentHandler, params);
0394: } catch (IllegalAccessException ex) {
0395: // NOTE: Should never reach here!
0396: throw new XNIException(ex);
0397: } catch (InvocationTargetException ex) {
0398: // NOTE: Should never reach here!
0399: throw new XNIException(ex);
0400: } catch (NoSuchMethodException ex) {
0401: // NOTE: Should never reach here!
0402: throw new XNIException(ex);
0403: }
0404: }
0405: }
0406:
0407: } // startDocument(XMLLocator,String,Augmentations)
0408:
0409: // old methods
0410:
0411: /** XML declaration. */
0412: public void xmlDecl(String version, String encoding,
0413: String standalone, Augmentations augs) throws XNIException {
0414: if (!fSeenAnything && fDocumentHandler != null) {
0415: fDocumentHandler.xmlDecl(version, encoding, standalone,
0416: augs);
0417: }
0418: } // xmlDecl(String,String,String,Augmentations)
0419:
0420: /** Doctype declaration. */
0421: public void doctypeDecl(String rootElementName, String publicId,
0422: String systemId, Augmentations augs) throws XNIException {
0423: fSeenAnything = true;
0424: if (fReportErrors) {
0425: if (fSeenRootElement) {
0426: fErrorReporter.reportError("HTML2010", null);
0427: } else if (fSeenDoctype) {
0428: fErrorReporter.reportError("HTML2011", null);
0429: }
0430: }
0431: if (!fSeenRootElement && !fSeenDoctype) {
0432: fSeenDoctype = true;
0433: if (fDocumentHandler != null) {
0434: fDocumentHandler.doctypeDecl(rootElementName, publicId,
0435: systemId, augs);
0436: }
0437: }
0438: } // doctypeDecl(String,String,String,Augmentations)
0439:
0440: /** End document. */
0441: public void endDocument(Augmentations augs) throws XNIException {
0442:
0443: // handle empty document
0444: if (!fSeenRootElement && !fDocumentFragment) {
0445: if (fReportErrors) {
0446: fErrorReporter.reportError("HTML2000", null);
0447: }
0448: String ename = modifyName("html", fNamesElems);
0449: fQName.setValues(null, ename, ename, null);
0450: if (fDocumentHandler != null) {
0451: callStartElement(fQName, emptyAttributes(),
0452: synthesizedAugs());
0453: callEndElement(fQName, synthesizedAugs());
0454: }
0455: }
0456:
0457: // pop all remaining elements
0458: else {
0459: int length = fElementStack.top;
0460: for (int i = 0; i < length; i++) {
0461: Info info = fElementStack.pop();
0462: if (fReportErrors) {
0463: String ename = info.qname.rawname;
0464: fErrorReporter.reportWarning("HTML2001",
0465: new Object[] { ename });
0466: }
0467: if (fDocumentHandler != null) {
0468: callEndElement(info.qname, synthesizedAugs());
0469: }
0470: }
0471: }
0472:
0473: // call handler
0474: if (fDocumentHandler != null) {
0475: fDocumentHandler.endDocument(augs);
0476: }
0477:
0478: } // endDocument(Augmentations)
0479:
0480: /** Comment. */
0481: public void comment(XMLString text, Augmentations augs)
0482: throws XNIException {
0483: fSeenAnything = true;
0484: if (fDocumentHandler != null) {
0485: fDocumentHandler.comment(text, augs);
0486: }
0487: } // comment(XMLString,Augmentations)
0488:
0489: /** Processing instruction. */
0490: public void processingInstruction(String target, XMLString data,
0491: Augmentations augs) throws XNIException {
0492: fSeenAnything = true;
0493: if (fDocumentHandler != null) {
0494: fDocumentHandler.processingInstruction(target, data, augs);
0495: }
0496: } // processingInstruction(String,XMLString,Augmentations)
0497:
0498: /** Start element. */
0499: public void startElement(final QName elem, XMLAttributes attrs,
0500: final Augmentations augs) throws XNIException {
0501: fSeenAnything = true;
0502:
0503: // check for end of document
0504: if (fSeenRootElementEnd) {
0505: notifyDiscardedStartElement(elem, attrs, augs);
0506: return;
0507: }
0508:
0509: // get element information
0510: HTMLElements.Element element = getElement(elem.rawname);
0511:
0512: // ignore multiple html, head, body elements
0513: if (fSeenRootElement && element.code == HTMLElements.HTML) {
0514: notifyDiscardedStartElement(elem, attrs, augs);
0515: return;
0516: }
0517: if (element.code == HTMLElements.HEAD) {
0518: if (fSeenHeadElement) {
0519: notifyDiscardedStartElement(elem, attrs, augs);
0520: return;
0521: }
0522: fSeenHeadElement = true;
0523: } else if (element.code == HTMLElements.BODY) {
0524: if (fSeenBodyElement) {
0525: notifyDiscardedStartElement(elem, attrs, augs);
0526: return;
0527: }
0528: fSeenBodyElement = true;
0529: } else if (element.code == HTMLElements.FORM) {
0530: if (fOpenedForm) {
0531: notifyDiscardedStartElement(elem, attrs, augs);
0532: return;
0533: }
0534: fOpenedForm = true;
0535: }
0536:
0537: // check proper parent
0538: if (element.parent != null) {
0539: if (!fSeenRootElement && !fDocumentFragment) {
0540: String pname = element.parent[0].name;
0541: pname = modifyName(pname, fNamesElems);
0542: if (fReportErrors) {
0543: String ename = elem.rawname;
0544: fErrorReporter.reportWarning("HTML2002",
0545: new Object[] { ename, pname });
0546: }
0547: QName qname = new QName(null, pname, pname, null);
0548: startElement(qname, null, synthesizedAugs());
0549: } else {
0550: HTMLElements.Element pelement = element.parent[0];
0551: if (pelement.code != HTMLElements.HEAD
0552: || (!fSeenBodyElement && !fDocumentFragment)) {
0553: int depth = getParentDepth(element.parent,
0554: element.bounds);
0555: if (depth == -1) {
0556: String pname = pelement.name;
0557: pname = modifyName(pname, fNamesElems);
0558: int pdepth = getParentDepth(pelement.parent,
0559: pelement.bounds);
0560: if (pdepth != -1) {
0561: QName qname = new QName(null, pname, pname,
0562: null);
0563: if (fReportErrors) {
0564: String ename = elem.rawname;
0565: fErrorReporter.reportWarning(
0566: "HTML2004", new Object[] {
0567: ename, pname });
0568: }
0569: startElement(qname, null, synthesizedAugs());
0570: }
0571: }
0572: }
0573: }
0574: }
0575:
0576: // if block element, save immediate parent inline elements
0577: int depth = 0;
0578: if (element.flags == 0) {
0579: int length = fElementStack.top;
0580: fInlineStack.top = 0;
0581: for (int i = length - 1; i >= 0; i--) {
0582: Info info = fElementStack.data[i];
0583: if (!info.element.isInline()) {
0584: break;
0585: }
0586: fInlineStack.push(info);
0587: endElement(info.qname, synthesizedAugs());
0588: }
0589: depth = fInlineStack.top;
0590: }
0591:
0592: // close previous elements
0593: if (element.closes != null) {
0594: int length = fElementStack.top;
0595: for (int i = length - 1; i >= 0; i--) {
0596: Info info = fElementStack.data[i];
0597:
0598: // does it close the element we're looking at?
0599: if (element.closes(info.element.code)) {
0600: if (fReportErrors) {
0601: String ename = elem.rawname;
0602: String iname = info.qname.rawname;
0603: fErrorReporter.reportWarning("HTML2005",
0604: new Object[] { ename, iname });
0605: }
0606: for (int j = length - 1; j >= i; j--) {
0607: info = fElementStack.pop();
0608: if (fDocumentHandler != null) {
0609: // PATCH: Marc-André Morissette
0610: callEndElement(info.qname,
0611: synthesizedAugs());
0612: }
0613: }
0614: length = i;
0615: continue;
0616: }
0617:
0618: // should we stop searching?
0619: boolean container = info.element.isContainer();
0620: boolean parent = false;
0621: if (!container) {
0622: for (int j = 0; j < element.parent.length; j++) {
0623: parent = parent
0624: || info.element.code == element.parent[j].code;
0625: }
0626: }
0627: if (container || parent) {
0628: break;
0629: }
0630: }
0631: }
0632:
0633: // call handler
0634: fSeenRootElement = true;
0635: if (element != null && element.isEmpty()) {
0636: if (attrs == null) {
0637: attrs = emptyAttributes();
0638: }
0639: if (fDocumentHandler != null) {
0640: fDocumentHandler.emptyElement(elem, attrs, augs);
0641: }
0642: } else {
0643: boolean inline = element != null && element.isInline();
0644: fElementStack.push(new Info(element, elem, inline ? attrs
0645: : null));
0646: if (attrs == null) {
0647: attrs = emptyAttributes();
0648: }
0649: if (fDocumentHandler != null) {
0650: callStartElement(elem, attrs, augs);
0651: }
0652: }
0653:
0654: // re-open inline elements
0655: for (int i = 0; i < depth; i++) {
0656: Info info = fInlineStack.pop();
0657: startElement(info.qname, info.attributes, synthesizedAugs());
0658: }
0659:
0660: } // startElement(QName,XMLAttributes,Augmentations)
0661:
0662: /** Empty element. */
0663: public void emptyElement(QName elem, XMLAttributes attrs,
0664: Augmentations augs) throws XNIException {
0665: startElement(elem, attrs, augs);
0666: endElement(elem, augs);
0667: } // emptyElement(QName,XMLAttributes,Augmentations)
0668:
0669: /** Start entity. */
0670: public void startGeneralEntity(String name,
0671: XMLResourceIdentifier id, String encoding,
0672: Augmentations augs) throws XNIException {
0673: fSeenAnything = true;
0674:
0675: // check for end of document
0676: if (fSeenRootElementEnd) {
0677: return;
0678: }
0679:
0680: // insert body, if needed
0681: if (!fDocumentFragment) {
0682: boolean insertBody = !fSeenRootElement;
0683: if (!insertBody) {
0684: Info info = fElementStack.peek();
0685: if (info.element.code == HTMLElements.HEAD
0686: || info.element.code == HTMLElements.HTML) {
0687: String hname = modifyName("head", fNamesElems);
0688: String bname = modifyName("body", fNamesElems);
0689: if (fReportErrors) {
0690: fErrorReporter.reportWarning("HTML2009",
0691: new Object[] { hname, bname });
0692: }
0693: fQName.setValues(null, hname, hname, null);
0694: endElement(fQName, synthesizedAugs());
0695: insertBody = true;
0696: }
0697: }
0698: if (insertBody) {
0699: String ename = modifyName("body", fNamesElems);
0700: fQName.setValues(null, ename, ename, null);
0701: if (fReportErrors) {
0702: fErrorReporter.reportWarning("HTML2006",
0703: new Object[] { ename });
0704: }
0705: startElement(fQName, null, synthesizedAugs());
0706: }
0707: }
0708:
0709: // call handler
0710: if (fDocumentHandler != null) {
0711: fDocumentHandler.startGeneralEntity(name, id, encoding,
0712: augs);
0713: }
0714:
0715: } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
0716:
0717: /** Text declaration. */
0718: public void textDecl(String version, String encoding,
0719: Augmentations augs) throws XNIException {
0720: fSeenAnything = true;
0721:
0722: // check for end of document
0723: if (fSeenRootElementEnd) {
0724: return;
0725: }
0726:
0727: // call handler
0728: if (fDocumentHandler != null) {
0729: fDocumentHandler.textDecl(version, encoding, augs);
0730: }
0731:
0732: } // textDecl(String,String,Augmentations)
0733:
0734: /** End entity. */
0735: public void endGeneralEntity(String name, Augmentations augs)
0736: throws XNIException {
0737:
0738: // check for end of document
0739: if (fSeenRootElementEnd) {
0740: return;
0741: }
0742:
0743: // call handler
0744: if (fDocumentHandler != null) {
0745: fDocumentHandler.endGeneralEntity(name, augs);
0746: }
0747:
0748: } // endGeneralEntity(String,Augmentations)
0749:
0750: /** Start CDATA section. */
0751: public void startCDATA(Augmentations augs) throws XNIException {
0752: fSeenAnything = true;
0753:
0754: // check for end of document
0755: if (fSeenRootElementEnd) {
0756: return;
0757: }
0758:
0759: // call handler
0760: if (fDocumentHandler != null) {
0761: fDocumentHandler.startCDATA(augs);
0762: }
0763:
0764: } // startCDATA(Augmentations)
0765:
0766: /** End CDATA section. */
0767: public void endCDATA(Augmentations augs) throws XNIException {
0768:
0769: // check for end of document
0770: if (fSeenRootElementEnd) {
0771: return;
0772: }
0773:
0774: // call handler
0775: if (fDocumentHandler != null) {
0776: fDocumentHandler.endCDATA(augs);
0777: }
0778:
0779: } // endCDATA(Augmentations)
0780:
0781: /** Characters. */
0782: public void characters(XMLString text, Augmentations augs)
0783: throws XNIException {
0784:
0785: // check for end of document
0786: if (fSeenRootElementEnd) {
0787: return;
0788: }
0789:
0790: // is this text whitespace?
0791: boolean whitespace = true;
0792: for (int i = 0; i < text.length; i++) {
0793: if (!Character.isWhitespace(text.ch[text.offset + i])) {
0794: whitespace = false;
0795: break;
0796: }
0797: }
0798:
0799: if (!fDocumentFragment) {
0800: // handle bare characters
0801: if (!fSeenRootElement) {
0802: if (whitespace) {
0803: return;
0804: }
0805: String ename = modifyName("body", fNamesElems);
0806: fQName.setValues(null, ename, ename, null);
0807: if (fReportErrors) {
0808: fErrorReporter.reportWarning("HTML2006",
0809: new Object[] { ename });
0810: }
0811: startElement(fQName, null, synthesizedAugs());
0812: }
0813:
0814: // handle character content in head
0815: // NOTE: This fequently happens when the document looks like:
0816: // <title>Title</title>
0817: // And here's some text.
0818: else if (!whitespace) {
0819: Info info = fElementStack.peek();
0820: if (info.element.code == HTMLElements.HEAD
0821: || info.element.code == HTMLElements.HTML) {
0822: String hname = modifyName("head", fNamesElems);
0823: String bname = modifyName("body", fNamesElems);
0824: if (fReportErrors) {
0825: fErrorReporter.reportWarning("HTML2009",
0826: new Object[] { hname, bname });
0827: }
0828: fQName.setValues(null, hname, hname, null);
0829: endElement(fQName, synthesizedAugs());
0830: fQName.setValues(null, bname, bname, null);
0831: startElement(fQName, null, synthesizedAugs());
0832: }
0833: }
0834: }
0835:
0836: // call handler
0837: if (fDocumentHandler != null) {
0838: fDocumentHandler.characters(text, augs);
0839: }
0840:
0841: } // characters(XMLString,Augmentations)
0842:
0843: /** Ignorable whitespace. */
0844: public void ignorableWhitespace(XMLString text, Augmentations augs)
0845: throws XNIException {
0846: characters(text, augs);
0847: } // ignorableWhitespace(XMLString,Augmentations)
0848:
0849: /** End element. */
0850: public void endElement(final QName element, final Augmentations augs)
0851: throws XNIException {
0852: // is there anything to do?
0853: if (fSeenRootElementEnd) {
0854: notifyDiscardedEndElement(element, augs);
0855: return;
0856: }
0857:
0858: // get element information
0859: HTMLElements.Element elem = getElement(element.rawname);
0860:
0861: // do we ignore outside content?
0862: if (!fIgnoreOutsideContent
0863: && (elem.code == HTMLElements.BODY || elem.code == HTMLElements.HTML)) {
0864: notifyDiscardedEndElement(element, augs);
0865: return;
0866: }
0867:
0868: // check for end of document
0869: if (elem.code == HTMLElements.HTML) {
0870: fSeenRootElementEnd = true;
0871: } else if (elem.code == HTMLElements.FORM) {
0872: fOpenedForm = false;
0873: }
0874:
0875: // empty element
0876: int depth = getElementDepth(elem);
0877: if (depth == -1) {
0878: if (elem.code == HTMLElements.P) {
0879: startElement(element, emptyAttributes(),
0880: synthesizedAugs());
0881: endElement(element, augs);
0882: } else if (!elem.isEmpty()) {
0883: notifyDiscardedEndElement(element, augs);
0884: }
0885: return;
0886: }
0887:
0888: // find unbalanced inline elements
0889: if (depth > 1 && elem.isInline()) {
0890: int size = fElementStack.top;
0891: fInlineStack.top = 0;
0892: for (int i = 0; i < depth - 1; i++) {
0893: Info info = fElementStack.data[size - i - 1];
0894: HTMLElements.Element pelem = info.element;
0895: if (pelem.isInline()) {
0896: // NOTE: I don't have to make a copy of the info because
0897: // it will just be popped off of the element stack
0898: // as soon as we close it, anyway.
0899: fInlineStack.push(info);
0900: }
0901: }
0902: }
0903:
0904: // close children up to appropriate element
0905: for (int i = 0; i < depth; i++) {
0906: Info info = fElementStack.pop();
0907: if (fReportErrors && i < depth - 1) {
0908: String ename = modifyName(element.rawname, fNamesElems);
0909: String iname = info.qname.rawname;
0910: fErrorReporter.reportWarning("HTML2007", new Object[] {
0911: ename, iname });
0912: }
0913: if (fDocumentHandler != null) {
0914: // PATCH: Marc-André Morissette
0915: callEndElement(info.qname,
0916: i < depth - 1 ? synthesizedAugs() : augs);
0917: }
0918: }
0919:
0920: // re-open inline elements
0921: if (depth > 1) {
0922: int size = fInlineStack.top;
0923: for (int i = 0; i < size; i++) {
0924: Info info = (Info) fInlineStack.pop();
0925: XMLAttributes attributes = info.attributes;
0926: if (fReportErrors) {
0927: String iname = info.qname.rawname;
0928: fErrorReporter.reportWarning("HTML2008",
0929: new Object[] { iname });
0930: }
0931: startElement(info.qname, attributes, synthesizedAugs());
0932: }
0933: }
0934:
0935: } // endElement(QName,Augmentations)
0936:
0937: // @since Xerces 2.1.0
0938:
0939: /** Sets the document source. */
0940: public void setDocumentSource(XMLDocumentSource source) {
0941: fDocumentSource = source;
0942: } // setDocumentSource(XMLDocumentSource)
0943:
0944: /** Returns the document source. */
0945: public XMLDocumentSource getDocumentSource() {
0946: return fDocumentSource;
0947: } // getDocumentSource():XMLDocumentSource
0948:
0949: // removed since Xerces-J 2.3.0
0950:
0951: /** Start document. */
0952: public void startDocument(XMLLocator locator, String encoding,
0953: Augmentations augs) throws XNIException {
0954: startDocument(locator, encoding, null, augs);
0955: } // startDocument(XMLLocator,String,Augmentations)
0956:
0957: /** Start prefix mapping. */
0958: public void startPrefixMapping(String prefix, String uri,
0959: Augmentations augs) throws XNIException {
0960:
0961: // check for end of document
0962: if (fSeenRootElementEnd) {
0963: return;
0964: }
0965:
0966: // call handler
0967: if (fDocumentHandler != null) {
0968: Class cls = fDocumentHandler.getClass();
0969: Class[] types = { String.class, String.class,
0970: Augmentations.class };
0971: try {
0972: Method method = cls.getMethod("startPrefixMapping",
0973: types);
0974: Object[] args = { prefix, uri, augs };
0975: method.invoke(fDocumentHandler, args);
0976: } catch (NoSuchMethodException e) {
0977: // ignore
0978: } catch (IllegalAccessException e) {
0979: // ignore
0980: } catch (InvocationTargetException e) {
0981: // ignore
0982: }
0983: }
0984:
0985: } // startPrefixMapping(String,String,Augmentations)
0986:
0987: /** End prefix mapping. */
0988: public void endPrefixMapping(String prefix, Augmentations augs)
0989: throws XNIException {
0990:
0991: // check for end of document
0992: if (fSeenRootElementEnd) {
0993: return;
0994: }
0995:
0996: // call handler
0997: if (fDocumentHandler != null) {
0998: Class cls = fDocumentHandler.getClass();
0999: Class[] types = { String.class, Augmentations.class };
1000: try {
1001: Method method = cls
1002: .getMethod("endPrefixMapping", types);
1003: Object[] args = { prefix, augs };
1004: method.invoke(fDocumentHandler, args);
1005: } catch (NoSuchMethodException e) {
1006: // ignore
1007: } catch (IllegalAccessException e) {
1008: // ignore
1009: } catch (InvocationTargetException e) {
1010: // ignore
1011: }
1012: }
1013:
1014: } // endPrefixMapping(String,Augmentations)
1015:
1016: //
1017: // Protected methods
1018: //
1019:
1020: /** Returns an HTML element. */
1021: protected HTMLElements.Element getElement(String name) {
1022: if (fNamespaces) {
1023: int index = name.indexOf(':');
1024: if (index != -1) {
1025: name = name.substring(index + 1);
1026: }
1027: }
1028: return HTMLElements.getElement(name);
1029: } // getElement(String):HTMLElements.Element
1030:
1031: /** Call document handler start element. */
1032: protected final void callStartElement(QName element,
1033: XMLAttributes attrs, Augmentations augs)
1034: throws XNIException {
1035: fDocumentHandler.startElement(element, attrs, augs);
1036: } // callStartElement(QName,XMLAttributes,Augmentations)
1037:
1038: /** Call document handler end element. */
1039: protected final void callEndElement(QName element,
1040: Augmentations augs) throws XNIException {
1041: fDocumentHandler.endElement(element, augs);
1042: } // callEndElement(QName,Augmentations)
1043:
1044: /**
1045: * Returns the depth of the open tag associated with the specified
1046: * element name or -1 if no matching element is found.
1047: *
1048: * @param element The element.
1049: */
1050: protected final int getElementDepth(HTMLElements.Element element) {
1051: final boolean container = element.isContainer();
1052: int depth = -1;
1053: for (int i = fElementStack.top - 1; i >= 0; i--) {
1054: Info info = fElementStack.data[i];
1055: if (info.element.code == element.code) {
1056: depth = fElementStack.top - i;
1057: break;
1058: }
1059: if (!container && info.element.isBlock()) {
1060: break;
1061: }
1062: }
1063: return depth;
1064: } // getElementDepth(HTMLElements.Element)
1065:
1066: /**
1067: * Returns the depth of the open tag associated with the specified
1068: * element parent names or -1 if no matching element is found.
1069: *
1070: * @param parents The parent elements.
1071: */
1072: protected int getParentDepth(HTMLElements.Element[] parents,
1073: short bounds) {
1074: if (parents != null) {
1075: for (int i = fElementStack.top - 1; i >= 0; i--) {
1076: Info info = fElementStack.data[i];
1077: if (info.element.code == bounds) {
1078: break;
1079: }
1080: for (int j = 0; j < parents.length; j++) {
1081: if (info.element.code == parents[j].code) {
1082: return fElementStack.top - i;
1083: }
1084: }
1085: }
1086: }
1087: return -1;
1088: } // getParentDepth(HTMLElements.Element[],short):int
1089:
1090: /** Returns a set of empty attributes. */
1091: protected final XMLAttributes emptyAttributes() {
1092: fEmptyAttrs.removeAllAttributes();
1093: return fEmptyAttrs;
1094: } // emptyAttributes():XMLAttributes
1095:
1096: /** Returns an augmentations object with a synthesized item added. */
1097: protected final Augmentations synthesizedAugs() {
1098: HTMLAugmentations augs = null;
1099: if (fAugmentations) {
1100: augs = fInfosetAugs;
1101: augs.removeAllItems();
1102: augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
1103: }
1104: return augs;
1105: } // synthesizedAugs():Augmentations
1106:
1107: //
1108: // Protected static methods
1109: //
1110:
1111: /** Modifies the given name based on the specified mode. */
1112: protected static final String modifyName(String name, short mode) {
1113: switch (mode) {
1114: case NAMES_UPPERCASE:
1115: return name.toUpperCase();
1116: case NAMES_LOWERCASE:
1117: return name.toLowerCase();
1118: }
1119: return name;
1120: } // modifyName(String,short):String
1121:
1122: /**
1123: * Converts HTML names string value to constant value.
1124: *
1125: * @see #NAMES_NO_CHANGE
1126: * @see #NAMES_LOWERCASE
1127: * @see #NAMES_UPPERCASE
1128: */
1129: protected static final short getNamesValue(String value) {
1130: if (value.equals("lower")) {
1131: return NAMES_LOWERCASE;
1132: }
1133: if (value.equals("upper")) {
1134: return NAMES_UPPERCASE;
1135: }
1136: return NAMES_NO_CHANGE;
1137: } // getNamesValue(String):short
1138:
1139: //
1140: // Classes
1141: //
1142:
1143: /**
1144: * Element info for each start element. This information is used when
1145: * closing unbalanced inline elements. For example:
1146: * <pre>
1147: * <i>unbalanced <b>HTML</i> content</b>
1148: * </pre>
1149: * <p>
1150: * It seems that it is a waste of processing and memory to copy the
1151: * attributes for every start element even if there are no unbalanced
1152: * inline elements in the document. However, if the attributes are
1153: * <em>not</em> saved, then important attributes such as style
1154: * information would be lost.
1155: *
1156: * @author Andy Clark
1157: */
1158: public static class Info {
1159:
1160: //
1161: // Data
1162: //
1163:
1164: /** The element. */
1165: public HTMLElements.Element element;
1166:
1167: /** The element qualified name. */
1168: public QName qname;
1169:
1170: /** The element attributes. */
1171: public XMLAttributes attributes;
1172:
1173: //
1174: // Constructors
1175: //
1176:
1177: /**
1178: * Creates an element information object.
1179: * <p>
1180: * <strong>Note:</strong>
1181: * This constructor makes a copy of the element information.
1182: *
1183: * @param element The element qualified name.
1184: */
1185: public Info(HTMLElements.Element element, QName qname) {
1186: this (element, qname, null);
1187: } // <init>(HTMLElements.Element,QName)
1188:
1189: /**
1190: * Creates an element information object.
1191: * <p>
1192: * <strong>Note:</strong>
1193: * This constructor makes a copy of the element information.
1194: *
1195: * @param element The element qualified name.
1196: * @param attributes The element attributes.
1197: */
1198: public Info(HTMLElements.Element element, QName qname,
1199: XMLAttributes attributes) {
1200: this .element = element;
1201: this .qname = new QName(qname);
1202: if (attributes != null) {
1203: int length = attributes.getLength();
1204: if (length > 0) {
1205: QName aqname = new QName();
1206: XMLAttributes newattrs = new XMLAttributesImpl();
1207: for (int i = 0; i < length; i++) {
1208: attributes.getName(i, aqname);
1209: String type = attributes.getType(i);
1210: String value = attributes.getValue(i);
1211: String nonNormalizedValue = attributes
1212: .getNonNormalizedValue(i);
1213: boolean specified = attributes.isSpecified(i);
1214: newattrs.addAttribute(aqname, type, value);
1215: newattrs.setNonNormalizedValue(i,
1216: nonNormalizedValue);
1217: newattrs.setSpecified(i, specified);
1218: }
1219: this .attributes = newattrs;
1220: }
1221: }
1222: } // <init>(HTMLElements.Element,QName,XMLAttributes)
1223:
1224: } // class Info
1225:
1226: /** Unsynchronized stack of element information. */
1227: public static class InfoStack {
1228:
1229: //
1230: // Data
1231: //
1232:
1233: /** The top of the stack. */
1234: public int top;
1235:
1236: /** The stack data. */
1237: public Info[] data = new Info[10];
1238:
1239: //
1240: // Public methods
1241: //
1242:
1243: /** Pushes element information onto the stack. */
1244: public void push(Info info) {
1245: if (top == data.length) {
1246: Info[] newarray = new Info[top + 10];
1247: System.arraycopy(data, 0, newarray, 0, top);
1248: data = newarray;
1249: }
1250: data[top++] = info;
1251: } // push(Info)
1252:
1253: /** Peeks at the top of the stack. */
1254: public Info peek() {
1255: return data[top - 1];
1256: } // peek():Info
1257:
1258: /** Pops the top item off of the stack. */
1259: public Info pop() {
1260: return data[--top];
1261: } // pop():Info
1262:
1263: } // class InfoStack
1264:
1265: void setTagBalancingListener(
1266: final HTMLTagBalancingListener tagBalancingListener) {
1267: this .tagBalancingListener = tagBalancingListener;
1268: }
1269:
1270: /**
1271: * Notifies the tagBalancingListener (if any) of an ignored start element
1272: */
1273: private void notifyDiscardedStartElement(final QName elem,
1274: final XMLAttributes attrs, final Augmentations augs) {
1275: if (tagBalancingListener != null)
1276: tagBalancingListener.ignoredStartElement(elem, attrs, augs);
1277: }
1278:
1279: /**
1280: * Notifies the tagBalancingListener (if any) of an ignored end element
1281: */
1282: private void notifyDiscardedEndElement(final QName element,
1283: final Augmentations augs) {
1284: if (tagBalancingListener != null)
1285: tagBalancingListener.ignoredEndElement(element, augs);
1286: }
1287:
1288: } // class HTMLTagBalancer
|