0001: /*
0002: * @(#)Tidy.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: /*
0007: HTML parser and pretty printer
0008:
0009: Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0010: Institute of Technology, Institut National de Recherche en
0011: Informatique et en Automatique, Keio University). All Rights
0012: Reserved.
0013:
0014: Contributing Author(s):
0015:
0016: Dave Raggett <dsr@w3.org>
0017: Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0018:
0019: The contributing author(s) would like to thank all those who
0020: helped with testing, bug fixes, and patience. This wouldn't
0021: have been possible without all of you.
0022:
0023: COPYRIGHT NOTICE:
0024:
0025: This software and documentation is provided "as is," and
0026: the copyright holders and contributing author(s) make no
0027: representations or warranties, express or implied, including
0028: but not limited to, warranties of merchantability or fitness
0029: for any particular purpose or that the use of the software or
0030: documentation will not infringe any third party patents,
0031: copyrights, trademarks or other rights.
0032:
0033: The copyright holders and contributing author(s) will not be
0034: liable for any direct, indirect, special or consequential damages
0035: arising out of any use of the software or documentation, even if
0036: advised of the possibility of such damage.
0037:
0038: Permission is hereby granted to use, copy, modify, and distribute
0039: this source code, or portions hereof, documentation and executables,
0040: for any purpose, without fee, subject to the following restrictions:
0041:
0042: 1. The origin of this source code must not be misrepresented.
0043: 2. Altered versions must be plainly marked as such and must
0044: not be misrepresented as being the original source.
0045: 3. This Copyright notice may not be removed or altered from any
0046: source or altered source distribution.
0047:
0048: The copyright holders and contributing author(s) specifically
0049: permit, without fee, and encourage the use of this source code
0050: as a component for supporting the Hypertext Markup Language in
0051: commercial products. If you use this source code in a product,
0052: acknowledgment is not required but would be appreciated.
0053: */
0054:
0055: package org.w3c.tidy;
0056:
0057: import java.io.PrintWriter;
0058: import java.io.FileWriter;
0059: import java.io.InputStream;
0060: import java.io.FileInputStream;
0061: import java.io.OutputStream;
0062: import java.io.FileOutputStream;
0063: import java.util.Properties;
0064:
0065: import java.io.IOException;
0066: import java.io.FileNotFoundException;
0067:
0068: // BEGIN RAVE MODIFICATIONS
0069: import org.w3c.dom.Attr;
0070: import org.w3c.dom.CharacterData;
0071: import org.w3c.dom.NamedNodeMap;
0072: import org.w3c.dom.NodeList;
0073:
0074: // END RAVE MODIFICATIONS
0075:
0076: /**
0077: *
0078: * <p>HTML parser and pretty printer</p>
0079: *
0080: * <p>
0081: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0082: * See Tidy.java for the copyright notice.
0083: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0084: * HTML Tidy Release 4 Aug 2000</a>
0085: * </p>
0086: *
0087: * <p>
0088: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0089: * Institute of Technology, Institut National de Recherche en
0090: * Informatique et en Automatique, Keio University). All Rights
0091: * Reserved.
0092: * </p>
0093: *
0094: * <p>
0095: * Contributing Author(s):<br>
0096: * <a href="mailto:dsr@w3.org">Dave Raggett</a><br>
0097: * <a href="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
0098: * </p>
0099: *
0100: * <p>
0101: * The contributing author(s) would like to thank all those who
0102: * helped with testing, bug fixes, and patience. This wouldn't
0103: * have been possible without all of you.
0104: * </p>
0105: *
0106: * <p>
0107: * COPYRIGHT NOTICE:<br>
0108: *
0109: * This software and documentation is provided "as is," and
0110: * the copyright holders and contributing author(s) make no
0111: * representations or warranties, express or implied, including
0112: * but not limited to, warranties of merchantability or fitness
0113: * for any particular purpose or that the use of the software or
0114: * documentation will not infringe any third party patents,
0115: * copyrights, trademarks or other rights.
0116: * </p>
0117: *
0118: * <p>
0119: * The copyright holders and contributing author(s) will not be
0120: * liable for any direct, indirect, special or consequential damages
0121: * arising out of any use of the software or documentation, even if
0122: * advised of the possibility of such damage.
0123: * </p>
0124: *
0125: * <p>
0126: * Permission is hereby granted to use, copy, modify, and distribute
0127: * this source code, or portions hereof, documentation and executables,
0128: * for any purpose, without fee, subject to the following restrictions:
0129: * </p>
0130: *
0131: * <p>
0132: * <ol>
0133: * <li>The origin of this source code must not be misrepresented.</li>
0134: * <li>Altered versions must be plainly marked as such and must
0135: * not be misrepresented as being the original source.</li>
0136: * <li>This Copyright notice may not be removed or altered from any
0137: * source or altered source distribution.</li>
0138: * </ol>
0139: * </p>
0140: *
0141: * <p>
0142: * The copyright holders and contributing author(s) specifically
0143: * permit, without fee, and encourage the use of this source code
0144: * as a component for supporting the Hypertext Markup Language in
0145: * commercial products. If you use this source code in a product,
0146: * acknowledgment is not required but would be appreciated.
0147: * </p>
0148: *
0149: * @author Dave Raggett <dsr@w3.org>
0150: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0151: * @version 1.0, 1999/05/22
0152: * @version 1.0.1, 1999/05/29
0153: * @version 1.1, 1999/06/18 Java Bean
0154: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0155: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0156: * @version 1.4, 1999/09/04 DOM support
0157: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0158: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0159: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0160: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0161: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0162: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0163: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0164: *
0165: */
0166:
0167: public class Tidy implements java.io.Serializable {
0168:
0169: static final long serialVersionUID = -2794371560623987718L;
0170:
0171: private boolean initialized = false;
0172: private PrintWriter errout = null; /* error output stream */
0173: private PrintWriter stderr = null;
0174: private Configuration configuration = null;
0175: private String inputStreamName = "InputStream";
0176: private int parseErrors = 0;
0177: private int parseWarnings = 0;
0178:
0179: public Tidy() {
0180: init();
0181: }
0182:
0183: public Configuration getConfiguration() {
0184: return configuration;
0185: }
0186:
0187: public PrintWriter getStderr() {
0188: return stderr;
0189: }
0190:
0191: /**
0192: * ParseErrors - the number of errors that occurred in the most
0193: * recent parse operation
0194: */
0195:
0196: public int getParseErrors() {
0197: return parseErrors;
0198: }
0199:
0200: /**
0201: * ParseWarnings - the number of warnings that occurred in the most
0202: * recent parse operation
0203: */
0204:
0205: public int getParseWarnings() {
0206: return parseWarnings;
0207: }
0208:
0209: /**
0210: * Errout - the error output stream
0211: */
0212:
0213: public PrintWriter getErrout() {
0214: return errout;
0215: }
0216:
0217: public void setErrout(PrintWriter errout) {
0218: this .errout = errout;
0219: }
0220:
0221: /**
0222: * Spaces - default indentation
0223: * @see org.w3c.tidy.Configuration#spaces
0224: */
0225:
0226: public void setSpaces(int spaces) {
0227: configuration.spaces = spaces;
0228: }
0229:
0230: public int getSpaces() {
0231: return configuration.spaces;
0232: }
0233:
0234: /**
0235: * Wraplen - default wrap margin
0236: * @see org.w3c.tidy.Configuration#wraplen
0237: */
0238:
0239: public void setWraplen(int wraplen) {
0240: configuration.wraplen = wraplen;
0241: }
0242:
0243: public int getWraplen() {
0244: return configuration.wraplen;
0245: }
0246:
0247: /**
0248: * CharEncoding
0249: * @see org.w3c.tidy.Configuration#CharEncoding
0250: */
0251:
0252: public void setCharEncoding(int charencoding) {
0253: configuration.CharEncoding = charencoding;
0254: }
0255:
0256: public int getCharEncoding() {
0257: return configuration.CharEncoding;
0258: }
0259:
0260: /**
0261: * Tabsize
0262: * @see org.w3c.tidy.Configuration#tabsize
0263: */
0264:
0265: public void setTabsize(int tabsize) {
0266: configuration.tabsize = tabsize;
0267: }
0268:
0269: public int getTabsize() {
0270: return configuration.tabsize;
0271: }
0272:
0273: /**
0274: * Errfile - file name to write errors to
0275: * @see org.w3c.tidy.Configuration#errfile
0276: */
0277:
0278: public void setErrfile(String errfile) {
0279: configuration.errfile = errfile;
0280: }
0281:
0282: public String getErrfile() {
0283: return configuration.errfile;
0284: }
0285:
0286: /**
0287: * Writeback - if true then output tidied markup
0288: * NOTE: this property is ignored when parsing from an InputStream.
0289: * @see org.w3c.tidy.Configuration#writeback
0290: */
0291:
0292: public void setWriteback(boolean writeback) {
0293: configuration.writeback = writeback;
0294: }
0295:
0296: public boolean getWriteback() {
0297: return configuration.writeback;
0298: }
0299:
0300: /**
0301: * OnlyErrors - if true normal output is suppressed
0302: * @see org.w3c.tidy.Configuration#OnlyErrors
0303: */
0304:
0305: public void setOnlyErrors(boolean OnlyErrors) {
0306: configuration.OnlyErrors = OnlyErrors;
0307: }
0308:
0309: public boolean getOnlyErrors() {
0310: return configuration.OnlyErrors;
0311: }
0312:
0313: /**
0314: * ShowWarnings - however errors are always shown
0315: * @see org.w3c.tidy.Configuration#ShowWarnings
0316: */
0317:
0318: public void setShowWarnings(boolean ShowWarnings) {
0319: configuration.ShowWarnings = ShowWarnings;
0320: }
0321:
0322: public boolean getShowWarnings() {
0323: return configuration.ShowWarnings;
0324: }
0325:
0326: /**
0327: * Quiet - no 'Parsing X', guessed DTD or summary
0328: * @see org.w3c.tidy.Configuration#Quiet
0329: */
0330:
0331: public void setQuiet(boolean Quiet) {
0332: configuration.Quiet = Quiet;
0333: }
0334:
0335: public boolean getQuiet() {
0336: return configuration.Quiet;
0337: }
0338:
0339: /**
0340: * IndentContent - indent content of appropriate tags
0341: * @see org.w3c.tidy.Configuration#IndentContent
0342: */
0343:
0344: public void setIndentContent(boolean IndentContent) {
0345: configuration.IndentContent = IndentContent;
0346: }
0347:
0348: public boolean getIndentContent() {
0349: return configuration.IndentContent;
0350: }
0351:
0352: /**
0353: * SmartIndent - does text/block level content effect indentation
0354: * @see org.w3c.tidy.Configuration#SmartIndent
0355: */
0356:
0357: public void setSmartIndent(boolean SmartIndent) {
0358: configuration.SmartIndent = SmartIndent;
0359: }
0360:
0361: public boolean getSmartIndent() {
0362: return configuration.SmartIndent;
0363: }
0364:
0365: /**
0366: * HideEndTags - suppress optional end tags
0367: * @see org.w3c.tidy.Configuration#HideEndTags
0368: */
0369:
0370: public void setHideEndTags(boolean HideEndTags) {
0371: configuration.HideEndTags = HideEndTags;
0372: }
0373:
0374: public boolean getHideEndTags() {
0375: return configuration.HideEndTags;
0376: }
0377:
0378: /**
0379: * XmlTags - treat input as XML
0380: * @see org.w3c.tidy.Configuration#XmlTags
0381: */
0382:
0383: public void setXmlTags(boolean XmlTags) {
0384: configuration.XmlTags = XmlTags;
0385: }
0386:
0387: public boolean getXmlTags() {
0388: return configuration.XmlTags;
0389: }
0390:
0391: /**
0392: * XmlOut - create output as XML
0393: * @see org.w3c.tidy.Configuration#XmlOut
0394: */
0395:
0396: public void setXmlOut(boolean XmlOut) {
0397: configuration.XmlOut = XmlOut;
0398: }
0399:
0400: public boolean getXmlOut() {
0401: return configuration.XmlOut;
0402: }
0403:
0404: /**
0405: * XHTML - output extensible HTML
0406: * @see org.w3c.tidy.Configuration#xHTML
0407: */
0408:
0409: public void setXHTML(boolean xHTML) {
0410: configuration.xHTML = xHTML;
0411: }
0412:
0413: public boolean getXHTML() {
0414: return configuration.xHTML;
0415: }
0416:
0417: /**
0418: * RawOut - avoid mapping values > 127 to entities
0419: * @see org.w3c.tidy.Configuration#RawOut
0420: */
0421:
0422: public void setRawOut(boolean RawOut) {
0423: configuration.RawOut = RawOut;
0424: }
0425:
0426: public boolean getRawOut() {
0427: return configuration.RawOut;
0428: }
0429:
0430: /**
0431: * UpperCaseTags - output tags in upper not lower case
0432: * @see org.w3c.tidy.Configuration#UpperCaseTags
0433: */
0434:
0435: public void setUpperCaseTags(boolean UpperCaseTags) {
0436: configuration.UpperCaseTags = UpperCaseTags;
0437: }
0438:
0439: public boolean getUpperCaseTags() {
0440: return configuration.UpperCaseTags;
0441: }
0442:
0443: /**
0444: * UpperCaseAttrs - output attributes in upper not lower case
0445: * @see org.w3c.tidy.Configuration#UpperCaseAttrs
0446: */
0447:
0448: public void setUpperCaseAttrs(boolean UpperCaseAttrs) {
0449: configuration.UpperCaseAttrs = UpperCaseAttrs;
0450: }
0451:
0452: public boolean getUpperCaseAttrs() {
0453: return configuration.UpperCaseAttrs;
0454: }
0455:
0456: /**
0457: * MakeClean - remove presentational clutter
0458: * @see org.w3c.tidy.Configuration#MakeClean
0459: */
0460:
0461: public void setMakeClean(boolean MakeClean) {
0462: configuration.MakeClean = MakeClean;
0463: }
0464:
0465: public boolean getMakeClean() {
0466: return configuration.MakeClean;
0467: }
0468:
0469: /**
0470: * BreakBeforeBR - o/p newline before <br> or not?
0471: * @see org.w3c.tidy.Configuration#BreakBeforeBR
0472: */
0473:
0474: public void setBreakBeforeBR(boolean BreakBeforeBR) {
0475: configuration.BreakBeforeBR = BreakBeforeBR;
0476: }
0477:
0478: public boolean getBreakBeforeBR() {
0479: return configuration.BreakBeforeBR;
0480: }
0481:
0482: /**
0483: * BurstSlides - create slides on each h2 element
0484: * @see org.w3c.tidy.Configuration#BurstSlides
0485: */
0486:
0487: public void setBurstSlides(boolean BurstSlides) {
0488: configuration.BurstSlides = BurstSlides;
0489: }
0490:
0491: public boolean getBurstSlides() {
0492: return configuration.BurstSlides;
0493: }
0494:
0495: /**
0496: * NumEntities - use numeric entities
0497: * @see org.w3c.tidy.Configuration#NumEntities
0498: */
0499:
0500: public void setNumEntities(boolean NumEntities) {
0501: configuration.NumEntities = NumEntities;
0502: }
0503:
0504: public boolean getNumEntities() {
0505: return configuration.NumEntities;
0506: }
0507:
0508: /**
0509: * QuoteMarks - output " marks as &quot;
0510: * @see org.w3c.tidy.Configuration#QuoteMarks
0511: */
0512:
0513: public void setQuoteMarks(boolean QuoteMarks) {
0514: configuration.QuoteMarks = QuoteMarks;
0515: }
0516:
0517: public boolean getQuoteMarks() {
0518: return configuration.QuoteMarks;
0519: }
0520:
0521: /**
0522: * QuoteNbsp - output non-breaking space as entity
0523: * @see org.w3c.tidy.Configuration#QuoteNbsp
0524: */
0525:
0526: public void setQuoteNbsp(boolean QuoteNbsp) {
0527: configuration.QuoteNbsp = QuoteNbsp;
0528: }
0529:
0530: public boolean getQuoteNbsp() {
0531: return configuration.QuoteNbsp;
0532: }
0533:
0534: /**
0535: * QuoteAmpersand - output naked ampersand as &
0536: * @see org.w3c.tidy.Configuration#QuoteAmpersand
0537: */
0538:
0539: public void setQuoteAmpersand(boolean QuoteAmpersand) {
0540: configuration.QuoteAmpersand = QuoteAmpersand;
0541: }
0542:
0543: public boolean getQuoteAmpersand() {
0544: return configuration.QuoteAmpersand;
0545: }
0546:
0547: /**
0548: * WrapAttVals - wrap within attribute values
0549: * @see org.w3c.tidy.Configuration#WrapAttVals
0550: */
0551:
0552: public void setWrapAttVals(boolean WrapAttVals) {
0553: configuration.WrapAttVals = WrapAttVals;
0554: }
0555:
0556: public boolean getWrapAttVals() {
0557: return configuration.WrapAttVals;
0558: }
0559:
0560: /**
0561: * WrapScriptlets - wrap within JavaScript string literals
0562: * @see org.w3c.tidy.Configuration#WrapScriptlets
0563: */
0564:
0565: public void setWrapScriptlets(boolean WrapScriptlets) {
0566: configuration.WrapScriptlets = WrapScriptlets;
0567: }
0568:
0569: public boolean getWrapScriptlets() {
0570: return configuration.WrapScriptlets;
0571: }
0572:
0573: /**
0574: * WrapSection - wrap within <![ ... ]> section tags
0575: * @see org.w3c.tidy.Configuration#WrapSection
0576: */
0577:
0578: public void setWrapSection(boolean WrapSection) {
0579: configuration.WrapSection = WrapSection;
0580: }
0581:
0582: public boolean getWrapSection() {
0583: return configuration.WrapSection;
0584: }
0585:
0586: /**
0587: * AltText - default text for alt attribute
0588: * @see org.w3c.tidy.Configuration#altText
0589: */
0590:
0591: public void setAltText(String altText) {
0592: configuration.altText = altText;
0593: }
0594:
0595: public String getAltText() {
0596: return configuration.altText;
0597: }
0598:
0599: /**
0600: * Slidestyle - style sheet for slides
0601: * @see org.w3c.tidy.Configuration#slidestyle
0602: */
0603:
0604: public void setSlidestyle(String slidestyle) {
0605: configuration.slidestyle = slidestyle;
0606: }
0607:
0608: public String getSlidestyle() {
0609: return configuration.slidestyle;
0610: }
0611:
0612: /**
0613: * XmlPi - add <?xml?> for XML docs
0614: * @see org.w3c.tidy.Configuration#XmlPi
0615: */
0616:
0617: public void setXmlPi(boolean XmlPi) {
0618: configuration.XmlPi = XmlPi;
0619: }
0620:
0621: public boolean getXmlPi() {
0622: return configuration.XmlPi;
0623: }
0624:
0625: /**
0626: * DropFontTags - discard presentation tags
0627: * @see org.w3c.tidy.Configuration#DropFontTags
0628: */
0629:
0630: public void setDropFontTags(boolean DropFontTags) {
0631: configuration.DropFontTags = DropFontTags;
0632: }
0633:
0634: public boolean getDropFontTags() {
0635: return configuration.DropFontTags;
0636: }
0637:
0638: /**
0639: * DropEmptyParas - discard empty p elements
0640: * @see org.w3c.tidy.Configuration#DropEmptyParas
0641: */
0642:
0643: public void setDropEmptyParas(boolean DropEmptyParas) {
0644: configuration.DropEmptyParas = DropEmptyParas;
0645: }
0646:
0647: public boolean getDropEmptyParas() {
0648: return configuration.DropEmptyParas;
0649: }
0650:
0651: /**
0652: * FixComments - fix comments with adjacent hyphens
0653: * @see org.w3c.tidy.Configuration#FixComments
0654: */
0655:
0656: public void setFixComments(boolean FixComments) {
0657: configuration.FixComments = FixComments;
0658: }
0659:
0660: public boolean getFixComments() {
0661: return configuration.FixComments;
0662: }
0663:
0664: /**
0665: * WrapAsp - wrap within ASP pseudo elements
0666: * @see org.w3c.tidy.Configuration#WrapAsp
0667: */
0668:
0669: public void setWrapAsp(boolean WrapAsp) {
0670: configuration.WrapAsp = WrapAsp;
0671: }
0672:
0673: public boolean getWrapAsp() {
0674: return configuration.WrapAsp;
0675: }
0676:
0677: /**
0678: * WrapJste - wrap within JSTE pseudo elements
0679: * @see org.w3c.tidy.Configuration#WrapJste
0680: */
0681:
0682: public void setWrapJste(boolean WrapJste) {
0683: configuration.WrapJste = WrapJste;
0684: }
0685:
0686: public boolean getWrapJste() {
0687: return configuration.WrapJste;
0688: }
0689:
0690: /**
0691: * WrapPhp - wrap within PHP pseudo elements
0692: * @see org.w3c.tidy.Configuration#WrapPhp
0693: */
0694:
0695: public void setWrapPhp(boolean WrapPhp) {
0696: configuration.WrapPhp = WrapPhp;
0697: }
0698:
0699: public boolean getWrapPhp() {
0700: return configuration.WrapPhp;
0701: }
0702:
0703: /**
0704: * FixBackslash - fix URLs by replacing \ with /
0705: * @see org.w3c.tidy.Configuration#FixBackslash
0706: */
0707:
0708: public void setFixBackslash(boolean FixBackslash) {
0709: configuration.FixBackslash = FixBackslash;
0710: }
0711:
0712: public boolean getFixBackslash() {
0713: return configuration.FixBackslash;
0714: }
0715:
0716: /**
0717: * IndentAttributes - newline+indent before each attribute
0718: * @see org.w3c.tidy.Configuration#IndentAttributes
0719: */
0720:
0721: public void setIndentAttributes(boolean IndentAttributes) {
0722: configuration.IndentAttributes = IndentAttributes;
0723: }
0724:
0725: public boolean getIndentAttributes() {
0726: return configuration.IndentAttributes;
0727: }
0728:
0729: /**
0730: * DocType - user specified doctype
0731: * omit | auto | strict | loose | <i>fpi</i>
0732: * where the <i>fpi</i> is a string similar to
0733: * "-//ACME//DTD HTML 3.14159//EN"
0734: * Note: for <i>fpi</i> include the double-quotes in the string.
0735: * @see org.w3c.tidy.Configuration#docTypeStr
0736: * @see org.w3c.tidy.Configuration#docTypeMode
0737: */
0738:
0739: public void setDocType(String doctype) {
0740: if (doctype != null)
0741: configuration.docTypeStr = configuration.parseDocType(
0742: doctype, "doctype");
0743: }
0744:
0745: public String getDocType() {
0746: String result = null;
0747: switch (configuration.docTypeMode) {
0748: case Configuration.DOCTYPE_OMIT:
0749: result = "omit";
0750: break;
0751: case Configuration.DOCTYPE_AUTO:
0752: result = "auto";
0753: break;
0754: case Configuration.DOCTYPE_STRICT:
0755: result = "strict";
0756: break;
0757: case Configuration.DOCTYPE_LOOSE:
0758: result = "loose";
0759: break;
0760: case Configuration.DOCTYPE_USER:
0761: result = configuration.docTypeStr;
0762: break;
0763: }
0764: return result;
0765: }
0766:
0767: /**
0768: * LogicalEmphasis - replace i by em and b by strong
0769: * @see org.w3c.tidy.Configuration#LogicalEmphasis
0770: */
0771:
0772: public void setLogicalEmphasis(boolean LogicalEmphasis) {
0773: configuration.LogicalEmphasis = LogicalEmphasis;
0774: }
0775:
0776: public boolean getLogicalEmphasis() {
0777: return configuration.LogicalEmphasis;
0778: }
0779:
0780: /**
0781: * XmlPIs - if set to true PIs must end with ?>
0782: * @see org.w3c.tidy.Configuration#XmlPIs
0783: */
0784:
0785: public void setXmlPIs(boolean XmlPIs) {
0786: configuration.XmlPIs = XmlPIs;
0787: }
0788:
0789: public boolean getXmlPIs() {
0790: return configuration.XmlPIs;
0791: }
0792:
0793: /**
0794: * EncloseText - if true text at body is wrapped in <p>'s
0795: * @see org.w3c.tidy.Configuration#EncloseBodyText
0796: */
0797:
0798: public void setEncloseText(boolean EncloseText) {
0799: configuration.EncloseBodyText = EncloseText;
0800: }
0801:
0802: public boolean getEncloseText() {
0803: return configuration.EncloseBodyText;
0804: }
0805:
0806: /**
0807: * EncloseBlockText - if true text in blocks is wrapped in <p>'s
0808: * @see org.w3c.tidy.Configuration#EncloseBlockText
0809: */
0810:
0811: public void setEncloseBlockText(boolean EncloseBlockText) {
0812: configuration.EncloseBlockText = EncloseBlockText;
0813: }
0814:
0815: public boolean getEncloseBlockText() {
0816: return configuration.EncloseBlockText;
0817: }
0818:
0819: /**
0820: * KeepFileTimes - if true last modified time is preserved<br>
0821: * <b>this is NOT supported at this time.</b>
0822: * @see org.w3c.tidy.Configuration#KeepFileTimes
0823: */
0824:
0825: public void setKeepFileTimes(boolean KeepFileTimes) {
0826: configuration.KeepFileTimes = KeepFileTimes;
0827: }
0828:
0829: public boolean getKeepFileTimes() {
0830: return configuration.KeepFileTimes;
0831: }
0832:
0833: /**
0834: * Word2000 - draconian cleaning for Word2000
0835: * @see org.w3c.tidy.Configuration#Word2000
0836: */
0837:
0838: public void setWord2000(boolean Word2000) {
0839: configuration.Word2000 = Word2000;
0840: }
0841:
0842: public boolean getWord2000() {
0843: return configuration.Word2000;
0844: }
0845:
0846: /**
0847: * TidyMark - add meta element indicating tidied doc
0848: * @see org.w3c.tidy.Configuration#TidyMark
0849: */
0850:
0851: public void setTidyMark(boolean TidyMark) {
0852: configuration.TidyMark = TidyMark;
0853: }
0854:
0855: public boolean getTidyMark() {
0856: return configuration.TidyMark;
0857: }
0858:
0859: /**
0860: * XmlSpace - if set to yes adds xml:space attr as needed
0861: * @see org.w3c.tidy.Configuration#XmlSpace
0862: */
0863:
0864: public void setXmlSpace(boolean XmlSpace) {
0865: configuration.XmlSpace = XmlSpace;
0866: }
0867:
0868: public boolean getXmlSpace() {
0869: return configuration.XmlSpace;
0870: }
0871:
0872: /**
0873: * Emacs - if true format error output for GNU Emacs
0874: * @see org.w3c.tidy.Configuration#Emacs
0875: */
0876:
0877: public void setEmacs(boolean Emacs) {
0878: configuration.Emacs = Emacs;
0879: }
0880:
0881: public boolean getEmacs() {
0882: return configuration.Emacs;
0883: }
0884:
0885: /**
0886: * LiteralAttribs - if true attributes may use newlines
0887: * @see org.w3c.tidy.Configuration#LiteralAttribs
0888: */
0889:
0890: public void setLiteralAttribs(boolean LiteralAttribs) {
0891: configuration.LiteralAttribs = LiteralAttribs;
0892: }
0893:
0894: public boolean getLiteralAttribs() {
0895: return configuration.LiteralAttribs;
0896: }
0897:
0898: /**
0899: * InputStreamName - the name of the input stream (printed in the
0900: * header information).
0901: */
0902: public void setInputStreamName(String name) {
0903: if (name != null)
0904: inputStreamName = name;
0905: }
0906:
0907: public String getInputStreamName() {
0908: return inputStreamName;
0909: }
0910:
0911: /**
0912: * Sets the configuration from a configuration file.
0913: */
0914:
0915: public void setConfigurationFromFile(String filename) {
0916: configuration.parseFile(filename);
0917: }
0918:
0919: /**
0920: * Sets the configuration from a properties object.
0921: */
0922:
0923: public void setConfigurationFromProps(Properties props) {
0924: configuration.addProps(props);
0925: }
0926:
0927: /**
0928: * first time initialization which should
0929: * precede reading the command line
0930: */
0931:
0932: private void init() {
0933: configuration = new Configuration();
0934: if (configuration == null)
0935: return;
0936:
0937: AttributeTable at = AttributeTable.getDefaultAttributeTable();
0938: if (at == null)
0939: return;
0940: TagTable tt = new TagTable();
0941: if (tt == null)
0942: return;
0943: tt.setConfiguration(configuration);
0944: configuration.tt = tt;
0945: EntityTable et = EntityTable.getDefaultEntityTable();
0946: if (et == null)
0947: return;
0948:
0949: /* Unnecessary - same initial values in Configuration
0950: Configuration.XmlTags = false;
0951: Configuration.XmlOut = false;
0952: Configuration.HideEndTags = false;
0953: Configuration.UpperCaseTags = false;
0954: Configuration.MakeClean = false;
0955: Configuration.writeback = false;
0956: Configuration.OnlyErrors = false;
0957: */
0958:
0959: configuration.errfile = null;
0960: stderr = new PrintWriter(System.err, true);
0961: errout = stderr;
0962: initialized = true;
0963: }
0964:
0965: /**
0966: * Parses InputStream in and returns the root Node.
0967: * If out is non-null, pretty prints to OutputStream out.
0968: */
0969:
0970: public Node parse(InputStream in, OutputStream out) {
0971: Node document = null;
0972:
0973: try {
0974: document = parse(in, null, out);
0975: } catch (FileNotFoundException fnfe) {
0976: } catch (IOException e) {
0977: }
0978:
0979: return document;
0980: }
0981:
0982: /**
0983: * Internal routine that actually does the parsing. The caller
0984: * can pass either an InputStream or file name. If both are passed,
0985: * the file name is preferred.
0986: */
0987:
0988: private Node parse(InputStream in, String file, OutputStream out)
0989: throws FileNotFoundException, IOException {
0990: Lexer lexer;
0991: Node document = null;
0992: Node doctype;
0993: Out o = new OutImpl(); /* normal output stream */
0994: PPrint pprint;
0995:
0996: if (!initialized)
0997: return null;
0998:
0999: if (errout == null)
1000: return null;
1001:
1002: parseErrors = 0;
1003: parseWarnings = 0;
1004:
1005: /* ensure config is self-consistent */
1006: configuration.adjust();
1007:
1008: if (file != null) {
1009: in = new FileInputStream(file);
1010: inputStreamName = file;
1011: } else if (in == null) {
1012: in = System.in;
1013: inputStreamName = "stdin";
1014: }
1015:
1016: if (in != null) {
1017: lexer = new Lexer(new StreamInImpl(in,
1018: configuration.CharEncoding, configuration.tabsize),
1019: configuration);
1020: lexer.errout = errout;
1021:
1022: /*
1023: store pointer to lexer in input stream
1024: to allow character encoding errors to be
1025: reported
1026: */
1027: lexer.in.lexer = lexer;
1028:
1029: /* Tidy doesn't alter the doctype for generic XML docs */
1030: if (configuration.XmlTags)
1031: document = ParserImpl.parseXMLDocument(lexer);
1032: else {
1033: lexer.warnings = 0;
1034: if (!configuration.Quiet)
1035: Report.helloMessage(errout, Report.RELEASE_DATE,
1036: inputStreamName);
1037:
1038: document = ParserImpl.parseDocument(lexer);
1039:
1040: if (!document.checkNodeIntegrity()) {
1041: Report.badTree(errout);
1042: return null;
1043: }
1044:
1045: Clean cleaner = new Clean(configuration.tt);
1046:
1047: /* simplifies <b><b> ... </b> ...</b> etc. */
1048: cleaner.nestedEmphasis(document);
1049:
1050: /* cleans up <dir>indented text</dir> etc. */
1051: cleaner.list2BQ(document);
1052: cleaner.bQ2Div(document);
1053:
1054: /* replaces i by em and b by strong */
1055: if (configuration.LogicalEmphasis)
1056: cleaner.emFromI(document);
1057:
1058: if (configuration.Word2000
1059: && cleaner.isWord2000(document,
1060: configuration.tt)) {
1061: /* prune Word2000's <![if ...]> ... <![endif]> */
1062: cleaner.dropSections(lexer, document);
1063:
1064: /* drop style & class attributes and empty p, span elements */
1065: cleaner.cleanWord2000(lexer, document);
1066: }
1067:
1068: /* replaces presentational markup by style rules */
1069: if (configuration.MakeClean
1070: || configuration.DropFontTags)
1071: cleaner.cleanTree(lexer, document);
1072:
1073: if (!document.checkNodeIntegrity()) {
1074: Report.badTree(errout);
1075: return null;
1076: }
1077: doctype = document.findDocType();
1078: if (document.content != null) {
1079: if (configuration.xHTML)
1080: lexer.setXHTMLDocType(document);
1081: else
1082: lexer.fixDocType(document);
1083:
1084: if (configuration.TidyMark)
1085: lexer.addGenerator(document);
1086: }
1087:
1088: /* ensure presence of initial <?XML version="1.0"?> */
1089: if (configuration.XmlOut && configuration.XmlPi)
1090: lexer.fixXMLPI(document);
1091:
1092: if (!configuration.Quiet && document.content != null) {
1093: Report.reportVersion(errout, lexer,
1094: inputStreamName, doctype);
1095: Report.reportNumWarnings(errout, lexer);
1096: }
1097: }
1098:
1099: parseWarnings = lexer.warnings;
1100: parseErrors = lexer.errors;
1101:
1102: // Try to close the InputStream but only if if we created it.
1103:
1104: if ((file != null) && (in != System.in)) {
1105: try {
1106: in.close();
1107: } catch (IOException e) {
1108: }
1109: }
1110:
1111: if (lexer.errors > 0)
1112: Report.needsAuthorIntervention(errout);
1113:
1114: o.state = StreamIn.FSM_ASCII;
1115: o.encoding = configuration.CharEncoding;
1116:
1117: if (!configuration.OnlyErrors && lexer.errors == 0) {
1118: if (configuration.BurstSlides) {
1119: Node body;
1120:
1121: body = null;
1122: /*
1123: remove doctype to avoid potential clash with
1124: markup introduced when bursting into slides
1125: */
1126: /* discard the document type */
1127: doctype = document.findDocType();
1128:
1129: if (doctype != null)
1130: Node.discardElement(doctype);
1131:
1132: /* slides use transitional features */
1133: lexer.versions |= Dict.VERS_HTML40_LOOSE;
1134:
1135: /* and patch up doctype to match */
1136: if (configuration.xHTML)
1137: lexer.setXHTMLDocType(document);
1138: else
1139: lexer.fixDocType(document);
1140:
1141: /* find the body element which may be implicit */
1142: body = document.findBody(configuration.tt);
1143:
1144: if (body != null) {
1145: pprint = new PPrint(configuration);
1146: Report.reportNumberOfSlides(errout, pprint
1147: .countSlides(body));
1148: pprint.createSlides(lexer, document);
1149: } else
1150: Report.missingBody(errout);
1151: } else if (configuration.writeback && (file != null)) {
1152: try {
1153: pprint = new PPrint(configuration);
1154: o.out = new FileOutputStream(file);
1155:
1156: if (configuration.XmlTags)
1157: pprint.printXMLTree(o, (short) 0, 0, lexer,
1158: document);
1159: else
1160: pprint.printTree(o, (short) 0, 0, lexer,
1161: document);
1162:
1163: pprint.flushLine(o, 0);
1164: o.out.close();
1165: } catch (IOException e) {
1166: errout.println(file + e.toString());
1167: }
1168: } else if (out != null) {
1169: pprint = new PPrint(configuration);
1170: o.out = out;
1171:
1172: if (configuration.XmlTags)
1173: pprint.printXMLTree(o, (short) 0, 0, lexer,
1174: document);
1175: else
1176: pprint.printTree(o, (short) 0, 0, lexer,
1177: document);
1178:
1179: pprint.flushLine(o, 0);
1180: }
1181:
1182: }
1183:
1184: Report.errorSummary(lexer);
1185: }
1186: return document;
1187: }
1188:
1189: /**
1190: * Parses InputStream in and returns a DOM Document node.
1191: * If out is non-null, pretty prints to OutputStream out.
1192: */
1193:
1194: public org.w3c.dom.Document parseDOM(InputStream in,
1195: OutputStream out) {
1196: Node document = parse(in, out);
1197: if (document != null)
1198: return (org.w3c.dom.Document) document.getAdapter();
1199: else
1200: return null;
1201: }
1202:
1203: /**
1204: * Creates an empty DOM Document.
1205: */
1206:
1207: public static org.w3c.dom.Document createEmptyDocument() {
1208: Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1209: Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html",
1210: new TagTable());
1211: if (document != null && node != null) {
1212: Node.insertNodeAtStart(document, node);
1213: return (org.w3c.dom.Document) document.getAdapter();
1214: } else {
1215: return null;
1216: }
1217: }
1218:
1219: /**
1220: * Pretty-prints a DOM Document.
1221: */
1222:
1223: public void pprint(org.w3c.dom.Document doc, OutputStream out) {
1224: Out o = new OutImpl();
1225: PPrint pprint;
1226: Node document;
1227:
1228: if (!(doc instanceof DOMDocumentImpl)) {
1229: return;
1230: }
1231: document = ((DOMDocumentImpl) doc).adaptee;
1232:
1233: o.state = StreamIn.FSM_ASCII;
1234: o.encoding = configuration.CharEncoding;
1235:
1236: if (out != null) {
1237: pprint = new PPrint(configuration);
1238: o.out = out;
1239:
1240: if (configuration.XmlTags)
1241: pprint.printXMLTree(o, (short) 0, 0, null, document);
1242: else
1243: pprint.printTree(o, (short) 0, 0, null, document);
1244:
1245: pprint.flushLine(o, 0);
1246: }
1247: }
1248:
1249: /**
1250: * Command line interface to parser and pretty printer.
1251: */
1252:
1253: public static void main(String[] argv) {
1254: int totalerrors = 0;
1255: int totalwarnings = 0;
1256: String file;
1257: InputStream in;
1258: String prog = "Tidy";
1259: Node document;
1260: Node doctype;
1261: Lexer lexer;
1262: String s;
1263: Out out = new OutImpl(); /* normal output stream */
1264: PPrint pprint;
1265: int argc = argv.length + 1;
1266: int argIndex = 0;
1267: Tidy tidy;
1268: Configuration configuration;
1269: String arg;
1270: String current_errorfile = "stderr";
1271:
1272: tidy = new Tidy();
1273: configuration = tidy.getConfiguration();
1274:
1275: /* read command line */
1276:
1277: while (argc > 0) {
1278: if (argc > 1 && argv[argIndex].startsWith("-")) {
1279: /* support -foo and --foo */
1280: arg = argv[argIndex].substring(1);
1281:
1282: if (arg.length() > 0 && arg.charAt(0) == '-')
1283: arg = arg.substring(1);
1284:
1285: if (arg.equals("xml"))
1286: configuration.XmlTags = true;
1287: else if (arg.equals("asxml") || arg.equals("asxhtml"))
1288: configuration.xHTML = true;
1289: else if (arg.equals("indent")) {
1290: configuration.IndentContent = true;
1291: configuration.SmartIndent = true;
1292: } else if (arg.equals("omit"))
1293: configuration.HideEndTags = true;
1294: else if (arg.equals("upper"))
1295: configuration.UpperCaseTags = true;
1296: else if (arg.equals("clean"))
1297: configuration.MakeClean = true;
1298: else if (arg.equals("raw"))
1299: configuration.CharEncoding = Configuration.RAW;
1300: else if (arg.equals("ascii"))
1301: configuration.CharEncoding = Configuration.ASCII;
1302: else if (arg.equals("latin1"))
1303: configuration.CharEncoding = Configuration.LATIN1;
1304: else if (arg.equals("utf8"))
1305: configuration.CharEncoding = Configuration.UTF8;
1306: else if (arg.equals("iso2022"))
1307: configuration.CharEncoding = Configuration.ISO2022;
1308: else if (arg.equals("mac"))
1309: configuration.CharEncoding = Configuration.MACROMAN;
1310: else if (arg.equals("numeric"))
1311: configuration.NumEntities = true;
1312: else if (arg.equals("modify"))
1313: configuration.writeback = true;
1314: else if (arg.equals("change")) /* obsolete */
1315: configuration.writeback = true;
1316: else if (arg.equals("update")) /* obsolete */
1317: configuration.writeback = true;
1318: else if (arg.equals("errors"))
1319: configuration.OnlyErrors = true;
1320: else if (arg.equals("quiet"))
1321: configuration.Quiet = true;
1322: else if (arg.equals("slides"))
1323: configuration.BurstSlides = true;
1324: else if (arg.equals("help")
1325: || argv[argIndex].charAt(1) == '?'
1326: || argv[argIndex].charAt(1) == 'h') {
1327: Report.helpText(new PrintWriter(System.out, true),
1328: prog);
1329: System.exit(1);
1330: } else if (arg.equals("config")) {
1331: if (argc >= 3) {
1332: configuration.parseFile(argv[argIndex + 1]);
1333: --argc;
1334: ++argIndex;
1335: }
1336: } else if (argv[argIndex].equals("-file")
1337: || argv[argIndex].equals("--file")
1338: || argv[argIndex].equals("-f")) {
1339: if (argc >= 3) {
1340: configuration.errfile = argv[argIndex + 1];
1341: --argc;
1342: ++argIndex;
1343: }
1344: } else if (argv[argIndex].equals("-wrap")
1345: || argv[argIndex].equals("--wrap")
1346: || argv[argIndex].equals("-w")) {
1347: if (argc >= 3) {
1348: configuration.wraplen = Integer
1349: .parseInt(argv[argIndex + 1]);
1350: --argc;
1351: ++argIndex;
1352: }
1353: } else if (argv[argIndex].equals("-version")
1354: || argv[argIndex].equals("--version")
1355: || argv[argIndex].equals("-v")) {
1356: Report.showVersion(tidy.getErrout());
1357: System.exit(0);
1358: } else {
1359: s = argv[argIndex];
1360:
1361: for (int i = 1; i < s.length(); i++) {
1362: if (s.charAt(i) == 'i') {
1363: configuration.IndentContent = true;
1364: configuration.SmartIndent = true;
1365: } else if (s.charAt(i) == 'o')
1366: configuration.HideEndTags = true;
1367: else if (s.charAt(i) == 'u')
1368: configuration.UpperCaseTags = true;
1369: else if (s.charAt(i) == 'c')
1370: configuration.MakeClean = true;
1371: else if (s.charAt(i) == 'n')
1372: configuration.NumEntities = true;
1373: else if (s.charAt(i) == 'm')
1374: configuration.writeback = true;
1375: else if (s.charAt(i) == 'e')
1376: configuration.OnlyErrors = true;
1377: else if (s.charAt(i) == 'q')
1378: configuration.Quiet = true;
1379: else
1380: Report.unknownOption(tidy.getErrout(), s
1381: .charAt(i));
1382: }
1383: }
1384:
1385: --argc;
1386: ++argIndex;
1387: continue;
1388: }
1389:
1390: /* ensure config is self-consistent */
1391: configuration.adjust();
1392:
1393: /* user specified error file */
1394: if (configuration.errfile != null) {
1395: /* is it same as the currently opened file? */
1396: if (!configuration.errfile.equals(current_errorfile)) {
1397: /* no so close previous error file */
1398:
1399: if (tidy.getErrout() != tidy.getStderr())
1400: tidy.getErrout().close();
1401:
1402: /* and try to open the new error file */
1403: try {
1404: tidy.setErrout(new PrintWriter(new FileWriter(
1405: configuration.errfile), true));
1406: current_errorfile = configuration.errfile;
1407: } catch (IOException e) {
1408: /* can't be opened so fall back to stderr */
1409: current_errorfile = "stderr";
1410: tidy.setErrout(tidy.getStderr());
1411: }
1412: }
1413: }
1414:
1415: if (argc > 1) {
1416: file = argv[argIndex];
1417: } else {
1418: file = "stdin";
1419: }
1420:
1421: try {
1422: document = tidy.parse(null, file, System.out);
1423: totalwarnings += tidy.parseWarnings;
1424: totalerrors += tidy.parseErrors;
1425: } catch (FileNotFoundException fnfe) {
1426: Report.unknownFile(tidy.getErrout(), prog, file);
1427: } catch (IOException ioe) {
1428: Report.unknownFile(tidy.getErrout(), prog, file);
1429: }
1430:
1431: --argc;
1432: ++argIndex;
1433:
1434: if (argc <= 1)
1435: break;
1436: }
1437:
1438: if (totalerrors + totalwarnings > 0)
1439: Report.generalInfo(tidy.getErrout());
1440:
1441: if (tidy.getErrout() != tidy.getStderr())
1442: tidy.getErrout().close();
1443:
1444: /* return status can be used by scripts */
1445:
1446: if (totalerrors > 0)
1447: System.exit(2);
1448:
1449: if (totalwarnings > 0)
1450: System.exit(1);
1451:
1452: /* 0 signifies all is ok */
1453: System.exit(0);
1454: }
1455:
1456: // BEGIN RAVE MODIFICATIONS
1457: static final String replacement = "%leaveentitiesalone%";
1458:
1459: /** Wraps an input stream, and "escapes" entities such that
1460: * JTidy doesn't see them (and doesn't mess with them). The
1461: * corresponding EntityWrapperOutputStream will undo its effects.
1462: * */
1463: public static class EntityWrapperInputStream extends InputStream {
1464: public EntityWrapperInputStream(InputStream inputStream) {
1465: this .inputStream = inputStream;
1466: }
1467:
1468: public int read() throws IOException {
1469: if (buffer != null) {
1470: if (++bufferPosition >= buffer.length()) {
1471: buffer = null;
1472: bufferPosition = -1;
1473: } else {
1474: return buffer.charAt(bufferPosition);
1475: }
1476: }
1477:
1478: int result = inputStream.read();
1479: if (result == '&') {
1480: buffer = replacement;
1481: result = read();
1482: }
1483:
1484: return result;
1485: }
1486:
1487: public int read(byte[] b) throws IOException {
1488: return read(b, 0, b.length);
1489: }
1490:
1491: public int read(byte[] b, int offset, int length)
1492: throws IOException {
1493: int c;
1494: int i = -1;
1495: while (++i < length) {
1496: c = read();
1497: if (c == -1) {
1498: return i == 0 ? -1 : i;
1499: }
1500: b[offset + i] = (byte) c;
1501: }
1502:
1503: return i;
1504: }
1505:
1506: public void close() throws IOException {
1507: inputStream.close();
1508: }
1509:
1510: private String buffer;
1511: private InputStream inputStream;
1512: private int bufferPosition = -1;
1513: }
1514:
1515: /** Wraps an output stream, and translates escaped entities back
1516: * into proper entities
1517: * */
1518: public static class EntityWrapperOutputStream extends OutputStream {
1519: public EntityWrapperOutputStream(OutputStream outputStream,
1520: boolean jspx) {
1521: this .outputStream = outputStream;
1522: this .jspx = jspx;
1523: }
1524:
1525: public void close() throws IOException {
1526: outputStream.close();
1527: }
1528:
1529: public void flush() throws IOException {
1530: outputStream.flush();
1531: }
1532:
1533: public void write(int b) throws IOException {
1534: if (sb.length() != 0) {
1535: sb.append((char) b);
1536: String s = sb.toString(); // UGH! super inefficient
1537: if (s.equals(replacement)) {
1538: outputStream.write('&');
1539: if (jspx) {
1540: outputStream.write('a');
1541: outputStream.write('m');
1542: outputStream.write('p');
1543: outputStream.write(';');
1544: }
1545: sb.setLength(0);
1546: } else if (!replacement.startsWith(s)) {
1547: outputStream.write(s.getBytes());
1548: sb.setLength(0);
1549: }
1550: } else if (b == (int) replacement.charAt(0)) {
1551: sb.append((char) b);
1552: } else {
1553: outputStream.write(b);
1554: }
1555: }
1556:
1557: private boolean jspx;
1558: private StringBuffer sb = new StringBuffer();
1559: private OutputStream outputStream;
1560: }
1561:
1562: /** When parseDOM is called, there's no output stream to fix the
1563: * nodes. This method achieves that.
1564: */
1565: public static void cleanEntities(org.w3c.dom.Node node,
1566: boolean convertHtmlToJspx) {
1567: if (node instanceof CharacterData) {
1568: CharacterData text = (CharacterData) node;
1569: while (true) {
1570: String s = text.getData();
1571: if (s.indexOf(replacement) == -1) {
1572: break;
1573: }
1574: // Don't change text ampersands within text nodes because those
1575: // will get expanded anyway by the dom serializer
1576: //s = s.replaceAll(replacement, convertHtmlToJspx ? "&" : "&");
1577: s = s.replaceAll(replacement, "&");
1578: text.setData(s);
1579: }
1580: }
1581: NamedNodeMap nmn = node.getAttributes();
1582: if (nmn != null) {
1583: for (int j = 0, siz = nmn.getLength(); j < siz; j++) {
1584: org.w3c.dom.Node item = nmn.item(j);
1585: if (item instanceof org.w3c.dom.Attr) {
1586: org.w3c.dom.Attr attr = (org.w3c.dom.Attr) item;
1587: String s = attr.getValue();
1588: if (s.indexOf(replacement) == -1) {
1589: continue;
1590: }
1591: s = s.replaceAll(replacement,
1592: convertHtmlToJspx ? "&" : "&");
1593: s = expand(s);
1594: // XXX It would be nice if I could find a way to set
1595: // the node value of the Node in such a way that it
1596: // preserve entities. Can I add text nodes and entity
1597: // nodes?? That would kick ass!
1598: attr.setValue(s);
1599: }
1600: }
1601: }
1602: NodeList nl = node.getChildNodes();
1603: for (int i = 0, n = nl.getLength(); i < n; i++) {
1604: org.w3c.dom.Node child = nl.item(i);
1605: cleanEntities(child, convertHtmlToJspx);
1606: }
1607: }
1608:
1609: /** Expand entities one level in the given source string.
1610: * Copied from insync.markup. This method has the Sun copyright.
1611: */
1612: private static String expand(String unexpanded) {
1613: if (unexpanded.indexOf('&') == -1) { // todo: keep index and copy up to it below
1614: return unexpanded;
1615: }
1616: int n = unexpanded.length();
1617: int nm1 = n - 1;
1618:
1619: // IMPORTANT NOTE: Keeps this code in sync with getJspxOffset below!
1620:
1621: StringBuffer sb = new StringBuffer(n);
1622: for (int i = 0; i < n; i++) {
1623: char c = unexpanded.charAt(i);
1624: if (c == '&' && i < nm1) {
1625: // Locate entity
1626: int begin = i + 1;
1627: int end = begin;
1628: while (end < n && unexpanded.charAt(end) != ';'
1629: && (end - begin <= 10)) { // longest entity is 8 chars
1630: end++;
1631: }
1632: if (end == n || unexpanded.charAt(end) != ';') {
1633: // Error - just spit out a &
1634: sb.append('&');
1635: continue;
1636: }
1637: String entity = unexpanded.substring(begin, end);
1638:
1639: //NB60 Talk to Peter Zavadsky
1640: int e = com.sun.org.apache.xml.internal.serialize.HTMLdtd
1641: .charFromName(entity);
1642: if (e == -1) {
1643: sb.append('&'); // browsers show the &
1644: continue;
1645: } else {
1646: sb.append((char) e);
1647: i = end;
1648: }
1649: } else {
1650: sb.append(c);
1651: }
1652: }
1653: return sb.toString();
1654: }
1655:
1656: // END RAVE MODIFICATIONS
1657:
1658: }
|