0001: /*
0002: * @(#)Tidy.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: /*
0007: HTML parser and pretty printer
0008:
0009: Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0010: Institute of Technology, Institut National de Recherche en
0011: Informatique et en Automatique, Keio University). All Rights
0012: Reserved.
0013:
0014: Contributing Author(s):
0015:
0016: Dave Raggett <dsr@w3.org>
0017: Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0018:
0019: The contributing author(s) would like to thank all those who
0020: helped with testing, bug fixes, and patience. This wouldn't
0021: have been possible without all of you.
0022:
0023: COPYRIGHT NOTICE:
0024:
0025: This software and documentation is provided "as is," and
0026: the copyright holders and contributing author(s) make no
0027: representations or warranties, express or implied, including
0028: but not limited to, warranties of merchantability or fitness
0029: for any particular purpose or that the use of the software or
0030: documentation will not infringe any third party patents,
0031: copyrights, trademarks or other rights.
0032:
0033: The copyright holders and contributing author(s) will not be
0034: liable for any direct, indirect, special or consequential damages
0035: arising out of any use of the software or documentation, even if
0036: advised of the possibility of such damage.
0037:
0038: Permission is hereby granted to use, copy, modify, and distribute
0039: this source code, or portions hereof, documentation and executables,
0040: for any purpose, without fee, subject to the following restrictions:
0041:
0042: 1. The origin of this source code must not be misrepresented.
0043: 2. Altered versions must be plainly marked as such and must
0044: not be misrepresented as being the original source.
0045: 3. This Copyright notice may not be removed or altered from any
0046: source or altered source distribution.
0047:
0048: The copyright holders and contributing author(s) specifically
0049: permit, without fee, and encourage the use of this source code
0050: as a component for supporting the Hypertext Markup Language in
0051: commercial products. If you use this source code in a product,
0052: acknowledgment is not required but would be appreciated.
0053: */
0054:
0055: package org.w3c.tidy;
0056:
0057: import java.io.PrintWriter;
0058: import java.io.FileWriter;
0059: import java.io.InputStream;
0060: import java.io.FileInputStream;
0061: import java.io.OutputStream;
0062: import java.io.FileOutputStream;
0063: import java.util.Properties;
0064:
0065: import java.io.IOException;
0066: import java.io.FileNotFoundException;
0067:
0068: /**
0069: *
0070: * <p>HTML parser and pretty printer</p>
0071: *
0072: * <p>
0073: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0074: * See Tidy.java for the copyright notice.
0075: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0076: * HTML Tidy Release 4 Aug 2000</a>
0077: * </p>
0078: *
0079: * <p>
0080: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0081: * Institute of Technology, Institut National de Recherche en
0082: * Informatique et en Automatique, Keio University). All Rights
0083: * Reserved.
0084: * </p>
0085: *
0086: * <p>
0087: * Contributing Author(s):<br>
0088: * <a href="mailto:dsr@w3.org">Dave Raggett</a><br>
0089: * <a href="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
0090: * </p>
0091: *
0092: * <p>
0093: * The contributing author(s) would like to thank all those who
0094: * helped with testing, bug fixes, and patience. This wouldn't
0095: * have been possible without all of you.
0096: * </p>
0097: *
0098: * <p>
0099: * COPYRIGHT NOTICE:<br>
0100: *
0101: * This software and documentation is provided "as is," and
0102: * the copyright holders and contributing author(s) make no
0103: * representations or warranties, express or implied, including
0104: * but not limited to, warranties of merchantability or fitness
0105: * for any particular purpose or that the use of the software or
0106: * documentation will not infringe any third party patents,
0107: * copyrights, trademarks or other rights.
0108: * </p>
0109: *
0110: * <p>
0111: * The copyright holders and contributing author(s) will not be
0112: * liable for any direct, indirect, special or consequential damages
0113: * arising out of any use of the software or documentation, even if
0114: * advised of the possibility of such damage.
0115: * </p>
0116: *
0117: * <p>
0118: * Permission is hereby granted to use, copy, modify, and distribute
0119: * this source code, or portions hereof, documentation and executables,
0120: * for any purpose, without fee, subject to the following restrictions:
0121: * </p>
0122: *
0123: * <p>
0124: * <ol>
0125: * <li>The origin of this source code must not be misrepresented.</li>
0126: * <li>Altered versions must be plainly marked as such and must
0127: * not be misrepresented as being the original source.</li>
0128: * <li>This Copyright notice may not be removed or altered from any
0129: * source or altered source distribution.</li>
0130: * </ol>
0131: * </p>
0132: *
0133: * <p>
0134: * The copyright holders and contributing author(s) specifically
0135: * permit, without fee, and encourage the use of this source code
0136: * as a component for supporting the Hypertext Markup Language in
0137: * commercial products. If you use this source code in a product,
0138: * acknowledgment is not required but would be appreciated.
0139: * </p>
0140: *
0141: * @author Dave Raggett <dsr@w3.org>
0142: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0143: * @version 1.0, 1999/05/22
0144: * @version 1.0.1, 1999/05/29
0145: * @version 1.1, 1999/06/18 Java Bean
0146: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0147: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0148: * @version 1.4, 1999/09/04 DOM support
0149: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0150: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0151: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0152: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0153: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0154: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0155: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0156: *
0157: */
0158:
0159: public class Tidy implements java.io.Serializable {
0160:
0161: static final long serialVersionUID = -2794371560623987718L;
0162:
0163: private boolean initialized = false;
0164: private PrintWriter errout = null; /* error output stream */
0165: private PrintWriter stderr = null;
0166: private Configuration configuration = null;
0167: private String inputStreamName = "InputStream";
0168: private int parseErrors = 0;
0169: private int parseWarnings = 0;
0170:
0171: public Tidy() {
0172: init();
0173: }
0174:
0175: public Configuration getConfiguration() {
0176: return configuration;
0177: }
0178:
0179: public PrintWriter getStderr() {
0180: return stderr;
0181: }
0182:
0183: /**
0184: * ParseErrors - the number of errors that occurred in the most
0185: * recent parse operation
0186: */
0187:
0188: public int getParseErrors() {
0189: return parseErrors;
0190: }
0191:
0192: /**
0193: * ParseWarnings - the number of warnings that occurred in the most
0194: * recent parse operation
0195: */
0196:
0197: public int getParseWarnings() {
0198: return parseWarnings;
0199: }
0200:
0201: /**
0202: * Errout - the error output stream
0203: */
0204:
0205: public PrintWriter getErrout() {
0206: return errout;
0207: }
0208:
0209: public void setErrout(PrintWriter errout) {
0210: this .errout = errout;
0211: }
0212:
0213: /**
0214: * Spaces - default indentation
0215: * @see org.w3c.tidy.Configuration#spaces
0216: */
0217:
0218: public void setSpaces(int spaces) {
0219: configuration.spaces = spaces;
0220: }
0221:
0222: public int getSpaces() {
0223: return configuration.spaces;
0224: }
0225:
0226: /**
0227: * Wraplen - default wrap margin
0228: * @see org.w3c.tidy.Configuration#wraplen
0229: */
0230:
0231: public void setWraplen(int wraplen) {
0232: configuration.wraplen = wraplen;
0233: }
0234:
0235: public int getWraplen() {
0236: return configuration.wraplen;
0237: }
0238:
0239: /**
0240: * CharEncoding
0241: * @see org.w3c.tidy.Configuration#CharEncoding
0242: */
0243:
0244: public void setCharEncoding(int charencoding) {
0245: configuration.CharEncoding = charencoding;
0246: }
0247:
0248: public int getCharEncoding() {
0249: return configuration.CharEncoding;
0250: }
0251:
0252: /**
0253: * Tabsize
0254: * @see org.w3c.tidy.Configuration#tabsize
0255: */
0256:
0257: public void setTabsize(int tabsize) {
0258: configuration.tabsize = tabsize;
0259: }
0260:
0261: public int getTabsize() {
0262: return configuration.tabsize;
0263: }
0264:
0265: /**
0266: * Errfile - file name to write errors to
0267: * @see org.w3c.tidy.Configuration#errfile
0268: */
0269:
0270: public void setErrfile(String errfile) {
0271: configuration.errfile = errfile;
0272: }
0273:
0274: public String getErrfile() {
0275: return configuration.errfile;
0276: }
0277:
0278: /**
0279: * Writeback - if true then output tidied markup
0280: * NOTE: this property is ignored when parsing from an InputStream.
0281: * @see org.w3c.tidy.Configuration#writeback
0282: */
0283:
0284: public void setWriteback(boolean writeback) {
0285: configuration.writeback = writeback;
0286: }
0287:
0288: public boolean getWriteback() {
0289: return configuration.writeback;
0290: }
0291:
0292: /**
0293: * OnlyErrors - if true normal output is suppressed
0294: * @see org.w3c.tidy.Configuration#OnlyErrors
0295: */
0296:
0297: public void setOnlyErrors(boolean OnlyErrors) {
0298: configuration.OnlyErrors = OnlyErrors;
0299: }
0300:
0301: public boolean getOnlyErrors() {
0302: return configuration.OnlyErrors;
0303: }
0304:
0305: /**
0306: * ShowWarnings - however errors are always shown
0307: * @see org.w3c.tidy.Configuration#ShowWarnings
0308: */
0309:
0310: public void setShowWarnings(boolean ShowWarnings) {
0311: configuration.ShowWarnings = ShowWarnings;
0312: }
0313:
0314: public boolean getShowWarnings() {
0315: return configuration.ShowWarnings;
0316: }
0317:
0318: /**
0319: * Quiet - no 'Parsing X', guessed DTD or summary
0320: * @see org.w3c.tidy.Configuration#Quiet
0321: */
0322:
0323: public void setQuiet(boolean Quiet) {
0324: configuration.Quiet = Quiet;
0325: }
0326:
0327: public boolean getQuiet() {
0328: return configuration.Quiet;
0329: }
0330:
0331: /**
0332: * IndentContent - indent content of appropriate tags
0333: * @see org.w3c.tidy.Configuration#IndentContent
0334: */
0335:
0336: public void setIndentContent(boolean IndentContent) {
0337: configuration.IndentContent = IndentContent;
0338: }
0339:
0340: public boolean getIndentContent() {
0341: return configuration.IndentContent;
0342: }
0343:
0344: /**
0345: * SmartIndent - does text/block level content effect indentation
0346: * @see org.w3c.tidy.Configuration#SmartIndent
0347: */
0348:
0349: public void setSmartIndent(boolean SmartIndent) {
0350: configuration.SmartIndent = SmartIndent;
0351: }
0352:
0353: public boolean getSmartIndent() {
0354: return configuration.SmartIndent;
0355: }
0356:
0357: /**
0358: * HideEndTags - suppress optional end tags
0359: * @see org.w3c.tidy.Configuration#HideEndTags
0360: */
0361:
0362: public void setHideEndTags(boolean HideEndTags) {
0363: configuration.HideEndTags = HideEndTags;
0364: }
0365:
0366: public boolean getHideEndTags() {
0367: return configuration.HideEndTags;
0368: }
0369:
0370: /**
0371: * XmlTags - treat input as XML
0372: * @see org.w3c.tidy.Configuration#XmlTags
0373: */
0374:
0375: public void setXmlTags(boolean XmlTags) {
0376: configuration.XmlTags = XmlTags;
0377: }
0378:
0379: public boolean getXmlTags() {
0380: return configuration.XmlTags;
0381: }
0382:
0383: /**
0384: * XmlOut - create output as XML
0385: * @see org.w3c.tidy.Configuration#XmlOut
0386: */
0387:
0388: public void setXmlOut(boolean XmlOut) {
0389: configuration.XmlOut = XmlOut;
0390: }
0391:
0392: public boolean getXmlOut() {
0393: return configuration.XmlOut;
0394: }
0395:
0396: /**
0397: * XHTML - output extensible HTML
0398: * @see org.w3c.tidy.Configuration#xHTML
0399: */
0400:
0401: public void setXHTML(boolean xHTML) {
0402: configuration.xHTML = xHTML;
0403: }
0404:
0405: public boolean getXHTML() {
0406: return configuration.xHTML;
0407: }
0408:
0409: /**
0410: * RawOut - avoid mapping values > 127 to entities
0411: * @see org.w3c.tidy.Configuration#RawOut
0412: */
0413:
0414: public void setRawOut(boolean RawOut) {
0415: configuration.RawOut = RawOut;
0416: }
0417:
0418: public boolean getRawOut() {
0419: return configuration.RawOut;
0420: }
0421:
0422: /**
0423: * UpperCaseTags - output tags in upper not lower case
0424: * @see org.w3c.tidy.Configuration#UpperCaseTags
0425: */
0426:
0427: public void setUpperCaseTags(boolean UpperCaseTags) {
0428: configuration.UpperCaseTags = UpperCaseTags;
0429: }
0430:
0431: public boolean getUpperCaseTags() {
0432: return configuration.UpperCaseTags;
0433: }
0434:
0435: /**
0436: * UpperCaseAttrs - output attributes in upper not lower case
0437: * @see org.w3c.tidy.Configuration#UpperCaseAttrs
0438: */
0439:
0440: public void setUpperCaseAttrs(boolean UpperCaseAttrs) {
0441: configuration.UpperCaseAttrs = UpperCaseAttrs;
0442: }
0443:
0444: public boolean getUpperCaseAttrs() {
0445: return configuration.UpperCaseAttrs;
0446: }
0447:
0448: /**
0449: * MakeClean - remove presentational clutter
0450: * @see org.w3c.tidy.Configuration#MakeClean
0451: */
0452:
0453: public void setMakeClean(boolean MakeClean) {
0454: configuration.MakeClean = MakeClean;
0455: }
0456:
0457: public boolean getMakeClean() {
0458: return configuration.MakeClean;
0459: }
0460:
0461: /**
0462: * BreakBeforeBR - o/p newline before <br> or not?
0463: * @see org.w3c.tidy.Configuration#BreakBeforeBR
0464: */
0465:
0466: public void setBreakBeforeBR(boolean BreakBeforeBR) {
0467: configuration.BreakBeforeBR = BreakBeforeBR;
0468: }
0469:
0470: public boolean getBreakBeforeBR() {
0471: return configuration.BreakBeforeBR;
0472: }
0473:
0474: /**
0475: * BurstSlides - create slides on each h2 element
0476: * @see org.w3c.tidy.Configuration#BurstSlides
0477: */
0478:
0479: public void setBurstSlides(boolean BurstSlides) {
0480: configuration.BurstSlides = BurstSlides;
0481: }
0482:
0483: public boolean getBurstSlides() {
0484: return configuration.BurstSlides;
0485: }
0486:
0487: /**
0488: * NumEntities - use numeric entities
0489: * @see org.w3c.tidy.Configuration#NumEntities
0490: */
0491:
0492: public void setNumEntities(boolean NumEntities) {
0493: configuration.NumEntities = NumEntities;
0494: }
0495:
0496: public boolean getNumEntities() {
0497: return configuration.NumEntities;
0498: }
0499:
0500: /**
0501: * QuoteMarks - output " marks as &quot;
0502: * @see org.w3c.tidy.Configuration#QuoteMarks
0503: */
0504:
0505: public void setQuoteMarks(boolean QuoteMarks) {
0506: configuration.QuoteMarks = QuoteMarks;
0507: }
0508:
0509: public boolean getQuoteMarks() {
0510: return configuration.QuoteMarks;
0511: }
0512:
0513: /**
0514: * QuoteNbsp - output non-breaking space as entity
0515: * @see org.w3c.tidy.Configuration#QuoteNbsp
0516: */
0517:
0518: public void setQuoteNbsp(boolean QuoteNbsp) {
0519: configuration.QuoteNbsp = QuoteNbsp;
0520: }
0521:
0522: public boolean getQuoteNbsp() {
0523: return configuration.QuoteNbsp;
0524: }
0525:
0526: /**
0527: * QuoteAmpersand - output naked ampersand as &
0528: * @see org.w3c.tidy.Configuration#QuoteAmpersand
0529: */
0530:
0531: public void setQuoteAmpersand(boolean QuoteAmpersand) {
0532: configuration.QuoteAmpersand = QuoteAmpersand;
0533: }
0534:
0535: public boolean getQuoteAmpersand() {
0536: return configuration.QuoteAmpersand;
0537: }
0538:
0539: /**
0540: * WrapAttVals - wrap within attribute values
0541: * @see org.w3c.tidy.Configuration#WrapAttVals
0542: */
0543:
0544: public void setWrapAttVals(boolean WrapAttVals) {
0545: configuration.WrapAttVals = WrapAttVals;
0546: }
0547:
0548: public boolean getWrapAttVals() {
0549: return configuration.WrapAttVals;
0550: }
0551:
0552: /**
0553: * WrapScriptlets - wrap within JavaScript string literals
0554: * @see org.w3c.tidy.Configuration#WrapScriptlets
0555: */
0556:
0557: public void setWrapScriptlets(boolean WrapScriptlets) {
0558: configuration.WrapScriptlets = WrapScriptlets;
0559: }
0560:
0561: public boolean getWrapScriptlets() {
0562: return configuration.WrapScriptlets;
0563: }
0564:
0565: /**
0566: * WrapSection - wrap within <![ ... ]> section tags
0567: * @see org.w3c.tidy.Configuration#WrapSection
0568: */
0569:
0570: public void setWrapSection(boolean WrapSection) {
0571: configuration.WrapSection = WrapSection;
0572: }
0573:
0574: public boolean getWrapSection() {
0575: return configuration.WrapSection;
0576: }
0577:
0578: /**
0579: * AltText - default text for alt attribute
0580: * @see org.w3c.tidy.Configuration#altText
0581: */
0582:
0583: public void setAltText(String altText) {
0584: configuration.altText = altText;
0585: }
0586:
0587: public String getAltText() {
0588: return configuration.altText;
0589: }
0590:
0591: /**
0592: * Slidestyle - style sheet for slides
0593: * @see org.w3c.tidy.Configuration#slidestyle
0594: */
0595:
0596: public void setSlidestyle(String slidestyle) {
0597: configuration.slidestyle = slidestyle;
0598: }
0599:
0600: public String getSlidestyle() {
0601: return configuration.slidestyle;
0602: }
0603:
0604: /**
0605: * XmlPi - add <?xml?> for XML docs
0606: * @see org.w3c.tidy.Configuration#XmlPi
0607: */
0608:
0609: public void setXmlPi(boolean XmlPi) {
0610: configuration.XmlPi = XmlPi;
0611: }
0612:
0613: public boolean getXmlPi() {
0614: return configuration.XmlPi;
0615: }
0616:
0617: /**
0618: * DropFontTags - discard presentation tags
0619: * @see org.w3c.tidy.Configuration#DropFontTags
0620: */
0621:
0622: public void setDropFontTags(boolean DropFontTags) {
0623: configuration.DropFontTags = DropFontTags;
0624: }
0625:
0626: public boolean getDropFontTags() {
0627: return configuration.DropFontTags;
0628: }
0629:
0630: /**
0631: * DropEmptyParas - discard empty p elements
0632: * @see org.w3c.tidy.Configuration#DropEmptyParas
0633: */
0634:
0635: public void setDropEmptyParas(boolean DropEmptyParas) {
0636: configuration.DropEmptyParas = DropEmptyParas;
0637: }
0638:
0639: public boolean getDropEmptyParas() {
0640: return configuration.DropEmptyParas;
0641: }
0642:
0643: /**
0644: * FixComments - fix comments with adjacent hyphens
0645: * @see org.w3c.tidy.Configuration#FixComments
0646: */
0647:
0648: public void setFixComments(boolean FixComments) {
0649: configuration.FixComments = FixComments;
0650: }
0651:
0652: public boolean getFixComments() {
0653: return configuration.FixComments;
0654: }
0655:
0656: /**
0657: * WrapAsp - wrap within ASP pseudo elements
0658: * @see org.w3c.tidy.Configuration#WrapAsp
0659: */
0660:
0661: public void setWrapAsp(boolean WrapAsp) {
0662: configuration.WrapAsp = WrapAsp;
0663: }
0664:
0665: public boolean getWrapAsp() {
0666: return configuration.WrapAsp;
0667: }
0668:
0669: /**
0670: * WrapJste - wrap within JSTE pseudo elements
0671: * @see org.w3c.tidy.Configuration#WrapJste
0672: */
0673:
0674: public void setWrapJste(boolean WrapJste) {
0675: configuration.WrapJste = WrapJste;
0676: }
0677:
0678: public boolean getWrapJste() {
0679: return configuration.WrapJste;
0680: }
0681:
0682: /**
0683: * WrapPhp - wrap within PHP pseudo elements
0684: * @see org.w3c.tidy.Configuration#WrapPhp
0685: */
0686:
0687: public void setWrapPhp(boolean WrapPhp) {
0688: configuration.WrapPhp = WrapPhp;
0689: }
0690:
0691: public boolean getWrapPhp() {
0692: return configuration.WrapPhp;
0693: }
0694:
0695: /**
0696: * FixBackslash - fix URLs by replacing \ with /
0697: * @see org.w3c.tidy.Configuration#FixBackslash
0698: */
0699:
0700: public void setFixBackslash(boolean FixBackslash) {
0701: configuration.FixBackslash = FixBackslash;
0702: }
0703:
0704: public boolean getFixBackslash() {
0705: return configuration.FixBackslash;
0706: }
0707:
0708: /**
0709: * IndentAttributes - newline+indent before each attribute
0710: * @see org.w3c.tidy.Configuration#IndentAttributes
0711: */
0712:
0713: public void setIndentAttributes(boolean IndentAttributes) {
0714: configuration.IndentAttributes = IndentAttributes;
0715: }
0716:
0717: public boolean getIndentAttributes() {
0718: return configuration.IndentAttributes;
0719: }
0720:
0721: /**
0722: * DocType - user specified doctype
0723: * omit | auto | strict | loose | <i>fpi</i>
0724: * where the <i>fpi</i> is a string similar to
0725: * "-//ACME//DTD HTML 3.14159//EN"
0726: * Note: for <i>fpi</i> include the double-quotes in the string.
0727: * @see org.w3c.tidy.Configuration#docTypeStr
0728: * @see org.w3c.tidy.Configuration#docTypeMode
0729: */
0730:
0731: public void setDocType(String doctype) {
0732: if (doctype != null)
0733: configuration.docTypeStr = configuration.parseDocType(
0734: doctype, "doctype");
0735: }
0736:
0737: public String getDocType() {
0738: String result = null;
0739: switch (configuration.docTypeMode) {
0740: case Configuration.DOCTYPE_OMIT:
0741: result = "omit";
0742: break;
0743: case Configuration.DOCTYPE_AUTO:
0744: result = "auto";
0745: break;
0746: case Configuration.DOCTYPE_STRICT:
0747: result = "strict";
0748: break;
0749: case Configuration.DOCTYPE_LOOSE:
0750: result = "loose";
0751: break;
0752: case Configuration.DOCTYPE_USER:
0753: result = configuration.docTypeStr;
0754: break;
0755: }
0756: return result;
0757: }
0758:
0759: /**
0760: * LogicalEmphasis - replace i by em and b by strong
0761: * @see org.w3c.tidy.Configuration#LogicalEmphasis
0762: */
0763:
0764: public void setLogicalEmphasis(boolean LogicalEmphasis) {
0765: configuration.LogicalEmphasis = LogicalEmphasis;
0766: }
0767:
0768: public boolean getLogicalEmphasis() {
0769: return configuration.LogicalEmphasis;
0770: }
0771:
0772: /**
0773: * XmlPIs - if set to true PIs must end with ?>
0774: * @see org.w3c.tidy.Configuration#XmlPIs
0775: */
0776:
0777: public void setXmlPIs(boolean XmlPIs) {
0778: configuration.XmlPIs = XmlPIs;
0779: }
0780:
0781: public boolean getXmlPIs() {
0782: return configuration.XmlPIs;
0783: }
0784:
0785: /**
0786: * EncloseText - if true text at body is wrapped in <p>'s
0787: * @see org.w3c.tidy.Configuration#EncloseBodyText
0788: */
0789:
0790: public void setEncloseText(boolean EncloseText) {
0791: configuration.EncloseBodyText = EncloseText;
0792: }
0793:
0794: public boolean getEncloseText() {
0795: return configuration.EncloseBodyText;
0796: }
0797:
0798: /**
0799: * EncloseBlockText - if true text in blocks is wrapped in <p>'s
0800: * @see org.w3c.tidy.Configuration#EncloseBlockText
0801: */
0802:
0803: public void setEncloseBlockText(boolean EncloseBlockText) {
0804: configuration.EncloseBlockText = EncloseBlockText;
0805: }
0806:
0807: public boolean getEncloseBlockText() {
0808: return configuration.EncloseBlockText;
0809: }
0810:
0811: /**
0812: * KeepFileTimes - if true last modified time is preserved<br>
0813: * <b>this is NOT supported at this time.</b>
0814: * @see org.w3c.tidy.Configuration#KeepFileTimes
0815: */
0816:
0817: public void setKeepFileTimes(boolean KeepFileTimes) {
0818: configuration.KeepFileTimes = KeepFileTimes;
0819: }
0820:
0821: public boolean getKeepFileTimes() {
0822: return configuration.KeepFileTimes;
0823: }
0824:
0825: /**
0826: * Word2000 - draconian cleaning for Word2000
0827: * @see org.w3c.tidy.Configuration#Word2000
0828: */
0829:
0830: public void setWord2000(boolean Word2000) {
0831: configuration.Word2000 = Word2000;
0832: }
0833:
0834: public boolean getWord2000() {
0835: return configuration.Word2000;
0836: }
0837:
0838: /**
0839: * TidyMark - add meta element indicating tidied doc
0840: * @see org.w3c.tidy.Configuration#TidyMark
0841: */
0842:
0843: public void setTidyMark(boolean TidyMark) {
0844: configuration.TidyMark = TidyMark;
0845: }
0846:
0847: public boolean getTidyMark() {
0848: return configuration.TidyMark;
0849: }
0850:
0851: /**
0852: * XmlSpace - if set to yes adds xml:space attr as needed
0853: * @see org.w3c.tidy.Configuration#XmlSpace
0854: */
0855:
0856: public void setXmlSpace(boolean XmlSpace) {
0857: configuration.XmlSpace = XmlSpace;
0858: }
0859:
0860: public boolean getXmlSpace() {
0861: return configuration.XmlSpace;
0862: }
0863:
0864: /**
0865: * Emacs - if true format error output for GNU Emacs
0866: * @see org.w3c.tidy.Configuration#Emacs
0867: */
0868:
0869: public void setEmacs(boolean Emacs) {
0870: configuration.Emacs = Emacs;
0871: }
0872:
0873: public boolean getEmacs() {
0874: return configuration.Emacs;
0875: }
0876:
0877: /**
0878: * LiteralAttribs - if true attributes may use newlines
0879: * @see org.w3c.tidy.Configuration#LiteralAttribs
0880: */
0881:
0882: public void setLiteralAttribs(boolean LiteralAttribs) {
0883: configuration.LiteralAttribs = LiteralAttribs;
0884: }
0885:
0886: public boolean getLiteralAttribs() {
0887: return configuration.LiteralAttribs;
0888: }
0889:
0890: /**
0891: * InputStreamName - the name of the input stream (printed in the
0892: * header information).
0893: */
0894: public void setInputStreamName(String name) {
0895: if (name != null)
0896: inputStreamName = name;
0897: }
0898:
0899: public String getInputStreamName() {
0900: return inputStreamName;
0901: }
0902:
0903: /**
0904: * Sets the configuration from a configuration file.
0905: */
0906:
0907: public void setConfigurationFromFile(String filename) {
0908: configuration.parseFile(filename);
0909: }
0910:
0911: /**
0912: * Sets the configuration from a properties object.
0913: */
0914:
0915: public void setConfigurationFromProps(Properties props) {
0916: configuration.addProps(props);
0917: }
0918:
0919: /**
0920: * first time initialization which should
0921: * precede reading the command line
0922: */
0923:
0924: private void init() {
0925: configuration = new Configuration();
0926: if (configuration == null)
0927: return;
0928:
0929: AttributeTable at = AttributeTable.getDefaultAttributeTable();
0930: if (at == null)
0931: return;
0932: TagTable tt = new TagTable();
0933: if (tt == null)
0934: return;
0935: tt.setConfiguration(configuration);
0936: configuration.tt = tt;
0937: EntityTable et = EntityTable.getDefaultEntityTable();
0938: if (et == null)
0939: return;
0940:
0941: /* Unnecessary - same initial values in Configuration
0942: Configuration.XmlTags = false;
0943: Configuration.XmlOut = false;
0944: Configuration.HideEndTags = false;
0945: Configuration.UpperCaseTags = false;
0946: Configuration.MakeClean = false;
0947: Configuration.writeback = false;
0948: Configuration.OnlyErrors = false;
0949: */
0950:
0951: configuration.errfile = null;
0952: stderr = new PrintWriter(System.err, true);
0953: errout = stderr;
0954: initialized = true;
0955: }
0956:
0957: /**
0958: * Parses InputStream in and returns the root Node.
0959: * If out is non-null, pretty prints to OutputStream out.
0960: */
0961:
0962: public Node parse(InputStream in, OutputStream out) {
0963: Node document = null;
0964:
0965: try {
0966: document = parse(in, null, out);
0967: } catch (FileNotFoundException fnfe) {
0968: } catch (IOException e) {
0969: }
0970:
0971: return document;
0972: }
0973:
0974: /**
0975: * Internal routine that actually does the parsing. The caller
0976: * can pass either an InputStream or file name. If both are passed,
0977: * the file name is preferred.
0978: */
0979:
0980: private Node parse(InputStream in, String file, OutputStream out)
0981: throws FileNotFoundException, IOException {
0982: Lexer lexer;
0983: Node document = null;
0984: Node doctype;
0985: Out o = new OutImpl(); /* normal output stream */
0986: PPrint pprint;
0987:
0988: if (!initialized)
0989: return null;
0990:
0991: if (errout == null)
0992: return null;
0993:
0994: parseErrors = 0;
0995: parseWarnings = 0;
0996:
0997: /* ensure config is self-consistent */
0998: configuration.adjust();
0999:
1000: if (file != null) {
1001: in = new FileInputStream(file);
1002: inputStreamName = file;
1003: } else if (in == null) {
1004: in = System.in;
1005: inputStreamName = "stdin";
1006: }
1007:
1008: if (in != null) {
1009: lexer = new Lexer(new StreamInImpl(in,
1010: configuration.CharEncoding, configuration.tabsize),
1011: configuration);
1012: lexer.errout = errout;
1013:
1014: /*
1015: store pointer to lexer in input stream
1016: to allow character encoding errors to be
1017: reported
1018: */
1019: lexer.in.lexer = lexer;
1020:
1021: /* Tidy doesn't alter the doctype for generic XML docs */
1022: if (configuration.XmlTags)
1023: document = ParserImpl.parseXMLDocument(lexer);
1024: else {
1025: lexer.warnings = 0;
1026: if (!configuration.Quiet)
1027: Report.helloMessage(errout, Report.RELEASE_DATE,
1028: inputStreamName);
1029:
1030: document = ParserImpl.parseDocument(lexer);
1031:
1032: if (!document.checkNodeIntegrity()) {
1033: Report.badTree(errout);
1034: return null;
1035: }
1036:
1037: Clean cleaner = new Clean(configuration.tt);
1038:
1039: /* simplifies <b><b> ... </b> ...</b> etc. */
1040: cleaner.nestedEmphasis(document);
1041:
1042: /* cleans up <dir>indented text</dir> etc. */
1043: cleaner.list2BQ(document);
1044: cleaner.bQ2Div(document);
1045:
1046: /* replaces i by em and b by strong */
1047: if (configuration.LogicalEmphasis)
1048: cleaner.emFromI(document);
1049:
1050: if (configuration.Word2000
1051: && cleaner.isWord2000(document,
1052: configuration.tt)) {
1053: /* prune Word2000's <![if ...]> ... <![endif]> */
1054: cleaner.dropSections(lexer, document);
1055:
1056: /* drop style & class attributes and empty p, span elements */
1057: cleaner.cleanWord2000(lexer, document);
1058: }
1059:
1060: /* replaces presentational markup by style rules */
1061: if (configuration.MakeClean
1062: || configuration.DropFontTags)
1063: cleaner.cleanTree(lexer, document);
1064:
1065: if (!document.checkNodeIntegrity()) {
1066: Report.badTree(errout);
1067: return null;
1068: }
1069: doctype = document.findDocType();
1070: if (document.content != null) {
1071: if (configuration.xHTML)
1072: lexer.setXHTMLDocType(document);
1073: else
1074: lexer.fixDocType(document);
1075:
1076: if (configuration.TidyMark)
1077: lexer.addGenerator(document);
1078: }
1079:
1080: /* ensure presence of initial <?XML version="1.0"?> */
1081: if (configuration.XmlOut && configuration.XmlPi)
1082: lexer.fixXMLPI(document);
1083:
1084: if (!configuration.Quiet && document.content != null) {
1085: Report.reportVersion(errout, lexer,
1086: inputStreamName, doctype);
1087: Report.reportNumWarnings(errout, lexer);
1088: }
1089: }
1090:
1091: parseWarnings = lexer.warnings;
1092: parseErrors = lexer.errors;
1093:
1094: // Try to close the InputStream but only if if we created it.
1095:
1096: if ((file != null) && (in != System.in)) {
1097: try {
1098: in.close();
1099: } catch (IOException e) {
1100: }
1101: }
1102:
1103: if (lexer.errors > 0)
1104: Report.needsAuthorIntervention(errout);
1105:
1106: o.state = StreamIn.FSM_ASCII;
1107: o.encoding = configuration.CharEncoding;
1108:
1109: if (!configuration.OnlyErrors && lexer.errors == 0) {
1110: if (configuration.BurstSlides) {
1111: Node body;
1112:
1113: body = null;
1114: /*
1115: remove doctype to avoid potential clash with
1116: markup introduced when bursting into slides
1117: */
1118: /* discard the document type */
1119: doctype = document.findDocType();
1120:
1121: if (doctype != null)
1122: Node.discardElement(doctype);
1123:
1124: /* slides use transitional features */
1125: lexer.versions |= Dict.VERS_HTML40_LOOSE;
1126:
1127: /* and patch up doctype to match */
1128: if (configuration.xHTML)
1129: lexer.setXHTMLDocType(document);
1130: else
1131: lexer.fixDocType(document);
1132:
1133: /* find the body element which may be implicit */
1134: body = document.findBody(configuration.tt);
1135:
1136: if (body != null) {
1137: pprint = new PPrint(configuration);
1138: Report.reportNumberOfSlides(errout, pprint
1139: .countSlides(body));
1140: pprint.createSlides(lexer, document);
1141: } else
1142: Report.missingBody(errout);
1143: } else if (configuration.writeback && (file != null)) {
1144: try {
1145: pprint = new PPrint(configuration);
1146: o.out = new FileOutputStream(file);
1147:
1148: if (configuration.XmlTags)
1149: pprint.printXMLTree(o, (short) 0, 0, lexer,
1150: document);
1151: else
1152: pprint.printTree(o, (short) 0, 0, lexer,
1153: document);
1154:
1155: pprint.flushLine(o, 0);
1156: o.out.close();
1157: } catch (IOException e) {
1158: errout.println(file + e.toString());
1159: }
1160: } else if (out != null) {
1161: pprint = new PPrint(configuration);
1162: o.out = out;
1163:
1164: if (configuration.XmlTags)
1165: pprint.printXMLTree(o, (short) 0, 0, lexer,
1166: document);
1167: else
1168: pprint.printTree(o, (short) 0, 0, lexer,
1169: document);
1170:
1171: pprint.flushLine(o, 0);
1172: }
1173:
1174: }
1175:
1176: Report.errorSummary(lexer);
1177: }
1178: return document;
1179: }
1180:
1181: /**
1182: * Parses InputStream in and returns a DOM Document node.
1183: * If out is non-null, pretty prints to OutputStream out.
1184: */
1185:
1186: public org.w3c.dom.Document parseDOM(InputStream in,
1187: OutputStream out) {
1188: Node document = parse(in, out);
1189: if (document != null)
1190: return (org.w3c.dom.Document) document.getAdapter();
1191: else
1192: return null;
1193: }
1194:
1195: /**
1196: * Creates an empty DOM Document.
1197: */
1198:
1199: public static org.w3c.dom.Document createEmptyDocument() {
1200: Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1201: Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html",
1202: new TagTable());
1203: if (document != null && node != null) {
1204: Node.insertNodeAtStart(document, node);
1205: return (org.w3c.dom.Document) document.getAdapter();
1206: } else {
1207: return null;
1208: }
1209: }
1210:
1211: /**
1212: * Pretty-prints a DOM Document.
1213: */
1214:
1215: public void pprint(org.w3c.dom.Document doc, OutputStream out) {
1216: Out o = new OutImpl();
1217: PPrint pprint;
1218: Node document;
1219:
1220: if (!(doc instanceof DOMDocumentImpl)) {
1221: return;
1222: }
1223: document = ((DOMDocumentImpl) doc).adaptee;
1224:
1225: o.state = StreamIn.FSM_ASCII;
1226: o.encoding = configuration.CharEncoding;
1227:
1228: if (out != null) {
1229: pprint = new PPrint(configuration);
1230: o.out = out;
1231:
1232: if (configuration.XmlTags)
1233: pprint.printXMLTree(o, (short) 0, 0, null, document);
1234: else
1235: pprint.printTree(o, (short) 0, 0, null, document);
1236:
1237: pprint.flushLine(o, 0);
1238: }
1239: }
1240:
1241: /**
1242: * Command line interface to parser and pretty printer.
1243: */
1244:
1245: public static void main(String[] argv) {
1246: int totalerrors = 0;
1247: int totalwarnings = 0;
1248: String file;
1249: InputStream in;
1250: String prog = "Tidy";
1251: Node document;
1252: Node doctype;
1253: Lexer lexer;
1254: String s;
1255: Out out = new OutImpl(); /* normal output stream */
1256: PPrint pprint;
1257: int argc = argv.length + 1;
1258: int argIndex = 0;
1259: Tidy tidy;
1260: Configuration configuration;
1261: String arg;
1262: String current_errorfile = "stderr";
1263:
1264: tidy = new Tidy();
1265: configuration = tidy.getConfiguration();
1266:
1267: /* read command line */
1268:
1269: while (argc > 0) {
1270: if (argc > 1 && argv[argIndex].startsWith("-")) {
1271: /* support -foo and --foo */
1272: arg = argv[argIndex].substring(1);
1273:
1274: if (arg.length() > 0 && arg.charAt(0) == '-')
1275: arg = arg.substring(1);
1276:
1277: if (arg.equals("xml"))
1278: configuration.XmlTags = true;
1279: else if (arg.equals("asxml") || arg.equals("asxhtml"))
1280: configuration.xHTML = true;
1281: else if (arg.equals("indent")) {
1282: configuration.IndentContent = true;
1283: configuration.SmartIndent = true;
1284: } else if (arg.equals("omit"))
1285: configuration.HideEndTags = true;
1286: else if (arg.equals("upper"))
1287: configuration.UpperCaseTags = true;
1288: else if (arg.equals("clean"))
1289: configuration.MakeClean = true;
1290: else if (arg.equals("raw"))
1291: configuration.CharEncoding = Configuration.RAW;
1292: else if (arg.equals("ascii"))
1293: configuration.CharEncoding = Configuration.ASCII;
1294: else if (arg.equals("latin1"))
1295: configuration.CharEncoding = Configuration.LATIN1;
1296: else if (arg.equals("utf8"))
1297: configuration.CharEncoding = Configuration.UTF8;
1298: else if (arg.equals("iso2022"))
1299: configuration.CharEncoding = Configuration.ISO2022;
1300: else if (arg.equals("mac"))
1301: configuration.CharEncoding = Configuration.MACROMAN;
1302: else if (arg.equals("numeric"))
1303: configuration.NumEntities = true;
1304: else if (arg.equals("modify"))
1305: configuration.writeback = true;
1306: else if (arg.equals("change")) /* obsolete */
1307: configuration.writeback = true;
1308: else if (arg.equals("update")) /* obsolete */
1309: configuration.writeback = true;
1310: else if (arg.equals("errors"))
1311: configuration.OnlyErrors = true;
1312: else if (arg.equals("quiet"))
1313: configuration.Quiet = true;
1314: else if (arg.equals("slides"))
1315: configuration.BurstSlides = true;
1316: else if (arg.equals("help")
1317: || argv[argIndex].charAt(1) == '?'
1318: || argv[argIndex].charAt(1) == 'h') {
1319: Report.helpText(new PrintWriter(System.out, true),
1320: prog);
1321: System.exit(1);
1322: } else if (arg.equals("config")) {
1323: if (argc >= 3) {
1324: configuration.parseFile(argv[argIndex + 1]);
1325: --argc;
1326: ++argIndex;
1327: }
1328: } else if (argv[argIndex].equals("-file")
1329: || argv[argIndex].equals("--file")
1330: || argv[argIndex].equals("-f")) {
1331: if (argc >= 3) {
1332: configuration.errfile = argv[argIndex + 1];
1333: --argc;
1334: ++argIndex;
1335: }
1336: } else if (argv[argIndex].equals("-wrap")
1337: || argv[argIndex].equals("--wrap")
1338: || argv[argIndex].equals("-w")) {
1339: if (argc >= 3) {
1340: configuration.wraplen = Integer
1341: .parseInt(argv[argIndex + 1]);
1342: --argc;
1343: ++argIndex;
1344: }
1345: } else if (argv[argIndex].equals("-version")
1346: || argv[argIndex].equals("--version")
1347: || argv[argIndex].equals("-v")) {
1348: Report.showVersion(tidy.getErrout());
1349: System.exit(0);
1350: } else {
1351: s = argv[argIndex];
1352:
1353: for (int i = 1; i < s.length(); i++) {
1354: if (s.charAt(i) == 'i') {
1355: configuration.IndentContent = true;
1356: configuration.SmartIndent = true;
1357: } else if (s.charAt(i) == 'o')
1358: configuration.HideEndTags = true;
1359: else if (s.charAt(i) == 'u')
1360: configuration.UpperCaseTags = true;
1361: else if (s.charAt(i) == 'c')
1362: configuration.MakeClean = true;
1363: else if (s.charAt(i) == 'n')
1364: configuration.NumEntities = true;
1365: else if (s.charAt(i) == 'm')
1366: configuration.writeback = true;
1367: else if (s.charAt(i) == 'e')
1368: configuration.OnlyErrors = true;
1369: else if (s.charAt(i) == 'q')
1370: configuration.Quiet = true;
1371: else
1372: Report.unknownOption(tidy.getErrout(), s
1373: .charAt(i));
1374: }
1375: }
1376:
1377: --argc;
1378: ++argIndex;
1379: continue;
1380: }
1381:
1382: /* ensure config is self-consistent */
1383: configuration.adjust();
1384:
1385: /* user specified error file */
1386: if (configuration.errfile != null) {
1387: /* is it same as the currently opened file? */
1388: if (!configuration.errfile.equals(current_errorfile)) {
1389: /* no so close previous error file */
1390:
1391: if (tidy.getErrout() != tidy.getStderr())
1392: tidy.getErrout().close();
1393:
1394: /* and try to open the new error file */
1395: try {
1396: tidy.setErrout(new PrintWriter(new FileWriter(
1397: configuration.errfile), true));
1398: current_errorfile = configuration.errfile;
1399: } catch (IOException e) {
1400: /* can't be opened so fall back to stderr */
1401: current_errorfile = "stderr";
1402: tidy.setErrout(tidy.getStderr());
1403: }
1404: }
1405: }
1406:
1407: if (argc > 1) {
1408: file = argv[argIndex];
1409: } else {
1410: file = "stdin";
1411: }
1412:
1413: try {
1414: document = tidy.parse(null, file, System.out);
1415: totalwarnings += tidy.parseWarnings;
1416: totalerrors += tidy.parseErrors;
1417: } catch (FileNotFoundException fnfe) {
1418: Report.unknownFile(tidy.getErrout(), prog, file);
1419: } catch (IOException ioe) {
1420: Report.unknownFile(tidy.getErrout(), prog, file);
1421: }
1422:
1423: --argc;
1424: ++argIndex;
1425:
1426: if (argc <= 1)
1427: break;
1428: }
1429:
1430: if (totalerrors + totalwarnings > 0)
1431: Report.generalInfo(tidy.getErrout());
1432:
1433: if (tidy.getErrout() != tidy.getStderr())
1434: tidy.getErrout().close();
1435:
1436: /* return status can be used by scripts */
1437:
1438: if (totalerrors > 0)
1439: System.exit(2);
1440:
1441: if (totalwarnings > 0)
1442: System.exit(1);
1443:
1444: /* 0 signifies all is ok */
1445: System.exit(0);
1446: }
1447: }
|