001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.io.File;
041: import java.io.IOException;
042: import java.net.URL;
043:
044: /**
045: * <p>Command line usage class.</p>
046: *
047: * Created by: Vladimir Nikic <br/>
048: * Date: November, 2006.
049: */
050: public class CommandLine {
051:
052: private static String getArgValue(String[] args, String name) {
053: for (int i = 0; i < args.length; i++) {
054: String curr = args[i];
055: int eqIndex = curr.indexOf('=');
056: if (eqIndex >= 0) {
057: String argName = curr.substring(0, eqIndex).trim();
058: String argValue = curr.substring(eqIndex + 1).trim();
059:
060: if (argName.toLowerCase()
061: .startsWith(name.toLowerCase())) {
062: return argValue;
063: }
064: }
065: }
066:
067: return "";
068: }
069:
070: private static boolean toBoolean(String s) {
071: return s != null
072: && ("on".equalsIgnoreCase(s)
073: || "true".equalsIgnoreCase(s) || "yes"
074: .equalsIgnoreCase(s));
075: }
076:
077: public static void main(String[] args) throws IOException {
078: String source = getArgValue(args, "src");
079: if ("".equals(source)) {
080: System.err
081: .println("Usage: java -jar htmlcleanerXX.jar src = <url | file> [incharset = <charset>] [dest = <file>] [outcharset = <charset>] [options...]");
082: System.err.println("");
083: System.err.println("where options include:");
084: System.err
085: .println(" outputtype = simple | compact | pretty");
086: System.err.println(" advancedxmlescape = true | false");
087: System.err.println(" usecdata = true | false");
088: System.err.println(" specialentities = true | false");
089: System.err.println(" unicodechars = true | false");
090: System.err.println(" omitunknowntags = true | false");
091: System.err.println(" omitdeprtags = true | false");
092: System.err.println(" omitcomments = true | false");
093: System.err.println(" omitxmldecl = true | false");
094: System.err.println(" omitdoctypedecl = true | false");
095: System.err.println(" omitxmlnsatt = true | false");
096: System.err
097: .println(" hyphenreplacement = <string value>");
098: System.exit(1);
099: }
100:
101: String inCharset = getArgValue(args, "incharset");
102: if ("".equals(inCharset)) {
103: inCharset = HtmlCleaner.DEFAULT_CHARSET;
104: }
105:
106: String outCharset = getArgValue(args, "outcharset");
107: if ("".equals(outCharset)) {
108: outCharset = HtmlCleaner.DEFAULT_CHARSET;
109: }
110:
111: String destination = getArgValue(args, "dest");
112: String outputType = getArgValue(args, "outputtype");
113: String advancedXmlEscape = getArgValue(args,
114: "advancedxmlescape");
115: String useCData = getArgValue(args, "usecdata");
116: String translateSpecialEntities = getArgValue(args,
117: "specialentities");
118: String unicodeChars = getArgValue(args, "unicodechars");
119: String omitUnknownTags = getArgValue(args, "omitunknowntags");
120: String omitDeprecatedTags = getArgValue(args, "omitdeprtags");
121: String omitComments = getArgValue(args, "omitcomments");
122: String omitXmlDeclaration = getArgValue(args, "omitxmldecl");
123: String omitDoctypeDeclaration = getArgValue(args,
124: "omitdoctypedecl");
125: String omitXmlnsAttributes = getArgValue(args, "omitxmlnsatt");
126: String commentHyphen = getArgValue(args, "hyphenreplacement");
127:
128: HtmlCleaner cleaner = null;
129:
130: String src = source.toLowerCase();
131: if (src.startsWith("http://") || src.startsWith("https://")) {
132: cleaner = new HtmlCleaner(new URL(src), inCharset);
133: } else {
134: cleaner = new HtmlCleaner(new File(src), inCharset);
135: }
136:
137: if (!"".equals(omitUnknownTags)) {
138: cleaner.setOmitUnknownTags(toBoolean(omitUnknownTags));
139: }
140:
141: if (!"".equals(omitDeprecatedTags)) {
142: cleaner
143: .setOmitDeprecatedTags(toBoolean(omitDeprecatedTags));
144: }
145:
146: if (!"".equals(advancedXmlEscape)) {
147: cleaner.setAdvancedXmlEscape(toBoolean(advancedXmlEscape));
148: }
149:
150: if (!"".equals(useCData)) {
151: cleaner.setUseCdataForScriptAndStyle(toBoolean(useCData));
152: }
153:
154: if (!"".equals(translateSpecialEntities)) {
155: cleaner
156: .setTranslateSpecialEntities(toBoolean(translateSpecialEntities));
157: }
158:
159: if (!"".equals(unicodeChars)) {
160: cleaner.setRecognizeUnicodeChars(toBoolean(unicodeChars));
161: }
162:
163: if (!"".equals(omitComments)) {
164: cleaner.setOmitComments(toBoolean(omitComments));
165: }
166:
167: if (!"".equals(omitXmlDeclaration)) {
168: cleaner
169: .setOmitXmlDeclaration(toBoolean(omitXmlDeclaration));
170: }
171:
172: if (!"".equals(omitDoctypeDeclaration)) {
173: cleaner
174: .setOmitDoctypeDeclaration(toBoolean(omitDoctypeDeclaration));
175: }
176:
177: if (!"".equals(omitXmlnsAttributes)) {
178: cleaner
179: .setOmitXmlnsAttributes(toBoolean(omitXmlnsAttributes));
180: }
181:
182: if (!"".equals(commentHyphen)) {
183: cleaner.setHyphenReplacementInComment(commentHyphen);
184: }
185:
186: cleaner.clean(false, false);
187:
188: if ("".equals(destination)) {
189: if ("compact".equals(outputType)) {
190: cleaner.writeCompactXmlToStream(System.out, outCharset);
191: } else if ("pretty".equals(outputType)) {
192: cleaner.writePrettyXmlToStream(System.out, outCharset);
193: } else {
194: cleaner.writeXmlToStream(System.out, outCharset);
195: }
196: } else {
197: if ("compact".equals(outputType)) {
198: cleaner.writeCompactXmlToFile(destination, outCharset);
199: } else if ("pretty".equals(outputType)) {
200: cleaner.writePrettyXmlToFile(destination, outCharset);
201: } else {
202: cleaner.writeXmlToFile(destination, outCharset);
203: }
204: }
205: }
206:
207: }
|