001: /*
002: * Copyright 2002-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.cyberneko.html.filters;
018:
019: import java.io.OutputStream;
020: import java.io.OutputStreamWriter;
021: import java.io.PrintWriter;
022: import java.io.UnsupportedEncodingException;
023:
024: import org.cyberneko.html.HTMLConfiguration;
025: import org.cyberneko.html.HTMLElements;
026: import org.cyberneko.html.HTMLEntities;
027: import org.cyberneko.html.filters.DefaultFilter;
028:
029: import org.apache.xerces.xni.Augmentations;
030: import org.apache.xerces.xni.NamespaceContext;
031: import org.apache.xerces.xni.QName;
032: import org.apache.xerces.xni.XMLAttributes;
033: import org.apache.xerces.xni.XMLLocator;
034: import org.apache.xerces.xni.XMLResourceIdentifier;
035: import org.apache.xerces.xni.XMLString;
036: import org.apache.xerces.xni.XNIException;
037: import org.apache.xerces.xni.parser.XMLDocumentFilter;
038: import org.apache.xerces.xni.parser.XMLInputSource;
039: import org.apache.xerces.xni.parser.XMLParserConfiguration;
040:
041: /**
042: * An HTML writer written as a filter. Besides serializing the HTML
043: * event stream, the writer also passes the document events to the next
044: * stage in the pipeline. This allows applications to insert writer
045: * filters between other custom filters for debugging purposes.
046: * <p>
047: * Since an HTML document may have specified its encoding using the
048: * <META> tag and http-equiv/content attributes, the writer will
049: * automatically change any character set specified in this tag to
050: * match the encoding of the output stream. Therefore, the character
051: * encoding name used to construct the writer should be an official
052: * <a href='http://www.iana.org/assignments/character-sets'>IANA</a>
053: * encoding name and not a Java encoding name.
054: * <p>
055: * <strong>Note:</strong>
056: * The modified character set in the <META> tag is <em>not</em>
057: * propagated to the next stage in the pipeline. The changed value is
058: * only output to the stream; the original value is sent to the next
059: * stage in the pipeline.
060: *
061: * @author Andy Clark
062: *
063: * @version $Id: Writer.java,v 1.7 2005/02/14 04:01:33 andyc Exp $
064: */
065: public class Writer extends DefaultFilter {
066:
067: //
068: // Constants
069: //
070:
071: /** Notify character entity references. */
072: public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
073:
074: /** Notify built-in entity references. */
075: public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
076:
077: /** Augmentations feature identifier. */
078: protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
079:
080: /** Filters property identifier. */
081: protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
082:
083: //
084: // Data
085: //
086:
087: /** The encoding. */
088: protected String fEncoding;
089:
090: /**
091: * The print writer used for serializing the document with the
092: * appropriate character encoding.
093: */
094: protected PrintWriter fPrinter;
095:
096: // state
097:
098: /** Seen root element. */
099: protected boolean fSeenRootElement;
100:
101: /** Seen http-equiv directive. */
102: protected boolean fSeenHttpEquiv;
103:
104: /** Element depth. */
105: protected int fElementDepth;
106:
107: /** Normalize character content. */
108: protected boolean fNormalize;
109:
110: /** Print characters. */
111: protected boolean fPrintChars;
112:
113: //
114: // Constructors
115: //
116:
117: /** Constructs a writer filter that prints to standard out. */
118: public Writer() {
119: // Note: UTF-8 should *always* be a supported encoding. Although,
120: // I've heard of the old M$ JVM not supporting it! Amazing. -Ac
121: try {
122: fEncoding = "UTF-8";
123: fPrinter = new PrintWriter(new OutputStreamWriter(
124: System.out, fEncoding));
125: } catch (UnsupportedEncodingException e) {
126: throw new RuntimeException(e.getMessage());
127: }
128: } // <init>()
129:
130: /**
131: * Constructs a writer filter using the specified output stream and
132: * encoding.
133: *
134: * @param outputStream The output stream to write to.
135: * @param encoding The encoding to be used for the output. The encoding name
136: * should be an official IANA encoding name.
137: */
138: public Writer(OutputStream outputStream, String encoding)
139: throws UnsupportedEncodingException {
140: this (new OutputStreamWriter(outputStream, encoding), encoding);
141: } // <init>(OutputStream,String)
142:
143: /**
144: * Constructs a writer filter using the specified Java writer and
145: * encoding.
146: *
147: * @param writer The Java writer to write to.
148: * @param encoding The encoding to be used for the output. The encoding name
149: * should be an official IANA encoding name.
150: */
151: public Writer(java.io.Writer writer, String encoding) {
152: fEncoding = encoding;
153: if (writer instanceof PrintWriter) {
154: fPrinter = (PrintWriter) writer;
155: } else {
156: fPrinter = new PrintWriter(writer);
157: }
158: } // <init>(java.io.Writer,String)
159:
160: //
161: // XMLDocumentHandler methods
162: //
163:
164: // since Xerces-J 2.2.0
165:
166: /** Start document. */
167: public void startDocument(XMLLocator locator, String encoding,
168: NamespaceContext nscontext, Augmentations augs)
169: throws XNIException {
170: fSeenRootElement = false;
171: fSeenHttpEquiv = false;
172: fElementDepth = 0;
173: fNormalize = true;
174: fPrintChars = true;
175: super .startDocument(locator, encoding, nscontext, augs);
176: } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
177:
178: // old methods
179:
180: /** Start document. */
181: public void startDocument(XMLLocator locator, String encoding,
182: Augmentations augs) throws XNIException {
183: startDocument(locator, encoding, null, augs);
184: } // startDocument(XMLLocator,String,Augmentations)
185:
186: /** Comment. */
187: public void comment(XMLString text, Augmentations augs)
188: throws XNIException {
189: if (fSeenRootElement && fElementDepth <= 0) {
190: fPrinter.println();
191: }
192: fPrinter.print("<!--");
193: printCharacters(text, false);
194: fPrinter.print("-->");
195: if (!fSeenRootElement) {
196: fPrinter.println();
197: }
198: fPrinter.flush();
199: } // comment(XMLString,Augmentations)
200:
201: /** Start element. */
202: public void startElement(QName element, XMLAttributes attributes,
203: Augmentations augs) throws XNIException {
204: fSeenRootElement = true;
205: fElementDepth++;
206: fNormalize = !HTMLElements.getElement(element.rawname)
207: .isSpecial();
208: printStartElement(element, attributes);
209: super .startElement(element, attributes, augs);
210: } // startElement(QName,XMLAttributes,Augmentations)
211:
212: /** Empty element. */
213: public void emptyElement(QName element, XMLAttributes attributes,
214: Augmentations augs) throws XNIException {
215: fSeenRootElement = true;
216: printStartElement(element, attributes);
217: super .emptyElement(element, attributes, augs);
218: } // emptyElement(QName,XMLAttributes,Augmentations)
219:
220: /** Characters. */
221: public void characters(XMLString text, Augmentations augs)
222: throws XNIException {
223: if (fPrintChars) {
224: printCharacters(text, fNormalize);
225: }
226: super .characters(text, augs);
227: } // characters(XMLString,Augmentations)
228:
229: /** End element. */
230: public void endElement(QName element, Augmentations augs)
231: throws XNIException {
232: fElementDepth--;
233: fNormalize = true;
234: /***
235: // NOTE: Not sure if this is what should be done in the case where
236: // the encoding is not explitly declared within the HEAD. So
237: // I'm leaving it commented out for now. -Ac
238: if (element.rawname.equalsIgnoreCase("head") && !fSeenHttpEquiv) {
239: boolean capitalize = Character.isUpperCase(element.rawname.charAt(0));
240: String ename = capitalize ? "META" : "meta";
241: QName qname = new QName(null, ename, ename, null);
242: XMLAttributes attrs = new XMLAttributesImpl();
243: QName aname = new QName(null, "http-equiv", "http-equiv", null);
244: attrs.addAttribute(aname, "CDATA", "Content-Type");
245: aname.setValues(null, "content", "content", null);
246: attrs.addAttribute(aname, "CDATA", "text/html; charset="+fEncoding);
247: super.emptyElement(qname, attrs, null);
248: }
249: /***/
250: printEndElement(element);
251: super .endElement(element, augs);
252: } // endElement(QName,Augmentations)
253:
254: /** Start general entity. */
255: public void startGeneralEntity(String name,
256: XMLResourceIdentifier id, String encoding,
257: Augmentations augs) throws XNIException {
258: fPrintChars = false;
259: if (name.startsWith("#")) {
260: try {
261: boolean hex = name.startsWith("#x");
262: int offset = hex ? 2 : 1;
263: int base = hex ? 16 : 10;
264: int value = Integer.parseInt(name.substring(offset),
265: base);
266: String entity = HTMLEntities.get(value);
267: if (entity != null) {
268: name = entity;
269: }
270: } catch (NumberFormatException e) {
271: // ignore
272: }
273: }
274: printEntity(name);
275: super .startGeneralEntity(name, id, encoding, augs);
276: } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
277:
278: /** End general entity. */
279: public void endGeneralEntity(String name, Augmentations augs)
280: throws XNIException {
281: fPrintChars = true;
282: super .endGeneralEntity(name, augs);
283: } // endGeneralEntity(String,Augmentations)
284:
285: //
286: // Protected methods
287: //
288:
289: /** Print attribute value. */
290: protected void printAttributeValue(String text) {
291: int length = text.length();
292: for (int j = 0; j < length; j++) {
293: char c = text.charAt(j);
294: if (c == '"') {
295: fPrinter.print(""");
296: } else {
297: fPrinter.print(c);
298: }
299: }
300: fPrinter.flush();
301: } // printAttributeValue(String)
302:
303: /** Print characters. */
304: protected void printCharacters(XMLString text, boolean normalize) {
305: if (normalize) {
306: for (int i = 0; i < text.length; i++) {
307: char c = text.ch[text.offset + i];
308: if (c != '\n') {
309: String entity = HTMLEntities.get(c);
310: if (entity != null) {
311: printEntity(entity);
312: } else {
313: fPrinter.print(c);
314: }
315: } else {
316: fPrinter.println();
317: }
318: }
319: } else {
320: for (int i = 0; i < text.length; i++) {
321: char c = text.ch[text.offset + i];
322: fPrinter.print(c);
323: }
324: }
325: fPrinter.flush();
326: } // printCharacters(XMLString,boolean)
327:
328: /** Print start element. */
329: protected void printStartElement(QName element,
330: XMLAttributes attributes) {
331:
332: // modify META[@http-equiv='content-type']/@content value
333: int contentIndex = -1;
334: String originalContent = null;
335: if (element.rawname.toLowerCase().equals("meta")) {
336: String httpEquiv = null;
337: int length = attributes.getLength();
338: for (int i = 0; i < length; i++) {
339: String aname = attributes.getQName(i).toLowerCase();
340: if (aname.equals("http-equiv")) {
341: httpEquiv = attributes.getValue(i);
342: } else if (aname.equals("content")) {
343: contentIndex = i;
344: }
345: }
346: if (httpEquiv != null
347: && httpEquiv.toLowerCase().equals("content-type")) {
348: fSeenHttpEquiv = true;
349: String content = null;
350: if (contentIndex != -1) {
351: originalContent = attributes.getValue(contentIndex);
352: content = originalContent.toLowerCase();
353: }
354: if (content != null) {
355: int charsetIndex = content.indexOf("charset=");
356: if (charsetIndex != -1) {
357: content = content
358: .substring(0, charsetIndex + 8);
359: } else {
360: content += ";charset=";
361: }
362: content += fEncoding;
363: attributes.setValue(contentIndex, content);
364: }
365: }
366: }
367:
368: // print element
369: fPrinter.print('<');
370: fPrinter.print(element.rawname);
371: int attrCount = attributes != null ? attributes.getLength() : 0;
372: for (int i = 0; i < attrCount; i++) {
373: String aname = attributes.getQName(i);
374: String avalue = attributes.getValue(i);
375: fPrinter.print(' ');
376: fPrinter.print(aname);
377: fPrinter.print("=\"");
378: printAttributeValue(avalue);
379: fPrinter.print('"');
380: }
381: fPrinter.print('>');
382: fPrinter.flush();
383:
384: // return original META[@http-equiv]/@content value
385: if (contentIndex != -1) {
386: attributes.setValue(contentIndex, originalContent);
387: }
388:
389: } // printStartElement(QName,XMLAttributes)
390:
391: /** Print end element. */
392: protected void printEndElement(QName element) {
393: fPrinter.print("</");
394: fPrinter.print(element.rawname);
395: fPrinter.print('>');
396: fPrinter.flush();
397: } // printEndElement(QName)
398:
399: /** Print entity. */
400: protected void printEntity(String name) {
401: fPrinter.print('&');
402: fPrinter.print(name);
403: fPrinter.print(';');
404: fPrinter.flush();
405: } // printEntity(String)
406:
407: //
408: // MAIN
409: //
410:
411: /** Main. */
412: public static void main(String[] argv) throws Exception {
413: if (argv.length == 0) {
414: printUsage();
415: System.exit(1);
416: }
417: XMLParserConfiguration parser = new HTMLConfiguration();
418: parser.setFeature(NOTIFY_CHAR_REFS, true);
419: parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
420: String iencoding = null;
421: String oencoding = "Windows-1252";
422: boolean identity = false;
423: boolean purify = false;
424: for (int i = 0; i < argv.length; i++) {
425: String arg = argv[i];
426: if (arg.equals("-ie")) {
427: iencoding = argv[++i];
428: continue;
429: }
430: if (arg.equals("-e") || arg.equals("-oe")) {
431: oencoding = argv[++i];
432: continue;
433: }
434: if (arg.equals("-i")) {
435: identity = true;
436: continue;
437: }
438: if (arg.equals("-p")) {
439: purify = true;
440: continue;
441: }
442: if (arg.equals("-h")) {
443: printUsage();
444: System.exit(1);
445: }
446: java.util.Vector filtersVector = new java.util.Vector(2);
447: if (identity) {
448: filtersVector.addElement(new Identity());
449: } else if (purify) {
450: filtersVector.addElement(new Purifier());
451: }
452: filtersVector.addElement(new Writer(System.out, oencoding));
453: XMLDocumentFilter[] filters = new XMLDocumentFilter[filtersVector
454: .size()];
455: filtersVector.copyInto(filters);
456: parser.setProperty(FILTERS, filters);
457: XMLInputSource source = new XMLInputSource(null, arg, null);
458: source.setEncoding(iencoding);
459: parser.parse(source);
460: }
461: } // main(String[])
462:
463: /** Print usage. */
464: private static void printUsage() {
465: System.err.println("usage: java " + Writer.class.getName()
466: + " (options) file ...");
467: System.err.println();
468: System.err.println("options:");
469: System.err
470: .println(" -ie name Specify IANA name of input encoding.");
471: System.err
472: .println(" -oe name Specify IANA name of output encoding.");
473: System.err.println(" -i Perform identity transform.");
474: System.err
475: .println(" -p Purify output to ensure XML well-formedness.");
476: System.err.println(" -h Display help screen.");
477: System.err.println();
478: System.err.println("notes:");
479: System.err
480: .println(" The -i and -p options are mutually exclusive.");
481: System.err
482: .println(" The -e option has been replaced with -oe.");
483: } // printUsage()
484:
485: } // class Writer
|