001: /*
002: * Copyright 1999-2002,2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: // Aug 21, 2000:
018: // Added ability to omit DOCTYPE declaration.
019: // Reported by Lars Martin <lars@smb-tec.com>
020: // Aug 25, 2000:
021: // Added ability to omit comments.
022: // Contributed by Anupam Bagchi <abagchi@jtcsv.com>
023:
024: package org.jasig.portal.serialize;
025:
026: import java.io.UnsupportedEncodingException;
027:
028: import org.w3c.dom.Document;
029: import org.w3c.dom.DocumentType;
030: import org.w3c.dom.Node;
031: import org.w3c.dom.html.HTMLDocument;
032:
033: /**
034: * Specifies an output format to control the serializer. Based on the
035: * XSLT specification for output format, plus additional parameters.
036: * Used to select the suitable serializer and determine how the
037: * document should be formatted on output.
038: * <p>
039: * The two interesting constructors are:
040: * <ul>
041: * <li>{@link #OutputFormat(String,String,boolean)} creates a format
042: * for the specified method (XML, HTML, Text, etc), encoding and indentation
043: * <li>{@link #OutputFormat(Document,String,boolean)} creates a format
044: * compatible with the document type (XML, HTML, Text, etc), encoding and
045: * indentation
046: * </ul>
047: *
048: *
049: * @version $Revision: 36559 $ $Date: 2006-04-28 11:38:13 -0700 (Fri, 28 Apr 2006) $
050: * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
051: * <a href="mailto:visco@intalio.com">Keith Visco</a>
052: * @see Serializer
053: * @see Method
054: * @see LineSeparator
055: */
056: public class OutputFormat {
057:
058: public static class DTD {
059:
060: /**
061: * Public identifier for HTML 4.01 (Strict) document type.
062: */
063: public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
064:
065: /**
066: * System identifier for HTML 4.01 (Strict) document type.
067: */
068: public static final String HTMLSystemId = "http://www.w3.org/TR/html4/strict.dtd";
069:
070: /**
071: * Public identifier for XHTML 1.0 (Strict) document type.
072: */
073: public static final String XHTMLPublicId = "-//W3C//DTD XHTML 1.0 Strict//EN";
074:
075: /**
076: * System identifier for XHTML 1.0 (Strict) document type.
077: */
078: public static final String XHTMLSystemId = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
079:
080: }
081:
082: public static class Defaults {
083:
084: /**
085: * If indentation is turned on, the default identation
086: * level is 4.
087: *
088: * @see #setIndenting(boolean)
089: */
090: public static final int Indent = 4;
091:
092: /**
093: * The default encoding for Web documents it UTF-8.
094: *
095: * @see #getEncoding()
096: */
097: public static final String Encoding = "UTF-8";
098:
099: /**
100: * The default line width at which to break long lines
101: * when identing. This is set to 72.
102: */
103: public static final int LineWidth = 72;
104:
105: }
106:
107: /**
108: * Holds the output method specified for this document,
109: * or null if no method was specified.
110: */
111: private String _method;
112:
113: /**
114: * Specifies the version of the output method.
115: */
116: private String _version;
117:
118: /**
119: * The indentation level, or zero if no indentation
120: * was requested.
121: */
122: private int _indent = 0;
123:
124: /**
125: * The encoding to use, if an input stream is used.
126: * The default is always UTF-8.
127: */
128: private String _encoding = Defaults.Encoding;
129:
130: /**
131: * The EncodingInfo instance for _encoding.
132: */
133: private EncodingInfo _encodingInfo = null;
134:
135: // whether java names for encodings are permitted
136: private boolean _allowJavaNames = false;
137:
138: /**
139: * The specified media type or null.
140: */
141: private String _mediaType;
142:
143: /**
144: * The specified document type system identifier, or null.
145: */
146: private String _doctypeSystem;
147:
148: /**
149: * The specified document type public identifier, or null.
150: */
151: private String _doctypePublic;
152:
153: /**
154: * Ture if the XML declaration should be ommited;
155: */
156: private boolean _omitXmlDeclaration = false;
157:
158: /**
159: * Ture if the DOCTYPE declaration should be ommited;
160: */
161: private boolean _omitDoctype = false;
162:
163: /**
164: * Ture if comments should be ommited;
165: */
166: private boolean _omitComments = false;
167:
168: /**
169: * Ture if the comments should be ommited;
170: */
171: private boolean _stripComments = false;
172:
173: /**
174: * True if the document type should be marked as standalone.
175: */
176: private boolean _standalone = false;
177:
178: /**
179: * List of element tag names whose text node children must
180: * be output as CDATA.
181: */
182: private String[] _cdataElements;
183:
184: /**
185: * List of element tag names whose text node children must
186: * be output unescaped.
187: */
188: private String[] _nonEscapingElements;
189:
190: /**
191: * The selected line separator.
192: */
193: private String _lineSeparator = LineSeparator.Web;
194:
195: /**
196: * The line width at which to wrap long lines when indenting.
197: */
198: private int _lineWidth = Defaults.LineWidth;
199:
200: /**
201: * True if spaces should be preserved in elements that do not
202: * specify otherwise, or specify the default behavior.
203: */
204: private boolean _preserve = false;
205: /** If true, an empty string valued attribute is output as "". If false and
206: * and we are using the HTMLSerializer, then only the attribute name is
207: * serialized. Defaults to false for backwards compatibility.
208: */
209: private boolean _preserveEmptyAttributes = false;
210:
211: /**
212: * Constructs a new output format with the default values.
213: */
214: public OutputFormat() {
215: }
216:
217: /**
218: * Constructs a new output format with the default values for
219: * the specified method and encoding. If <tt>indent</tt>
220: * is true, the document will be pretty printed with the default
221: * indentation level and default line wrapping.
222: *
223: * @param method The specified output method
224: * @param encoding The specified encoding
225: * @param indenting True for pretty printing
226: * @see #setEncoding
227: * @see #setIndenting
228: * @see #setMethod
229: */
230: public OutputFormat(String method, String encoding,
231: boolean indenting) {
232: setMethod(method);
233: setEncoding(encoding);
234: setIndenting(indenting);
235: }
236:
237: /**
238: * Constructs a new output format with the proper method,
239: * document type identifiers and media type for the specified
240: * document.
241: *
242: * @param doc The document to output
243: * @see #whichMethod
244: */
245: public OutputFormat(Document doc) {
246: setMethod(whichMethod(doc));
247: setDoctype(whichDoctypePublic(doc), whichDoctypeSystem(doc));
248: setMediaType(whichMediaType(getMethod()));
249: }
250:
251: /**
252: * Constructs a new output format with the proper method,
253: * document type identifiers and media type for the specified
254: * document, and with the specified encoding. If <tt>indent</tt>
255: * is true, the document will be pretty printed with the default
256: * indentation level and default line wrapping.
257: *
258: * @param doc The document to output
259: * @param encoding The specified encoding
260: * @param indenting True for pretty printing
261: * @see #setEncoding
262: * @see #setIndenting
263: * @see #whichMethod
264: */
265: public OutputFormat(Document doc, String encoding, boolean indenting) {
266: this (doc);
267: setEncoding(encoding);
268: setIndenting(indenting);
269: }
270:
271: /**
272: * Returns the method specified for this output format.
273: * Typically the method will be <tt>xml</tt>, <tt>html</tt>
274: * or <tt>text</tt>, but it might be other values.
275: * If no method was specified, null will be returned
276: * and the most suitable method will be determined for
277: * the document by calling {@link #whichMethod}.
278: *
279: * @return The specified output method, or null
280: */
281: public String getMethod() {
282: return _method;
283: }
284:
285: /**
286: * Sets the method for this output format.
287: *
288: * @see #getMethod
289: * @param method The output method, or null
290: */
291: public void setMethod(String method) {
292: _method = method;
293: }
294:
295: /**
296: * Returns the version for this output method.
297: * If no version was specified, will return null
298: * and the default version number will be used.
299: * If the serializerr does not support that particular
300: * version, it should default to a supported version.
301: *
302: * @return The specified method version, or null
303: */
304: public String getVersion() {
305: return _version;
306: }
307:
308: /**
309: * Sets the version for this output method.
310: * For XML the value would be "1.0", for HTML
311: * it would be "4.0".
312: *
313: * @see #getVersion
314: * @param version The output method version, or null
315: */
316: public void setVersion(String version) {
317: _version = version;
318: }
319:
320: /**
321: * Returns the indentation specified. If no indentation
322: * was specified, zero is returned and the document
323: * should not be indented.
324: *
325: * @return The indentation or zero
326: * @see #setIndenting
327: */
328: public int getIndent() {
329: return _indent;
330: }
331:
332: /**
333: * Returns true if indentation was specified.
334: */
335: public boolean getIndenting() {
336: return (_indent > 0);
337: }
338:
339: /**
340: * Sets the indentation. The document will not be
341: * indented if the indentation is set to zero.
342: * Calling {@link #setIndenting} will reset this
343: * value to zero (off) or the default (on).
344: *
345: * @param indent The indentation, or zero
346: */
347: public void setIndent(int indent) {
348: if (indent < 0)
349: _indent = 0;
350: else
351: _indent = indent;
352: }
353:
354: /**
355: * Sets the indentation on and off. When set on, the default
356: * indentation level and default line wrapping is used
357: * (see {@link Defaults#Indent} and {@link Defaults#LineWidth}).
358: * To specify a different indentation level or line wrapping,
359: * use {@link #setIndent} and {@link #setLineWidth}.
360: *
361: * @param on True if indentation should be on
362: */
363: public void setIndenting(boolean on) {
364: if (on) {
365: _indent = Defaults.Indent;
366: _lineWidth = Defaults.LineWidth;
367: } else {
368: _indent = 0;
369: _lineWidth = 0;
370: }
371: }
372:
373: /**
374: * Returns the specified encoding. If no encoding was
375: * specified, the default is always "UTF-8".
376: *
377: * @return The encoding
378: */
379: public String getEncoding() {
380: return _encoding;
381: }
382:
383: /**
384: * Sets the encoding for this output method. If no
385: * encoding was specified, the default is always "UTF-8".
386: * Make sure the encoding is compatible with the one
387: * used by the {@link java.io.Writer}.
388: *
389: * @see #getEncoding
390: * @param encoding The encoding, or null
391: */
392: public void setEncoding(String encoding) {
393: _encoding = encoding;
394: _encodingInfo = null;
395: }
396:
397: /**
398: * Sets the encoding for this output method with an <code>EncodingInfo</code>
399: * instance.
400: */
401: public void setEncoding(EncodingInfo encInfo) {
402: _encoding = encInfo.getIANAName();
403: _encodingInfo = encInfo;
404: }
405:
406: /**
407: * Returns an <code>EncodingInfo<code> instance for the encoding.
408: *
409: * @see #setEncoding
410: */
411: public EncodingInfo getEncodingInfo()
412: throws UnsupportedEncodingException {
413: if (_encodingInfo == null)
414: _encodingInfo = Encodings.getEncodingInfo(_encoding,
415: _allowJavaNames);
416: return _encodingInfo;
417: }
418:
419: /**
420: * Sets whether java encoding names are permitted
421: */
422: public void setAllowJavaNames(boolean allow) {
423: _allowJavaNames = allow;
424: }
425:
426: /**
427: * Returns whether java encoding names are permitted
428: */
429: public boolean setAllowJavaNames() {
430: return _allowJavaNames;
431: }
432:
433: /**
434: * Returns the specified media type, or null.
435: * To determine the media type based on the
436: * document type, use {@link #whichMediaType}.
437: *
438: * @return The specified media type, or null
439: */
440: public String getMediaType() {
441: return _mediaType;
442: }
443:
444: /**
445: * Sets the media type.
446: *
447: * @see #getMediaType
448: * @param mediaType The specified media type
449: */
450: public void setMediaType(String mediaType) {
451: _mediaType = mediaType;
452: }
453:
454: /**
455: * Sets the document type public and system identifiers.
456: * Required only if the DOM Document or SAX events do not
457: * specify the document type, and one must be present in
458: * the serialized document. Any document type specified
459: * by the DOM Document or SAX events will override these
460: * values.
461: *
462: * @param publicId The public identifier, or null
463: * @param systemId The system identifier, or null
464: */
465: public void setDoctype(String publicId, String systemId) {
466: _doctypePublic = publicId;
467: _doctypeSystem = systemId;
468: }
469:
470: /**
471: * Returns the specified document type public identifier,
472: * or null.
473: */
474: public String getDoctypePublic() {
475: return _doctypePublic;
476: }
477:
478: /**
479: * Returns the specified document type system identifier,
480: * or null.
481: */
482: public String getDoctypeSystem() {
483: return _doctypeSystem;
484: }
485:
486: /**
487: * Returns true if comments should be ommited.
488: * The default is false.
489: */
490: public boolean getOmitComments() {
491: return _omitComments;
492: }
493:
494: /**
495: * Sets comment omitting on and off.
496: *
497: * @param omit True if comments should be ommited
498: */
499: public void setOmitComments(boolean omit) {
500: _omitComments = omit;
501: }
502:
503: /**
504: * Returns true if the DOCTYPE declaration should
505: * be ommited. The default is false.
506: */
507: public boolean getOmitDocumentType() {
508: return _omitDoctype;
509: }
510:
511: /**
512: * Sets DOCTYPE declaration omitting on and off.
513: *
514: * @param omit True if DOCTYPE declaration should be ommited
515: */
516: public void setOmitDocumentType(boolean omit) {
517: _omitDoctype = omit;
518: }
519:
520: /**
521: * Returns true if the XML document declaration should
522: * be ommited. The default is false.
523: */
524: public boolean getOmitXMLDeclaration() {
525: return _omitXmlDeclaration;
526: }
527:
528: /**
529: * Sets XML declaration omitting on and off.
530: *
531: * @param omit True if XML declaration should be ommited
532: */
533: public void setOmitXMLDeclaration(boolean omit) {
534: _omitXmlDeclaration = omit;
535: }
536:
537: /**
538: * Returns true if the document type is standalone.
539: * The default is false.
540: */
541: public boolean getStandalone() {
542: return _standalone;
543: }
544:
545: /**
546: * Sets document DTD standalone. The public and system
547: * identifiers must be null for the document to be
548: * serialized as standalone.
549: *
550: * @param standalone True if document DTD is standalone
551: */
552: public void setStandalone(boolean standalone) {
553: _standalone = standalone;
554: }
555:
556: /**
557: * Returns a list of all the elements whose text node children
558: * should be output as CDATA, or null if no such elements were
559: * specified.
560: */
561: public String[] getCDataElements() {
562: return _cdataElements;
563: }
564:
565: /**
566: * Returns true if the text node children of the given elements
567: * should be output as CDATA.
568: *
569: * @param tagName The element's tag name
570: * @return True if should serialize as CDATA
571: */
572: public boolean isCDataElement(String tagName) {
573: int i;
574:
575: if (_cdataElements == null)
576: return false;
577: for (i = 0; i < _cdataElements.length; ++i)
578: if (_cdataElements[i].equals(tagName))
579: return true;
580: return false;
581: }
582:
583: /**
584: * Sets the list of elements for which text node children
585: * should be output as CDATA.
586: *
587: * @param cdataElements List of CDATA element tag names
588: */
589: public void setCDataElements(String[] cdataElements) {
590: _cdataElements = cdataElements;
591: }
592:
593: /**
594: * Returns a list of all the elements whose text node children
595: * should be output unescaped (no character references), or null
596: * if no such elements were specified.
597: */
598: public String[] getNonEscapingElements() {
599: return _nonEscapingElements;
600: }
601:
602: /**
603: * Returns true if the text node children of the given elements
604: * should be output unescaped.
605: *
606: * @param tagName The element's tag name
607: * @return True if should serialize unescaped
608: */
609: public boolean isNonEscapingElement(String tagName) {
610: int i;
611:
612: if (_nonEscapingElements == null) {
613: return false;
614: }
615: for (i = 0; i < _nonEscapingElements.length; ++i)
616: if (_nonEscapingElements[i].equals(tagName))
617: return true;
618: return false;
619: }
620:
621: /**
622: * Sets the list of elements for which text node children
623: * should be output unescaped (no character references).
624: *
625: * @param nonEscapingElements List of unescaped element tag names
626: */
627: public void setNonEscapingElements(String[] nonEscapingElements) {
628: _nonEscapingElements = nonEscapingElements;
629: }
630:
631: /**
632: * Returns a specific line separator to use. The default is the
633: * Web line separator (<tt>\n</tt>). A string is returned to
634: * support double codes (CR + LF).
635: *
636: * @return The specified line separator
637: */
638: public String getLineSeparator() {
639: return _lineSeparator;
640: }
641:
642: /**
643: * Sets the line separator. The default is the Web line separator
644: * (<tt>\n</tt>). The machine's line separator can be obtained
645: * from the system property <tt>line.separator</tt>, but is only
646: * useful if the document is edited on machines of the same type.
647: * For general documents, use the Web line separator.
648: *
649: * @param lineSeparator The specified line separator
650: */
651: public void setLineSeparator(String lineSeparator) {
652: if (lineSeparator == null)
653: _lineSeparator = LineSeparator.Web;
654: else
655: _lineSeparator = lineSeparator;
656: }
657:
658: /**
659: * Returns true if the default behavior for this format is to
660: * preserve spaces. All elements that do not specify otherwise
661: * or specify the default behavior will be formatted based on
662: * this rule. All elements that specify space preserving will
663: * always preserve space.
664: */
665: public boolean getPreserveSpace() {
666: return _preserve;
667: }
668:
669: /**
670: * Sets space preserving as the default behavior. The default is
671: * space stripping and all elements that do not specify otherwise
672: * or use the default value will not preserve spaces.
673: *
674: * @param preserve True if spaces should be preserved
675: */
676: public void setPreserveSpace(boolean preserve) {
677: _preserve = preserve;
678: }
679:
680: /**
681: * Return the selected line width for breaking up long lines.
682: * When indenting, and only when indenting, long lines will be
683: * broken at space boundaries based on this line width.
684: * No line wrapping occurs if this value is zero.
685: */
686: public int getLineWidth() {
687: return _lineWidth;
688: }
689:
690: /**
691: * Sets the line width. If zero then no line wrapping will
692: * occur. Calling {@link #setIndenting} will reset this
693: * value to zero (off) or the default (on).
694: *
695: * @param lineWidth The line width to use, zero for default
696: * @see #getLineWidth
697: * @see #setIndenting
698: */
699: public void setLineWidth(int lineWidth) {
700: if (lineWidth <= 0)
701: _lineWidth = 0;
702: else
703: _lineWidth = lineWidth;
704: }
705:
706: /**
707: * Returns the preserveEmptyAttribute flag. If flag is false, then'
708: * attributes with empty string values are output as the attribute
709: * name only (in HTML mode).
710: * @return preserve the preserve flag
711: */
712: public boolean getPreserveEmptyAttributes() {
713: return _preserveEmptyAttributes;
714: }
715:
716: /**
717: * Sets the preserveEmptyAttribute flag. If flag is false, then'
718: * attributes with empty string values are output as the attribute
719: * name only (in HTML mode).
720: * @param preserve the preserve flag
721: */
722: public void setPreserveEmptyAttributes(boolean preserve) {
723: _preserveEmptyAttributes = preserve;
724: }
725:
726: /**
727: * Returns the last printable character based on the selected
728: * encoding. Control characters and non-printable characters
729: * are always printed as character references.
730: */
731: public char getLastPrintable() {
732: if (getEncoding() != null
733: && (getEncoding().equalsIgnoreCase("ASCII")))
734: return 0xFF;
735: else
736: return 0xFFFF;
737: }
738:
739: /**
740: * Determine the output method for the specified document.
741: * If the document is an instance of {@link org.w3c.dom.html.HTMLDocument}
742: * then the method is said to be <tt>html</tt>. If the root
743: * element is 'html' and all text nodes preceding the root
744: * element are all whitespace, then the method is said to be
745: * <tt>html</tt>. Otherwise the method is <tt>xml</tt>.
746: *
747: * @param doc The document to check
748: * @return The suitable method
749: */
750: public static String whichMethod(Document doc) {
751: Node node;
752: String value;
753: int i;
754:
755: // If document is derived from HTMLDocument then the default
756: // method is html.
757: if (doc instanceof HTMLDocument)
758: return Method.HTML;
759:
760: // Lookup the root element and the text nodes preceding it.
761: // If root element is html and all text nodes contain whitespace
762: // only, the method is html.
763:
764: // FIXME (SM) should we care about namespaces here?
765:
766: node = doc.getFirstChild();
767: while (node != null) {
768: // If the root element is html, the method is html.
769: if (node.getNodeType() == Node.ELEMENT_NODE) {
770: if (node.getNodeName().equalsIgnoreCase("html")) {
771: return Method.HTML;
772: } else if (node.getNodeName().equalsIgnoreCase("root")) {
773: return Method.FOP;
774: } else {
775: return Method.XML;
776: }
777: } else if (node.getNodeType() == Node.TEXT_NODE) {
778: // If a text node preceding the root element contains
779: // only whitespace, this might be html, otherwise it's
780: // definitely xml.
781: value = node.getNodeValue();
782: for (i = 0; i < value.length(); ++i)
783: if (value.charAt(i) != 0x20
784: && value.charAt(i) != 0x0A
785: && value.charAt(i) != 0x09
786: && value.charAt(i) != 0x0D)
787: return Method.XML;
788: }
789: node = node.getNextSibling();
790: }
791: // Anything else, the method is xml.
792: return Method.XML;
793: }
794:
795: /**
796: * Returns the document type public identifier
797: * specified for this document, or null.
798: */
799: public static String whichDoctypePublic(Document doc) {
800: DocumentType doctype;
801:
802: /* DOM Level 2 was introduced into the code base*/
803: doctype = doc.getDoctype();
804: if (doctype != null) {
805: // Note on catch: DOM Level 1 does not specify this method
806: // and the code will throw a NoSuchMethodError
807: try {
808: return doctype.getPublicId();
809: } catch (Error except) {
810: }
811: }
812:
813: if (doc instanceof HTMLDocument)
814: return DTD.XHTMLPublicId;
815: return null;
816: }
817:
818: /**
819: * Returns the document type system identifier
820: * specified for this document, or null.
821: */
822: public static String whichDoctypeSystem(Document doc) {
823: DocumentType doctype;
824:
825: /* DOM Level 2 was introduced into the code base*/
826: doctype = doc.getDoctype();
827: if (doctype != null) {
828: // Note on catch: DOM Level 1 does not specify this method
829: // and the code will throw a NoSuchMethodError
830: try {
831: return doctype.getSystemId();
832: } catch (Error except) {
833: }
834: }
835:
836: if (doc instanceof HTMLDocument)
837: return DTD.XHTMLSystemId;
838: return null;
839: }
840:
841: /**
842: * Returns the suitable media format for a document
843: * output with the specified method.
844: */
845: public static String whichMediaType(String method) {
846: if (method.equalsIgnoreCase(Method.XML))
847: return "text/xml";
848: if (method.equalsIgnoreCase(Method.HTML))
849: return "text/html";
850: if (method.equalsIgnoreCase(Method.XHTML))
851: return "text/html";
852: if (method.equalsIgnoreCase(Method.TEXT))
853: return "text/plain";
854: if (method.equalsIgnoreCase(Method.FOP))
855: return "application/pdf";
856: return null;
857: }
858:
859: }
|