001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: // Sep 14, 2000:
019: // Fixed problem with namespace handling. Contributed by
020: // David Blondeau <blondeau@intalio.com>
021: // Sep 14, 2000:
022: // Fixed serializer to report IO exception directly, instead at
023: // the end of document processing.
024: // Reported by Patrick Higgins <phiggins@transzap.com>
025: // Aug 21, 2000:
026: // Fixed bug in startDocument not calling prepare.
027: // Reported by Mikael Staldal <d96-mst-ingen-reklam@d.kth.se>
028: // Aug 21, 2000:
029: // Added ability to omit DOCTYPE declaration.
030: package org.apache.xml.serialize;
031:
032: import java.io.IOException;
033: import java.io.OutputStream;
034: import java.io.Writer;
035:
036: import org.apache.xerces.dom.DOMMessageFormatter;
037: import org.apache.xerces.util.NamespaceSupport;
038: import org.apache.xerces.util.SymbolTable;
039: import org.apache.xerces.util.XML11Char;
040: import org.apache.xerces.util.XMLChar;
041: import org.w3c.dom.DOMError;
042: import org.w3c.dom.Document;
043: import org.xml.sax.SAXException;
044:
045: /**
046: * Implements an XML serializer supporting both DOM and SAX pretty
047: * serializing. For usage instructions see {@link Serializer}.
048: * <p>
049: * If an output stream is used, the encoding is taken from the
050: * output format (defaults to <tt>UTF-8</tt>). If a writer is
051: * used, make sure the writer uses the same encoding (if applies)
052: * as specified in the output format.
053: * <p>
054: * The serializer supports both DOM and SAX. SAX serializing is done by firing
055: * SAX events and using the serializer as a document handler. DOM serializing is done
056: * by calling {@link #serialize(Document)} or by using DOM Level 3
057: * {@link org.w3c.dom.ls.LSSerializer} and
058: * serializing with {@link org.w3c.dom.ls.LSSerializer#write},
059: * {@link org.w3c.dom.ls.LSSerializer#writeToString}.
060: * <p>
061: * If an I/O exception occurs while serializing, the serializer
062: * will not throw an exception directly, but only throw it
063: * at the end of serializing (either DOM or SAX's {@link
064: * org.xml.sax.DocumentHandler#endDocument}.
065: * <p>
066: * For elements that are not specified as whitespace preserving,
067: * the serializer will potentially break long text lines at space
068: * boundaries, indent lines, and serialize elements on separate
069: * lines. Line terminators will be regarded as spaces, and
070: * spaces at beginning of line will be stripped.
071: *
072: * @deprecated This class was deprecated in Xerces 2.9.0. It is recommended
073: * that new applications use the DOM Level 3 LSSerializer or JAXP's Transformation
074: * API for XML (TrAX) for serializing XML. See the Xerces documentation for more
075: * information.
076: * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
077: * @author <a href="mailto:rahul.srivastava@sun.com">Rahul Srivastava</a>
078: * @author Elena Litani IBM
079: * @version $Revision: 476047 $ $Date: 2006-11-16 23:27:45 -0500 (Thu, 16 Nov 2006) $
080: * @see Serializer
081: */
082: public class XML11Serializer extends XMLSerializer {
083:
084: //
085: // constants
086: //
087:
088: protected static final boolean DEBUG = false;
089:
090: //
091: // data
092: //
093:
094: //
095: // DOM Level 3 implementation: variables intialized in DOMSerializerImpl
096: //
097:
098: /** stores namespaces in scope */
099: protected NamespaceSupport fNSBinder;
100:
101: /** stores all namespace bindings on the current element */
102: protected NamespaceSupport fLocalNSBinder;
103:
104: /** symbol table for serialization */
105: protected SymbolTable fSymbolTable;
106:
107: // is node dom level 1 node?
108: protected boolean fDOML1 = false;
109: // counter for new prefix names
110: protected int fNamespaceCounter = 1;
111: protected final static String PREFIX = "NS";
112:
113: /**
114: * Controls whether namespace fixup should be performed during
115: * the serialization.
116: * NOTE: if this field is set to true the following
117: * fields need to be initialized: fNSBinder, fLocalNSBinder, fSymbolTable,
118: * XMLSymbols.EMPTY_STRING, fXmlSymbol, fXmlnsSymbol, fNamespaceCounter.
119: */
120: protected boolean fNamespaces = false;
121:
122: /**
123: * Constructs a new serializer. The serializer cannot be used without
124: * calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
125: * first.
126: */
127: public XML11Serializer() {
128: super ();
129: _format.setVersion("1.1");
130: }
131:
132: /**
133: * Constructs a new serializer. The serializer cannot be used without
134: * calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
135: * first.
136: */
137: public XML11Serializer(OutputFormat format) {
138: super (format);
139: _format.setVersion("1.1");
140: }
141:
142: /**
143: * Constructs a new serializer that writes to the specified writer
144: * using the specified output format. If <tt>format</tt> is null,
145: * will use a default output format.
146: *
147: * @param writer The writer to use
148: * @param format The output format to use, null for the default
149: */
150: public XML11Serializer(Writer writer, OutputFormat format) {
151: super (writer, format);
152: _format.setVersion("1.1");
153: }
154:
155: /**
156: * Constructs a new serializer that writes to the specified output
157: * stream using the specified output format. If <tt>format</tt>
158: * is null, will use a default output format.
159: *
160: * @param output The output stream to use
161: * @param format The output format to use, null for the default
162: */
163: public XML11Serializer(OutputStream output, OutputFormat format) {
164: super (output, format != null ? format : new OutputFormat(
165: Method.XML, null, false));
166: _format.setVersion("1.1");
167: }
168:
169: //-----------------------------------------//
170: // SAX content handler serializing methods //
171: //-----------------------------------------//
172:
173: public void characters(char[] chars, int start, int length)
174: throws SAXException {
175: ElementState state;
176:
177: try {
178: state = content();
179:
180: // Check if text should be print as CDATA section or unescaped
181: // based on elements listed in the output format (the element
182: // state) or whether we are inside a CDATA section or entity.
183:
184: if (state.inCData || state.doCData) {
185: int saveIndent;
186:
187: // Print a CDATA section. The text is not escaped, but ']]>'
188: // appearing in the code must be identified and dealt with.
189: // The contents of a text node is considered space preserving.
190: if (!state.inCData) {
191: _printer.printText("<![CDATA[");
192: state.inCData = true;
193: }
194: saveIndent = _printer.getNextIndent();
195: _printer.setNextIndent(0);
196: char ch;
197: final int end = start + length;
198: for (int index = start; index < end; ++index) {
199: ch = chars[index];
200: if (ch == ']' && index + 2 < end
201: && chars[index + 1] == ']'
202: && chars[index + 2] == '>') {
203: _printer.printText("]]]]><![CDATA[>");
204: index += 2;
205: continue;
206: }
207: if (!XML11Char.isXML11Valid(ch)) {
208: // check if it is surrogate
209: if (++index < end) {
210: surrogates(ch, chars[index], true);
211: } else {
212: fatalError("The character '" + ch
213: + "' is an invalid XML character");
214: }
215: continue;
216: }
217: if (_encodingInfo.isPrintable(ch)
218: && XML11Char.isXML11ValidLiteral(ch)) {
219: _printer.printText(ch);
220: } else {
221: // The character is not printable -- split CDATA section
222: _printer.printText("]]>&#x");
223: _printer.printText(Integer.toHexString(ch));
224: _printer.printText(";<![CDATA[");
225: }
226: }
227: _printer.setNextIndent(saveIndent);
228:
229: } else {
230:
231: int saveIndent;
232:
233: if (state.preserveSpace) {
234: // If preserving space then hold of indentation so no
235: // excessive spaces are printed at line breaks, escape
236: // the text content without replacing spaces and print
237: // the text breaking only at line breaks.
238: saveIndent = _printer.getNextIndent();
239: _printer.setNextIndent(0);
240: printText(chars, start, length, true,
241: state.unescaped);
242: _printer.setNextIndent(saveIndent);
243: } else {
244: printText(chars, start, length, false,
245: state.unescaped);
246: }
247: }
248: } catch (IOException except) {
249: throw new SAXException(except);
250: }
251: }
252:
253: //
254: // overwrite printing functions to make sure serializer prints out valid XML
255: //
256: protected void printEscaped(String source) throws IOException {
257: int length = source.length();
258: for (int i = 0; i < length; ++i) {
259: int ch = source.charAt(i);
260: if (!XML11Char.isXML11Valid(ch)) {
261: if (++i < length) {
262: surrogates(ch, source.charAt(i), false);
263: } else {
264: fatalError("The character '" + (char) ch
265: + "' is an invalid XML character");
266: }
267: continue;
268: }
269: if (ch == '\n' || ch == '\r' || ch == '\t' || ch == 0x0085
270: || ch == 0x2028) {
271: printHex(ch);
272: } else if (ch == '<') {
273: _printer.printText("<");
274: } else if (ch == '&') {
275: _printer.printText("&");
276: } else if (ch == '"') {
277: _printer.printText(""");
278: } else if ((ch >= ' ' && _encodingInfo
279: .isPrintable((char) ch))) {
280: _printer.printText((char) ch);
281: } else {
282: printHex(ch);
283: }
284: }
285: }
286:
287: protected final void printCDATAText(String text) throws IOException {
288: int length = text.length();
289: char ch;
290:
291: for (int index = 0; index < length; ++index) {
292: ch = text.charAt(index);
293:
294: if (ch == ']' && index + 2 < length
295: && text.charAt(index + 1) == ']'
296: && text.charAt(index + 2) == '>') { // check for ']]>'
297: if (fDOMErrorHandler != null) {
298: // REVISIT: this means that if DOM Error handler is not registered we don't report any
299: // fatal errors and might serialize not wellformed document
300: if ((features & DOMSerializerImpl.SPLITCDATA) == 0
301: && (features & DOMSerializerImpl.WELLFORMED) == 0) {
302: // issue fatal error
303: String msg = DOMMessageFormatter.formatMessage(
304: DOMMessageFormatter.SERIALIZER_DOMAIN,
305: "EndingCDATA", null);
306: modifyDOMError(msg,
307: DOMError.SEVERITY_FATAL_ERROR, null,
308: fCurrentNode);
309: boolean continueProcess = fDOMErrorHandler
310: .handleError(fDOMError);
311: if (!continueProcess) {
312: throw new IOException();
313: }
314: } else {
315: // issue warning
316: String msg = DOMMessageFormatter.formatMessage(
317: DOMMessageFormatter.SERIALIZER_DOMAIN,
318: "SplittingCDATA", null);
319: modifyDOMError(msg, DOMError.SEVERITY_WARNING,
320: null, fCurrentNode);
321: fDOMErrorHandler.handleError(fDOMError);
322: }
323: }
324: // split CDATA section
325: _printer.printText("]]]]><![CDATA[>");
326: index += 2;
327: continue;
328: }
329:
330: if (!XML11Char.isXML11Valid(ch)) {
331: // check if it is surrogate
332: if (++index < length) {
333: surrogates(ch, text.charAt(index), true);
334: } else {
335: fatalError("The character '" + ch
336: + "' is an invalid XML character");
337: }
338: continue;
339: }
340: if (_encodingInfo.isPrintable(ch)
341: && XML11Char.isXML11ValidLiteral(ch)) {
342: _printer.printText(ch);
343: } else {
344: // The character is not printable -- split CDATA section
345: _printer.printText("]]>&#x");
346: _printer.printText(Integer.toHexString(ch));
347: _printer.printText(";<![CDATA[");
348: }
349: }
350: }
351:
352: // note that this "int" should, in all cases, be a char.
353: // REVISIT: make it a char...
354: protected final void printXMLChar(int ch) throws IOException {
355: if (ch == '\r' || ch == 0x0085 || ch == 0x2028) {
356: printHex(ch);
357: } else if (ch == '<') {
358: _printer.printText("<");
359: } else if (ch == '&') {
360: _printer.printText("&");
361: } else if (ch == '>') {
362: // character sequence "]]>" can't appear in content, therefore
363: // we should escape '>'
364: _printer.printText(">");
365: } else if (_encodingInfo.isPrintable((char) ch)
366: && XML11Char.isXML11ValidLiteral(ch)) {
367: _printer.printText((char) ch);
368: } else {
369: printHex(ch);
370: }
371: }
372:
373: protected final void surrogates(int high, int low, boolean inContent)
374: throws IOException {
375: if (XMLChar.isHighSurrogate(high)) {
376: if (!XMLChar.isLowSurrogate(low)) {
377: //Invalid XML
378: fatalError("The character '" + (char) low
379: + "' is an invalid XML character");
380: } else {
381: int supplemental = XMLChar.supplemental((char) high,
382: (char) low);
383: if (!XML11Char.isXML11Valid(supplemental)) {
384: //Invalid XML
385: fatalError("The character '" + (char) supplemental
386: + "' is an invalid XML character");
387: } else {
388: if (inContent && content().inCData) {
389: _printer.printText("]]>&#x");
390: _printer.printText(Integer
391: .toHexString(supplemental));
392: _printer.printText(";<![CDATA[");
393: } else {
394: printHex(supplemental);
395: }
396: }
397: }
398: } else {
399: fatalError("The character '" + (char) high
400: + "' is an invalid XML character");
401: }
402:
403: }
404:
405: protected void printText(String text, boolean preserveSpace,
406: boolean unescaped) throws IOException {
407: int index;
408: char ch;
409: int length = text.length();
410: if (preserveSpace) {
411: // Preserving spaces: the text must print exactly as it is,
412: // without breaking when spaces appear in the text and without
413: // consolidating spaces. If a line terminator is used, a line
414: // break will occur.
415: for (index = 0; index < length; ++index) {
416: ch = text.charAt(index);
417: if (!XML11Char.isXML11Valid(ch)) {
418: // check if it is surrogate
419: if (++index < length) {
420: surrogates(ch, text.charAt(index), true);
421: } else {
422: fatalError("The character '" + ch
423: + "' is an invalid XML character");
424: }
425: continue;
426: }
427: if (unescaped && XML11Char.isXML11ValidLiteral(ch)) {
428: _printer.printText(ch);
429: } else {
430: printXMLChar(ch);
431: }
432: }
433: } else {
434: // Not preserving spaces: print one part at a time, and
435: // use spaces between parts to break them into different
436: // lines. Spaces at beginning of line will be stripped
437: // by printing mechanism. Line terminator is treated
438: // no different than other text part.
439: for (index = 0; index < length; ++index) {
440: ch = text.charAt(index);
441: if (!XML11Char.isXML11Valid(ch)) {
442: // check if it is surrogate
443: if (++index < length) {
444: surrogates(ch, text.charAt(index), true);
445: } else {
446: fatalError("The character '" + ch
447: + "' is an invalid XML character");
448: }
449: continue;
450: }
451: if (unescaped && XML11Char.isXML11ValidLiteral(ch)) {
452: _printer.printText(ch);
453: } else {
454: printXMLChar(ch);
455: }
456: }
457: }
458: }
459:
460: protected void printText(char[] chars, int start, int length,
461: boolean preserveSpace, boolean unescaped)
462: throws IOException {
463:
464: if (preserveSpace) {
465: // Preserving spaces: the text must print exactly as it is,
466: // without breaking when spaces appear in the text and without
467: // consolidating spaces. If a line terminator is used, a line
468: // break will occur.
469: while (length-- > 0) {
470: char ch = chars[start++];
471: if (!XML11Char.isXML11Valid(ch)) {
472: // check if it is surrogate
473: if (length-- > 0) {
474: surrogates(ch, chars[start++], true);
475: } else {
476: fatalError("The character '" + ch
477: + "' is an invalid XML character");
478: }
479: continue;
480: }
481: if (unescaped && XML11Char.isXML11ValidLiteral(ch)) {
482: _printer.printText(ch);
483: } else {
484: printXMLChar(ch);
485: }
486: }
487: } else {
488: // Not preserving spaces: print one part at a time, and
489: // use spaces between parts to break them into different
490: // lines. Spaces at beginning of line will be stripped
491: // by printing mechanism. Line terminator is treated
492: // no different than other text part.
493: while (length-- > 0) {
494: char ch = chars[start++];
495: if (!XML11Char.isXML11Valid(ch)) {
496: // check if it is surrogate
497: if (length-- > 0) {
498: surrogates(ch, chars[start++], true);
499: } else {
500: fatalError("The character '" + ch
501: + "' is an invalid XML character");
502: }
503: continue;
504: }
505: if (unescaped && XML11Char.isXML11ValidLiteral(ch)) {
506: _printer.printText(ch);
507: } else {
508: printXMLChar(ch);
509: }
510: }
511: }
512: }
513:
514: public boolean reset() {
515: super .reset();
516: return true;
517: }
518:
519: }
|