0001: /* Copyright 2002-2005 Elliotte Rusty Harold
0002:
0003: This library is free software; you can redistribute it and/or modify
0004: it under the terms of version 2.1 of the GNU Lesser General Public
0005: License as published by the Free Software Foundation.
0006:
0007: This library is distributed in the hope that it will be useful,
0008: but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: GNU Lesser General Public License for more details.
0011:
0012: You should have received a copy of the GNU Lesser General Public
0013: License along with this library; if not, write to the
0014: Free Software Foundation, Inc., 59 Temple Place, Suite 330,
0015: Boston, MA 02111-1307 USA
0016:
0017: You can contact Elliotte Rusty Harold by sending e-mail to
0018: elharo@metalab.unc.edu. Please include the word "XOM" in the
0019: subject line. The XOM home page is located at http://www.xom.nu/
0020: */
0021:
0022: package nu.xom;
0023:
0024: import java.io.IOException;
0025: import java.io.OutputStream;
0026: import java.io.OutputStreamWriter;
0027: import java.io.UnsupportedEncodingException;
0028: import java.io.Writer;
0029: import java.util.Iterator;
0030: import java.util.Locale;
0031: import java.util.Map;
0032:
0033: /**
0034: * <p>
0035: * Outputs a <code>Document</code> object in a specific encoding using
0036: * various options for controlling white space, normalization,
0037: * indenting, line breaking, and base URIs. However, in general these
0038: * options do affect the document's infoset. In particular, if you set
0039: * either the maximum line length or the indent size to a positive
0040: * value, then the serializer will not respect input white space. It
0041: * may trim leading and trailing space, condense runs of white
0042: * space to a single space, convert carriage returns and linefeeds
0043: * to spaces, add extra space where none was present before,
0044: * and otherwise muck with the document's white space.
0045: * The defaults, however, preserve all significant white space
0046: * including ignorable white space and boundary white space.
0047: * </p>
0048: *
0049: * @author Elliotte Rusty Harold
0050: * @version 1.1b7
0051: *
0052: */
0053: public class Serializer {
0054:
0055: private TextWriter escaper;
0056: private boolean preserveBaseURI = false;
0057:
0058: /**
0059: * <p>
0060: * Create a new serializer that uses the UTF-8 encoding.
0061: * </p>
0062: *
0063: * @param out the output stream to write the document on
0064: *
0065: * @throws NullPointerException if <code>out</code> is null
0066: */
0067: public Serializer(OutputStream out) {
0068:
0069: try {
0070: this .setOutputStream(out, "UTF-8");
0071: } catch (UnsupportedEncodingException ex) {
0072: throw new RuntimeException(
0073: "The VM is broken. It does not understand UTF-8.");
0074: }
0075:
0076: }
0077:
0078: /**
0079: * <p>
0080: * Create a new serializer that uses the specified encoding.
0081: * The encoding must be recognized by the Java virtual machine. If
0082: * you attempt to use an encoding that the local Java virtual
0083: * machine does not support, the constructor will throw an
0084: * <code>UnsupportedEncodingException</code>.
0085: * Currently the following encodings are recognized by XOM:
0086: * </p>
0087: *
0088: * <ul>
0089: * <li>UTF-8</li>
0090: * <li>UTF-16</li>
0091: * <li>UTF-16BE</li>
0092: * <li>UTF-16LE</li>
0093: * <li>ISO-10646-UCS-2</li>
0094: * <li>ISO-8859-1</li>
0095: * <li>ISO-8859-2</li>
0096: * <li>ISO-8859-3</li>
0097: * <li>ISO-8859-4</li>
0098: * <li>ISO-8859-5</li>
0099: * <li>ISO-8859-6</li>
0100: * <li>ISO-8859-7</li>
0101: * <li>ISO-8859-8</li>
0102: * <li>ISO-8859-9</li>
0103: * <li>ISO-8859-10</li>
0104: * <li>ISO-8859-11 (a.k.a. TIS-620)</li>
0105: * <li>ISO-8859-13</li>
0106: * <li>ISO-8859-14</li>
0107: * <li>ISO-8859-15</li>
0108: * <li>ISO-8859-16</li>
0109: * <li>IBM037 (a.k.a. CP037, EBCDIC-CP-US, EBCDIC-CP-CA,
0110: * EBCDIC-CP-WA, EBCDIC-CP-NL, and CSIBM037)</li>
0111: * <li>GB18030</li>
0112: * </ul>
0113: *
0114: * <p>
0115: * You can use encodings not in this list if the virtual
0116: * machine supports them. However, they may be
0117: * significantly slower than the encodings in this list.
0118: * </p>
0119: *
0120: * <p>
0121: * I've noticed Java has significant bugs in its handling of some
0122: * of these encodings. In some cases such as 0x80 in Big5, XOM
0123: * will escape a character that should not need to be escaped
0124: * because Java can't output that character in the specified
0125: * encoding, even though the output character set does contain it.
0126: * :-(
0127: * </p>
0128: *
0129: * @param out the output stream to write the document on
0130: * @param encoding the character encoding for the serialization
0131:
0132: * @throws NullPointerException if <code>out</code>
0133: * or <code>encoding</code> is null
0134: * @throws UnsupportedEncodingException if the VM does not
0135: * support the requested encoding
0136: *
0137: */
0138: public Serializer(OutputStream out, String encoding)
0139: throws UnsupportedEncodingException {
0140:
0141: if (encoding == null) {
0142: throw new NullPointerException("Null encoding");
0143: }
0144: this .setOutputStream(out, encoding);
0145:
0146: }
0147:
0148: /**
0149: * <p>
0150: * Flushes the previous output stream and
0151: * redirects further output to the new output stream.
0152: * </p>
0153: *
0154: *
0155: * @param out the output stream to write the document on
0156:
0157: * @throws NullPointerException if <code>out</code> is null
0158: * @throws IOException if the previous output stream
0159: * encounters an I/O error when flushed
0160: *
0161: */
0162: public void setOutputStream(OutputStream out) throws IOException {
0163:
0164: // flush any data onto the old output stream
0165: this .flush();
0166: int maxLength = getMaxLength();
0167: int indent = this .getIndent();
0168: String lineSeparator = getLineSeparator();
0169: boolean nfc = getUnicodeNormalizationFormC();
0170: String encoding = escaper.getEncoding();
0171: boolean lineSeparatorSet = escaper.lineSeparatorSet;
0172: setOutputStream(out, encoding);
0173: setIndent(indent);
0174: setMaxLength(maxLength);
0175: setUnicodeNormalizationFormC(nfc);
0176: if (lineSeparatorSet)
0177: setLineSeparator(lineSeparator);
0178:
0179: }
0180:
0181: private void setOutputStream(OutputStream out, String encoding)
0182: throws UnsupportedEncodingException {
0183:
0184: if (out == null) {
0185: throw new NullPointerException("Null OutputStream");
0186: }
0187: Writer writer;
0188: String encodingUpperCase = encoding.toUpperCase(Locale.ENGLISH);
0189: if (encodingUpperCase.equals("UTF-8")) {
0190: writer = new OutputStreamWriter(out, "UTF-8");
0191: } else if (encodingUpperCase.equals("UTF-16")
0192: || encodingUpperCase.equals("ISO-10646-UCS-2")) {
0193: // For compatibility with Java 1.2 and earlier
0194: writer = new OutputStreamWriter(out, "UnicodeBig");
0195: }
0196: // Java's Cp037 encoding is broken, so we have to
0197: // provide our own.
0198: else if (encodingUpperCase.equals("IBM037")
0199: || encodingUpperCase.equals("CP037")
0200: || encodingUpperCase.equals("EBCDIC-CP-US")
0201: || encodingUpperCase.equals("EBCDIC-CP-CA")
0202: || encodingUpperCase.equals("EBCDIC-CP-WA")
0203: || encodingUpperCase.equals("EBCDIC-CP-NL")
0204: || encodingUpperCase.equals("CSIBM037")) {
0205: writer = new EBCDICWriter(out);
0206: } else if (encodingUpperCase.equals("ISO-8859-11")
0207: || encodingUpperCase.equals("TIS-620")) {
0208: // Java doesn't recognize the name ISO-8859-11 and
0209: // Java 1.3 and earlier don't recognize TIS-620
0210: writer = new OutputStreamWriter(out, "TIS620");
0211: } else
0212: writer = new OutputStreamWriter(out, encoding);
0213:
0214: writer = new UnsynchronizedBufferedWriter(writer);
0215: this .escaper = TextWriterFactory
0216: .getTextWriter(writer, encoding);
0217:
0218: }
0219:
0220: /**
0221: * <p>
0222: * Serializes a document onto the output
0223: * stream using the current options.
0224: * </p>
0225: *
0226: * @param doc the <code>Document</code> to serialize
0227: *
0228: * @throws IOException if the underlying output stream
0229: * encounters an I/O error
0230: * @throws NullPointerException if <code>doc</code> is null
0231: * @throws UnavailableCharacterException if the document contains
0232: * an unescapable character (e.g. in an element name) that is
0233: * not available in the current encoding
0234: */
0235: public void write(Document doc) throws IOException {
0236:
0237: escaper.reset();
0238: // The OutputStreamWriter automatically inserts
0239: // the byte order mark if necessary.
0240: writeXMLDeclaration();
0241: int childCount = doc.getChildCount();
0242: for (int i = 0; i < childCount; i++) {
0243: writeChild(doc.getChild(i));
0244:
0245: // Might want to remove this line break in a
0246: // non-XML serializer where it's not guaranteed to be
0247: // OK to add extra line breaks in the prolog
0248: escaper.breakLine();
0249: }
0250: escaper.flush();
0251:
0252: }
0253:
0254: /**
0255: * <p>
0256: * Writes the XML declaration onto the output stream,
0257: * followed by a line break.
0258: * </p>
0259: *
0260: * @throws IOException if the underlying output stream
0261: * encounters an I/O error
0262: */
0263: protected void writeXMLDeclaration() throws IOException {
0264:
0265: escaper.writeMarkup("<?xml version=\"1.0\" encoding=\"");
0266: escaper.writeMarkup(escaper.getEncoding());
0267: escaper.writeMarkup("\"?>");
0268: escaper.breakLine();
0269:
0270: }
0271:
0272: /**
0273: * <p>
0274: * Serializes an element onto the output stream using the current
0275: * options. The result is guaranteed to be well-formed. If
0276: * <code>element</code> does not have a parent element, the output
0277: * will also be namespace well-formed.
0278: * </p>
0279: *
0280: * <p>
0281: * If the element is empty, this method invokes
0282: * <code>writeEmptyElementTag</code>. If the element is not
0283: * empty, then:
0284: * </p>
0285: *
0286: * <ol>
0287: * <li>It calls <code>writeStartTag</code>.</li>
0288: * <li>It passes each of the element's children to
0289: * <code>writeChild</code> in order.</li>
0290: * <li>It calls <code>writeEndTag</code>.</li>
0291: * </ol>
0292: *
0293: * <p>
0294: * It may break lines or add white space if the serializer has
0295: * been configured to indent or use a maximum line length.
0296: * </p>
0297: *
0298: * @param element the <code>Element</code> to serialize
0299: *
0300: * @throws IOException if the underlying output stream
0301: * encounters an I/O error
0302: * @throws UnavailableCharacterException if the element name
0303: * contains a character that is not available in the
0304: * current encoding
0305: */
0306: protected void write(Element element) throws IOException {
0307:
0308: boolean wasPreservingWhiteSpace = escaper.isPreserveSpace();
0309: if (escaper.isIndenting() && !wasPreservingWhiteSpace
0310: && !escaper.justBroke()) {
0311: escaper.breakLine();
0312: }
0313:
0314: // workaround for case where only children are empty text nodes
0315: boolean hasRealChildren = false;
0316: int childCount = element.getChildCount();
0317: for (int i = 0; i < childCount; i++) {
0318: Node child = element.getChild(i);
0319: if (child.isText()) {
0320: Text t = (Text) child;
0321: if (t.isEmpty())
0322: continue;
0323: }
0324: hasRealChildren = true;
0325: break;
0326: }
0327:
0328: if (hasRealChildren) {
0329: writeStartTag(element);
0330: // adjust for xml:space
0331: String newXMLSpaceValue = element.getAttributeValue(
0332: "space", "http://www.w3.org/XML/1998/namespace");
0333: if (newXMLSpaceValue != null) {
0334: if ("preserve".equals(newXMLSpaceValue)) {
0335: escaper.setPreserveSpace(true);
0336: } else if ("default".equals(newXMLSpaceValue)) {
0337: escaper.setPreserveSpace(false);
0338: }
0339: }
0340:
0341: escaper.incrementIndent();
0342: // children
0343: for (int i = 0; i < childCount; i++) {
0344: Node child = element.getChild(i);
0345: // need to work around a very tricky case here where
0346: // denormalized characters cross boundaries of
0347: // consecutive text nodes
0348: if (escaper.getNFC() && child.isText()) {
0349: Text t = (Text) child;
0350: while (i < childCount - 1) { // not the last node
0351: Node next = element.getChild(i + 1);
0352: if (next.isText()) {
0353: t = new Text(t.getValue() + next.getValue());
0354: i++;
0355: } else
0356: break;
0357: }
0358: writeChild(t);
0359: } else {
0360: writeChild(child);
0361: }
0362: }
0363: escaper.decrementIndent();
0364: if (escaper.getIndent() > 0 && !escaper.isPreserveSpace()) {
0365: if (hasNonTextChildren(element)) {
0366: escaper.breakLine();
0367: }
0368: }
0369: writeEndTag(element);
0370:
0371: // restore parent value
0372: if (newXMLSpaceValue != null) {
0373: escaper.setPreserveSpace(wasPreservingWhiteSpace);
0374: }
0375:
0376: } else {
0377: writeEmptyElementTag(element);
0378: }
0379:
0380: }
0381:
0382: private boolean hasNonTextChildren(Element element) {
0383:
0384: int childCount = element.getChildCount();
0385: for (int i = 0; i < childCount; i++) {
0386: if (!element.getChild(i).isText())
0387: return true;
0388: }
0389: return false;
0390:
0391: }
0392:
0393: // writeEndTag should not normally throw UnavailableCharacterException
0394: // because that would already have been thrown for the
0395: // corresponding start-tag.
0396: /**
0397: * <p>
0398: * Writes the end-tag for an element in the form
0399: * <code></<i>name</i>></code>.
0400: * </p>
0401: *
0402: * @param element the element whose end-tag is written
0403: *
0404: * @throws IOException if the underlying output stream
0405: * encounters an I/O error
0406: */
0407: protected void writeEndTag(Element element) throws IOException {
0408: escaper.writeMarkup("</");
0409: escaper.writeMarkup(element.getQualifiedName());
0410: escaper.writeMarkup('>');
0411: }
0412:
0413: /**
0414: *
0415: * <p>
0416: * Writes the start-tag for the element including
0417: * all its namespace declarations and attributes.
0418: * </p>
0419: *
0420: * <p>
0421: * The <code>writeAttributes</code> method is called to write
0422: * all the non-namespace-declaration attributes.
0423: * The <code>writeNamespaceDeclarations</code> method
0424: * is called to write all the namespace declaration attributes.
0425: * </p>
0426: *
0427: * @param element the element whose start-tag is written
0428: *
0429: * @throws IOException if the underlying output stream
0430: * encounters an I/O error
0431: * @throws UnavailableCharacterException if the name of the element
0432: * or the name of any of its attributes contains a character
0433: * that is not available in the current encoding
0434: */
0435: protected void writeStartTag(Element element) throws IOException {
0436: writeTagBeginning(element);
0437: escaper.writeMarkup('>');
0438: }
0439:
0440: /**
0441: *
0442: * <p>
0443: * Writes an empty-element tag for the element
0444: * including all its namespace declarations and attributes.
0445: * </p>
0446: *
0447: * <p>
0448: * The <code>writeAttributes</code> method is called to write
0449: * all the non-namespace-declaration attributes.
0450: * The <code>writeNamespaceDeclarations</code> method
0451: * is called to write all the namespace declaration attributes.
0452: * </p>
0453: *
0454: * <p>
0455: * If subclasses don't wish empty-element tags to be used,
0456: * they can override this method to simply invoke
0457: * <code>writeStartTag</code> followed by
0458: * <code>writeEndTag</code>.
0459: * </p>
0460: *
0461: * @param element the element whose empty-element tag is written
0462: *
0463: * @throws IOException if the underlying output stream
0464: * encounters an I/O error
0465: * @throws UnavailableCharacterException if the name of the element or the name of
0466: * any of its attributes contains a character that is not
0467: * available in the current encoding
0468: */
0469: protected void writeEmptyElementTag(Element element)
0470: throws IOException {
0471: writeTagBeginning(element);
0472: escaper.writeMarkup("/>");
0473: }
0474:
0475: // This just extracts the commonality between writeStartTag
0476: // and writeEmptyElementTag
0477: private void writeTagBeginning(Element element) throws IOException {
0478: escaper.writeMarkup('<');
0479: escaper.writeMarkup(element.getQualifiedName());
0480: writeAttributes(element);
0481: writeNamespaceDeclarations(element);
0482: }
0483:
0484: /**
0485: * <p>
0486: * Writes all the attributes of the specified
0487: * element onto the output stream, one at a time, separated
0488: * by white space. If preserveBaseURI is true, and it is
0489: * necessary to add an <code>xml:base</code> attribute
0490: * to the element in order to preserve the base URI, then
0491: * that attribute is also written here.
0492: * Each individual attribute is written by invoking
0493: * <code>write(Attribute)</code>.
0494: * </p>
0495: *
0496: * @param element the <code>Element</code> whose attributes are
0497: * written
0498: * @throws IOException if the underlying output stream
0499: * encounters an I/O error
0500: * @throws UnavailableCharacterException if the name of any of
0501: * the element's attributes contains a character that is not
0502: * available in the current encoding
0503: */
0504: protected void writeAttributes(Element element) throws IOException {
0505:
0506: // check to see if we need an xml:base attribute
0507: if (preserveBaseURI) {
0508: ParentNode parent = element.getParent();
0509: if (element.getAttribute("base",
0510: "http://www.w3.org/XML/1998/namespace") == null) {
0511: String baseValue = element.getBaseURI();
0512: if (parent == null
0513: || parent.isDocument()
0514: || !element.getBaseURI().equals(
0515: parent.getBaseURI())) {
0516:
0517: escaper.writeMarkup(' ');
0518: Attribute baseAttribute = new Attribute("xml:base",
0519: "http://www.w3.org/XML/1998/namespace",
0520: baseValue);
0521: write(baseAttribute);
0522: }
0523: }
0524: }
0525:
0526: int attributeCount = element.getAttributeCount();
0527: for (int i = 0; i < attributeCount; i++) {
0528: Attribute attribute = element.getAttribute(i);
0529: escaper.writeMarkup(' ');
0530: write(attribute);
0531: }
0532: }
0533:
0534: /**
0535: * <p>
0536: * Writes all the namespace declaration
0537: * attributes of the specified element onto the output stream,
0538: * one at a time, separated by white space. Each individual
0539: * declaration is written by invoking
0540: * <code>writeNamespaceDeclaration</code>.
0541: * </p>
0542: *
0543: * @param element the <code>Element</code> whose namespace
0544: * declarations are written
0545: * @throws IOException if the underlying output stream
0546: * encounters an I/O error
0547: * @throws UnavailableCharacterException if any of the element's
0548: * namespace prefixes contains a character that is not
0549: * available in the current encoding
0550: */
0551: protected void writeNamespaceDeclarations(Element element)
0552: throws IOException {
0553:
0554: ParentNode parent = element.getParent();
0555:
0556: Map prefixes = element.getNamespacePrefixesInScope();
0557: Iterator iterator = prefixes.entrySet().iterator();
0558: while (iterator.hasNext()) {
0559: Map.Entry entry = (Map.Entry) iterator.next();
0560: String additionalPrefix = (String) entry.getKey();
0561: String uri = (String) entry.getValue();
0562: if (parent != null && parent.isElement()) {
0563: Element parentElement = (Element) parent;
0564: if (uri.equals(parentElement
0565: .getNamespaceURI(additionalPrefix))) {
0566: continue;
0567: }
0568: } else if (uri.equals("")) {
0569: continue; // no need to say xmlns=""
0570: }
0571:
0572: // XXX replace with a writeSpace method????
0573: escaper.writeMarkup(' ');
0574: writeNamespaceDeclaration(additionalPrefix, uri);
0575: }
0576:
0577: }
0578:
0579: /**
0580: * <p>
0581: * Writes a namespace declaration in the form
0582: * <code>xmlns:<i>prefix</i>="<i>uri</i>"</code> or
0583: * <code>xmlns="<i>uri</i>"</code>. It does not write
0584: * the spaces on either side of the namespace declaration.
0585: * These are written by <code>writeNamespaceDeclarations</code>.
0586: * </p>
0587: *
0588: * @param prefix the namespace prefix; the empty string for the
0589: * default namespace
0590: * @param uri the namespace URI
0591: *
0592: * @throws IOException if the underlying output stream
0593: * encounters an I/O error
0594: * @throws UnavailableCharacterException if the namespace prefix contains a
0595: * character that is not available in the current encoding
0596: */
0597: protected void writeNamespaceDeclaration(String prefix, String uri)
0598: throws IOException {
0599:
0600: if ("".equals(prefix)) {
0601: escaper.writeMarkup("xmlns");
0602: } else {
0603: escaper.writeMarkup("xmlns:");
0604: escaper.writeMarkup(prefix);
0605: }
0606: escaper.writeMarkup("=\"");
0607: escaper.writePCDATA(uri);
0608: escaper.writeMarkup('\"');
0609:
0610: }
0611:
0612: /**
0613: * <p>
0614: * Writes an attribute in the form
0615: * <code><i>name</i>="<i>value</i>"</code>.
0616: * Characters in the attribute value are escaped as necessary.
0617: * </p>
0618: *
0619: * @param attribute the <code>Attribute</code> to write
0620: *
0621: * @throws IOException if the underlying output stream
0622: * encounters an I/O error
0623: * @throws UnavailableCharacterException if the attribute name contains a character
0624: * that is not available in the current encoding
0625: *
0626: */
0627: protected void write(Attribute attribute) throws IOException {
0628: escaper.writeMarkup(attribute.getQualifiedName());
0629: escaper.writeMarkup("=\"");
0630: escaper.writeAttributeValue(attribute.getValue());
0631: escaper.writeMarkup('\"');
0632: }
0633:
0634: /**
0635: * <p>
0636: * Writes a comment onto the output stream using the current
0637: * options. Since character and entity references are not resolved
0638: * in comments, comments can only be serialized when all
0639: * characters they contain are available in the current
0640: * encoding.
0641: * </p>
0642: *
0643: * @param comment the <code>Comment</code> to serialize
0644: *
0645: * @throws IOException if the underlying output stream
0646: * encounters an I/O error
0647: * @throws UnavailableCharacterException if the comment contains a
0648: * character that is not available in the current encoding
0649: */
0650: protected void write(Comment comment) throws IOException {
0651: if (escaper.isIndenting())
0652: escaper.breakLine();
0653: escaper.writeMarkup("<!--");
0654: escaper.writeMarkup(comment.getValue());
0655: escaper.writeMarkup("-->");
0656: }
0657:
0658: /**
0659: * <p>
0660: * Writes a processing instruction
0661: * onto the output stream using the current options.
0662: * Since character and entity references are not resolved
0663: * in processing instructions, processing instructions
0664: * can only be serialized when all
0665: * characters they contain are available in the current
0666: * encoding.
0667: * </p>
0668: *
0669: * @param instruction the <code>ProcessingInstruction</code>
0670: * to serialize
0671: *
0672: * @throws IOException if the underlying output stream
0673: * encounters an I/O error
0674: * @throws UnavailableCharacterException if the comment contains a
0675: * character that is not available in the current encoding
0676: */
0677: protected void write(ProcessingInstruction instruction)
0678: throws IOException {
0679:
0680: if (escaper.isIndenting())
0681: escaper.breakLine();
0682: escaper.writeMarkup("<?");
0683: escaper.writeMarkup(instruction.getTarget());
0684: String value = instruction.getValue();
0685: // for canonical XML, only output a space after the target
0686: // if there is a value
0687: if (!"".equals(value)) {
0688: escaper.writeMarkup(' ');
0689: escaper.writeMarkup(value);
0690: }
0691: escaper.writeMarkup("?>");
0692:
0693: }
0694:
0695: /**
0696: * <p>
0697: * Writes a <code>Text</code> object
0698: * onto the output stream using the current options.
0699: * Reserved characters such as <, > and "
0700: * are escaped using the standard entity references
0701: * such as <code>&lt;</code>, <code>&gt;</code>,
0702: * and <code>&quot;</code>.
0703: * </p>
0704: *
0705: * <p>
0706: * Characters which cannot be encoded in the current character set
0707: * (for example, Ω in ISO-8859-1) are encoded using
0708: * character references.
0709: * </p>
0710: *
0711: * @param text the <code>Text</code> to serialize
0712: *
0713: * @throws IOException if the underlying output stream
0714: * encounters an I/O error
0715: */
0716: protected void write(Text text) throws IOException {
0717:
0718: // XXX Is there a shortcut that takes advantage of the
0719: // data being stored in UTF-8 here? perhaps even if only
0720: // when serializing to UTF-8?
0721: String value = text.getValue();
0722: if (text.isCDATASection() && value.indexOf("]]>") == -1) {
0723: if (!(escaper instanceof UnicodeWriter)) {
0724: int length = value.length();
0725: for (int i = 0; i < length; i++) {
0726: if (escaper.needsEscaping(value.charAt(i))) {
0727: // can't use CDATA section
0728: escaper.writePCDATA(value);
0729: return;
0730: }
0731: }
0732: }
0733: escaper.writeMarkup("<![CDATA[");
0734: escaper.writeMarkup(value);
0735: escaper.writeMarkup("]]>");
0736: }
0737: // is this boundary whitespace we can ignore?
0738: else if (isBoundaryWhitespace(text, value)) {
0739: return; // without writing node
0740: } else {
0741: escaper.writePCDATA(value);
0742: }
0743:
0744: }
0745:
0746: private boolean isBoundaryWhitespace(Text text, String value) {
0747:
0748: if (getIndent() <= 0)
0749: return false;
0750:
0751: ParentNode parent = text.getParent();
0752: if (parent == null) {
0753: return "".equals(value.trim());
0754: }
0755:
0756: // ???? cutting next line only breaks a few tests; and what it does
0757: // break might be better off if the breakage is accepted as correct behavior
0758: int childCount = parent.getChildCount();
0759: if (childCount == 1)
0760: return false;
0761: if (!"".equals(value.trim()))
0762: return false;
0763:
0764: // ???? This is a huge Hotspot. maybe 12% of serialization time
0765: // when indenting. Is there any way to eliminate this?
0766: // We only actually need to test a couple of positions, 0 and
0767: // parent.getChildCount()-1
0768: // Instead of getting position we could get those two elements and compare
0769: // to the text. But you still need the previous and next
0770: int position = parent.indexOf(text);
0771:
0772: Node previous = null;
0773: Node next = null;
0774:
0775: if (position != 0)
0776: previous = parent.getChild(position - 1);
0777: if (position != childCount - 1) {
0778: next = parent.getChild(position + 1);
0779: }
0780: if (previous == null || !previous.isText()) {
0781: if (next == null || !next.isText()) {
0782: return true;
0783: }
0784: }
0785:
0786: return false;
0787:
0788: }
0789:
0790: /**
0791: * <p>
0792: * Writes a <code>DocType</code> object
0793: * onto the output stream using the current options.
0794: * </p>
0795: *
0796: * @param doctype the document type declaration to serialize
0797: *
0798: * @throws IOException if the underlying
0799: * output stream encounters an I/O error
0800: * @throws UnavailableCharacterException if the document type
0801: * declaration contains a character that is not available
0802: * in the current encoding
0803: */
0804: protected void write(DocType doctype) throws IOException {
0805:
0806: escaper.writeMarkup("<!DOCTYPE ");
0807: escaper.writeMarkup(doctype.getRootElementName());
0808: if (doctype.getPublicID() != null) {
0809: escaper.writeMarkup(" PUBLIC \"" + doctype.getPublicID()
0810: + "\" \"" + doctype.getSystemID() + "\"");
0811: } else if (doctype.getSystemID() != null) {
0812: escaper.writeMarkup(" SYSTEM \"" + doctype.getSystemID()
0813: + "\"");
0814: }
0815:
0816: String internalDTDSubset = doctype.getInternalDTDSubset();
0817: if (!internalDTDSubset.equals("")) {
0818: escaper.writeMarkup(" [");
0819: escaper.breakLine();
0820: escaper.setInDocType(true);
0821: escaper.writeMarkup(internalDTDSubset);
0822: escaper.setInDocType(false);
0823: escaper.writeMarkup("]");
0824: }
0825:
0826: escaper.writeMarkup(">");
0827:
0828: }
0829:
0830: /**
0831: * <p>
0832: * Writes a child node onto the output stream using the
0833: * current options. It is invoked when walking the tree to
0834: * serialize the entire document. It is not called, and indeed
0835: * should not be called, for either the <code>Document</code>
0836: * node or for attributes.
0837: * </p>
0838: *
0839: * @param node the <code>Node</code> to serialize
0840: *
0841: * @throws IOException if the underlying output stream
0842: * encounters an I/O error
0843: * @throws XMLException if an <code>Attribute</code>, a
0844: * <code>Document</code>, or <code>Namespace</code>
0845: * is passed to this method
0846: */
0847: protected void writeChild(Node node) throws IOException {
0848:
0849: if (node.isElement()) {
0850: write((Element) node);
0851: } else if (node.isText()) {
0852: write((Text) node);
0853: } else if (node.isComment()) {
0854: write((Comment) node);
0855: } else if (node.isProcessingInstruction()) {
0856: write((ProcessingInstruction) node);
0857: } else if (node.isDocType()) {
0858: write((DocType) node);
0859: } else {
0860: throw new XMLException("Cannot write a "
0861: + node.getClass().getName()
0862: + " from the writeChild() method");
0863: }
0864:
0865: }
0866:
0867: /** <p>
0868: * Writes a string onto the underlying output stream.
0869: * Non-ASCII characters that are not available in the
0870: * current character set are encoded with numeric character
0871: * references. The three reserved characters <, >, and &
0872: * are escaped using the standard entity references
0873: * <code>&lt;</code>, <code>&gt;</code>,
0874: * and <code>&amp;</code>.
0875: * Double and single quotes are not escaped.
0876: * </p>
0877: *
0878: * @param text the parsed character data to serialize
0879: *
0880: * @throws IOException if the underlying output stream
0881: * encounters an I/O error
0882: */
0883: protected final void writeEscaped(String text) throws IOException {
0884: escaper.writePCDATA(text);
0885: }
0886:
0887: /** <p>
0888: * Writes a string onto the underlying output stream.
0889: * Non-ASCII characters that are not available in the
0890: * current character set are escaped using hexadecimal numeric
0891: * character references. Carriage returns, line feeds, and tabs
0892: * are also escaped using hexadecimal numeric character
0893: * references in order to ensure their preservation on a round
0894: * trip. The four reserved characters <, >, &,
0895: * and " are escaped using the standard entity references
0896: * <code>&lt;</code>, <code>&gt;</code>,
0897: * <code>&amp;</code>, and <code>&quot;</code>.
0898: * The single quote is not escaped.
0899: * </p>
0900: *
0901: * @param value the attribute value to serialize
0902: *
0903: * @throws IOException if the underlying output stream
0904: * encounters an I/O error
0905: */
0906: protected final void writeAttributeValue(String value)
0907: throws IOException {
0908: escaper.writeAttributeValue(value);
0909: }
0910:
0911: /** <p>
0912: * Writes a string onto the underlying output stream.
0913: * without escaping any characters.
0914: * Non-ASCII characters that are not available in the
0915: * current character set cause an <code>IOException</code>.
0916: * </p>
0917: *
0918: * @param text the <code>String</code> to serialize
0919: *
0920: * @throws IOException if the underlying output stream
0921: * encounters an I/O error or <code>text</code> contains
0922: * characters not available in the current character set
0923: */
0924: protected final void writeRaw(String text) throws IOException {
0925: escaper.writeMarkup(text);
0926: }
0927:
0928: /** <p>
0929: * Writes the current line break string
0930: * onto the underlying output stream and indents
0931: * as specified by the current level and the indent property.
0932: * </p>
0933: *
0934: * @throws IOException if the underlying output stream
0935: * encounters an I/O error
0936: */
0937: protected final void breakLine() throws IOException {
0938: escaper.breakLine();
0939: }
0940:
0941: /**
0942: * <p>
0943: * Flushes the data onto the output stream.
0944: * It is not enough to flush the output stream.
0945: * You must flush the serializer object itself because it
0946: * uses some internal buffering.
0947: * The serializer will flush the underlying output stream.
0948: * </p>
0949: *
0950: * @throws IOException if the underlying
0951: * output stream encounters an I/O error
0952: */
0953: public void flush() throws IOException {
0954: escaper.flush();
0955: }
0956:
0957: /**
0958: * <p>
0959: * Returns the number of spaces this serializer indents.
0960: * </p>
0961: *
0962: * @return the number of spaces this serializer indents
0963: * each successive level beyond the previous one
0964: */
0965: public int getIndent() {
0966: return escaper.getIndent();
0967: }
0968:
0969: /**
0970: * <p>
0971: * Sets the number of additional spaces to add to each successive
0972: * level in the hierarchy. Use 0 for no extra indenting. The
0973: * maximum indentation is in limited to approximately half the
0974: * maximum line length. The serializer will not indent further
0975: * than that no matter how many levels deep the hierarchy is.
0976: * </p>
0977: *
0978: * <p>
0979: * When this variable is set to a value greater than 0,
0980: * the serializer does not preserve white space. Spaces,
0981: * tabs, carriage returns, and line feeds can all be
0982: * interchanged at the serializer's discretion, and additional
0983: * white space may be added before and after tags.
0984: * Carriage returns, line feeds, and tabs will not be
0985: * escaped with numeric character references.
0986: * </p>
0987: *
0988: * <p>
0989: * Inside elements with an <code>xml:space="preserve"</code>
0990: * attribute, white space is preserved and no indenting
0991: * takes place, regardless of the setting of the indent
0992: * property, unless, of course, an
0993: * <code>xml:space="default"</code> attribute overrides the
0994: * <code>xml:space="preserve"</code> attribute.
0995: * </p>
0996: *
0997: * <p>
0998: * The default value for indent is 0; that is, the default is
0999: * not to add or subtract any white space from the source
1000: * document.
1001: * </p>
1002: *
1003: * @param indent the number of spaces to indent
1004: * each successive level of the hierarchy
1005: *
1006: * @throws IllegalArgumentException if indent is less than zero
1007: *
1008: */
1009: public void setIndent(int indent) {
1010: if (indent < 0) {
1011: throw new IllegalArgumentException(
1012: "Indent cannot be negative");
1013: }
1014: escaper.setIndent(indent);
1015: }
1016:
1017: /**
1018: * <p>
1019: * Returns the string used as a line separator.
1020: * This is always <code>"\n"</code>, <code>"\r"</code>,
1021: * or <code>"\r\n"</code>.
1022: * </p>
1023: *
1024: * @return the line separator
1025: */
1026: public String getLineSeparator() {
1027: return escaper.getLineSeparator();
1028: }
1029:
1030: /**
1031: * <p>
1032: * Sets the line separator. This can only be one of the
1033: * three strings <code>"\n"</code>, <code>"\r"</code>,
1034: * or <code>"\r\n"</code>. All other values are forbidden.
1035: * If this method is invoked, then
1036: * line separators in the character data will be changed to this
1037: * string. Line separators in attribute values will be changed
1038: * to the hexadecimal numeric character references corresponding
1039: * to this string.
1040: * </p>
1041: *
1042: * <p>
1043: * The default line separator is <code>"\r\n"</code>. However,
1044: * line separators in character data and attribute values are not
1045: * changed to this string, unless this method is called first.
1046: * </p>
1047: *
1048: * @param lineSeparator the line separator to set
1049: *
1050: * @throws IllegalArgumentException if you attempt to use any line
1051: * separator other than <code>"\n"</code>, <code>"\r"</code>,
1052: * or <code>"\r\n"</code>.
1053: *
1054: */
1055: public void setLineSeparator(String lineSeparator) {
1056: escaper.setLineSeparator(lineSeparator);
1057: }
1058:
1059: /**
1060: * <p>
1061: * Returns the preferred maximum line length.
1062: * </p>
1063: *
1064: * @return the preferred maximum line length.
1065: */
1066: public int getMaxLength() {
1067: return escaper.getMaxLength();
1068: }
1069:
1070: /**
1071: * <p>
1072: * Sets the suggested maximum line length for this serializer.
1073: * Setting this to 0 indicates that no automatic wrapping is to be
1074: * performed. When a line approaches this length, the serializer
1075: * begins looking for opportunities to break the line. Generally
1076: * it will break on any ASCII white space character (tab, carriage
1077: * return, linefeed, and space). In some circumstances the
1078: * serializer may not be able to break the line before the maximum
1079: * length is reached. For instance, if an element name is longer
1080: * than the maximum line length the only way to correctly
1081: * serialize it is to exceed the maximum line length. In this case,
1082: * the serializer will exceed the maximum line length.
1083: * </p>
1084: *
1085: * <p>
1086: * The default value for maximum line length is 0, which is
1087: * interpreted as no maximum line length.
1088: * Setting this to a negative value just sets it to 0.
1089: * </p>
1090: *
1091: * <p>
1092: * When this variable is set to a value greater than 0,
1093: * the serializer does not preserve white space. Spaces,
1094: * tabs, carriage returns, and line feeds can all be
1095: * interchanged at the serializer's discretion.
1096: * Carriage returns, line feeds, and tabs will not be
1097: * escaped with numeric character references.
1098: * </p>
1099: *
1100: * <p>
1101: * Inside elements with an <code>xml:space="preserve"</code>
1102: * attribute, the maximum line length is not enforced,
1103: * regardless of the setting of the this property, unless,
1104: * of course, an <code>xml:space="default"</code> attribute
1105: * overrides the <code>xml:space="preserve"</code> attribute.
1106: * </p>
1107: *
1108: * @param maxLength the preferred maximum line length
1109: */
1110: public void setMaxLength(int maxLength) {
1111: escaper.setMaxLength(maxLength);
1112: }
1113:
1114: /**
1115: * <p>
1116: * Returns true if this serializer preserves the original
1117: * base URIs by inserting extra <code>xml:base</code> attributes.
1118: * </p>
1119: *
1120: * @return true if this <code>Serializer</code> inserts
1121: * extra <code>xml:base</code> attributes to attempt to
1122: * preserve base URI information from the document.
1123: */
1124: public boolean getPreserveBaseURI() {
1125: return preserveBaseURI;
1126: }
1127:
1128: /**
1129: * <p>
1130: * Determines whether this serializer inserts
1131: * extra <code>xml:base</code> attributes to attempt to
1132: * preserve base URI information from the document.
1133: * The default is false, do not preserve base URI information.
1134: * <code>xml:base</code> attributes that have been explicitly
1135: * added to an element are always output. This property only
1136: * determines whether or not extra <code>xml:base</code>
1137: * attributes are added.
1138: * </p>
1139: *
1140: * @param preserve true if <code>xml:base</code>
1141: * attributes should be added as necessary
1142: * to preserve base URI information
1143: */
1144: public void setPreserveBaseURI(boolean preserve) {
1145: this .preserveBaseURI = preserve;
1146: }
1147:
1148: /**
1149: * <p>
1150: * Returns the name of the character encoding used by
1151: * this serializer.
1152: * </p>
1153: *
1154: * @return the encoding used for the output document
1155: */
1156: public String getEncoding() {
1157: return escaper.getEncoding();
1158: }
1159:
1160: /**
1161: * <p>
1162: * If true, this property indicates serialization will
1163: * perform Unicode normalization on all data using normalization
1164: * form C (NFC). Performing Unicode normalization may change the
1165: * document's infoset. The default is false; do not normalize.
1166: * This version is based on Unicode 4.0.
1167: * </p>
1168: *
1169: * <p>
1170: * This feature has not yet been benchmarked or optimized.
1171: * It may result in substantially slower code.
1172: * </p>
1173: *
1174: * <p>
1175: * If all your data is in the first 256 code points of Unicode
1176: * (i.e. the ISO-8859-1, Latin-1 character set), then it's
1177: * already in normalization form C and normalizing won't change
1178: * anything.
1179: * </p>
1180: *
1181: * @param normalize true if normalization is performed;
1182: * false if it isn't
1183: */
1184: public void setUnicodeNormalizationFormC(boolean normalize) {
1185: escaper.setNFC(normalize);
1186: }
1187:
1188: /**
1189: * <p>
1190: * Indicates whether serialization will
1191: * perform Unicode normalization on all data using normalization
1192: * form C (NFC). The default is false; do not normalize.
1193: * </p>
1194: *
1195: * @return true if this serializer performs Unicode
1196: * normalization; false if it doesn't
1197: */
1198: public boolean getUnicodeNormalizationFormC() {
1199: return escaper.getNFC();
1200: }
1201:
1202: /**
1203: * <p>
1204: * Returns the current column number of the output stream. This
1205: * method useful for subclasses that implement their own pretty
1206: * printing strategies by inserting white space and line breaks
1207: * at appropriate points.
1208: * </p>
1209: *
1210: * <p>
1211: * Columns are counted based on Unicode characters, not Java
1212: * chars. A surrogate pair counts as one character in this
1213: * context, not two. However, a character followed by a
1214: * combining character (e.g. e followed by combining accent
1215: * acute) counts as two characters. This latter choice
1216: * (treating combining characters like regular characters)
1217: * is under review, and may change in the future if it's not
1218: * too big a performance hit.
1219: * </p>
1220: *
1221: * @return the current column number
1222: */
1223: protected final int getColumnNumber() {
1224: return escaper.getColumnNumber();
1225: }
1226:
1227: }
|