001: /*
002: * Copyright 1999-2005 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: /*
017: * $Id: WriterToUTF8Buffered.java,v 1.9 2005/03/23 17:54:05 ytalwar Exp $
018: */
019: package org.apache.xml.serializer;
020:
021: import java.io.IOException;
022: import java.io.OutputStream;
023: import java.io.UnsupportedEncodingException;
024: import java.io.Writer;
025:
026: /**
027: * This class writes unicode characters to a byte stream (java.io.OutputStream)
028: * as quickly as possible. It buffers the output in an internal
029: * buffer which must be flushed to the OutputStream when done. This flushing
030: * is done via the close() flush() or flushBuffer() method.
031: *
032: * This class is only used internally within Xalan.
033: *
034: * @xsl.usage internal
035: */
036: final class WriterToUTF8Buffered extends Writer implements WriterChain {
037:
038: /** number of bytes that the byte buffer can hold.
039: * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
040: */
041: private static final int BYTES_MAX = 16 * 1024;
042: /** number of characters that the character buffer can hold.
043: * This is 1/3 of the number of bytes because UTF-8 encoding
044: * can expand one unicode character by up to 3 bytes.
045: */
046: private static final int CHARS_MAX = (BYTES_MAX / 3);
047:
048: // private static final int
049:
050: /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
051: private final OutputStream m_os;
052:
053: /**
054: * The internal buffer where data is stored.
055: * (sc & sb remove final to compile in JDK 1.1.8)
056: */
057: private final byte m_outputBytes[];
058:
059: private final char m_inputChars[];
060:
061: /**
062: * The number of valid bytes in the buffer. This value is always
063: * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
064: * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
065: * byte data.
066: */
067: private int count;
068:
069: /**
070: * Create an buffered UTF-8 writer.
071: *
072: *
073: * @param out the underlying output stream.
074: *
075: * @throws UnsupportedEncodingException
076: */
077: public WriterToUTF8Buffered(OutputStream out)
078: throws UnsupportedEncodingException {
079: m_os = out;
080: // get 3 extra bytes to make buffer overflow checking simpler and faster
081: // we won't have to keep checking for a few extra characters
082: m_outputBytes = new byte[BYTES_MAX + 3];
083:
084: // Big enough to hold the input chars that will be transformed
085: // into output bytes in m_ouputBytes.
086: m_inputChars = new char[CHARS_MAX + 2];
087: count = 0;
088:
089: // the old body of this constructor, before the buffersize was changed to a constant
090: // this(out, 8*1024);
091: }
092:
093: /**
094: * Create an buffered UTF-8 writer to write data to the
095: * specified underlying output stream with the specified buffer
096: * size.
097: *
098: * @param out the underlying output stream.
099: * @param size the buffer size.
100: * @exception IllegalArgumentException if size <= 0.
101: */
102: // public WriterToUTF8Buffered(final OutputStream out, final int size)
103: // {
104: //
105: // m_os = out;
106: //
107: // if (size <= 0)
108: // {
109: // throw new IllegalArgumentException(
110: // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
111: // }
112: //
113: // m_outputBytes = new byte[size];
114: // count = 0;
115: // }
116: /**
117: * Write a single character. The character to be written is contained in
118: * the 16 low-order bits of the given integer value; the 16 high-order bits
119: * are ignored.
120: *
121: * <p> Subclasses that intend to support efficient single-character output
122: * should override this method.
123: *
124: * @param c int specifying a character to be written.
125: * @exception IOException If an I/O error occurs
126: */
127: public void write(final int c) throws IOException {
128:
129: /* If we are close to the end of the buffer then flush it.
130: * Remember the buffer can hold a few more bytes than BYTES_MAX
131: */
132: if (count >= BYTES_MAX)
133: flushBuffer();
134:
135: if (c < 0x80) {
136: m_outputBytes[count++] = (byte) (c);
137: } else if (c < 0x800) {
138: m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
139: m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
140: } else if (c < 0x10000) {
141: m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
142: m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
143: m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
144: } else {
145: m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
146: m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
147: m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
148: m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
149: }
150:
151: }
152:
153: /**
154: * Write a portion of an array of characters.
155: *
156: * @param chars Array of characters
157: * @param start Offset from which to start writing characters
158: * @param length Number of characters to write
159: *
160: * @exception IOException If an I/O error occurs
161: *
162: * @throws java.io.IOException
163: */
164: public void write(final char chars[], final int start,
165: final int length) throws java.io.IOException {
166:
167: // We multiply the length by three since this is the maximum length
168: // of the characters that we can put into the buffer. It is possible
169: // for each Unicode character to expand to three bytes.
170:
171: int lengthx3 = 3 * length;
172:
173: if (lengthx3 >= BYTES_MAX - count) {
174: // The requested length is greater than the unused part of the buffer
175: flushBuffer();
176:
177: if (lengthx3 > BYTES_MAX) {
178: /*
179: * The requested length exceeds the size of the buffer.
180: * Cut the buffer up into chunks, each of which will
181: * not cause an overflow to the output buffer m_outputBytes,
182: * and make multiple recursive calls.
183: * Be careful about integer overflows in multiplication.
184: */
185: int split = length / CHARS_MAX;
186: final int chunks;
187: if (split > 1)
188: chunks = split;
189: else
190: chunks = 2;
191: int end_chunk = start;
192: for (int chunk = 1; chunk <= chunks; chunk++) {
193: int start_chunk = end_chunk;
194: end_chunk = start
195: + (int) ((((long) length) * chunk) / chunks);
196:
197: // Adjust the end of the chunk if it ends on a high char
198: // of a Unicode surrogate pair and low char of the pair
199: // is not going to be in the same chunk
200: final char c = chars[end_chunk - 1];
201: int ic = chars[end_chunk - 1];
202: if (c >= 0xD800 && c <= 0xDBFF) {
203: // The last Java char that we were going
204: // to process is the first of a
205: // Java surrogate char pair that
206: // represent a Unicode character.
207:
208: if (end_chunk < start + length) {
209: // Avoid spanning by including the low
210: // char in the current chunk of chars.
211: end_chunk++;
212: } else {
213: /* This is the last char of the last chunk,
214: * and it is the high char of a high/low pair with
215: * no low char provided.
216: * TODO: error message needed.
217: * The char array incorrectly ends in a high char
218: * of a high/low surrogate pair, but there is
219: * no corresponding low as the high is the last char
220: */
221: end_chunk--;
222: }
223: }
224:
225: int len_chunk = (end_chunk - start_chunk);
226: this .write(chars, start_chunk, len_chunk);
227: }
228: return;
229: }
230: }
231:
232: final int n = length + start;
233: final byte[] buf_loc = m_outputBytes; // local reference for faster access
234: int count_loc = count; // local integer for faster access
235: int i = start;
236: {
237: /* This block could be omitted and the code would produce
238: * the same result. But this block exists to give the JIT
239: * a better chance of optimizing a tight and common loop which
240: * occurs when writing out ASCII characters.
241: */
242: char c;
243: for (; i < n && (c = chars[i]) < 0x80; i++)
244: buf_loc[count_loc++] = (byte) c;
245: }
246: for (; i < n; i++) {
247:
248: final char c = chars[i];
249:
250: if (c < 0x80)
251: buf_loc[count_loc++] = (byte) (c);
252: else if (c < 0x800) {
253: buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
254: buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
255: }
256: /**
257: * The following else if condition is added to support XML 1.1 Characters for
258: * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
259: * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
260: * [1101 11yy] [yyxx xxxx] (low surrogate)
261: * * uuuuu = wwww + 1
262: */
263: else if (c >= 0xD800 && c <= 0xDBFF) {
264: char high, low;
265: high = c;
266: i++;
267: low = chars[i];
268:
269: buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
270: buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
271: buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f)
272: + ((high << 4) & 0x30));
273: buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
274: } else {
275: buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
276: buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
277: buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
278: }
279: }
280: // Store the local integer back into the instance variable
281: count = count_loc;
282:
283: }
284:
285: /**
286: * Write a string.
287: *
288: * @param s String to be written
289: *
290: * @exception IOException If an I/O error occurs
291: */
292: public void write(final String s) throws IOException {
293:
294: // We multiply the length by three since this is the maximum length
295: // of the characters that we can put into the buffer. It is possible
296: // for each Unicode character to expand to three bytes.
297: final int length = s.length();
298: int lengthx3 = 3 * length;
299:
300: if (lengthx3 >= BYTES_MAX - count) {
301: // The requested length is greater than the unused part of the buffer
302: flushBuffer();
303:
304: if (lengthx3 > BYTES_MAX) {
305: /*
306: * The requested length exceeds the size of the buffer,
307: * so break it up in chunks that don't exceed the buffer size.
308: */
309: final int start = 0;
310: int split = length / CHARS_MAX;
311: final int chunks;
312: if (split > 1)
313: chunks = split;
314: else
315: chunks = 2;
316: int end_chunk = 0;
317: for (int chunk = 1; chunk <= chunks; chunk++) {
318: int start_chunk = end_chunk;
319: end_chunk = start
320: + (int) ((((long) length) * chunk) / chunks);
321: s.getChars(start_chunk, end_chunk, m_inputChars, 0);
322: int len_chunk = (end_chunk - start_chunk);
323:
324: // Adjust the end of the chunk if it ends on a high char
325: // of a Unicode surrogate pair and low char of the pair
326: // is not going to be in the same chunk
327: final char c = m_inputChars[len_chunk - 1];
328: if (c >= 0xD800 && c <= 0xDBFF) {
329: // Exclude char in this chunk,
330: // to avoid spanning a Unicode character
331: // that is in two Java chars as a high/low surrogate
332: end_chunk--;
333: len_chunk--;
334: if (chunk == chunks) {
335: /* TODO: error message needed.
336: * The String incorrectly ends in a high char
337: * of a high/low surrogate pair, but there is
338: * no corresponding low as the high is the last char
339: * Recover by ignoring this last char.
340: */
341: }
342: }
343:
344: this .write(m_inputChars, 0, len_chunk);
345: }
346: return;
347: }
348: }
349:
350: s.getChars(0, length, m_inputChars, 0);
351: final char[] chars = m_inputChars;
352: final int n = length;
353: final byte[] buf_loc = m_outputBytes; // local reference for faster access
354: int count_loc = count; // local integer for faster access
355: int i = 0;
356: {
357: /* This block could be omitted and the code would produce
358: * the same result. But this block exists to give the JIT
359: * a better chance of optimizing a tight and common loop which
360: * occurs when writing out ASCII characters.
361: */
362: char c;
363: for (; i < n && (c = chars[i]) < 0x80; i++)
364: buf_loc[count_loc++] = (byte) c;
365: }
366: for (; i < n; i++) {
367:
368: final char c = chars[i];
369:
370: if (c < 0x80)
371: buf_loc[count_loc++] = (byte) (c);
372: else if (c < 0x800) {
373: buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
374: buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
375: }
376: /**
377: * The following else if condition is added to support XML 1.1 Characters for
378: * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
379: * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
380: * [1101 11yy] [yyxx xxxx] (low surrogate)
381: * * uuuuu = wwww + 1
382: */
383: else if (c >= 0xD800 && c <= 0xDBFF) {
384: char high, low;
385: high = c;
386: i++;
387: low = chars[i];
388:
389: buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
390: buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
391: buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f)
392: + ((high << 4) & 0x30));
393: buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
394: } else {
395: buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
396: buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
397: buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
398: }
399: }
400: // Store the local integer back into the instance variable
401: count = count_loc;
402:
403: }
404:
405: /**
406: * Flush the internal buffer
407: *
408: * @throws IOException
409: */
410: public void flushBuffer() throws IOException {
411:
412: if (count > 0) {
413: m_os.write(m_outputBytes, 0, count);
414:
415: count = 0;
416: }
417: }
418:
419: /**
420: * Flush the stream. If the stream has saved any characters from the
421: * various write() methods in a buffer, write them immediately to their
422: * intended destination. Then, if that destination is another character or
423: * byte stream, flush it. Thus one flush() invocation will flush all the
424: * buffers in a chain of Writers and OutputStreams.
425: *
426: * @exception IOException If an I/O error occurs
427: *
428: * @throws java.io.IOException
429: */
430: public void flush() throws java.io.IOException {
431: flushBuffer();
432: m_os.flush();
433: }
434:
435: /**
436: * Close the stream, flushing it first. Once a stream has been closed,
437: * further write() or flush() invocations will cause an IOException to be
438: * thrown. Closing a previously-closed stream, however, has no effect.
439: *
440: * @exception IOException If an I/O error occurs
441: *
442: * @throws java.io.IOException
443: */
444: public void close() throws java.io.IOException {
445: flushBuffer();
446: m_os.close();
447: }
448:
449: /**
450: * Get the output stream where the events will be serialized to.
451: *
452: * @return reference to the result stream, or null of only a writer was
453: * set.
454: */
455: public OutputStream getOutputStream() {
456: return m_os;
457: }
458:
459: public Writer getWriter() {
460: // Only one of getWriter() or getOutputStream() can return null
461: // This type of writer wraps an OutputStream, not a Writer.
462: return null;
463: }
464: }
|