001: /*
002: * Java HTML Tidy - JTidy
003: * HTML parser and pretty printer
004: *
005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
006: * Institute of Technology, Institut National de Recherche en
007: * Informatique et en Automatique, Keio University). All Rights
008: * Reserved.
009: *
010: * Contributing Author(s):
011: *
012: * Dave Raggett <dsr@w3.org>
013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
014: * Gary L Peskin <garyp@firstech.com> (Java development)
015: * Sami Lempinen <sami@lempinen.net> (release management)
016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
017: *
018: * The contributing author(s) would like to thank all those who
019: * helped with testing, bug fixes, and patience. This wouldn't
020: * have been possible without all of you.
021: *
022: * COPYRIGHT NOTICE:
023: *
024: * This software and documentation is provided "as is," and
025: * the copyright holders and contributing author(s) make no
026: * representations or warranties, express or implied, including
027: * but not limited to, warranties of merchantability or fitness
028: * for any particular purpose or that the use of the software or
029: * documentation will not infringe any third party patents,
030: * copyrights, trademarks or other rights.
031: *
032: * The copyright holders and contributing author(s) will not be
033: * liable for any direct, indirect, special or consequential damages
034: * arising out of any use of the software or documentation, even if
035: * advised of the possibility of such damage.
036: *
037: * Permission is hereby granted to use, copy, modify, and distribute
038: * this source code, or portions hereof, documentation and executables,
039: * for any purpose, without fee, subject to the following restrictions:
040: *
041: * 1. The origin of this source code must not be misrepresented.
042: * 2. Altered versions must be plainly marked as such and must
043: * not be misrepresented as being the original source.
044: * 3. This Copyright notice may not be removed or altered from any
045: * source or altered source distribution.
046: *
047: * The copyright holders and contributing author(s) specifically
048: * permit, without fee, and encourage the use of this source code
049: * as a component for supporting the Hypertext Markup Language in
050: * commercial products. If you use this source code in a product,
051: * acknowledgment is not required but would be appreciated.
052: *
053: */
054: package org.w3c.tidy;
055:
056: import java.io.IOException;
057: import java.io.OutputStream;
058:
059: import org.w3c.tidy.EncodingUtils.PutBytes;
060:
061: /**
062: * Output implementation. This implementation is from the c version of tidy and it doesn't take advantage of java
063: * writers.
064: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
065: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
066: * @author Fabrizio Giustina
067: * @version $Revision: 1.16 $ ($Author: fgiust $)
068: */
069: public class OutImpl implements Out {
070:
071: /**
072: * output encoding.
073: */
074: private int encoding;
075:
076: /**
077: * actual state for ISO 2022.
078: */
079: private int state;
080:
081: /**
082: * output stream.
083: */
084: private OutputStream out;
085:
086: /**
087: * putter callback.
088: */
089: private PutBytes putBytes;
090:
091: /**
092: * newline bytes.
093: */
094: private byte[] newline;
095:
096: /**
097: * Constructor.
098: * @param configuration actual configuration instance (needed for newline configuration)
099: * @param encoding encoding constant
100: * @param out output stream
101: */
102: public OutImpl(Configuration configuration, int encoding,
103: OutputStream out) {
104: this .encoding = encoding;
105: this .state = EncodingUtils.FSM_ASCII;
106: this .out = out;
107:
108: // copy configured newline in bytes
109: this .newline = new byte[configuration.newline.length];
110: for (int j = 0; j < configuration.newline.length; j++) {
111: this .newline[j] = (byte) configuration.newline[j];
112: }
113:
114: this .putBytes = new PutBytes() {
115:
116: private OutImpl impl;
117:
118: PutBytes setOut(OutImpl out) {
119: this .impl = out;
120: return this ;
121: }
122:
123: public void doPut(byte[] buf, int[] count) {
124: impl.outcUTF8Bytes(buf, count);
125: }
126: } // set the out instance direclty
127: .setOut(this );
128: }
129:
130: /**
131: * output UTF-8 bytes to output stream.
132: * @param buf array of bytes
133: * @param count number of bytes in buf to write
134: */
135: void outcUTF8Bytes(byte[] buf, int[] count) {
136: try {
137: for (int i = 0; i < count[0]; i++) {
138: out.write(buf[i]);
139: }
140: } catch (IOException e) {
141: System.err
142: .println("OutImpl.outcUTF8Bytes: " + e.toString());
143: }
144: }
145:
146: /**
147: * .
148: * @see org.w3c.tidy.Out#outc(byte)
149: */
150: public void outc(byte c) {
151: outc(c & 0xFF); // Convert to unsigned.
152: }
153:
154: /**
155: * @see org.w3c.tidy.Out#outc(int)
156: */
157: public void outc(int c) {
158: int ch;
159:
160: try {
161:
162: if (this .encoding == Configuration.MACROMAN) {
163: if (c < 128) {
164: out.write(c);
165: } else {
166: int i;
167:
168: for (i = 128; i < 256; i++) {
169: if (EncodingUtils.decodeMacRoman(i - 128) == c) {
170: out.write(i);
171: break;
172: }
173: }
174: }
175: } else
176:
177: if (this .encoding == Configuration.WIN1252) {
178: if (c < 128 || (c > 159 && c < 256)) {
179: out.write(c);
180: } else {
181: int i;
182:
183: for (i = 128; i < 160; i++) {
184: if (EncodingUtils.decodeWin1252(i - 128) == c) {
185: out.write(i);
186: break;
187: }
188: }
189: }
190: } else if (this .encoding == Configuration.UTF8) {
191: int[] count = new int[] { 0 };
192:
193: EncodingUtils.encodeCharToUTF8Bytes(c, null,
194: this .putBytes, count);
195: if (count[0] <= 0) {
196: /* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
197: /* replacement char 0xFFFD encoded as UTF-8 */
198: out.write(0xEF);
199: out.write(0xBF);
200: out.write(0xBF);
201: }
202: } else if (this .encoding == Configuration.ISO2022) {
203: if (c == 0x1b) /* ESC */
204: {
205: this .state = EncodingUtils.FSM_ESC;
206: } else {
207: switch (this .state) {
208: case EncodingUtils.FSM_ESC:
209: if (c == '$') {
210: this .state = EncodingUtils.FSM_ESCD;
211: } else if (c == '(') {
212: this .state = EncodingUtils.FSM_ESCP;
213: } else {
214: this .state = EncodingUtils.FSM_ASCII;
215: }
216: break;
217:
218: case EncodingUtils.FSM_ESCD:
219: if (c == '(') {
220: this .state = EncodingUtils.FSM_ESCDP;
221: } else {
222: this .state = EncodingUtils.FSM_NONASCII;
223: }
224: break;
225:
226: case EncodingUtils.FSM_ESCDP:
227: this .state = EncodingUtils.FSM_NONASCII;
228: break;
229:
230: case EncodingUtils.FSM_ESCP:
231: this .state = EncodingUtils.FSM_ASCII;
232: break;
233:
234: case EncodingUtils.FSM_NONASCII:
235: c &= 0x7F;
236: break;
237:
238: default:
239: // should not reach here
240: break;
241: }
242: }
243:
244: this .out.write(c);
245: } else if (this .encoding == Configuration.UTF16LE
246: || this .encoding == Configuration.UTF16BE
247: || this .encoding == Configuration.UTF16) {
248: int i = 1;
249: int numChars = 1;
250: int[] theChars = new int[2];
251:
252: if (c > EncodingUtils.MAX_UTF16_FROM_UCS4) {
253: // invalid UTF-16 value
254: /* ReportEncodingError(in.lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
255: c = 0;
256: numChars = 0;
257: } else if (c >= EncodingUtils.UTF16_SURROGATES_BEGIN) {
258: // encode surrogate pairs
259:
260: // check for invalid pairs
261: if (((c & 0x0000FFFE) == 0x0000FFFE)
262: || ((c & 0x0000FFFF) == 0x0000FFFF)) {
263: /* ReportEncodingError(in.lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
264: c = 0;
265: numChars = 0;
266: } else {
267: theChars[0] = (c - EncodingUtils.UTF16_SURROGATES_BEGIN)
268: / 0x400
269: + EncodingUtils.UTF16_LOW_SURROGATE_BEGIN;
270: theChars[1] = (c - EncodingUtils.UTF16_SURROGATES_BEGIN)
271: % 0x400
272: + EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN;
273:
274: // output both
275: numChars = 2;
276: }
277: } else {
278: // just put the char out
279: theChars[0] = c;
280: }
281:
282: for (i = 0; i < numChars; i++) {
283: c = theChars[i];
284:
285: if (this .encoding == Configuration.UTF16LE) {
286: ch = c & 0xFF;
287: out.write(ch);
288: ch = (c >> 8) & 0xFF;
289: out.write(ch);
290: }
291:
292: else if (this .encoding == Configuration.UTF16BE
293: || this .encoding == Configuration.UTF16) {
294: ch = (c >> 8) & 0xFF;
295: out.write(ch);
296: ch = c & 0xFF;
297: out.write(ch);
298: }
299: }
300: }
301: // #431953 - start RJ
302: else if (this .encoding == Configuration.BIG5
303: || this .encoding == Configuration.SHIFTJIS) {
304: if (c < 128) {
305: this .out.write(c);
306: } else {
307: ch = (c >> 8) & 0xFF;
308: this .out.write(ch);
309: ch = c & 0xFF;
310: this .out.write(ch);
311: }
312: }
313: // #431953 - end RJ
314: else {
315: this .out.write(c);
316: }
317: } catch (IOException e) {
318: System.err.println("OutImpl.outc: " + e.toString());
319: }
320: }
321:
322: /**
323: * @see org.w3c.tidy.Out#newline()
324: */
325: public void newline() {
326: try {
327: this .out.write(this .newline);
328: this .out.flush();
329: } catch (IOException e) {
330: System.err.println("OutImpl.newline: " + e.toString());
331: }
332: }
333:
334: /**
335: * Setter for <code>out</code>.
336: * @param out The out to set.
337: */
338: public void setOut(OutputStream out) {
339: this .out = out;
340: }
341:
342: /**
343: * Output a Byte Order Mark.
344: */
345: public void outBOM() {
346: if (this .encoding == Configuration.UTF8
347: || this .encoding == Configuration.UTF16LE
348: || this .encoding == Configuration.UTF16BE
349: || this .encoding == Configuration.UTF16) {
350: outc(EncodingUtils.UNICODE_BOM); // this will take care of encoding the BOM correctly
351: }
352: }
353:
354: /**
355: * @see org.w3c.tidy.Out#close()
356: */
357: public void close() {
358: try {
359: this .out.flush();
360: this .out.close();
361: } catch (IOException e) {
362: System.err.println("OutImpl.close: " + e.toString());
363: }
364: }
365: }
|