001: /*
002: * Java HTML Tidy - JTidy
003: * HTML parser and pretty printer
004: *
005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
006: * Institute of Technology, Institut National de Recherche en
007: * Informatique et en Automatique, Keio University). All Rights
008: * Reserved.
009: *
010: * Contributing Author(s):
011: *
012: * Dave Raggett <dsr@w3.org>
013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
014: * Gary L Peskin <garyp@firstech.com> (Java development)
015: * Sami Lempinen <sami@lempinen.net> (release management)
016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
017: *
018: * The contributing author(s) would like to thank all those who
019: * helped with testing, bug fixes, and patience. This wouldn't
020: * have been possible without all of you.
021: *
022: * COPYRIGHT NOTICE:
023: *
024: * This software and documentation is provided "as is," and
025: * the copyright holders and contributing author(s) make no
026: * representations or warranties, express or implied, including
027: * but not limited to, warranties of merchantability or fitness
028: * for any particular purpose or that the use of the software or
029: * documentation will not infringe any third party patents,
030: * copyrights, trademarks or other rights.
031: *
032: * The copyright holders and contributing author(s) will not be
033: * liable for any direct, indirect, special or consequential damages
034: * arising out of any use of the software or documentation, even if
035: * advised of the possibility of such damage.
036: *
037: * Permission is hereby granted to use, copy, modify, and distribute
038: * this source code, or portions hereof, documentation and executables,
039: * for any purpose, without fee, subject to the following restrictions:
040: *
041: * 1. The origin of this source code must not be misrepresented.
042: * 2. Altered versions must be plainly marked as such and must
043: * not be misrepresented as being the original source.
044: * 3. This Copyright notice may not be removed or altered from any
045: * source or altered source distribution.
046: *
047: * The copyright holders and contributing author(s) specifically
048: * permit, without fee, and encourage the use of this source code
049: * as a component for supporting the Hypertext Markup Language in
050: * commercial products. If you use this source code in a product,
051: * acknowledgment is not required but would be appreciated.
052: *
053: */
054: package org.w3c.tidy;
055:
056: import java.io.IOException;
057: import java.io.InputStream;
058:
059: import org.w3c.tidy.EncodingUtils.GetBytes;
060:
061: /**
062: * Input Stream Implementation. This implementation is from the c version of tidy and it doesn't take advantage of java
063: * readers.
064: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
065: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
066: * @author Fabrizio Giustina
067: * @version $Revision: 1.28 $ ($Author: fgiust $)
068: */
069: public class StreamInImpl implements StreamIn {
070:
071: /**
072: * number of characters kept in buffer.
073: */
074: private static final int CHARBUF_SIZE = 5;
075:
076: /**
077: * needed for error reporting.
078: */
079: private Lexer lexer;
080:
081: /**
082: * character buffer.
083: */
084: private int[] charbuf = new int[CHARBUF_SIZE];
085:
086: /**
087: * actual position in buffer.
088: */
089: private int bufpos;
090:
091: /**
092: * Private unget buffer for the raw bytes read from the input stream. Normally this will only be used by the UTF-8
093: * decoder to resynchronize the input stream after finding an illegal UTF-8 sequences. But it can be used for other
094: * purposes when reading bytes in ReadCharFromStream.
095: */
096: private char[] rawBytebuf = new char[CHARBUF_SIZE];
097:
098: /**
099: * actual position in rawBytebuf.
100: */
101: private int rawBufpos;
102:
103: /**
104: * has a raw byte been pushed into stack?
105: */
106: private boolean rawPushed;
107:
108: /**
109: * looking for an UTF BOM?
110: */
111: private boolean lookingForBOM = true;
112:
113: /**
114: * has end of stream been reached?
115: */
116: private boolean endOfStream;
117:
118: private boolean pushed;
119:
120: private int tabs;
121:
122: /**
123: * tab size in chars.
124: */
125: private int tabsize;
126:
127: /**
128: * FSM for ISO2022.
129: */
130: private int state;
131:
132: /**
133: * Encoding.
134: */
135: private int encoding;
136:
137: /**
138: * current column number.
139: */
140: private int curcol;
141:
142: /**
143: * last column.
144: */
145: private int lastcol;
146:
147: /**
148: * current line number.
149: */
150: private int curline;
151:
152: /**
153: * input stream.
154: */
155: private InputStream stream;
156:
157: /**
158: * Getter.
159: */
160: private GetBytes getBytes;
161:
162: /**
163: * Avoid mapping values > 127 to entities.
164: */
165: private boolean rawOut;
166:
167: /**
168: * Instatiates a new StreamInImpl.
169: * @param stream input stream
170: * @param configuration Configuration
171: */
172: public StreamInImpl(InputStream stream, Configuration configuration) {
173: this .stream = stream;
174: this .charbuf[0] = '\0';
175: this .tabsize = configuration.tabsize;
176: this .curline = 1;
177: this .curcol = 1;
178: this .encoding = configuration.getInCharEncoding();
179: this .rawOut = configuration.rawOut;
180: this .state = EncodingUtils.FSM_ASCII;
181: this .getBytes = new GetBytes() {
182:
183: StreamInImpl in;
184:
185: GetBytes setStreamIn(StreamInImpl in) {
186: this .in = in;
187: return this ;
188: }
189:
190: public void doGet(int[] buf, int[] count, boolean unget) {
191: in.readRawBytesFromStream(buf, count, unget);
192: }
193: } // set the StreamInImpl instance directly
194: .setStreamIn(this );
195: }
196:
197: /**
198: * @see org.w3c.tidy.StreamIn#getCurcol()
199: */
200: public int getCurcol() {
201: return this .curcol;
202: }
203:
204: /**
205: * @see org.w3c.tidy.StreamIn#getCurline()
206: */
207: public int getCurline() {
208: return this .curline;
209: }
210:
211: /**
212: * Setter for <code>lexer</code>.
213: * @param lexer The lexer to set.
214: */
215: public void setLexer(Lexer lexer) {
216: this .lexer = lexer;
217: }
218:
219: /**
220: * @see org.w3c.tidy.StreamIn#readChar()
221: */
222: public int readChar() {
223: int c;
224:
225: if (this .pushed) {
226: c = this .charbuf[--(this .bufpos)];
227: if ((this .bufpos) == 0) {
228: this .pushed = false;
229: }
230:
231: if (c == '\n') {
232: this .curcol = 1;
233: this .curline++;
234: } else {
235: this .curcol++;
236: }
237:
238: return c;
239: }
240:
241: this .lastcol = this .curcol;
242:
243: if (this .tabs > 0) {
244: this .curcol++;
245: this .tabs--;
246: return ' ';
247: }
248:
249: while (true) {
250: c = readCharFromStream();
251:
252: if (c < 0) {
253: return END_OF_STREAM;
254: }
255:
256: if (c == '\n') {
257: this .curcol = 1;
258: this .curline++;
259: break;
260: }
261:
262: // #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00
263: if (c == '\r') {
264: c = readCharFromStream();
265: if (c != '\n') {
266: if (c != END_OF_STREAM) // EOF fix by Terry Teague 12 Aug 01
267: {
268: ungetChar(c);
269: }
270: c = '\n';
271: }
272: this .curcol = 1;
273: this .curline++;
274: break;
275: }
276:
277: if (c == '\t') {
278: this .tabs = this .tabsize
279: - ((this .curcol - 1) % this .tabsize) - 1;
280: this .curcol++;
281: c = ' ';
282: break;
283: }
284:
285: // strip control characters, except for Esc
286: if (c == '\033') {
287: break;
288: } else if (c == '\015' && !lexer.configuration.xmlTags) //Form Feed is allowed in HTML
289: {
290: break;
291: } else if (0 < c && c < 32) {
292: continue; // discard control char
293: }
294:
295: // watch out for chars that have already been decoded such as
296: // IS02022, UTF-8 etc, that don't require further decoding
297: if (rawOut || this .encoding == Configuration.ISO2022
298: || this .encoding == Configuration.UTF8
299: || this .encoding == Configuration.SHIFTJIS // #431953 - RJ
300: || this .encoding == Configuration.BIG5) // #431953 - RJ
301: {
302: this .curcol++;
303: break;
304: }
305:
306: // handle surrogate pairs
307: if ((this .encoding == Configuration.UTF16LE)
308: || (this .encoding == Configuration.UTF16)
309: || (this .encoding == Configuration.UTF16BE)) {
310: if (c > EncodingUtils.MAX_UTF8_FROM_UCS4) {
311: // invalid UTF-16 value
312: this .lexer.report.encodingError(this .lexer,
313: Report.INVALID_UTF16
314: | Report.DISCARDED_CHAR, c);
315: c = 0;
316: }
317: // high surrogate
318: else if (c >= EncodingUtils.UTF16_LOW_SURROGATE_BEGIN
319: && c <= EncodingUtils.UTF16_LOW_SURROGATE_END) {
320: int n, m;
321:
322: n = c;
323:
324: m = readCharFromStream();
325: if (m < 0) {
326: return END_OF_STREAM;
327: }
328: // low surrogate
329: if (m >= EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN
330: && m <= EncodingUtils.UTF16_HIGH_SURROGATE_END) {
331: // pair found, recombine them
332: c = (n - EncodingUtils.UTF16_LOW_SURROGATE_BEGIN)
333: * 0x400
334: + (m - EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN)
335: + 0x10000;
336:
337: // check for invalid pairs
338: if (((c & 0x0000FFFE) == 0x0000FFFE)
339: || ((c & 0x0000FFFF) == 0x0000FFFF)
340: || (c < EncodingUtils.UTF16_SURROGATES_BEGIN)) {
341: this .lexer.report.encodingError(this .lexer,
342: Report.INVALID_UTF16
343: | Report.DISCARDED_CHAR, c);
344: c = 0;
345: }
346: } else {
347: // not a valid pair
348: this .lexer.report.encodingError(this .lexer,
349: Report.INVALID_UTF16
350: | Report.DISCARDED_CHAR, c);
351: c = 0;
352: // should we unget the just read char?
353: }
354: } else {
355: // no recombination needed
356: }
357: }
358:
359: if (this .encoding == Configuration.MACROMAN) {
360: c = EncodingUtils.decodeMacRoman(c);
361: }
362:
363: // produced e.g. as a side-effect of smart quotes in Word
364: // but can't happen if using MACROMAN encoding
365: if (127 < c && c < 160) {
366: int c1 = 0;
367: int replaceMode;
368:
369: // set error position just before offending character
370: this .lexer.lines = this .curline;
371: this .lexer.columns = this .curcol;
372:
373: if ((this .encoding == Configuration.WIN1252)
374: || (this .lexer.configuration.replacementCharEncoding == Configuration.WIN1252)) {
375: c1 = EncodingUtils.decodeWin1252(c);
376: } else if (this .lexer.configuration.replacementCharEncoding == Configuration.MACROMAN) {
377: c1 = EncodingUtils.decodeMacRoman(c);
378: }
379:
380: replaceMode = TidyUtils.toBoolean(c1) ? Report.REPLACED_CHAR
381: : Report.DISCARDED_CHAR;
382:
383: if ((c1 == 0)
384: && (this .encoding == Configuration.WIN1252)
385: || (this .encoding == Configuration.MACROMAN)) {
386: this .lexer.report.encodingError(this .lexer,
387: Report.VENDOR_SPECIFIC_CHARS | replaceMode,
388: c);
389: } else if ((this .encoding != Configuration.WIN1252)
390: && (this .encoding != Configuration.MACROMAN)) {
391: this .lexer.report.encodingError(this .lexer,
392: Report.INVALID_SGML_CHARS | replaceMode, c);
393: }
394:
395: c = c1;
396: }
397:
398: if (c == 0) {
399: continue; // illegal char is discarded
400: }
401:
402: this .curcol++;
403: break;
404: }
405:
406: return c;
407: }
408:
409: /**
410: * @see org.w3c.tidy.StreamIn#ungetChar(int)
411: */
412: public void ungetChar(int c) {
413: this .pushed = true;
414: if (this .bufpos >= CHARBUF_SIZE) {
415: // pop last element
416: System.arraycopy(this .charbuf, 0, this .charbuf, 1,
417: CHARBUF_SIZE - 1);
418: this .bufpos--;
419: }
420: this .charbuf[(this .bufpos)++] = c;
421:
422: if (c == '\n') {
423: --this .curline;
424: }
425:
426: this .curcol = this .lastcol;
427: }
428:
429: /**
430: * @see org.w3c.tidy.StreamIn#isEndOfStream()
431: */
432: public boolean isEndOfStream() {
433: return this .endOfStream;
434: }
435:
436: /**
437: * @see org.w3c.tidy.StreamIn#readCharFromStream()
438: */
439: public int readCharFromStream() {
440: int c;
441: int[] n = new int[] { 0 };
442: int[] tempchar = new int[1];
443: int[] count = new int[] { 1 };
444:
445: readRawBytesFromStream(tempchar, count, false);
446: if (count[0] <= 0) {
447: endOfStream = true;
448: return END_OF_STREAM;
449: }
450:
451: c = tempchar[0];
452:
453: if (lookingForBOM
454: && (this .encoding == Configuration.UTF16
455: || this .encoding == Configuration.UTF16LE
456: || this .encoding == Configuration.UTF16BE || this .encoding == Configuration.UTF8)) {
457: // check for a Byte Order Mark
458: int c1, bom;
459:
460: lookingForBOM = false;
461:
462: if (c == END_OF_STREAM) {
463: lookingForBOM = false;
464: endOfStream = true;
465: return END_OF_STREAM;
466: }
467:
468: count[0] = 1;
469: readRawBytesFromStream(tempchar, count, false);
470: c1 = tempchar[0];
471:
472: bom = (c << 8) + c1;
473:
474: if (bom == EncodingUtils.UNICODE_BOM_BE) {
475: // big-endian UTF-16
476: if (this .encoding != Configuration.UTF16
477: && this .encoding != Configuration.UTF16BE) {
478: this .lexer.report.encodingError(this .lexer,
479: Report.ENCODING_MISMATCH,
480: Configuration.UTF16BE);
481: // non-fatal error
482: }
483: this .encoding = Configuration.UTF16BE;
484: this .lexer.configuration
485: .setInCharEncoding(Configuration.UTF16BE);
486: return EncodingUtils.UNICODE_BOM; // return decoded BOM
487: } else if (bom == EncodingUtils.UNICODE_BOM_LE) {
488: // little-endian UTF-16
489: if (this .encoding != Configuration.UTF16
490: && this .encoding != Configuration.UTF16LE) {
491: this .lexer.report.encodingError(this .lexer,
492: Report.ENCODING_MISMATCH,
493: Configuration.UTF16LE);
494: // non-fatal error
495: }
496: this .encoding = Configuration.UTF16LE;
497: this .lexer.configuration
498: .setInCharEncoding(Configuration.UTF16LE);
499: return EncodingUtils.UNICODE_BOM; // return decoded BOM
500: } else {
501: int c2;
502:
503: count[0] = 1;
504: readRawBytesFromStream(tempchar, count, false);
505: c2 = tempchar[0];
506:
507: if (((c << 16) + (c1 << 8) + c2) == EncodingUtils.UNICODE_BOM_UTF8) {
508: // UTF-8
509: this .encoding = Configuration.UTF8;
510: if (this .encoding != Configuration.UTF8) {
511: this .lexer.report.encodingError(this .lexer,
512: Report.ENCODING_MISMATCH,
513: Configuration.UTF8);
514: // non-fatal error
515: }
516: this .lexer.configuration
517: .setInCharEncoding(Configuration.UTF8);
518: return EncodingUtils.UNICODE_BOM; // return decoded BOM
519: }
520:
521: // the 2nd and/or 3rd bytes weren't what we were expecting, so unget the extra 2 bytes
522: rawPushed = true;
523:
524: if ((rawBufpos + 1) >= CHARBUF_SIZE) {
525: System.arraycopy(rawBytebuf, 2, rawBytebuf, 0,
526: CHARBUF_SIZE - 2);
527: rawBufpos -= 2;
528: }
529: // make sure the bytes are pushed in the right order
530: rawBytebuf[rawBufpos++] = (char) c2;
531: rawBytebuf[rawBufpos++] = (char) c1;
532: // drop through to code below, with the original char
533:
534: }
535: }
536:
537: this .lookingForBOM = false;
538:
539: // A document in ISO-2022 based encoding uses some ESC sequences called "designator" to switch character sets.
540: // The designators defined and used in ISO-2022-JP are: "ESC" + "(" + ? for ISO646 variants "ESC" + "$" + ? and
541: // "ESC" + "$" + "(" + ? for multibyte character sets Where ? stands for a single character used to indicate the
542: // character set for multibyte characters. Tidy handles this by preserving the escape sequence and setting the
543: // top bit of each byte for non-ascii chars. This bit is then cleared on output. The input stream keeps track of
544: // the state to determine when to set/clear the bit.
545:
546: if (this .encoding == Configuration.ISO2022) {
547: if (c == 0x1b) // ESC
548: {
549: this .state = EncodingUtils.FSM_ESC;
550: return c;
551: }
552:
553: switch (this .state) {
554: case EncodingUtils.FSM_ESC:
555: if (c == '$') {
556: this .state = EncodingUtils.FSM_ESCD;
557: } else if (c == '(') {
558: this .state = EncodingUtils.FSM_ESCP;
559: } else {
560: this .state = EncodingUtils.FSM_ASCII;
561: }
562: break;
563:
564: case EncodingUtils.FSM_ESCD:
565: if (c == '(') {
566: this .state = EncodingUtils.FSM_ESCDP;
567: } else {
568: this .state = EncodingUtils.FSM_NONASCII;
569: }
570: break;
571:
572: case EncodingUtils.FSM_ESCDP:
573: this .state = EncodingUtils.FSM_NONASCII;
574: break;
575:
576: case EncodingUtils.FSM_ESCP:
577: this .state = EncodingUtils.FSM_ASCII;
578: break;
579:
580: case EncodingUtils.FSM_NONASCII:
581: c |= 0x80;
582: break;
583:
584: default:
585: //
586: break;
587: }
588:
589: return c;
590: }
591:
592: if (this .encoding == Configuration.UTF16LE) {
593: int c1;
594:
595: count[0] = 1;
596: readRawBytesFromStream(tempchar, count, false);
597: if (count[0] <= 0) {
598: endOfStream = true;
599: return END_OF_STREAM;
600: }
601: c1 = tempchar[0];
602:
603: n[0] = (c1 << 8) + c;
604:
605: return n[0];
606: }
607:
608: // UTF-16 is big-endian by default
609: if ((this .encoding == Configuration.UTF16)
610: || (this .encoding == Configuration.UTF16BE)) {
611: int c1;
612:
613: count[0] = 1;
614: readRawBytesFromStream(tempchar, count, false);
615: if (count[0] <= 0) {
616: endOfStream = true;
617: return END_OF_STREAM;
618: }
619: c1 = tempchar[0];
620:
621: n[0] = (c << 8) + c1;
622:
623: return n[0];
624: }
625:
626: if (this .encoding == Configuration.UTF8) {
627: // deal with UTF-8 encoded char
628: int[] count2 = new int[] { 0 };
629:
630: // first byte "c" is passed in separately
631: boolean err = EncodingUtils.decodeUTF8BytesToChar(n, c,
632: new byte[0], this .getBytes, count2, 0);
633: if (!err && (n[0] == END_OF_STREAM) && (count2[0] == 1)) /* EOF */
634: {
635: endOfStream = true;
636: return END_OF_STREAM;
637: } else if (err) {
638: /* set error position just before offending character */
639: this .lexer.lines = this .curline;
640: this .lexer.columns = this .curcol;
641:
642: this .lexer.report
643: .encodingError(
644: this .lexer,
645: (short) (Report.INVALID_UTF8 | Report.REPLACED_CHAR),
646: n[0]);
647: n[0] = 0xFFFD; /* replacement char */
648: }
649:
650: return n[0];
651: }
652:
653: // #431953 - start RJ
654: // This section is suitable for any "multibyte" variable-width character encoding in which a one-byte code is
655: // less than 128, and the first byte of a two-byte code is greater or equal to 128. Note that Big5 and ShiftJIS
656: // fit into this kind, even though their second byte may be less than 128
657:
658: if ((this .encoding == Configuration.BIG5)
659: || (this .encoding == Configuration.SHIFTJIS)) {
660: if (c < 128) {
661: return c;
662: } else if ((this .encoding == Configuration.SHIFTJIS)
663: && (c >= 0xa1 && c <= 0xdf)) {
664: // 461643 - fix suggested by Rick Cameron 14 Sep 01
665: // for Shift_JIS, the values from 0xa1 through 0xdf represent singe-byte characters (U+FF61 to U+FF9F -
666: // half-shift Katakana)
667: return c;
668: } else {
669: int c1;
670: count[0] = 1;
671: readRawBytesFromStream(tempchar, count, false);
672:
673: if (count[0] <= 0) {
674: endOfStream = true;
675: return END_OF_STREAM;
676: }
677:
678: c1 = tempchar[0];
679: n[0] = (c << 8) + c1;
680: return n[0];
681: }
682: }
683: // #431953 - end RJ
684: n[0] = c;
685:
686: return n[0];
687: }
688:
689: /**
690: * Read raw bytes from stream, return <= 0 if EOF; or if "unget" is true, Unget the bytes to re-synchronize the
691: * input stream Normally UTF-8 successor bytes are read using this routine.
692: * @param buf character buffer
693: * @param count number of bytes to read
694: * @param unget unget bytes
695: */
696: protected void readRawBytesFromStream(int[] buf, int[] count,
697: boolean unget) {
698:
699: try {
700: for (int i = 0; i < count[0]; i++) {
701: if (unget) {
702:
703: int c = this .stream.read();
704:
705: // should never get here; testing for 0xFF, a valid char, is not a good idea
706: if (c == END_OF_STREAM) // || buf[i] == (unsigned char)EndOfStream
707: {
708: count[0] = -i;
709: return;
710: }
711:
712: rawPushed = true;
713:
714: if (rawBufpos >= CHARBUF_SIZE) {
715: System.arraycopy(rawBytebuf, 1, rawBytebuf, 0,
716: CHARBUF_SIZE - 1);
717: rawBufpos--;
718: }
719: rawBytebuf[rawBufpos++] = (char) buf[i];
720: } else {
721: if (rawPushed) {
722: buf[i] = rawBytebuf[--rawBufpos];
723: if (rawBufpos == 0) {
724: rawPushed = false;
725: }
726: } else {
727: int c = this .stream.read();
728: if (c == END_OF_STREAM) {
729: count[0] = -i;
730: break;
731: }
732: buf[i] = (char) c;
733: }
734: }
735: }
736: } catch (IOException e) {
737: System.err.println("StreamInImpl.readRawBytesFromStream: "
738: + e.toString());
739: }
740: return;
741: }
742:
743: }
|