001: /*
002: * @(#)StreamInImpl.java 1.11 2000/08/16
003: *
004: */
005:
006: package org.w3c.tidy;
007:
008: /**
009: *
010: * Input Stream Implementation
011: *
012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
013: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
014: * HTML Tidy Release 4 Aug 2000</a>
015: *
016: * @author Dave Raggett <dsr@w3.org>
017: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
018: * @version 1.0, 1999/05/22
019: * @version 1.0.1, 1999/05/29
020: * @version 1.1, 1999/06/18 Java Bean
021: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
022: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
023: * @version 1.4, 1999/09/04 DOM support
024: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
025: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
026: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
027: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
028: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
029: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
030: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
031: */
032:
033: import java.io.InputStream;
034: import java.io.IOException;
035:
036: public class StreamInImpl extends StreamIn {
037:
038: /* Mapping for Windows Western character set (128-159) to Unicode */
039: private static int[] Win2Unicode = { 0x20AC, 0x0000, 0x201A,
040: 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030,
041: 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, 0x0000,
042: 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
043: 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E,
044: 0x0178 };
045:
046: /*
047: John Love-Jensen contributed this table for mapping MacRoman
048: character set to Unicode
049: */
050:
051: private static int[] Mac2Unicode = {
052:
053: 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
054: 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E,
055: 0x000F,
056:
057: 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016,
058: 0x0017, 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D,
059: 0x001E, 0x001F,
060:
061: 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026,
062: 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D,
063: 0x002E, 0x002F,
064:
065: 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036,
066: 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D,
067: 0x003E, 0x003F,
068:
069: 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046,
070: 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D,
071: 0x004E, 0x004F,
072:
073: 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056,
074: 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D,
075: 0x005E, 0x005F,
076:
077: 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066,
078: 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D,
079: 0x006E, 0x006F,
080:
081: 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076,
082: 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D,
083: 0x007E, 0x007F,
084: /* x7F = DEL */
085: 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC,
086: 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7,
087: 0x00E9, 0x00E8,
088:
089: 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1,
090: 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9,
091: 0x00FB, 0x00FC,
092:
093: 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6,
094: 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260,
095: 0x00C6, 0x00D8,
096:
097: 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202,
098: 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9,
099: 0x00E6, 0x00F8,
100:
101: 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206,
102: 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5,
103: 0x0152, 0x0153,
104:
105: 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7,
106: 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A,
107: 0xFB01, 0xFB02,
108:
109: 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA,
110: 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC,
111: 0x00D3, 0x00D4,
112: /* xF0 = Apple Logo */
113: 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6,
114: 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD,
115: 0x02DB, 0x02C7 };
116:
117: public StreamInImpl(InputStream stream, int encoding, int tabsize) {
118: this .stream = stream;
119: this .pushed = false;
120: this .c = (int) '\0';
121: this .tabs = 0;
122: this .tabsize = tabsize;
123: this .curline = 1;
124: this .curcol = 1;
125: this .encoding = encoding;
126: this .state = FSM_ASCII;
127: this .endOfStream = false;
128: }
129:
130: /* read char from stream */
131: public int readCharFromStream() {
132: int n, c, i, count;
133:
134: try {
135: c = this .stream.read();
136:
137: if (c == EndOfStream) {
138: this .endOfStream = true;
139: return c;
140: }
141:
142: /*
143: A document in ISO-2022 based encoding uses some ESC sequences
144: called "designator" to switch character sets. The designators
145: defined and used in ISO-2022-JP are:
146:
147: "ESC" + "(" + ? for ISO646 variants
148:
149: "ESC" + "$" + ? and
150: "ESC" + "$" + "(" + ? for multibyte character sets
151:
152: Where ? stands for a single character used to indicate the
153: character set for multibyte characters.
154:
155: Tidy handles this by preserving the escape sequence and
156: setting the top bit of each byte for non-ascii chars. This
157: bit is then cleared on output. The input stream keeps track
158: of the state to determine when to set/clear the bit.
159: */
160:
161: if (this .encoding == Configuration.ISO2022) {
162: if (c == 0x1b) /* ESC */
163: {
164: this .state = FSM_ESC;
165: return c;
166: }
167:
168: switch (this .state) {
169: case FSM_ESC:
170: if (c == '$')
171: this .state = FSM_ESCD;
172: else if (c == '(')
173: this .state = FSM_ESCP;
174: else
175: this .state = FSM_ASCII;
176: break;
177:
178: case FSM_ESCD:
179: if (c == '(')
180: this .state = FSM_ESCDP;
181: else
182: this .state = FSM_NONASCII;
183: break;
184:
185: case FSM_ESCDP:
186: this .state = FSM_NONASCII;
187: break;
188:
189: case FSM_ESCP:
190: this .state = FSM_ASCII;
191: break;
192:
193: case FSM_NONASCII:
194: c |= 0x80;
195: break;
196: }
197:
198: return c;
199: }
200:
201: if (this .encoding != Configuration.UTF8)
202: return c;
203:
204: /* deal with UTF-8 encoded char */
205:
206: if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */
207: {
208: n = c & 31;
209: count = 1;
210: } else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
211: {
212: n = c & 15;
213: count = 2;
214: } else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
215: {
216: n = c & 7;
217: count = 3;
218: } else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */
219: {
220: n = c & 3;
221: count = 4;
222: } else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */
223: {
224: n = c & 1;
225: count = 5;
226: } else
227: /* 0XXX XXXX one byte */
228: return c;
229:
230: /* successor bytes should have the form 10XX XXXX */
231: for (i = 1; i <= count; ++i) {
232: c = this .stream.read();
233:
234: if (c == EndOfStream) {
235: this .endOfStream = true;
236: return c;
237: }
238:
239: n = (n << 6) | (c & 0x3F);
240: }
241: } catch (IOException e) {
242: System.err.println("StreamInImpl.readCharFromStream: "
243: + e.toString());
244: n = EndOfStream;
245: }
246:
247: return n;
248: }
249:
250: public int readChar() {
251: int c;
252:
253: if (this .pushed) {
254: this .pushed = false;
255: c = this .c;
256:
257: if (c == '\n') {
258: this .curcol = 1;
259: this .curline++;
260: return c;
261: }
262:
263: this .curcol++;
264: return c;
265: }
266:
267: this .lastcol = this .curcol;
268:
269: if (this .tabs > 0) {
270: this .curcol++;
271: this .tabs--;
272: return ' ';
273: }
274:
275: for (;;) {
276: c = readCharFromStream();
277:
278: if (c < 0)
279: return EndOfStream;
280:
281: if (c == '\n') {
282: this .curcol = 1;
283: this .curline++;
284: break;
285: }
286:
287: if (c == '\r') {
288: c = readCharFromStream();
289: if (c != '\n') {
290: ungetChar(c);
291: c = '\n';
292: }
293: this .curcol = 1;
294: this .curline++;
295: break;
296: }
297:
298: if (c == '\t') {
299: this .tabs = this .tabsize
300: - ((this .curcol - 1) % this .tabsize) - 1;
301: this .curcol++;
302: c = ' ';
303: break;
304: }
305:
306: /* strip control characters, except for Esc */
307:
308: if (c == '\033')
309: break;
310:
311: if (0 < c && c < 32)
312: continue;
313:
314: /* watch out for IS02022 */
315:
316: if (this .encoding == Configuration.RAW
317: || this .encoding == Configuration.ISO2022) {
318: this .curcol++;
319: break;
320: }
321:
322: if (this .encoding == Configuration.MACROMAN)
323: c = Mac2Unicode[c];
324:
325: /* produced e.g. as a side-effect of smart quotes in Word */
326:
327: if (127 < c && c < 160) {
328: Report.encodingError((Lexer) this .lexer,
329: Report.WINDOWS_CHARS, c);
330:
331: c = Win2Unicode[c - 128];
332:
333: if (c == 0)
334: continue;
335: }
336:
337: this .curcol++;
338: break;
339: }
340:
341: return c;
342: }
343:
344: public void ungetChar(int c) {
345: this .pushed = true;
346: this .c = c;
347:
348: if (c == '\n') {
349: --this .curline;
350: }
351:
352: this .curcol = this .lastcol;
353: }
354:
355: public boolean isEndOfStream() {
356: return this.endOfStream;
357: }
358:
359: }
|