001: /*
002: * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: * Free SoftwareFoundation, Inc.
023: * 59 Temple Place, Suite 330
024: * Boston, MA 02111-1307 USA
025: *
026: * @author Scott Ferguson
027: */
028:
029: package com.caucho.vfs.i18n;
030:
031: import java.io.IOException;
032: import java.io.InputStream;
033: import java.io.Reader;
034:
035: /**
036: * Implements an encoding reader to convert the stupid
037: * windows "smart" quotes into ISO-8859-1 (Latin-1) characters.
038: *
039: * <p>The windows "smart" quotes actually do map into
040: * unicode characters. If that's what you want, use
041: * the window-1521 encoding instead. windows-hack converts
042: * to the closest latin-1 equivalent.
043: *
044: * <p>The three exceptions are the elipses '...', the
045: * trademark, and the per-mille characters. Those are translated into
046: * their unicode equivalents because there isn't a useful
047: * latin-1 equivalent.
048: */
049: public class WindowsHackReader extends EncodingReader {
050: private InputStream is;
051:
052: /**
053: * Null-arg constructor for instantiation by com.caucho.vfs.Encoding only.
054: */
055: public WindowsHackReader() {
056: }
057:
058: /**
059: * Create a windows-hack reader based on the readStream.
060: */
061: private WindowsHackReader(InputStream is) {
062: this .is = is;
063: }
064:
065: /**
066: * Create a windows-hack reader based on the readStream.
067: *
068: * @param is the input stream providing the bytes.
069: * @param javaEncoding the JDK name for the encoding.
070: *
071: * @return the windows-hack reader.
072: */
073: public Reader create(InputStream is, String javaEncoding) {
074: return new WindowsHackReader(is);
075: }
076:
077: /**
078: * Reads into a character buffer using the correct encoding.
079: */
080: public int read() throws IOException {
081: int ch1 = is.read();
082:
083: switch (ch1) {
084: case 130: // unicode 8218
085: return ',';
086:
087: case 131: // unicode 402
088: return 'f';
089:
090: case 132: // unicode 8222
091: return '"';
092:
093: case 133: // unicode 8230 "..."
094: return 8230;
095:
096: case 134: // unicode 8224 (dagger)
097: return '+';
098:
099: case 135: // unicode 8225 (double dagger)
100: return '+';
101:
102: case 136: // unicode 710
103: return '^';
104:
105: case 137: // unicode 8240 (per-mille 0/00)
106: return 8240;
107:
108: case 138: // unicode 352
109: return 'S';
110:
111: case 139: // unicode 8249
112: return '<';
113:
114: case 140: // unicode 338 (OE)
115: return 'O';
116:
117: case 145: // unicode 8216
118: case 146: // unicode 8217
119: return '\'';
120:
121: case 147: // unicode 8220
122: case 148: // unicode 8221
123: return '"';
124:
125: case 149: // unicode 8226 (bullet)
126: return '*';
127:
128: case 150: // unicode 8211
129: case 151: // unicode 8212
130: return '-';
131:
132: case 152: // unicode 732
133: return '~';
134:
135: case 153: // unicode 8482 (trademark)
136: return 8482;
137:
138: case 154: // unicode 353
139: return 's';
140:
141: case 155: // unicode 8250
142: return '>';
143:
144: case 156: // unicode 339 (oe)
145: return 'o';
146:
147: case 376: // unicode 376 (Y with umlaut)
148: return 'Y';
149:
150: default:
151: return ch1;
152: }
153: }
154:
155: /**
156: * Reads into a character buffer using the correct encoding.
157: *
158: * @param cbuf character buffer receiving the data.
159: * @param off starting offset into the buffer.
160: * @param len number of characters to read.
161: *
162: * @return the number of characters read or -1 on end of file.
163: */
164: public int read(char[] cbuf, int off, int len) throws IOException {
165: int i = 0;
166:
167: for (i = 0; i < len; i++) {
168: int ch = is.read();
169:
170: if (ch < 0)
171: return i == 0 ? -1 : i;
172:
173: switch (ch) {
174: case -1:
175: return i == 0 ? -1 : i;
176:
177: case 130: // unicode 8218
178: cbuf[off + i] = ',';
179: break;
180:
181: case 131: // unicode 402
182: cbuf[off + i] = 'f';
183: break;
184:
185: case 132: // unicode 8222
186: cbuf[off + i] = '"';
187: break;
188:
189: case 133: // unicode 8230 "..."
190: cbuf[off + i] = (char) 8230;
191: break;
192:
193: case 134: // unicode 8224 (dagger)
194: cbuf[off + i] = '+';
195: break;
196:
197: case 135: // unicode 8225 (double dagger)
198: cbuf[off + i] = '+';
199: break;
200:
201: case 136: // unicode 710
202: cbuf[off + i] = '^';
203: break;
204:
205: case 137: // unicode 8240 (per-mille 0/00)
206: cbuf[off + i] = (char) 8240;
207: break;
208:
209: case 138: // unicode 352
210: cbuf[off + i] = 'S';
211: break;
212:
213: case 139: // unicode 8249
214: cbuf[off + i] = '<';
215: break;
216:
217: case 140: // unicode 338 (OE)
218: cbuf[off + i] = 'O';
219: break;
220:
221: case 145: // unicode 8216
222: case 146: // unicode 8217
223: cbuf[off + i] = '\'';
224: break;
225:
226: case 147: // unicode 8220
227: case 148: // unicode 8221
228: cbuf[off + i] = (char) '"';
229: break;
230:
231: case 149: // unicode 8226 (bullet)
232: cbuf[off + i] = (char) '*';
233: break;
234:
235: case 150: // unicode 8211
236: case 151: // unicode 8212
237: cbuf[off + i] = (char) '-';
238: break;
239:
240: case 152: // unicode 732
241: cbuf[off + i] = (char) '~';
242: break;
243:
244: case 153: // unicode 8482 (trademark)
245: cbuf[off + i] = (char) 8482;
246: break;
247:
248: case 154: // unicode 353
249: cbuf[off + i] = 's';
250: break;
251:
252: case 155: // unicode 8250
253: cbuf[off + i] = '>';
254: break;
255:
256: case 156: // unicode 339 (oe)
257: cbuf[off + i] = 'o';
258: break;
259:
260: case 376: // unicode 376 (Y with umlaut)
261: cbuf[off + i] = 'Y';
262: break;
263:
264: default:
265: cbuf[off + i] = (char) ch;
266: }
267: }
268:
269: return i;
270: }
271: }
|