001: /*
002:
003: This software is OSI Certified Open Source Software.
004: OSI Certified is a certification mark of the Open Source Initiative.
005:
006: The license (Mozilla version 1.0) can be read at the MMBase site.
007: See http://www.MMBase.org/license
008:
009: */
010: package org.mmbase.util.transformers;
011:
012: import java.io.Reader;
013: import java.io.Writer;
014: import java.util.*;
015:
016: import org.mmbase.util.logging.*;
017:
018: /**
019: * Surrogates the Windows CP1252 characters which are not valid ISO-8859-1. It can also repair
020: * wrongly encoded Strings (byte arrays which were actually CP1252, but were considered ISO-8859-1
021: * when they were made to a Java String).
022: *
023: * @author Michiel Meeuwissen
024: * @since MMBase-1.7.2
025: * @version $Id: CP1252Surrogator.java,v 1.6 2007/02/24 21:57:50 nklasens Exp $
026: */
027:
028: public class CP1252Surrogator extends ConfigurableReaderTransformer
029: implements CharTransformer {
030: private static final Logger log = Logging
031: .getLoggerInstance(CP1252Surrogator.class);
032:
033: public static final int WELL_ENCODED = 0;
034: public static final int WRONG_ENCODED = 1;
035:
036: public CP1252Surrogator() {
037: this (WELL_ENCODED);
038: }
039:
040: public CP1252Surrogator(int conf) {
041: super (conf);
042: }
043:
044: public Writer transform(Reader r, Writer w) {
045: try {
046: while (true) {
047: int c = r.read();
048: if (c == -1)
049: break;
050: int cp;
051: if (to == WELL_ENCODED) { // CP1252 chars appear all over the place in the unicode set, this makes a nice an clear int of it, with the ISO-8859-1 values (0-255)
052: cp = ("" + (char) c).getBytes("CP1252")[0] & 0xff; // should this really be done by a String?
053: } else {
054: cp = c;
055:
056: }
057: switch (cp) {
058: case 128:
059: w.write("EURO");
060: break; // EURO SIGN
061: case 129:
062: w.write('?');
063: break; //
064: case 130:
065: w.write(',');
066: break; // SINGLE LOW-9 QUOTATION MARK
067: case 131:
068: w.write('f');
069: break; // LATIN SMALL LETTER F WITH HOOK
070: case 132:
071: w.write(",,");
072: break; // DOUBLE LOW-9 QUOTATION MARK
073: case 133:
074: w.write("...");
075: break; // HORIZONTAL ELLIPSIS
076: case 134:
077: w.write('+');
078: break; // DAGGER
079: case 135:
080: w.write("++");
081: break; // DOUBLE DAGGER
082: case 136:
083: w.write('^');
084: break; // MODIFIER LETTER CIRCUMFLEX ACCENT
085: case 137:
086: w.write("0/00");
087: break; // PER MILLE SIGN
088: case 138:
089: w.write('S');
090: break; // LATIN CAPITAL LETTER S WITH CARON
091: case 139:
092: w.write('<');
093: break; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
094: case 140:
095: w.write("OE");
096: break; // LATIN CAPITAL LIGATURE OE
097: case 141:
098: w.write('?');
099: break; //
100: case 142:
101: w.write('Z');
102: break; // LATIN CAPITAL LETTER Z WITH CARON
103: case 143:
104: w.write('?');
105: break; //
106: case 144:
107: w.write('?');
108: break; //
109: case 145:
110: w.write('\'');
111: break; // LEFT SINGLE QUOTATION MARK
112: case 146:
113: w.write('\'');
114: break; // RIGHT SINGLE QUOTATION MARK
115: case 147:
116: w.write('\"');
117: break; // LEFT DOUBLE QUOTATION MARK
118: case 148:
119: w.write('\"');
120: break; // RIGHT DOUBLE QUOTATION MARK
121: case 149:
122: w.write('-');
123: break; // BULLET
124: case 150:
125: w.write('-');
126: break; // EN DASH
127: case 151:
128: w.write('-');
129: break; // EM DASH
130: case 152:
131: w.write('~');
132: break; // SMALL TILDE
133: case 153:
134: w.write("(TM)");
135: break; // TRADE MARK SIGN
136: case 154:
137: w.write('s');
138: break; // LATIN SMALL LETTER S WITH CARON
139: case 155:
140: w.write('>');
141: break; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
142: case 156:
143: w.write("oe");
144: break; // LATIN SMALL LIGATURE OE
145: case 157:
146: w.write('?');
147: break; //
148: case 158:
149: w.write('z');
150: break; // LATIN SMALL LETTER Z WITH CARON
151: case 159:
152: w.write('Y');
153: break; // LATIN CAPITAL LETTER Y WITH DIAERESIS
154: default:
155: w.write(c);
156: }
157: }
158: } catch (Exception e) {
159: log.error(e.toString());
160: }
161: return w;
162: }
163:
164: public Map<String, Config> transformers() {
165: Map<String, Config> h = new HashMap<String, Config>();
166: h
167: .put(
168: "CP1252_SURROGATOR",
169: new Config(
170: CP1252Surrogator.class,
171: WELL_ENCODED,
172: "Takes the java String, and surrogates the 32 characters of it which are in CP1252 but not in ISO-8859-1"));
173: h
174: .put(
175: "CP1252_WRONG_SURROGATOR",
176: new Config(
177: CP1252Surrogator.class,
178: WRONG_ENCODED,
179: "Also surrogates the characters specific to CP1252, but supposed the String originally wrong encoded (it was suppoed to be ISO-8859-1, but actually was CP1252)"));
180: return h;
181: }
182:
183: public String getEncoding() {
184: switch (to) {
185: case WELL_ENCODED:
186: return "CP1252_SURROGATOR";
187: case WRONG_ENCODED:
188: return "CP1252_WRONG_SURROGATOR";
189: default:
190: throw new UnknownCodingException(getClass(), to);
191: }
192: }
193:
194: public static byte[] getTestBytes() {
195: byte[] testBytes = new byte[32];
196: for (int i = 0; i < 32; i++) {
197: testBytes[i] = (byte) (-128 + i);
198: }
199: return testBytes;
200: }
201:
202: public static String getTestString() {
203: try {
204: return new String(getTestBytes(), "CP1252");
205: } catch (Exception e) {
206: return e.toString();
207: }
208: }
209:
210: /**
211: * For testing only.
212: *
213: * Use on a UTF-8 terminal:
214: * java -Dfile.encoding=UTF-8 org.mmbase.util.transformers.CP1252Surrogator
215: * Or, on a ISO-8859-1 terminal: (you will see question marks, for the CP1252 chars)
216: * java -Dfile.encoding=ISO-8859-1 org.mmbase.util.transformers.CP1252Surrogator
217: * Or, if - may God forbid - you have a CP1252 terminal:
218: * java -Dfile.encoding=CP1252 org.mmbase.util.transformers.CP1252Surrogator
219: *
220: * This last thing you may simulate with something like this:
221: * java -Dfile.encoding=CP1252 org.mmbase.util.transformers.CP1252Surrogator | konwert cp1252-utf8
222: *
223: */
224: public static void main(String[] args) {
225:
226: // construct a String with all specific CP1252 charachters.
227: String testStringCP1252 = "bla bla " + getTestString();
228: String testStringISO1 = "";
229: try {
230: testStringISO1 = "bla bla "
231: + new String(getTestBytes(), "ISO-8859-1"); /// it's a lie, but try it anyway.
232: } catch (Exception e) {
233: log.error("", e);
234: }
235:
236: CharTransformer transOk = new CP1252Surrogator();
237: CharTransformer transNok = new CP1252Surrogator(WRONG_ENCODED);
238: CharTransformer unicode = new UnicodeEscaper();
239:
240: System.out.println("Test-string (CP1252): " + testStringCP1252);
241: // System.out.println("Test-string (ISO-1) : " + testStringISO1); _DOES NOT MAKE SENSE_.
242:
243: System.out.println("Java-escaped (CP1252): "
244: + unicode.transform(testStringCP1252));
245: System.out.println("Java-escaped (ISO-1) : "
246: + unicode.transform(testStringISO1));
247: System.out.println("Surrogated test-string (CP1252): "
248: + transOk.transform(testStringCP1252));
249: System.out.println("Surrogated test-string (ISO-1) : "
250: + transNok.transform(testStringISO1)); // fixe the non-sensical string.
251:
252: }
253:
254: }
|