01: /* Cp1252
02: *
03: * Created on September 12, 2006
04: *
05: * Copyright (C) 2006 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.util.ms;
24:
25: import java.io.UnsupportedEncodingException;
26:
27: /**
28: * A fast implementation of code page 1252. This is used to convert bytes
29: * to characters in .doc files that don't use unicode.
30: *
31: * <p>The Java Charset APIs seemed like overkill for these translations,
32: * since 1 byte always translates into 1 character.
33: *
34: * @author pjack
35: */
36: public class Cp1252 {
37:
38: /**
39: * The translation table. If x is an unsigned byte from a .doc
40: * text stream, then XLAT[x] is the Unicode character that byte
41: * represents.
42: */
43: final private static char[] XLAT = createTable();
44:
45: /**
46: * Static utility library, do not instantiate.
47: */
48: private Cp1252() {
49: }
50:
51: /**
52: * Generates the translation table. The Java String API is used for each
53: * possible byte to determine the corresponding Unicode character.
54: *
55: * @return the Cp1252 translation table
56: */
57: private static char[] createTable() {
58: char[] result = new char[256];
59: byte[] b = new byte[1];
60: for (int i = 0; i < 256; i++)
61: try {
62: b[0] = (byte) i;
63: String s = new String(b, "Cp1252");
64: result[i] = s.charAt(0);
65: } catch (UnsupportedEncodingException e) {
66: throw new RuntimeException(e);
67: }
68: return result;
69: }
70:
71: /**
72: * Returns the Unicode character for the given Cp1252 byte.
73: *
74: * @param b an unsigned byte from 0 to 255
75: * @return the Unicode character corresponding to that byte
76: */
77: public static char decode(int b) {
78: return XLAT[b];
79: }
80:
81: }
|