001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.io.*;
011:
012: import com.ibm.icu.impl.ICUBinary;
013: import com.ibm.icu.impl.ICUDebug;
014:
015: /* Format of cnvalias.icu -----------------------------------------------------
016: *
017: * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
018: * This binary form contains several tables. All indexes are to uint16_t
019: * units, and not to the bytes (uint8_t units). Addressing everything on
020: * 16-bit boundaries allows us to store more information with small index
021: * numbers, which are also 16-bit in size. The majority of the table (except
022: * the string table) are 16-bit numbers.
023: *
024: * First there is the size of the Table of Contents (TOC). The TOC
025: * entries contain the size of each section. In order to find the offset
026: * you just need to sum up the previous offsets.
027: * The TOC length and entries are an array of uint32_t values.
028: * The first section after the TOC starts immediately after the TOC.
029: *
030: * 1) This section contains a list of converters. This list contains indexes
031: * into the string table for the converter name. The index of this list is
032: * also used by other sections, which are mentioned later on.
033: * This list is not sorted.
034: *
035: * 2) This section contains a list of tags. This list contains indexes
036: * into the string table for the tag name. The index of this list is
037: * also used by other sections, which are mentioned later on.
038: * This list is in priority order of standards.
039: *
040: * 3) This section contains a list of sorted unique aliases. This
041: * list contains indexes into the string table for the alias name. The
042: * index of this list is also used by other sections, like the 4th section.
043: * The index for the 3rd and 4th section is used to get the
044: * alias -> converter name mapping. Section 3 and 4 form a two column table.
045: *
046: * 4) This section contains a list of mapped converter names. Consider this
047: * as a table that maps the 3rd section to the 1st section. This list contains
048: * indexes into the 1st section. The index of this list is the same index in
049: * the 3rd section. There is also some extra information in the high bits of
050: * each converter index in this table. Currently it's only used to say that
051: * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
052: * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
053: * the predigested form of the 5th section so that an alias lookup can be fast.
054: *
055: * 5) This section contains a 2D array with indexes to the 6th section. This
056: * section is the full form of all alias mappings. The column index is the
057: * index into the converter list (column header). The row index is the index
058: * to tag list (row header). This 2D array is the top part a 3D array. The
059: * third dimension is in the 6th section.
060: *
061: * 6) This is blob of variable length arrays. Each array starts with a size,
062: * and is followed by indexes to alias names in the string table. This is
063: * the third dimension to the section 5. No other section should be referencing
064: * this section.
065: *
066: * 7) Reserved at this time (There is no information). This _usually_ has a
067: * size of 0. Future versions may add more information here.
068: *
069: * 8) This is the string table. All strings are indexed on an even address.
070: * There are two reasons for this. First many chip architectures locate strings
071: * faster on even address boundaries. Second, since all indexes are 16-bit
072: * numbers, this string table can be 128KB in size instead of 64KB when we
073: * only have strings starting on an even address.
074: *
075: *
076: * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
077: * has a unique alias among all converters. That same alias can
078: * be mentioned in other standards on different converters,
079: * but only one alias per tag can be unique.
080: *
081: *
082: * Converter Names (Usually in TR22 form)
083: * -------------------------------------------.
084: * T / /|
085: * a / / |
086: * g / / |
087: * s / / |
088: * / / |
089: * ------------------------------------------/ |
090: * A | | |
091: * l | | |
092: * i | | /
093: * a | | /
094: * s | | /
095: * e | | /
096: * s | |/
097: * -------------------------------------------
098: *
099: *
100: *
101: * Here is what it really looks like. It's like swiss cheese.
102: * There are holes. Some converters aren't recognized by
103: * a standard, or they are really old converters that the
104: * standard doesn't recognize anymore.
105: *
106: * Converter Names (Usually in TR22 form)
107: * -------------------------------------------.
108: * T /##########################################/|
109: * a / # # /#
110: * g / # ## ## ### # ### ### ### #/
111: * s / # ##### #### ## ## #/#
112: * / ### # # ## # # # ### # # #/##
113: * ------------------------------------------/# #
114: * A |### # # ## # # # ### # # #|# #
115: * l |# # # # # ## # #|# #
116: * i |# # # # # # #|#
117: * a |# #|#
118: * s | #|#
119: * e
120: * s
121: *
122: */
123:
124: final class UConverterAliasDataReader implements ICUBinary.Authenticate {
125: private final static boolean debug = ICUDebug
126: .enabled("UConverterAliasDataReader");
127:
128: /**
129: * <p>Protected constructor.</p>
130: * @param inputStream ICU uprop.dat file input stream
131: * @exception IOException throw if data file fails authentication
132: * @draft 2.1
133: */
134: protected UConverterAliasDataReader(InputStream inputStream)
135: throws IOException {
136: if (debug)
137: System.out.println("Bytes in inputStream "
138: + inputStream.available());
139:
140: unicodeVersion = ICUBinary.readHeader(inputStream,
141: DATA_FORMAT_ID, this );
142:
143: if (debug)
144: System.out.println("Bytes left in inputStream "
145: + inputStream.available());
146:
147: dataInputStream = new DataInputStream(inputStream);
148:
149: if (debug)
150: System.out.println("Bytes left in dataInputStream "
151: + dataInputStream.available());
152: }
153:
154: // protected methods -------------------------------------------------
155:
156: protected int[] readToc(int n) throws IOException {
157: int[] toc = new int[n];
158: //Read the toc
159: for (int i = 0; i < n; ++i) {
160: toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
161: }
162: return toc;
163: }
164:
165: protected void read(int[] convList, int[] tagList, int[] aliasList,
166: int[] untaggedConvArray, int[] taggedAliasArray,
167: int[] taggedAliasLists, int[] optionTable,
168: byte[] stringTable, byte[] normalizedStringTable)
169: throws IOException {
170: int i;
171: //int listnum = 1;
172: //long listsize;
173:
174: for (i = 0; i < convList.length; ++i)
175: convList[i] = dataInputStream.readUnsignedShort();
176:
177: for (i = 0; i < tagList.length; ++i)
178: tagList[i] = dataInputStream.readUnsignedShort();
179:
180: for (i = 0; i < aliasList.length; ++i)
181: aliasList[i] = dataInputStream.readUnsignedShort();
182:
183: for (i = 0; i < untaggedConvArray.length; ++i)
184: untaggedConvArray[i] = dataInputStream.readUnsignedShort();
185:
186: for (i = 0; i < taggedAliasArray.length; ++i)
187: taggedAliasArray[i] = dataInputStream.readUnsignedShort();
188:
189: for (i = 0; i < taggedAliasLists.length; ++i)
190: taggedAliasLists[i] = dataInputStream.readUnsignedShort();
191:
192: for (i = 0; i < optionTable.length; ++i)
193: optionTable[i] = dataInputStream.readUnsignedShort();
194:
195: dataInputStream.read(stringTable);
196: dataInputStream.read(normalizedStringTable);
197: }
198:
199: public boolean isDataVersionAcceptable(byte version[]) {
200: return version.length >= DATA_FORMAT_VERSION.length
201: && version[0] == DATA_FORMAT_VERSION[0]
202: && version[1] == DATA_FORMAT_VERSION[1]
203: && version[2] == DATA_FORMAT_VERSION[2];
204: }
205:
206: public byte[] getUnicodeVersion() {
207: return unicodeVersion;
208: }
209:
210: // private data members -------------------------------------------------
211:
212: /**
213: * ICU data file input stream
214: */
215: private DataInputStream dataInputStream;
216:
217: private byte[] unicodeVersion;
218:
219: /**
220: * File format version that this class understands.
221: * No guarantees are made if a older version is used
222: * see store.c of gennorm for more information and values
223: */
224: // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
225: private static final byte DATA_FORMAT_ID[] = { (byte) 0x43,
226: (byte) 0x76, (byte) 0x41, (byte) 0x6c }; // dataFormat="CvAl"
227: private static final byte DATA_FORMAT_VERSION[] = { 3, 0, 1 };
228:
229: //private static final int UNSIGNED_SHORT_MASK = 0xffff;
230: private static final int UNSIGNED_INT_MASK = 0xffffffff;
231:
232: }
|