001: /**
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.text;
007:
008: import java.io.BufferedInputStream;
009: import java.io.InputStream;
010: import java.io.DataInputStream;
011: import java.io.IOException;
012:
013: import com.ibm.icu.impl.ICUData;
014: import com.ibm.icu.impl.ICUResourceBundle;
015: import com.ibm.icu.impl.Trie;
016: import com.ibm.icu.impl.CharTrie;
017:
018: /**
019: * <p>Internal class used for Rule Based Break Iterators</p>
020: * <p>This class provides access to the compiled break rule data, as
021: * it is stored in a .brk file.
022: * @internal
023: *
024: */
025: final class RBBIDataWrapper {
026: //
027: // These fields are the ready-to-use compiled rule data, as
028: // read from the file.
029: //
030: RBBIDataHeader fHeader;
031: short fFTable[];
032: short fRTable[];
033: short fSFTable[];
034: short fSRTable[];
035: CharTrie fTrie;
036: String fRuleSource;
037: int fStatusTable[];
038:
039: //
040: // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
041: // Used by the rule compiler when flattening the data.
042: //
043: final static int DH_SIZE = 24;
044: final static int DH_MAGIC = 0;
045: final static int DH_FORMATVERSION = 1;
046: final static int DH_LENGTH = 2;
047: final static int DH_CATCOUNT = 3;
048: final static int DH_FTABLE = 4;
049: final static int DH_FTABLELEN = 5;
050: final static int DH_RTABLE = 6;
051: final static int DH_RTABLELEN = 7;
052: final static int DH_SFTABLE = 8;
053: final static int DH_SFTABLELEN = 9;
054: final static int DH_SRTABLE = 10;
055: final static int DH_SRTABLELEN = 11;
056: final static int DH_TRIE = 12;
057: final static int DH_TRIELEN = 13;
058: final static int DH_RULESOURCE = 14;
059: final static int DH_RULESOURCELEN = 15;
060: final static int DH_STATUSTABLE = 16;
061: final static int DH_STATUSTABLELEN = 17;
062:
063: // Index offsets to the fields in a state table row.
064: // Corresponds to struct RBBIStateTableRow in the C version.
065: //
066: final static int ACCEPTING = 0;
067: final static int LOOKAHEAD = 1;
068: final static int TAGIDX = 2;
069: final static int RESERVED = 3;
070: final static int NEXTSTATES = 4;
071:
072: // Index offsets to header fields of a state table
073: // struct RBBIStateTable {... in the C version.
074: //
075: final static int NUMSTATES = 0;
076: final static int ROWLEN = 2;
077: final static int FLAGS = 4;
078: final static int RESERVED_2 = 6;
079: final static int ROW_DATA = 8;
080:
081: // Bit selectors for the "FLAGS" field of the state table header
082: // enum RBBIStateTableFlags in the C version.
083: //
084: final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
085: final static int RBBI_BOF_REQUIRED = 2;
086:
087: // Getters for fields from the state table header
088: //
089: final static int getNumStates(short table[]) {
090: int hi = table[NUMSTATES];
091: int lo = table[NUMSTATES + 1];
092: int val = (hi << 16) + (lo & 0x0000ffff);
093: return val;
094: }
095:
096: /**
097: * Data Header. A struct-like class with the fields from the RBBI data file header.
098: */
099: final static class RBBIDataHeader {
100: int fMagic; // == 0xbla0
101: int fVersion; // == 1 (for ICU 3.2 and earlier.
102: byte[] fFormatVersion; // For ICU 3.4 and later.
103: int fLength; // Total length in bytes of this RBBI Data,
104: // including all sections, not just the header.
105: int fCatCount; // Number of character categories.
106:
107: //
108: // Offsets and sizes of each of the subsections within the RBBI data.
109: // All offsets are bytes from the start of the RBBIDataHeader.
110: // All sizes are in bytes.
111: //
112: int fFTable; // forward state transition table.
113: int fFTableLen;
114: int fRTable; // Offset to the reverse state transition table.
115: int fRTableLen;
116: int fSFTable; // safe point forward transition table
117: int fSFTableLen;
118: int fSRTable; // safe point reverse transition table
119: int fSRTableLen;
120: int fTrie; // Offset to Trie data for character categories
121: int fTrieLen;
122: int fRuleSource; // Offset to the source for for the break
123: int fRuleSourceLen; // rules. Stored UChar *.
124: int fStatusTable; // Offset to the table of rule status values
125: int fStatusTableLen;
126:
127: public RBBIDataHeader() {
128: fMagic = 0;
129: fFormatVersion = new byte[4];
130: }
131: }
132:
133: /**
134: * RBBI State Table Indexing Function. Given a state number, return the
135: * array index of the start of the state table row for that state.
136: *
137: */
138: int getRowIndex(int state) {
139: return ROW_DATA + state * (fHeader.fCatCount + 4);
140: }
141:
142: static class TrieFoldingFunc implements Trie.DataManipulate {
143: public int getFoldingOffset(int data) {
144: if ((data & 0x8000) != 0) {
145: return data & 0x7fff;
146: } else {
147: return 0;
148: }
149: }
150: }
151:
152: static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
153:
154: RBBIDataWrapper() {
155: }
156:
157: static RBBIDataWrapper get(String name) throws IOException {
158: String fullName = "data/" + name;
159: InputStream is = ICUData.getRequiredStream(fullName);
160: return get(is);
161: }
162:
163: /*
164: * Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
165: * of RBBI rules.
166: */
167: static RBBIDataWrapper get(InputStream is) throws IOException {
168: int i;
169:
170: DataInputStream dis = new DataInputStream(
171: new BufferedInputStream(is));
172: RBBIDataWrapper This = new RBBIDataWrapper();
173:
174: // Seek past the ICU data header.
175: // TODO: verify that the header looks good.
176: dis.skip(0x80);
177:
178: // Read in the RBBI data header...
179: This.fHeader = new RBBIDataHeader();
180: This.fHeader.fMagic = dis.readInt();
181: This.fHeader.fVersion = dis.readInt();
182: This.fHeader.fFormatVersion[0] = (byte) (This.fHeader.fVersion >> 24);
183: This.fHeader.fFormatVersion[1] = (byte) (This.fHeader.fVersion >> 16);
184: This.fHeader.fFormatVersion[2] = (byte) (This.fHeader.fVersion >> 8);
185: This.fHeader.fFormatVersion[3] = (byte) (This.fHeader.fVersion);
186: This.fHeader.fLength = dis.readInt();
187: This.fHeader.fCatCount = dis.readInt();
188: This.fHeader.fFTable = dis.readInt();
189: This.fHeader.fFTableLen = dis.readInt();
190: This.fHeader.fRTable = dis.readInt();
191: This.fHeader.fRTableLen = dis.readInt();
192: This.fHeader.fSFTable = dis.readInt();
193: This.fHeader.fSFTableLen = dis.readInt();
194: This.fHeader.fSRTable = dis.readInt();
195: This.fHeader.fSRTableLen = dis.readInt();
196: This.fHeader.fTrie = dis.readInt();
197: This.fHeader.fTrieLen = dis.readInt();
198: This.fHeader.fRuleSource = dis.readInt();
199: This.fHeader.fRuleSourceLen = dis.readInt();
200: This.fHeader.fStatusTable = dis.readInt();
201: This.fHeader.fStatusTableLen = dis.readInt();
202: dis.skip(6 * 4); // uint32_t fReserved[6];
203:
204: if (This.fHeader.fMagic != 0xb1a0
205: || !(This.fHeader.fVersion == 1 || // ICU 3.2 and earlier
206: This.fHeader.fFormatVersion[0] == 3) // ICU 3.4
207: ) {
208: throw new IOException(
209: "Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
210: }
211:
212: // Current position in input stream.
213: int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes)
214:
215: //
216: // Read in the Forward state transition table as an array of shorts.
217: //
218:
219: // Quick Sanity Check
220: if (This.fHeader.fFTable < pos
221: || This.fHeader.fFTable > This.fHeader.fLength) {
222: throw new IOException("Break iterator Rule data corrupt");
223: }
224:
225: // Skip over any padding preceding this table
226: dis.skip(This.fHeader.fFTable - pos);
227: pos = This.fHeader.fFTable;
228:
229: This.fFTable = new short[This.fHeader.fFTableLen / 2];
230: for (i = 0; i < This.fFTable.length; i++) {
231: This.fFTable[i] = dis.readShort();
232: pos += 2;
233: }
234:
235: //
236: // Read in the Reverse state table
237: //
238:
239: // Skip over any padding in the file
240: dis.skip(This.fHeader.fRTable - pos);
241: pos = This.fHeader.fRTable;
242:
243: // Create & fill the table itself.
244: This.fRTable = new short[This.fHeader.fRTableLen / 2];
245: for (i = 0; i < This.fRTable.length; i++) {
246: This.fRTable[i] = dis.readShort();
247: pos += 2;
248: }
249:
250: //
251: // Read in the Safe Forward state table
252: //
253: if (This.fHeader.fSFTableLen > 0) {
254: // Skip over any padding in the file
255: dis.skip(This.fHeader.fSFTable - pos);
256: pos = This.fHeader.fSFTable;
257:
258: // Create & fill the table itself.
259: This.fSFTable = new short[This.fHeader.fSFTableLen / 2];
260: for (i = 0; i < This.fSFTable.length; i++) {
261: This.fSFTable[i] = dis.readShort();
262: pos += 2;
263: }
264: }
265:
266: //
267: // Read in the Safe Reverse state table
268: //
269: if (This.fHeader.fSRTableLen > 0) {
270: // Skip over any padding in the file
271: dis.skip(This.fHeader.fSRTable - pos);
272: pos = This.fHeader.fSRTable;
273:
274: // Create & fill the table itself.
275: This.fSRTable = new short[This.fHeader.fSRTableLen / 2];
276: for (i = 0; i < This.fSRTable.length; i++) {
277: This.fSRTable[i] = dis.readShort();
278: pos += 2;
279: }
280: }
281:
282: //
283: // Unserialize the Character categories TRIE
284: // Because we can't be absolutely certain where the Trie deserialize will
285: // leave the input stream, leave position unchanged.
286: // The seek to the start of the next item following the TRIE will get us
287: // back in sync.
288: //
289: dis.skip(This.fHeader.fTrie - pos); // seek input stream from end of previous section to
290: pos = This.fHeader.fTrie; // to the start of the trie
291:
292: dis.mark(This.fHeader.fTrieLen + 100); // Mark position of start of TRIE in the input
293: // and tell Java to keep the mark valid so long
294: // as we don't go more than 100 bytes past the
295: // past the end of the TRIE.
296:
297: This.fTrie = new CharTrie(dis, fTrieFoldingFunc); // Deserialize the TRIE, leaving input
298: // stream at an unknown position, preceding the
299: // padding between TRIE and following section.
300:
301: dis.reset(); // Move input stream back to marked position at
302: // the start of the serialized TRIE. Now our
303: // "pos" variable and the input stream are in
304: // agreement.
305:
306: //
307: // Read the Rule Status Table
308: //
309: if (pos > This.fHeader.fStatusTable) {
310: throw new IOException("Break iterator Rule data corrupt");
311: }
312: dis.skip(This.fHeader.fStatusTable - pos);
313: pos = This.fHeader.fStatusTable;
314: This.fStatusTable = new int[This.fHeader.fStatusTableLen / 4];
315: for (i = 0; i < This.fStatusTable.length; i++) {
316: This.fStatusTable[i] = dis.readInt();
317: pos += 4;
318: }
319:
320: //
321: // Put the break rule source into a String
322: //
323: if (pos > This.fHeader.fRuleSource) {
324: throw new IOException("Break iterator Rule data corrupt");
325: }
326: dis.skip(This.fHeader.fRuleSource - pos);
327: pos = This.fHeader.fRuleSource;
328: StringBuffer sb = new StringBuffer(
329: This.fHeader.fRuleSourceLen / 2);
330: for (i = 0; i < This.fHeader.fRuleSourceLen; i += 2) {
331: sb.append(dis.readChar());
332: pos += 2;
333: }
334: This.fRuleSource = sb.toString();
335:
336: if (RuleBasedBreakIterator.fDebugEnv != null
337: && RuleBasedBreakIterator.fDebugEnv.indexOf("data") >= 0) {
338: This.dump();
339: }
340: return This;
341: }
342:
343: /** Debug function to display the break iterator data.
344: * @internal
345: */
346: void dump() {
347: System.out.println("RBBI Data Wrapper dump ...");
348: System.out.println();
349: System.out.println("Forward State Table");
350: dumpTable(fFTable);
351: System.out.println("Reverse State Table");
352: dumpTable(fRTable);
353: System.out.println("Forward Safe Points Table");
354: dumpTable(fSFTable);
355: System.out.println("Reverse Safe Points Table");
356: dumpTable(fSRTable);
357:
358: dumpCharCategories();
359: System.out.println("Source Rules: " + fRuleSource);
360:
361: }
362:
363: /** Fixed width int-to-string conversion.
364: * @internal
365: *
366: */
367: static public String intToString(int n, int width) {
368: StringBuffer dest = new StringBuffer(width);
369: dest.append(n);
370: while (dest.length() < width) {
371: dest.insert(0, ' ');
372: }
373: return dest.toString();
374: }
375:
376: /** Fixed width int-to-string conversion.
377: * @internal
378: *
379: */
380: static public String intToHexString(int n, int width) {
381: StringBuffer dest = new StringBuffer(width);
382: dest.append(Integer.toHexString(n));
383: while (dest.length() < width) {
384: dest.insert(0, ' ');
385: }
386: return dest.toString();
387: }
388:
389: /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
390: private void dumpTable(short table[]) {
391: if (table == null) {
392: System.out.println(" -- null -- ");
393: } else {
394: int n;
395: int state;
396: String header = " Row Acc Look Tag";
397: for (n = 0; n < fHeader.fCatCount; n++) {
398: header += intToString(n, 5);
399: }
400: System.out.println(header);
401: for (n = 0; n < header.length(); n++) {
402: System.out.print("-");
403: }
404: System.out.println();
405: for (state = 0; state < getNumStates(table); state++) {
406: dumpRow(table, state);
407: }
408: System.out.println();
409: }
410: }
411:
412: /**
413: * Dump (for debug) a single row of an RBBI state table
414: * @param table
415: * @param state
416: * @internal
417: */
418: private void dumpRow(short table[], int state) {
419: StringBuffer dest = new StringBuffer(fHeader.fCatCount * 5 + 20);
420: dest.append(intToString(state, 4));
421: int row = getRowIndex(state);
422: if (table[row + ACCEPTING] != 0) {
423: dest.append(intToString(table[row + ACCEPTING], 5));
424: } else {
425: dest.append(" ");
426: }
427: if (table[row + LOOKAHEAD] != 0) {
428: dest.append(intToString(table[row + LOOKAHEAD], 5));
429: } else {
430: dest.append(" ");
431: }
432: dest.append(intToString(table[row + TAGIDX], 5));
433:
434: for (int col = 0; col < fHeader.fCatCount; col++) {
435: dest.append(intToString(table[row + NEXTSTATES + col], 5));
436: }
437:
438: System.out.println(dest);
439: }
440:
441: private void dumpCharCategories() {
442: int n = fHeader.fCatCount;
443: String catStrings[] = new String[n + 1];
444: int rangeStart = 0;
445: int rangeEnd = 0;
446: int lastCat = -1;
447: int char32;
448: int category;
449: int lastNewline[] = new int[n + 1];
450:
451: for (category = 0; category <= fHeader.fCatCount; category++) {
452: catStrings[category] = "";
453: }
454: System.out.println("\nCharacter Categories");
455: System.out.println("--------------------");
456: for (char32 = 0; char32 <= 0x10ffff; char32++) {
457: category = fTrie.getCodePointValue(char32);
458: category &= ~0x4000; // Mask off dictionary bit.
459: if (category < 0 || category > fHeader.fCatCount) {
460: System.out.println("Error, bad category "
461: + Integer.toHexString(category) + " for char "
462: + Integer.toHexString(char32));
463: break;
464: }
465: if (category == lastCat) {
466: rangeEnd = char32;
467: } else {
468: if (lastCat >= 0) {
469: if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
470: lastNewline[lastCat] = catStrings[lastCat]
471: .length() + 10;
472: catStrings[lastCat] += "\n ";
473: }
474:
475: catStrings[lastCat] += " "
476: + Integer.toHexString(rangeStart);
477: if (rangeEnd != rangeStart) {
478: catStrings[lastCat] += "-"
479: + Integer.toHexString(rangeEnd);
480: }
481: }
482: lastCat = category;
483: rangeStart = rangeEnd = char32;
484: }
485: }
486: catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
487: if (rangeEnd != rangeStart) {
488: catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
489: }
490:
491: for (category = 0; category <= fHeader.fCatCount; category++) {
492: System.out.println(intToString(category, 5) + " "
493: + catStrings[category]);
494: }
495: System.out.println();
496: }
497:
498: public static void main(String[] args) {
499: String s;
500: if (args.length == 0) {
501: s = "char";
502: } else {
503: s = args[0];
504: }
505: System.out.println("RBBIDataWrapper.main(" + s + ") ");
506:
507: String versionedName = ICUResourceBundle.ICU_BUNDLE + "/" + s
508: + ".brk";
509:
510: try {
511: RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
512: This.dump();
513: } catch (Exception e) {
514: System.out.println("Exception: " + e.toString());
515: }
516:
517: }
518:
519: }
|