001: //
002: // Copyright (C) 2002-2006, International Business Machines Corporation and others.
003: // All Rights Reserved.
004: //
005: //
006:
007: package com.ibm.icu.text;
008:
009: import java.util.HashMap;
010: import java.util.List;
011: import java.util.ArrayList;
012: import java.util.Map;
013: import java.io.OutputStream;
014: import java.io.DataOutputStream;
015: import java.io.IOException;
016: import com.ibm.icu.impl.Assert;
017:
018: class RBBIRuleBuilder {
019: // This is the main class for building (compiling) break rules into the tables
020: // required by the runtime RBBI engine.
021: //
022:
023: String fDebugEnv; // controls debug trace output
024: String fRules; // The rule string that we are compiling
025: RBBIRuleScanner fScanner; // The scanner.
026:
027: //
028: // There are four separate parse trees generated, one for each of the
029: // forward rules, reverse rules, safe forward rules and safe reverse rules.
030: // This array references the root of each of the trees.
031: //
032: RBBINode[] fTreeRoots = new RBBINode[4];
033: static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
034: static final int fReverseTree = 1; // for each of the trees.
035: static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
036: static final int fSafeRevTree = 3; // there is no array.)
037: int fDefaultTree = fForwardTree; // For rules not qualified with a !
038: // the tree to which they belong to.
039:
040: boolean fChainRules; // True for chained Unicode TR style rules.
041: // False for traditional regexp rules.
042:
043: boolean fLBCMNoChain; // True: suppress chaining of rules on
044: // chars with LineBreak property == CM.
045:
046: boolean fLookAheadHardBreak; // True: Look ahead matches cause an
047: // immediate break, no continuing for the
048: // longest match.
049:
050: RBBISetBuilder fSetBuilder; // Set and Character Category builder.
051: List fUSetNodes; // Vector of all uset nodes.
052: RBBITableBuilder fForwardTables; // State transition tables
053: RBBITableBuilder fReverseTables;
054: RBBITableBuilder fSafeFwdTables;
055: RBBITableBuilder fSafeRevTables;
056:
057: //
058: // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
059: //
060: Map fStatusSets = new HashMap(); // Status value sets encountered so far.
061: // Map Key is the set of values.
062: // Map Value is the runtime array index.
063:
064: List fRuleStatusVals; // List of Integer objects. Has same layout as the
065: // runtime array of status (tag) values -
066: // number of values in group 1
067: // first status value in group 1
068: // 2nd status value in group 1
069: // ...
070: // number of values in group 2
071: // first status value in group 2
072: // etc.
073: //
074: // Error codes from ICU4C.
075: // using these simplified the porting, and consolidated the
076: // creation of Java exceptions
077: //
078: static final int U_BRK_ERROR_START = 0x10200;
079: /**< Start of codes indicating Break Iterator failures */
080:
081: static final int U_BRK_INTERNAL_ERROR = 0x10201;
082: /**< An internal error (bug) was detected. */
083:
084: static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
085: /**< Hex digits expected as part of a escaped char in a rule. */
086:
087: static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
088: /**< Missing ';' at the end of a RBBI rule. */
089:
090: static final int U_BRK_RULE_SYNTAX = 0x10204;
091: /**< Syntax error in RBBI rule. */
092:
093: static final int U_BRK_UNCLOSED_SET = 0x10205;
094: /**< UnicodeSet witing an RBBI rule missing a closing ']'. */
095:
096: static final int U_BRK_ASSIGN_ERROR = 0x10206;
097: /**< Syntax error in RBBI rule assignment statement. */
098:
099: static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
100: /**< RBBI rule $Variable redefined. */
101:
102: static final int U_BRK_MISMATCHED_PAREN = 0x10208;
103: /**< Mis-matched parentheses in an RBBI rule. */
104:
105: static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
106: /**< Missing closing quote in an RBBI rule. */
107:
108: static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
109: /**< Use of an undefined $Variable in an RBBI rule. */
110:
111: static final int U_BRK_INIT_ERROR = 0x1020b;
112: /**< Initialization failure. Probable missing ICU Data. */
113:
114: static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
115: /**< Rule contains an empty Unicode Set. */
116:
117: static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
118: /**< !!option in RBBI rules not recognized. */
119:
120: static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
121: /**< The {nnn} tag on a rule is mal formed */
122: static final int U_BRK_MALFORMED_SET = 0x1020f;
123:
124: static final int U_BRK_ERROR_LIMIT = 0x10210;
125:
126: /**< This must always be the last value to indicate the limit for Break Iterator failures */
127:
128: //----------------------------------------------------------------------------------------
129: //
130: // Constructor.
131: //
132: //----------------------------------------------------------------------------------------
133: RBBIRuleBuilder(String rules) {
134: fDebugEnv = System.getProperty("U_RBBIDEBUG");
135: fRules = rules;
136: fUSetNodes = new ArrayList();
137: fRuleStatusVals = new ArrayList();
138: fScanner = new RBBIRuleScanner(this );
139: fSetBuilder = new RBBISetBuilder(this );
140: }
141:
142: //----------------------------------------------------------------------------------------
143: //
144: // flattenData() - Collect up the compiled RBBI rule data and put it into
145: // the format for saving in ICU data files,
146: //
147: // See the ICU4C file common/rbidata.h for a detailed description.
148: //
149: //----------------------------------------------------------------------------------------
150: static final int align8(int i) {
151: return (i + 7) & 0xfffffff8;
152: }
153:
154: void flattenData(OutputStream os) throws IOException {
155: DataOutputStream dos = new DataOutputStream(os);
156: int i;
157:
158: // Remove comments and whitespace from the rules to make it smaller.
159: String strippedRules = RBBIRuleScanner.stripRules(fRules);
160:
161: // Calculate the size of each section in the data in bytes.
162: // Sizes here are padded up to a multiple of 8 for better memory alignment.
163: // Sections sizes actually stored in the header are for the actual data
164: // without the padding.
165: //
166: int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
167: int forwardTableSize = align8(fForwardTables.getTableSize());
168: int reverseTableSize = align8(fReverseTables.getTableSize());
169: int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
170: int safeRevTableSize = align8(fSafeRevTables.getTableSize());
171: int trieSize = align8(fSetBuilder.getTrieSize());
172: int statusTableSize = align8(fRuleStatusVals.size() * 4);
173: int rulesSize = align8((strippedRules.length()) * 2);
174: int totalSize = headerSize + forwardTableSize
175: + reverseTableSize + safeFwdTableSize
176: + safeRevTableSize + statusTableSize + trieSize
177: + rulesSize;
178: int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
179:
180: //
181: // Write out an ICU Data Header
182: // TODO: actually create a real header, rather than just a placeholder.
183: // The empty placeholder is ok for compile-and-go from within ICU4J.
184: // Replicating the ICU4C genbrk tool for building .brk resources would need a real header.
185: //
186: byte[] ICUDataHeader = new byte[0x80];
187: dos.write(ICUDataHeader);
188:
189: //
190: // Write out the RBBIDataHeader
191: //
192: int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader
193: header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
194: header[RBBIDataWrapper.DH_FORMATVERSION] = 0x03010000; // uint8_t fFormatVersion[4];
195: header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
196: header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder
197: .getNumCharCategories(); // fCatCount.
198: header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
199: header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
200: header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE]
201: + forwardTableSize; // fRTable
202: header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
203: header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
204: + reverseTableSize; // fSTable
205: header[RBBIDataWrapper.DH_SFTABLELEN] = safeFwdTableSize; // fSTableLen
206: header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
207: + safeFwdTableSize; // fSRTable
208: header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; // fSRTableLen
209: header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
210: + safeRevTableSize; // fTrie
211: header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
212: header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
213: + header[RBBIDataWrapper.DH_TRIELEN];
214: header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
215: header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
216: + statusTableSize;
217: header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules
218: .length() * 2;
219: for (i = 0; i < header.length; i++) {
220: dos.writeInt(header[i]);
221: outputPos += 4;
222: }
223:
224: // Write out the actual state tables.
225: short[] tableData;
226: tableData = fForwardTables.exportTable();
227: Assert.assrt(outputPos == header[4]);
228: for (i = 0; i < tableData.length; i++) {
229: dos.writeShort(tableData[i]);
230: outputPos += 2;
231: }
232:
233: tableData = fReverseTables.exportTable();
234: Assert.assrt(outputPos == header[6]);
235: for (i = 0; i < tableData.length; i++) {
236: dos.writeShort(tableData[i]);
237: outputPos += 2;
238: }
239:
240: Assert.assrt(outputPos == header[8]);
241: tableData = fSafeFwdTables.exportTable();
242: for (i = 0; i < tableData.length; i++) {
243: dos.writeShort(tableData[i]);
244: outputPos += 2;
245: }
246:
247: Assert.assrt(outputPos == header[10]);
248: tableData = fSafeRevTables.exportTable();
249: for (i = 0; i < tableData.length; i++) {
250: dos.writeShort(tableData[i]);
251: outputPos += 2;
252: }
253:
254: // write out the Trie table
255: Assert.assrt(outputPos == header[12]);
256: fSetBuilder.serializeTrie(os);
257: outputPos += header[13];
258: while (outputPos % 8 != 0) { // pad to an 8 byte boundary
259: dos.write(0);
260: outputPos += 1;
261: }
262:
263: // Write out the status {tag} table.
264: Assert.assrt(outputPos == header[16]);
265: for (i = 0; i < fRuleStatusVals.size(); i++) {
266: Integer val = (Integer) fRuleStatusVals.get(i);
267: dos.writeInt(val.intValue());
268: outputPos += 4;
269: }
270:
271: while (outputPos % 8 != 0) { // pad to an 8 byte boundary
272: dos.write(0);
273: outputPos += 1;
274: }
275:
276: // Write out the stripped rules (rules with extra spaces removed
277: // These go last in the data area, even though they are not last in the header.
278: Assert.assrt(outputPos == header[14]);
279: dos.writeChars(strippedRules);
280: outputPos += strippedRules.length() * 2;
281: while (outputPos % 8 != 0) { // pad to an 8 byte boundary
282: dos.write(0);
283: outputPos += 1;
284: }
285: }
286:
287: //----------------------------------------------------------------------------------------
288: //
289: // compileRules compile source rules, placing the compiled form into a output stream
290: // The compiled form is identical to that from ICU4C (Big Endian).
291: //
292: //----------------------------------------------------------------------------------------
293: static void compileRules(String rules, OutputStream os)
294: throws IOException {
295: //
296: // Read the input rules, generate a parse tree, symbol table,
297: // and list of all Unicode Sets referenced by the rules.
298: //
299: RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
300: builder.fScanner.parse();
301:
302: //
303: // UnicodeSet processing.
304: // Munge the Unicode Sets to create a set of character categories.
305: // Generate the mapping tables (TRIE) from input 32-bit characters to
306: // the character categories.
307: //
308: builder.fSetBuilder.build();
309:
310: //
311: // Generate the DFA state transition table.
312: //
313: builder.fForwardTables = new RBBITableBuilder(builder,
314: fForwardTree);
315: builder.fReverseTables = new RBBITableBuilder(builder,
316: fReverseTree);
317: builder.fSafeFwdTables = new RBBITableBuilder(builder,
318: fSafeFwdTree);
319: builder.fSafeRevTables = new RBBITableBuilder(builder,
320: fSafeRevTree);
321: builder.fForwardTables.build();
322: builder.fReverseTables.build();
323: builder.fSafeFwdTables.build();
324: builder.fSafeRevTables.build();
325: if (builder.fDebugEnv != null
326: && builder.fDebugEnv.indexOf("states") >= 0) {
327: builder.fForwardTables.printRuleStatusTable();
328: }
329:
330: //
331: // Package up the compiled data, writing it to an output stream
332: // in the serialization format. This is the same as the ICU4C runtime format.
333: //
334: builder.flattenData(os);
335: }
336: }
|