001: /*
002: *******************************************************************************
003: * Copyright (C) 2003-2006,
004: * International Business Machines Corporation and others. All Rights Reserved.
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.text;
009:
010: import java.util.List;
011: import java.util.ArrayList;
012: import java.util.Iterator;
013: import java.io.OutputStream;
014: import java.io.IOException;
015:
016: import com.ibm.icu.impl.Assert;
017: import com.ibm.icu.impl.CharTrie;
018: import com.ibm.icu.impl.Trie;
019: import com.ibm.icu.impl.IntTrieBuilder;
020:
021: //
022: // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
023: // (part of the rule building process.)
024: //
025: // Starting with the rules parse tree from the scanner,
026: //
027: // - Enumerate the set of UnicodeSets that are referenced
028: // by the RBBI rules.
029: // - compute a set of non-overlapping character ranges
030: // with all characters within a range belonging to the same
031: // set of input uniocde sets.
032: // - Derive a set of non-overlapping UnicodeSet (like things)
033: // that will correspond to columns in the state table for
034: // the RBBI execution engine. All characters within one
035: // of these sets belong to the same set of the original
036: // UnicodeSets from the user's rules.
037: // - construct the trie table that maps input characters
038: // to the index of the matching non-overlapping set of set from
039: // the previous step.
040: //
041: class RBBISetBuilder {
042: static class RangeDescriptor {
043: int fStartChar; // Start of range, unicode 32 bit value.
044: int fEndChar; // End of range, unicode 32 bit value.
045: int fNum; // runtime-mapped input value for this range.
046: List fIncludesSets; // vector of the the original
047: // Unicode sets that include this range.
048: // (Contains ptrs to uset nodes)
049: RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
050:
051: RangeDescriptor() {
052: fIncludesSets = new ArrayList();
053: }
054:
055: RangeDescriptor(RangeDescriptor other) {
056: fStartChar = other.fStartChar;
057: fEndChar = other.fEndChar;
058: fNum = other.fNum;
059: fIncludesSets = new ArrayList(other.fIncludesSets);
060: }
061:
062: //-------------------------------------------------------------------------------------
063: //
064: // RangeDesriptor::split()
065: //
066: //-------------------------------------------------------------------------------------
067: void split(int where) {
068: Assert.assrt(where > fStartChar && where <= fEndChar);
069: RangeDescriptor nr = new RangeDescriptor(this );
070:
071: // RangeDescriptor copy constructor copies all fields.
072: // Only need to update those that are different after the split.
073: nr.fStartChar = where;
074: this .fEndChar = where - 1;
075: nr.fNext = this .fNext;
076: this .fNext = nr;
077:
078: // TODO: fIncludesSets is not updated. Check it out.
079: // Probably because they haven't been populated yet,
080: // but still sloppy.
081: }
082:
083: //-------------------------------------------------------------------------------------
084: //
085: // RangeDescriptor::setDictionaryFlag
086: //
087: // Character Category Numbers that include characters from
088: // the original Unicode Set named "dictionary" have bit 14
089: // set to 1. The RBBI runtime engine uses this to trigger
090: // use of the word dictionary.
091: //
092: // This function looks through the Unicode Sets that it
093: // (the range) includes, and sets the bit in fNum when
094: // "dictionary" is among them.
095: //
096: // TODO: a faster way would be to find the set node for
097: // "dictionary" just once, rather than looking it
098: // up by name every time.
099: //
100: // -------------------------------------------------------------------------------------
101: void setDictionaryFlag() {
102: int i;
103:
104: for (i = 0; i < this .fIncludesSets.size(); i++) {
105: RBBINode usetNode = (RBBINode) fIncludesSets.get(i);
106: String setName = "";
107: RBBINode setRef = usetNode.fParent;
108: if (setRef != null) {
109: RBBINode varRef = setRef.fParent;
110: if (varRef != null
111: && varRef.fType == RBBINode.varRef) {
112: setName = varRef.fText;
113: }
114: }
115: if (setName.equals("dictionary")) {
116: this .fNum |= 0x4000;
117: break;
118: }
119: }
120:
121: };
122: }
123:
124: RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
125: RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
126:
127: IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing
128: int fTrieSize; // the Unicode Sets.
129:
130: // Groups correspond to character categories -
131: // groups of ranges that are in the same original UnicodeSets.
132: // fGroupCount is the index of the last used group.
133: // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
134: // State table column 0 is not used. Column 1 is for end-of-input.
135: // column 2 is for group 0. Funny counting.
136: int fGroupCount;
137:
138: boolean fSawBOF;
139:
140: //------------------------------------------------------------------------
141: //
142: // RBBISetBuilder Constructor
143: //
144: //------------------------------------------------------------------------
145: RBBISetBuilder(RBBIRuleBuilder rb) {
146: fRB = rb;
147: }
148:
149: //------------------------------------------------------------------------
150: //
151: // build Build the list of non-overlapping character ranges
152: // from the Unicode Sets.
153: //
154: //------------------------------------------------------------------------
155: void build() {
156: RBBINode usetNode;
157: RangeDescriptor rlRange;
158:
159: if (fRB.fDebugEnv != null
160: && fRB.fDebugEnv.indexOf("usets") >= 0) {
161: printSets();
162: }
163:
164: // Initialize the process by creating a single range encompassing all characters
165: // that is in no sets.
166: //
167: fRangeList = new RangeDescriptor();
168: fRangeList.fStartChar = 0;
169: fRangeList.fEndChar = 0x10ffff;
170:
171: //
172: // Find the set of non-overlapping ranges of characters
173: //
174: Iterator ni = fRB.fUSetNodes.iterator();
175: while (ni.hasNext()) {
176: usetNode = (RBBINode) ni.next();
177:
178: UnicodeSet inputSet = usetNode.fInputSet;
179: int inputSetRangeCount = inputSet.getRangeCount();
180: int inputSetRangeIndex = 0;
181: rlRange = fRangeList;
182:
183: for (;;) {
184: if (inputSetRangeIndex >= inputSetRangeCount) {
185: break;
186: }
187: int inputSetRangeBegin = inputSet
188: .getRangeStart(inputSetRangeIndex);
189: int inputSetRangeEnd = inputSet
190: .getRangeEnd(inputSetRangeIndex);
191:
192: // skip over ranges from the range list that are completely
193: // below the current range from the input unicode set.
194: while (rlRange.fEndChar < inputSetRangeBegin) {
195: rlRange = rlRange.fNext;
196: }
197:
198: // If the start of the range from the range list is before with
199: // the start of the range from the unicode set, split the range list range
200: // in two, with one part being before (wholly outside of) the unicode set
201: // and the other containing the rest.
202: // Then continue the loop; the post-split current range will then be skipped
203: // over
204: if (rlRange.fStartChar < inputSetRangeBegin) {
205: rlRange.split(inputSetRangeBegin);
206: continue;
207: }
208:
209: // Same thing at the end of the ranges...
210: // If the end of the range from the range list doesn't coincide with
211: // the end of the range from the unicode set, split the range list
212: // range in two. The first part of the split range will be
213: // wholly inside the Unicode set.
214: if (rlRange.fEndChar > inputSetRangeEnd) {
215: rlRange.split(inputSetRangeEnd + 1);
216: }
217:
218: // The current rlRange is now entirely within the UnicodeSet range.
219: // Add this unicode set to the list of sets for this rlRange
220: if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
221: rlRange.fIncludesSets.add(usetNode);
222: }
223:
224: // Advance over ranges that we are finished with.
225: if (inputSetRangeEnd == rlRange.fEndChar) {
226: inputSetRangeIndex++;
227: }
228: rlRange = rlRange.fNext;
229: }
230: }
231:
232: if (fRB.fDebugEnv != null
233: && fRB.fDebugEnv.indexOf("range") >= 0) {
234: printRanges();
235: }
236:
237: //
238: // Group the above ranges, with each group consisting of one or more
239: // ranges that are in exactly the same set of original UnicodeSets.
240: // The groups are numbered, and these group numbers are the set of
241: // input symbols recognized by the run-time state machine.
242: //
243: // Numbering: # 0 (state table column 0) is unused.
244: // # 1 is reserved - table column 1 is for end-of-input
245: // # 2 is reserved - table column 2 is for beginning-in-input
246: // # 3 is the first range list.
247: //
248: RangeDescriptor rlSearchRange;
249: for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) {
250: for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext) {
251: if (rlRange.fIncludesSets
252: .equals(rlSearchRange.fIncludesSets)) {
253: rlRange.fNum = rlSearchRange.fNum;
254: break;
255: }
256: }
257: if (rlRange.fNum == 0) {
258: fGroupCount++;
259: rlRange.fNum = fGroupCount + 2;
260: rlRange.setDictionaryFlag();
261: addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
262: }
263: }
264:
265: // Handle input sets that contain the special string {eof}.
266: // Column 1 of the state table is reserved for EOF on input.
267: // Column 2 is reserved for before-the-start-input.
268: // (This column can be optimized away later if there are no rule
269: // references to {bof}.)
270: // Add this column value (1 or 2) to the equivalent expression
271: // subtree for each UnicodeSet that contains the string {eof}
272: // Because {bof} and {eof} are not a characters in the normal sense,
273: // they doesn't affect the computation of ranges or TRIE.
274:
275: String eofString = "eof";
276: String bofString = "bof";
277:
278: ni = fRB.fUSetNodes.iterator();
279: while (ni.hasNext()) {
280: usetNode = (RBBINode) ni.next();
281: UnicodeSet inputSet = usetNode.fInputSet;
282: if (inputSet.contains(eofString)) {
283: addValToSet(usetNode, 1);
284: }
285: if (inputSet.contains(bofString)) {
286: addValToSet(usetNode, 2);
287: fSawBOF = true;
288: }
289: }
290:
291: if (fRB.fDebugEnv != null
292: && fRB.fDebugEnv.indexOf("rgroup") >= 0) {
293: printRangeGroups();
294: }
295: if (fRB.fDebugEnv != null
296: && fRB.fDebugEnv.indexOf("esets") >= 0) {
297: printSets();
298: }
299:
300: //IntTrieBuilder(int aliasdata[], int maxdatalength,
301: // int initialvalue, int leadunitvalue,
302: // boolean latin1linear)
303:
304: fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one)
305: 100000, // Max Data Length
306: 0, // Initial value for all code points
307: 0, // Lead Surrogate unit value,
308: true); // Keep Latin 1 in separately.
309:
310: for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) {
311: fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar + 1,
312: rlRange.fNum, true);
313: }
314: }
315:
316: //-----------------------------------------------------------------------------------
317: //
318: // RBBIDataManipulate A little internal class needed only to wrap of the
319: // getFoldedValue() function needed for Trie table creation.
320: //
321: //-----------------------------------------------------------------------------------
322: class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
323: public int getFoldedValue(int start, int offset) {
324: int value;
325: int limit;
326: boolean[] inBlockZero = new boolean[1];
327:
328: limit = start + 0x400;
329: while (start < limit) {
330: value = fTrie.getValue(start, inBlockZero);
331: if (inBlockZero[0]) {
332: start += IntTrieBuilder.DATA_BLOCK_LENGTH;
333: } else if (value != 0) {
334: return offset | 0x08000;
335: } else {
336: ++start;
337: }
338: }
339: return 0;
340: }
341: }
342:
343: RBBIDataManipulate dm = new RBBIDataManipulate();
344:
345: //-----------------------------------------------------------------------------------
346: //
347: // getTrieSize() Return the size that will be required to serialize the Trie.
348: //
349: //-----------------------------------------------------------------------------------
350: int getTrieSize() {
351: int size = 0;
352: try {
353: // The trie serialize function returns the size of the data written.
354: // null output stream says give size only, don't actually write anything.
355: size = fTrie.serialize(null, true, dm);
356: } catch (IOException e) {
357: Assert.assrt(false);
358: }
359: return size;
360: }
361:
362: //-----------------------------------------------------------------------------------
363: //
364: // serializeTrie() Write the serialized trie to an output stream
365: //
366: //-----------------------------------------------------------------------------------
367: void serializeTrie(OutputStream os) throws IOException {
368: fTrie.serialize(os, true, dm);
369: }
370:
371: //------------------------------------------------------------------------
372: //
373: // addValToSets Add a runtime-mapped input value to each uset from a
374: // list of uset nodes. (val corresponds to a state table column.)
375: // For each of the original Unicode sets - which correspond
376: // directly to uset nodes - a logically equivalent expression
377: // is constructed in terms of the remapped runtime input
378: // symbol set. This function adds one runtime input symbol to
379: // a list of sets.
380: //
381: // The "logically equivalent expression" is the tree for an
382: // or-ing together of all of the symbols that go into the set.
383: //
384: //------------------------------------------------------------------------
385: void addValToSets(List sets, int val) {
386: int ix;
387:
388: for (ix = 0; ix < sets.size(); ix++) {
389: RBBINode usetNode = (RBBINode) sets.get(ix);
390: addValToSet(usetNode, val);
391: }
392: }
393:
394: void addValToSet(RBBINode usetNode, int val) {
395: RBBINode leafNode = new RBBINode(RBBINode.leafChar);
396: leafNode.fVal = val;
397: if (usetNode.fLeftChild == null) {
398: usetNode.fLeftChild = leafNode;
399: leafNode.fParent = usetNode;
400: } else {
401: // There are already input symbols present for this set.
402: // Set up an OR node, with the previous stuff as the left child
403: // and the new value as the right child.
404: RBBINode orNode = new RBBINode(RBBINode.opOr);
405: orNode.fLeftChild = usetNode.fLeftChild;
406: orNode.fRightChild = leafNode;
407: orNode.fLeftChild.fParent = orNode;
408: orNode.fRightChild.fParent = orNode;
409: usetNode.fLeftChild = orNode;
410: orNode.fParent = usetNode;
411: }
412: }
413:
414: //------------------------------------------------------------------------
415: //
416: // getNumCharCategories
417: //
418: //------------------------------------------------------------------------
419: int getNumCharCategories() {
420: return fGroupCount + 3;
421: }
422:
423: //------------------------------------------------------------------------
424: //
425: // sawBOF
426: //
427: //------------------------------------------------------------------------
428: boolean sawBOF() {
429: return fSawBOF;
430: }
431:
432: //------------------------------------------------------------------------
433: //
434: // getFirstChar Given a runtime RBBI character category, find
435: // the first UChar32 that is in the set of chars
436: // in the category.
437: //------------------------------------------------------------------------
438: int getFirstChar(int category) {
439: RangeDescriptor rlRange;
440: int retVal = -1;
441: for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) {
442: if (rlRange.fNum == category) {
443: retVal = rlRange.fStartChar;
444: break;
445: }
446: }
447: return retVal;
448: }
449:
450: //------------------------------------------------------------------------
451: //
452: // printRanges A debugging function.
453: // dump out all of the range definitions.
454: //
455: //------------------------------------------------------------------------
456: void printRanges() {
457: RangeDescriptor rlRange;
458: int i;
459:
460: System.out.print("\n\n Nonoverlapping Ranges ...\n");
461: for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) {
462: System.out.print(" " + rlRange.fNum + " "
463: + (int) rlRange.fStartChar + "-"
464: + (int) rlRange.fEndChar);
465:
466: for (i = 0; i < rlRange.fIncludesSets.size(); i++) {
467: RBBINode usetNode = (RBBINode) rlRange.fIncludesSets
468: .get(i);
469: String setName = "anon";
470: RBBINode setRef = usetNode.fParent;
471: if (setRef != null) {
472: RBBINode varRef = setRef.fParent;
473: if (varRef != null
474: && varRef.fType == RBBINode.varRef) {
475: setName = varRef.fText;
476: }
477: }
478: System.out.print(setName);
479: System.out.print(" ");
480: }
481: System.out.println("");
482: }
483: }
484:
485: //------------------------------------------------------------------------
486: //
487: // printRangeGroups A debugging function.
488: // dump out all of the range groups.
489: //
490: //------------------------------------------------------------------------
491: void printRangeGroups() {
492: RangeDescriptor rlRange;
493: RangeDescriptor tRange;
494: int i;
495: int lastPrintedGroupNum = 0;
496:
497: System.out
498: .print("\nRanges grouped by Unicode Set Membership...\n");
499: for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) {
500: int groupNum = rlRange.fNum & 0xbfff;
501: if (groupNum > lastPrintedGroupNum) {
502: lastPrintedGroupNum = groupNum;
503: if (groupNum < 10) {
504: System.out.print(" ");
505: }
506: System.out.print(groupNum + " ");
507:
508: if ((rlRange.fNum & 0x4000) != 0) {
509: System.out.print(" <DICT> ");
510: }
511:
512: for (i = 0; i < rlRange.fIncludesSets.size(); i++) {
513: RBBINode usetNode = (RBBINode) rlRange.fIncludesSets
514: .get(i);
515: String setName = "anon";
516: RBBINode setRef = usetNode.fParent;
517: if (setRef != null) {
518: RBBINode varRef = setRef.fParent;
519: if (varRef != null
520: && varRef.fType == RBBINode.varRef) {
521: setName = varRef.fText;
522: }
523: }
524: System.out.print(setName);
525: System.out.print(" ");
526: }
527:
528: i = 0;
529: for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
530: if (tRange.fNum == rlRange.fNum) {
531: if (i++ % 5 == 0) {
532: System.out.print("\n ");
533: }
534: RBBINode.printHex((int) tRange.fStartChar, -1);
535: System.out.print("-");
536: RBBINode.printHex((int) tRange.fEndChar, 0);
537: }
538: }
539: System.out.print("\n");
540: }
541: }
542: System.out.print("\n");
543: }
544:
545: //------------------------------------------------------------------------
546: //
547: // printSets A debugging function.
548: // dump out all of the set definitions.
549: //
550: //------------------------------------------------------------------------
551: void printSets() {
552: int i;
553: System.out.print("\n\nUnicode Sets List\n------------------\n");
554: for (i = 0; i < fRB.fUSetNodes.size(); i++) {
555: RBBINode usetNode;
556: RBBINode setRef;
557: RBBINode varRef;
558: String setName;
559:
560: usetNode = (RBBINode) fRB.fUSetNodes.get(i);
561:
562: //System.out.print(" " + i + " ");
563: RBBINode.printInt(2, i);
564: setName = "anonymous";
565: setRef = usetNode.fParent;
566: if (setRef != null) {
567: varRef = setRef.fParent;
568: if (varRef != null && varRef.fType == RBBINode.varRef) {
569: setName = varRef.fText;
570: }
571: }
572: System.out.print(" " + setName);
573: System.out.print(" ");
574: System.out.print(usetNode.fText);
575: System.out.print("\n");
576: if (usetNode.fLeftChild != null) {
577: usetNode.fLeftChild.printTree(true);
578: }
579: }
580: System.out.print("\n");
581: }
582:
583: }
|