001: /*
002: * Copyright 1999-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package com.lowagie.text.pdf.hyphenation;
018:
019: import java.io.Serializable;
020: import java.util.Enumeration;
021: import java.util.Stack;
022:
023: /**
024: * <h2>Ternary Search Tree.</h2>
025: *
026: * <p>A ternary search tree is a hibrid between a binary tree and
027: * a digital search tree (trie). Keys are limited to strings.
028: * A data value of type char is stored in each leaf node.
029: * It can be used as an index (or pointer) to the data.
030: * Branches that only contain one key are compressed to one node
031: * by storing a pointer to the trailer substring of the key.
032: * This class is intended to serve as base class or helper class
033: * to implement Dictionary collections or the like. Ternary trees
034: * have some nice properties as the following: the tree can be
035: * traversed in sorted order, partial matches (wildcard) can be
036: * implemented, retrieval of all keys within a given distance
037: * from the target, etc. The storage requirements are higher than
038: * a binary tree but a lot less than a trie. Performance is
039: * comparable with a hash table, sometimes it outperforms a hash
040: * function (most of the time can determine a miss faster than a hash).</p>
041: *
042: * <p>The main purpose of this java port is to serve as a base for
043: * implementing TeX's hyphenation algorithm (see The TeXBook,
044: * appendix H). Each language requires from 5000 to 15000 hyphenation
045: * patterns which will be keys in this tree. The strings patterns
046: * are usually small (from 2 to 5 characters), but each char in the
047: * tree is stored in a node. Thus memory usage is the main concern.
048: * We will sacrify 'elegance' to keep memory requirenments to the
049: * minimum. Using java's char type as pointer (yes, I know pointer
050: * it is a forbidden word in java) we can keep the size of the node
051: * to be just 8 bytes (3 pointers and the data char). This gives
052: * room for about 65000 nodes. In my tests the english patterns
053: * took 7694 nodes and the german patterns 10055 nodes,
054: * so I think we are safe.</p>
055: *
056: * <p>All said, this is a map with strings as keys and char as value.
057: * Pretty limited!. It can be extended to a general map by
058: * using the string representation of an object and using the
059: * char value as an index to an array that contains the object
060: * values.</p>
061: *
062: * @author cav@uniscope.co.jp
063: */
064:
065: public class TernaryTree implements Cloneable, Serializable {
066:
067: /**
068: * We use 4 arrays to represent a node. I guess I should have created
069: * a proper node class, but somehow Knuth's pascal code made me forget
070: * we now have a portable language with virtual memory management and
071: * automatic garbage collection! And now is kind of late, furthermore,
072: * if it ain't broken, don't fix it.
073: */
074:
075: private static final long serialVersionUID = 5313366505322983510L;
076:
077: /**
078: * Pointer to low branch and to rest of the key when it is
079: * stored directly in this node, we don't have unions in java!
080: */
081: protected char[] lo;
082:
083: /**
084: * Pointer to high branch.
085: */
086: protected char[] hi;
087:
088: /**
089: * Pointer to equal branch and to data when this node is a string terminator.
090: */
091: protected char[] eq;
092:
093: /**
094: * <P>The character stored in this node: splitchar.
095: * Two special values are reserved:</P>
096: * <ul><li>0x0000 as string terminator</li>
097: * <li>0xFFFF to indicate that the branch starting at
098: * this node is compressed</li></ul>
099: * <p>This shouldn't be a problem if we give the usual semantics to
100: * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
101: */
102: protected char[] sc;
103:
104: /**
105: * This vector holds the trailing of the keys when the branch is compressed.
106: */
107: protected CharVector kv;
108:
109: protected char root;
110: protected char freenode;
111: protected int length; // number of items in tree
112:
113: protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
114:
115: TernaryTree() {
116: init();
117: }
118:
119: protected void init() {
120: root = 0;
121: freenode = 1;
122: length = 0;
123: lo = new char[BLOCK_SIZE];
124: hi = new char[BLOCK_SIZE];
125: eq = new char[BLOCK_SIZE];
126: sc = new char[BLOCK_SIZE];
127: kv = new CharVector();
128: }
129:
130: /**
131: * Branches are initially compressed, needing
132: * one node per key plus the size of the string
133: * key. They are decompressed as needed when
134: * another key with same prefix
135: * is inserted. This saves a lot of space,
136: * specially for long keys.
137: */
138: public void insert(String key, char val) {
139: // make sure we have enough room in the arrays
140: int len = key.length() + 1; // maximum number of nodes that may be generated
141: if (freenode + len > eq.length) {
142: redimNodeArrays(eq.length + BLOCK_SIZE);
143: }
144: char strkey[] = new char[len--];
145: key.getChars(0, len, strkey, 0);
146: strkey[len] = 0;
147: root = insert(root, strkey, 0, val);
148: }
149:
150: public void insert(char[] key, int start, char val) {
151: int len = strlen(key) + 1;
152: if (freenode + len > eq.length) {
153: redimNodeArrays(eq.length + BLOCK_SIZE);
154: }
155: root = insert(root, key, start, val);
156: }
157:
158: /**
159: * The actual insertion function, recursive version.
160: */
161: private char insert(char p, char[] key, int start, char val) {
162: int len = strlen(key, start);
163: if (p == 0) {
164: // this means there is no branch, this node will start a new branch.
165: // Instead of doing that, we store the key somewhere else and create
166: // only one node with a pointer to the key
167: p = freenode++;
168: eq[p] = val; // holds data
169: length++;
170: hi[p] = 0;
171: if (len > 0) {
172: sc[p] = 0xFFFF; // indicates branch is compressed
173: lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
174: strcpy(kv.getArray(), lo[p], key, start);
175: } else {
176: sc[p] = 0;
177: lo[p] = 0;
178: }
179: return p;
180: }
181:
182: if (sc[p] == 0xFFFF) {
183: // branch is compressed: need to decompress
184: // this will generate garbage in the external key array
185: // but we can do some garbage collection later
186: char pp = freenode++;
187: lo[pp] = lo[p]; // previous pointer to key
188: eq[pp] = eq[p]; // previous pointer to data
189: lo[p] = 0;
190: if (len > 0) {
191: sc[p] = kv.get(lo[pp]);
192: eq[p] = pp;
193: lo[pp]++;
194: if (kv.get(lo[pp]) == 0) {
195: // key completly decompressed leaving garbage in key array
196: lo[pp] = 0;
197: sc[pp] = 0;
198: hi[pp] = 0;
199: } else {
200: // we only got first char of key, rest is still there
201: sc[pp] = 0xFFFF;
202: }
203: } else {
204: // In this case we can save a node by swapping the new node
205: // with the compressed node
206: sc[pp] = 0xFFFF;
207: hi[p] = pp;
208: sc[p] = 0;
209: eq[p] = val;
210: length++;
211: return p;
212: }
213: }
214: char s = key[start];
215: if (s < sc[p]) {
216: lo[p] = insert(lo[p], key, start, val);
217: } else if (s == sc[p]) {
218: if (s != 0) {
219: eq[p] = insert(eq[p], key, start + 1, val);
220: } else {
221: // key already in tree, overwrite data
222: eq[p] = val;
223: }
224: } else {
225: hi[p] = insert(hi[p], key, start, val);
226: }
227: return p;
228: }
229:
230: /**
231: * Compares 2 null terminated char arrays
232: */
233: public static int strcmp(char[] a, int startA, char[] b, int startB) {
234: for (; a[startA] == b[startB]; startA++, startB++) {
235: if (a[startA] == 0) {
236: return 0;
237: }
238: }
239: return a[startA] - b[startB];
240: }
241:
242: /**
243: * Compares a string with null terminated char array
244: */
245: public static int strcmp(String str, char[] a, int start) {
246: int i, d, len = str.length();
247: for (i = 0; i < len; i++) {
248: d = (int) str.charAt(i) - a[start + i];
249: if (d != 0) {
250: return d;
251: }
252: if (a[start + i] == 0) {
253: return d;
254: }
255: }
256: if (a[start + i] != 0) {
257: return (int) -a[start + i];
258: }
259: return 0;
260:
261: }
262:
263: public static void strcpy(char[] dst, int di, char[] src, int si) {
264: while (src[si] != 0) {
265: dst[di++] = src[si++];
266: }
267: dst[di] = 0;
268: }
269:
270: public static int strlen(char[] a, int start) {
271: int len = 0;
272: for (int i = start; i < a.length && a[i] != 0; i++) {
273: len++;
274: }
275: return len;
276: }
277:
278: public static int strlen(char[] a) {
279: return strlen(a, 0);
280: }
281:
282: public int find(String key) {
283: int len = key.length();
284: char strkey[] = new char[len + 1];
285: key.getChars(0, len, strkey, 0);
286: strkey[len] = 0;
287:
288: return find(strkey, 0);
289: }
290:
291: public int find(char[] key, int start) {
292: int d;
293: char p = root;
294: int i = start;
295: char c;
296:
297: while (p != 0) {
298: if (sc[p] == 0xFFFF) {
299: if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
300: return eq[p];
301: } else {
302: return -1;
303: }
304: }
305: c = key[i];
306: d = c - sc[p];
307: if (d == 0) {
308: if (c == 0) {
309: return eq[p];
310: }
311: i++;
312: p = eq[p];
313: } else if (d < 0) {
314: p = lo[p];
315: } else {
316: p = hi[p];
317: }
318: }
319: return -1;
320: }
321:
322: public boolean knows(String key) {
323: return (find(key) >= 0);
324: }
325:
326: // redimension the arrays
327: private void redimNodeArrays(int newsize) {
328: int len = newsize < lo.length ? newsize : lo.length;
329: char[] na = new char[newsize];
330: System.arraycopy(lo, 0, na, 0, len);
331: lo = na;
332: na = new char[newsize];
333: System.arraycopy(hi, 0, na, 0, len);
334: hi = na;
335: na = new char[newsize];
336: System.arraycopy(eq, 0, na, 0, len);
337: eq = na;
338: na = new char[newsize];
339: System.arraycopy(sc, 0, na, 0, len);
340: sc = na;
341: }
342:
343: public int size() {
344: return length;
345: }
346:
347: public Object clone() {
348: TernaryTree t = new TernaryTree();
349: t.lo = (char[]) this .lo.clone();
350: t.hi = (char[]) this .hi.clone();
351: t.eq = (char[]) this .eq.clone();
352: t.sc = (char[]) this .sc.clone();
353: t.kv = (CharVector) this .kv.clone();
354: t.root = this .root;
355: t.freenode = this .freenode;
356: t.length = this .length;
357:
358: return t;
359: }
360:
361: /**
362: * Recursively insert the median first and then the median of the
363: * lower and upper halves, and so on in order to get a balanced
364: * tree. The array of keys is assumed to be sorted in ascending
365: * order.
366: */
367: protected void insertBalanced(String[] k, char[] v, int offset,
368: int n) {
369: int m;
370: if (n < 1) {
371: return;
372: }
373: m = n >> 1;
374:
375: insert(k[m + offset], v[m + offset]);
376: insertBalanced(k, v, offset, m);
377:
378: insertBalanced(k, v, offset + m + 1, n - m - 1);
379: }
380:
381: /**
382: * Balance the tree for best search performance
383: */
384: public void balance() {
385: // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
386:
387: int i = 0, n = length;
388: String[] k = new String[n];
389: char[] v = new char[n];
390: Iterator iter = new Iterator();
391: while (iter.hasMoreElements()) {
392: v[i] = iter.getValue();
393: k[i++] = (String) iter.nextElement();
394: }
395: init();
396: insertBalanced(k, v, 0, n);
397:
398: // With uniform letter distribution sc[root] should be around 'm'
399: // System.out.print("After root splitchar = "); System.out.println(sc[root]);
400: }
401:
402: /**
403: * Each node stores a character (splitchar) which is part of
404: * some key(s). In a compressed branch (one that only contain
405: * a single string key) the trailer of the key which is not
406: * already in nodes is stored externally in the kv array.
407: * As items are inserted, key substrings decrease.
408: * Some substrings may completely disappear when the whole
409: * branch is totally decompressed.
410: * The tree is traversed to find the key substrings actually
411: * used. In addition, duplicate substrings are removed using
412: * a map (implemented with a TernaryTree!).
413: *
414: */
415: public void trimToSize() {
416: // first balance the tree for best performance
417: balance();
418:
419: // redimension the node arrays
420: redimNodeArrays(freenode);
421:
422: // ok, compact kv array
423: CharVector kx = new CharVector();
424: kx.alloc(1);
425: TernaryTree map = new TernaryTree();
426: compact(kx, map, root);
427: kv = kx;
428: kv.trimToSize();
429: }
430:
431: private void compact(CharVector kx, TernaryTree map, char p) {
432: int k;
433: if (p == 0) {
434: return;
435: }
436: if (sc[p] == 0xFFFF) {
437: k = map.find(kv.getArray(), lo[p]);
438: if (k < 0) {
439: k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
440: strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
441: map.insert(kx.getArray(), k, (char) k);
442: }
443: lo[p] = (char) k;
444: } else {
445: compact(kx, map, lo[p]);
446: if (sc[p] != 0) {
447: compact(kx, map, eq[p]);
448: }
449: compact(kx, map, hi[p]);
450: }
451: }
452:
453: public Enumeration keys() {
454: return new Iterator();
455: }
456:
457: public class Iterator implements Enumeration {
458:
459: /**
460: * current node index
461: */
462: int cur;
463:
464: /**
465: * current key
466: */
467: String curkey;
468:
469: private class Item implements Cloneable {
470: char parent;
471: char child;
472:
473: public Item() {
474: parent = 0;
475: child = 0;
476: }
477:
478: public Item(char p, char c) {
479: parent = p;
480: child = c;
481: }
482:
483: public Object clone() {
484: return new Item(parent, child);
485: }
486:
487: }
488:
489: /**
490: * Node stack
491: */
492: Stack ns;
493:
494: /**
495: * key stack implemented with a StringBuffer
496: */
497: StringBuffer ks;
498:
499: public Iterator() {
500: cur = -1;
501: ns = new Stack();
502: ks = new StringBuffer();
503: rewind();
504: }
505:
506: public void rewind() {
507: ns.removeAllElements();
508: ks.setLength(0);
509: cur = root;
510: run();
511: }
512:
513: public Object nextElement() {
514: String res = curkey;
515: cur = up();
516: run();
517: return res;
518: }
519:
520: public char getValue() {
521: if (cur >= 0) {
522: return eq[cur];
523: }
524: return 0;
525: }
526:
527: public boolean hasMoreElements() {
528: return (cur != -1);
529: }
530:
531: /**
532: * traverse upwards
533: */
534: private int up() {
535: Item i = new Item();
536: int res = 0;
537:
538: if (ns.empty()) {
539: return -1;
540: }
541:
542: if (cur != 0 && sc[cur] == 0) {
543: return lo[cur];
544: }
545:
546: boolean climb = true;
547:
548: while (climb) {
549: i = (Item) ns.pop();
550: i.child++;
551: switch (i.child) {
552: case 1:
553: if (sc[i.parent] != 0) {
554: res = eq[i.parent];
555: ns.push(i.clone());
556: ks.append(sc[i.parent]);
557: } else {
558: i.child++;
559: ns.push(i.clone());
560: res = hi[i.parent];
561: }
562: climb = false;
563: break;
564:
565: case 2:
566: res = hi[i.parent];
567: ns.push(i.clone());
568: if (ks.length() > 0) {
569: ks.setLength(ks.length() - 1); // pop
570: }
571: climb = false;
572: break;
573:
574: default:
575: if (ns.empty()) {
576: return -1;
577: }
578: climb = true;
579: break;
580: }
581: }
582: return res;
583: }
584:
585: /**
586: * traverse the tree to find next key
587: */
588: private int run() {
589: if (cur == -1) {
590: return -1;
591: }
592:
593: boolean leaf = false;
594: while (true) {
595: // first go down on low branch until leaf or compressed branch
596: while (cur != 0) {
597: if (sc[cur] == 0xFFFF) {
598: leaf = true;
599: break;
600: }
601: ns.push(new Item((char) cur, '\u0000'));
602: if (sc[cur] == 0) {
603: leaf = true;
604: break;
605: }
606: cur = lo[cur];
607: }
608: if (leaf) {
609: break;
610: }
611: // nothing found, go up one node and try again
612: cur = up();
613: if (cur == -1) {
614: return -1;
615: }
616: }
617: // The current node should be a data node and
618: // the key should be in the key stack (at least partially)
619: StringBuffer buf = new StringBuffer(ks.toString());
620: if (sc[cur] == 0xFFFF) {
621: int p = lo[cur];
622: while (kv.get(p) != 0) {
623: buf.append(kv.get(p++));
624: }
625: }
626: curkey = buf.toString();
627: return 0;
628: }
629:
630: }
631:
632: public void printStats() {
633: System.out.println("Number of keys = "
634: + Integer.toString(length));
635: System.out
636: .println("Node count = " + Integer.toString(freenode));
637: // System.out.println("Array length = " + Integer.toString(eq.length));
638: System.out.println("Key Array length = "
639: + Integer.toString(kv.length()));
640:
641: /*
642: * for(int i=0; i<kv.length(); i++)
643: * if ( kv.get(i) != 0 )
644: * System.out.print(kv.get(i));
645: * else
646: * System.out.println("");
647: * System.out.println("Keys:");
648: * for(Enumeration enum = keys(); enum.hasMoreElements(); )
649: * System.out.println(enum.nextElement());
650: */
651:
652: }
653:
654: /* public static void main(String[] args) throws Exception {
655: TernaryTree tt = new TernaryTree();
656: tt.insert("Carlos", 'C');
657: tt.insert("Car", 'r');
658: tt.insert("palos", 'l');
659: tt.insert("pa", 'p');
660: tt.trimToSize();
661: System.out.println((char)tt.find("Car"));
662: System.out.println((char)tt.find("Carlos"));
663: System.out.println((char)tt.find("alto"));
664: tt.printStats();
665: }*/
666:
667: }
|