001: /*
002: * Copyright 1999-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: /* $Id: HyphenationTree.java 2623 2007-02-23 22:28:28Z xlv $ */
018:
019: package com.lowagie.text.pdf.hyphenation;
020:
021: import java.io.InputStream;
022: import java.util.ArrayList;
023: import java.util.HashMap;
024:
025: /**
026: * This tree structure stores the hyphenation patterns in an efficient
027: * way for fast lookup. It provides the provides the method to
028: * hyphenate a word.
029: *
030: * @author Carlos Villegas <cav@uniscope.co.jp>
031: */
032: public class HyphenationTree extends TernaryTree implements
033: PatternConsumer {
034:
035: private static final long serialVersionUID = -7763254239309429432L;
036:
037: /**
038: * value space: stores the inteletter values
039: */
040: protected ByteVector vspace;
041:
042: /**
043: * This map stores hyphenation exceptions
044: */
045: protected HashMap stoplist;
046:
047: /**
048: * This map stores the character classes
049: */
050: protected TernaryTree classmap;
051:
052: /**
053: * Temporary map to store interletter values on pattern loading.
054: */
055: private transient TernaryTree ivalues;
056:
057: public HyphenationTree() {
058: stoplist = new HashMap(23); // usually a small table
059: classmap = new TernaryTree();
060: vspace = new ByteVector();
061: vspace.alloc(1); // this reserves index 0, which we don't use
062: }
063:
064: /**
065: * Packs the values by storing them in 4 bits, two values into a byte
066: * Values range is from 0 to 9. We use zero as terminator,
067: * so we'll add 1 to the value.
068: * @param values a string of digits from '0' to '9' representing the
069: * interletter values.
070: * @return the index into the vspace array where the packed values
071: * are stored.
072: */
073: protected int packValues(String values) {
074: int i, n = values.length();
075: int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
076: int offset = vspace.alloc(m);
077: byte[] va = vspace.getArray();
078: for (i = 0; i < n; i++) {
079: int j = i >> 1;
080: byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f);
081: if ((i & 1) == 1) {
082: va[j + offset] = (byte) (va[j + offset] | v);
083: } else {
084: va[j + offset] = (byte) (v << 4); // big endian
085: }
086: }
087: va[m - 1 + offset] = 0; // terminator
088: return offset;
089: }
090:
091: protected String unpackValues(int k) {
092: StringBuffer buf = new StringBuffer();
093: byte v = vspace.get(k++);
094: while (v != 0) {
095: char c = (char) ((v >>> 4) - 1 + '0');
096: buf.append(c);
097: c = (char) (v & 0x0f);
098: if (c == 0) {
099: break;
100: }
101: c = (char) (c - 1 + '0');
102: buf.append(c);
103: v = vspace.get(k++);
104: }
105: return buf.toString();
106: }
107:
108: public void loadSimplePatterns(InputStream stream) {
109: SimplePatternParser pp = new SimplePatternParser();
110: ivalues = new TernaryTree();
111:
112: pp.parse(stream, this );
113:
114: // patterns/values should be now in the tree
115: // let's optimize a bit
116: trimToSize();
117: vspace.trimToSize();
118: classmap.trimToSize();
119:
120: // get rid of the auxiliary map
121: ivalues = null;
122: }
123:
124: public String findPattern(String pat) {
125: int k = super .find(pat);
126: if (k >= 0) {
127: return unpackValues(k);
128: }
129: return "";
130: }
131:
132: /**
133: * String compare, returns 0 if equal or
134: * t is a substring of s
135: */
136: protected int hstrcmp(char[] s, int si, char[] t, int ti) {
137: for (; s[si] == t[ti]; si++, ti++) {
138: if (s[si] == 0) {
139: return 0;
140: }
141: }
142: if (t[ti] == 0) {
143: return 0;
144: }
145: return s[si] - t[ti];
146: }
147:
148: protected byte[] getValues(int k) {
149: StringBuffer buf = new StringBuffer();
150: byte v = vspace.get(k++);
151: while (v != 0) {
152: char c = (char) ((v >>> 4) - 1);
153: buf.append(c);
154: c = (char) (v & 0x0f);
155: if (c == 0) {
156: break;
157: }
158: c = (char) (c - 1);
159: buf.append(c);
160: v = vspace.get(k++);
161: }
162: byte[] res = new byte[buf.length()];
163: for (int i = 0; i < res.length; i++) {
164: res[i] = (byte) buf.charAt(i);
165: }
166: return res;
167: }
168:
169: /**
170: * <p>Search for all possible partial matches of word starting
171: * at index an update interletter values. In other words, it
172: * does something like:</p>
173: * <code>
174: * for(i=0; i<patterns.length; i++) {
175: * if ( word.substring(index).startsWidth(patterns[i]) )
176: * update_interletter_values(patterns[i]);
177: * }
178: * </code>
179: * <p>But it is done in an efficient way since the patterns are
180: * stored in a ternary tree. In fact, this is the whole purpose
181: * of having the tree: doing this search without having to test
182: * every single pattern. The number of patterns for languages
183: * such as English range from 4000 to 10000. Thus, doing thousands
184: * of string comparisons for each word to hyphenate would be
185: * really slow without the tree. The tradeoff is memory, but
186: * using a ternary tree instead of a trie, almost halves the
187: * the memory used by Lout or TeX. It's also faster than using
188: * a hash table</p>
189: * @param word null terminated word to match
190: * @param index start index from word
191: * @param il interletter values array to update
192: */
193: protected void searchPatterns(char[] word, int index, byte[] il) {
194: byte[] values;
195: int i = index;
196: char p, q;
197: char sp = word[i];
198: p = root;
199:
200: while (p > 0 && p < sc.length) {
201: if (sc[p] == 0xFFFF) {
202: if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
203: values = getValues(eq[p]); // data pointer is in eq[]
204: int j = index;
205: for (int k = 0; k < values.length; k++) {
206: if (j < il.length && values[k] > il[j]) {
207: il[j] = values[k];
208: }
209: j++;
210: }
211: }
212: return;
213: }
214: int d = sp - sc[p];
215: if (d == 0) {
216: if (sp == 0) {
217: break;
218: }
219: sp = word[++i];
220: p = eq[p];
221: q = p;
222:
223: // look for a pattern ending at this position by searching for
224: // the null char ( splitchar == 0 )
225: while (q > 0 && q < sc.length) {
226: if (sc[q] == 0xFFFF) { // stop at compressed branch
227: break;
228: }
229: if (sc[q] == 0) {
230: values = getValues(eq[q]);
231: int j = index;
232: for (int k = 0; k < values.length; k++) {
233: if (j < il.length && values[k] > il[j]) {
234: il[j] = values[k];
235: }
236: j++;
237: }
238: break;
239: } else {
240: q = lo[q];
241:
242: /**
243: * actually the code should be:
244: * q = sc[q] < 0 ? hi[q] : lo[q];
245: * but java chars are unsigned
246: */
247: }
248: }
249: } else {
250: p = d < 0 ? lo[p] : hi[p];
251: }
252: }
253: }
254:
255: /**
256: * Hyphenate word and return a Hyphenation object.
257: * @param word the word to be hyphenated
258: * @param remainCharCount Minimum number of characters allowed
259: * before the hyphenation point.
260: * @param pushCharCount Minimum number of characters allowed after
261: * the hyphenation point.
262: * @return a {@link Hyphenation Hyphenation} object representing
263: * the hyphenated word or null if word is not hyphenated.
264: */
265: public Hyphenation hyphenate(String word, int remainCharCount,
266: int pushCharCount) {
267: char[] w = word.toCharArray();
268: return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
269: }
270:
271: /**
272: * w = "****nnllllllnnn*****",
273: * where n is a non-letter, l is a letter,
274: * all n may be absent, the first n is at offset,
275: * the first l is at offset + iIgnoreAtBeginning;
276: * word = ".llllll.'\0'***",
277: * where all l in w are copied into word.
278: * In the first part of the routine len = w.length,
279: * in the second part of the routine len = word.length.
280: * Three indices are used:
281: * index(w), the index in w,
282: * index(word), the index in word,
283: * letterindex(word), the index in the letter part of word.
284: * The following relations exist:
285: * index(w) = offset + i - 1
286: * index(word) = i - iIgnoreAtBeginning
287: * letterindex(word) = index(word) - 1
288: * (see first loop).
289: * It follows that:
290: * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
291: * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
292: */
293:
294: /**
295: * Hyphenate word and return an array of hyphenation points.
296: * @param w char array that contains the word
297: * @param offset Offset to first character in word
298: * @param len Length of word
299: * @param remainCharCount Minimum number of characters allowed
300: * before the hyphenation point.
301: * @param pushCharCount Minimum number of characters allowed after
302: * the hyphenation point.
303: * @return a {@link Hyphenation Hyphenation} object representing
304: * the hyphenated word or null if word is not hyphenated.
305: */
306: public Hyphenation hyphenate(char[] w, int offset, int len,
307: int remainCharCount, int pushCharCount) {
308: int i;
309: char[] word = new char[len + 3];
310:
311: // normalize word
312: char[] c = new char[2];
313: int iIgnoreAtBeginning = 0;
314: int iLength = len;
315: boolean bEndOfLetters = false;
316: for (i = 1; i <= len; i++) {
317: c[0] = w[offset + i - 1];
318: int nc = classmap.find(c, 0);
319: if (nc < 0) { // found a non-letter character ...
320: if (i == (1 + iIgnoreAtBeginning)) {
321: // ... before any letter character
322: iIgnoreAtBeginning++;
323: } else {
324: // ... after a letter character
325: bEndOfLetters = true;
326: }
327: iLength--;
328: } else {
329: if (!bEndOfLetters) {
330: word[i - iIgnoreAtBeginning] = (char) nc;
331: } else {
332: return null;
333: }
334: }
335: }
336: len = iLength;
337: if (len < (remainCharCount + pushCharCount)) {
338: // word is too short to be hyphenated
339: return null;
340: }
341: int[] result = new int[len + 1];
342: int k = 0;
343:
344: // check exception list first
345: String sw = new String(word, 1, len);
346: if (stoplist.containsKey(sw)) {
347: // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
348: ArrayList hw = (ArrayList) stoplist.get(sw);
349: int j = 0;
350: for (i = 0; i < hw.size(); i++) {
351: Object o = hw.get(i);
352: // j = index(sw) = letterindex(word)?
353: // result[k] = corresponding index(w)
354: if (o instanceof String) {
355: j += ((String) o).length();
356: if (j >= remainCharCount
357: && j < (len - pushCharCount)) {
358: result[k++] = j + iIgnoreAtBeginning;
359: }
360: }
361: }
362: } else {
363: // use algorithm to get hyphenation points
364: word[0] = '.'; // word start marker
365: word[len + 1] = '.'; // word end marker
366: word[len + 2] = 0; // null terminated
367: byte[] il = new byte[len + 3]; // initialized to zero
368: for (i = 0; i < len + 1; i++) {
369: searchPatterns(word, i, il);
370: }
371:
372: // hyphenation points are located where interletter value is odd
373: // i is letterindex(word),
374: // i + 1 is index(word),
375: // result[k] = corresponding index(w)
376: for (i = 0; i < len; i++) {
377: if (((il[i + 1] & 1) == 1) && i >= remainCharCount
378: && i <= (len - pushCharCount)) {
379: result[k++] = i + iIgnoreAtBeginning;
380: }
381: }
382: }
383:
384: if (k > 0) {
385: // trim result array
386: int[] res = new int[k];
387: System.arraycopy(result, 0, res, 0, k);
388: return new Hyphenation(new String(w, offset, len), res);
389: } else {
390: return null;
391: }
392: }
393:
394: /**
395: * Add a character class to the tree. It is used by
396: * {@link SimplePatternParser SimplePatternParser} as callback to
397: * add character classes. Character classes define the
398: * valid word characters for hyphenation. If a word contains
399: * a character not defined in any of the classes, it is not hyphenated.
400: * It also defines a way to normalize the characters in order
401: * to compare them with the stored patterns. Usually pattern
402: * files use only lower case characters, in this case a class
403: * for letter 'a', for example, should be defined as "aA", the first
404: * character being the normalization char.
405: */
406: public void addClass(String chargroup) {
407: if (chargroup.length() > 0) {
408: char equivChar = chargroup.charAt(0);
409: char[] key = new char[2];
410: key[1] = 0;
411: for (int i = 0; i < chargroup.length(); i++) {
412: key[0] = chargroup.charAt(i);
413: classmap.insert(key, 0, equivChar);
414: }
415: }
416: }
417:
418: /**
419: * Add an exception to the tree. It is used by
420: * {@link SimplePatternParser SimplePatternParser} class as callback to
421: * store the hyphenation exceptions.
422: * @param word normalized word
423: * @param hyphenatedword a vector of alternating strings and
424: * {@link Hyphen hyphen} objects.
425: */
426: public void addException(String word, ArrayList hyphenatedword) {
427: stoplist.put(word, hyphenatedword);
428: }
429:
430: /**
431: * Add a pattern to the tree. Mainly, to be used by
432: * {@link SimplePatternParser SimplePatternParser} class as callback to
433: * add a pattern to the tree.
434: * @param pattern the hyphenation pattern
435: * @param ivalue interletter weight values indicating the
436: * desirability and priority of hyphenating at a given point
437: * within the pattern. It should contain only digit characters.
438: * (i.e. '0' to '9').
439: */
440: public void addPattern(String pattern, String ivalue) {
441: int k = ivalues.find(ivalue);
442: if (k <= 0) {
443: k = packValues(ivalue);
444: ivalues.insert(ivalue, (char) k);
445: }
446: insert(pattern, (char) k);
447: }
448:
449: public void printStats() {
450: System.out.println("Value space size = "
451: + Integer.toString(vspace.length()));
452: super.printStats();
453: }
454: }
|