001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
003: *
004: * This code is free software; you can redistribute it and/or modify it
005: * under the terms of the GNU General Public License version 2 only, as
006: * published by the Free Software Foundation. Sun designates this
007: * particular file as subject to the "Classpath" exception as provided
008: * by Sun in the LICENSE file that accompanied this code.
009: *
010: * This code is distributed in the hope that it will be useful, but WITHOUT
011: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
012: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
013: * version 2 for more details (a copy is included in the LICENSE file that
014: * accompanied this code).
015: *
016: * You should have received a copy of the GNU General Public License version
017: * 2 along with this work; if not, write to the Free Software Foundation,
018: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
019: *
020: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
021: * CA 95054 USA or visit www.sun.com if you need additional information or
022: * have any questions.
023: *
024: */
025:
026: /*
027: *******************************************************************************
028: *
029: * Copyright (C) 1999-2003, International Business Machines
030: * Corporation and others. All Rights Reserved.
031: *
032: *******************************************************************************
033: */
034:
035: package sun.font;
036:
037: /**
038: * <code>ScriptRun</code> is used to find runs of characters in
039: * the same script, as defined in the <code>Script</code> class.
040: * It implements a simple iterator over an array of characters.
041: * The iterator will assign <code>COMMON</code> and <code>INHERITED</code>
042: * characters to the same script as the preceeding characters. If the
043: * COMMON and INHERITED characters are first, they will be assigned to
044: * the same script as the following characters.
045: *
046: * The iterator will try to match paired punctuation. If it sees an
047: * opening punctuation character, it will remember the script that
048: * was assigned to that character, and assign the same script to the
049: * matching closing punctuation.
050: *
051: * No attempt is made to combine related scripts into a single run. In
052: * particular, Hiragana, Katakana, and Han characters will appear in seperate
053: * runs.
054:
055: * Here is an example of how to iterate over script runs:
056: * <pre>
057: * void printScriptRuns(char[] text)
058: * {
059: * ScriptRun scriptRun = new ScriptRun(text, 0, text.length);
060: *
061: * while (scriptRun.next()) {
062: * int start = scriptRun.getScriptStart();
063: * int limit = scriptRun.getScriptLimit();
064: * int script = scriptRun.getScriptCode();
065: *
066: * System.out.println("Script \"" + Script.getName(script) + "\" from " +
067: * start + " to " + limit + ".");
068: * }
069: * }
070: * </pre>
071: *
072: */
073: public final class ScriptRun {
074: private char[] text; // fixed once set by constructor
075: private int textStart;
076: private int textLimit;
077:
078: private int scriptStart; // change during iteration
079: private int scriptLimit;
080: private int scriptCode;
081:
082: private int stack[]; // stack used to handle paired punctuation if encountered
083: private int parenSP;
084:
085: public ScriptRun() {
086: // must call init later or we die.
087: }
088:
089: /**
090: * Construct a <code>ScriptRun</code> object which iterates over a subrange
091: * of the given characetrs.
092: *
093: * @param chars the array of characters over which to iterate.
094: * @param start the index of the first character over which to iterate
095: * @param count the number of characters over which to iterate
096: */
097: public ScriptRun(char[] chars, int start, int count) {
098: init(chars, start, count);
099: }
100:
101: public void init(char[] chars, int start, int count) {
102: if (chars == null || start < 0 || count < 0
103: || count > chars.length - start) {
104: throw new IllegalArgumentException();
105: }
106:
107: text = chars;
108: textStart = start;
109: textLimit = start + count;
110:
111: scriptStart = textStart;
112: scriptLimit = textStart;
113: scriptCode = Script.INVALID_CODE;
114: parenSP = 0;
115: }
116:
117: /**
118: * Get the starting index of the current script run.
119: *
120: * @return the index of the first character in the current script run.
121: */
122: public final int getScriptStart() {
123: return scriptStart;
124: }
125:
126: /**
127: * Get the index of the first character after the current script run.
128: *
129: * @return the index of the first character after the current script run.
130: */
131: public final int getScriptLimit() {
132: return scriptLimit;
133: }
134:
135: /**
136: * Get the script code for the script of the current script run.
137: *
138: * @return the script code for the script of the current script run.
139: * @see #Script
140: */
141: public final int getScriptCode() {
142: return scriptCode;
143: }
144:
145: /**
146: * Find the next script run. Returns <code>false</code> if there
147: * isn't another run, returns <code>true</code> if there is.
148: *
149: * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
150: */
151: public final boolean next() {
152: int startSP = parenSP; // used to find the first new open character
153:
154: // if we've fallen off the end of the text, we're done
155: if (scriptLimit >= textLimit) {
156: return false;
157: }
158:
159: scriptCode = Script.COMMON;
160: scriptStart = scriptLimit;
161:
162: int ch;
163:
164: while ((ch = nextCodePoint()) != DONE) {
165: int sc = ScriptRunData.getScript(ch);
166: int pairIndex = sc == Script.COMMON ? getPairIndex(ch) : -1;
167:
168: // Paired character handling:
169: //
170: // if it's an open character, push it onto the stack.
171: // if it's a close character, find the matching open on the
172: // stack, and use that script code. Any non-matching open
173: // characters above it on the stack will be popped.
174: if (pairIndex >= 0) {
175: if ((pairIndex & 1) == 0) {
176: if (stack == null) {
177: stack = new int[32];
178: } else if (parenSP == stack.length) {
179: int[] newstack = new int[stack.length + 32];
180: System.arraycopy(stack, 0, newstack, 0,
181: stack.length);
182: stack = newstack;
183: }
184:
185: stack[parenSP++] = pairIndex;
186: stack[parenSP++] = scriptCode;
187: } else if (parenSP > 0) {
188: int pi = pairIndex & ~1;
189:
190: while ((parenSP -= 2) >= 0 && stack[parenSP] != pi)
191: ;
192:
193: if (parenSP >= 0) {
194: sc = stack[parenSP + 1];
195: } else {
196: parenSP = 0;
197: }
198: if (parenSP < startSP) {
199: startSP = parenSP;
200: }
201: }
202: }
203:
204: if (sameScript(scriptCode, sc)) {
205: if (scriptCode <= Script.INHERITED
206: && sc > Script.INHERITED) {
207: scriptCode = sc;
208:
209: // now that we have a final script code, fix any open
210: // characters we pushed before we knew the script code.
211: while (startSP < parenSP) {
212: stack[startSP + 1] = scriptCode;
213: startSP += 2;
214: }
215: }
216:
217: // if this character is a close paired character,
218: // pop it from the stack
219: if (pairIndex > 0 && (pairIndex & 1) != 0
220: && parenSP > 0) {
221: parenSP -= 2;
222: }
223: } else {
224: // We've just seen the first character of
225: // the next run. Back over it so we'll see
226: // it again the next time.
227: pushback(ch);
228:
229: // we're outta here
230: break;
231: }
232: }
233:
234: return true;
235: }
236:
237: static final int SURROGATE_START = 0x10000;
238: static final int LEAD_START = 0xd800;
239: static final int LEAD_LIMIT = 0xdc00;
240: static final int TAIL_START = 0xdc00;
241: static final int TAIL_LIMIT = 0xe000;
242: static final int LEAD_SURROGATE_SHIFT = 10;
243: static final int SURROGATE_OFFSET = SURROGATE_START
244: - (LEAD_START << LEAD_SURROGATE_SHIFT) - TAIL_START;
245:
246: static final int DONE = -1;
247:
248: private final int nextCodePoint() {
249: if (scriptLimit >= textLimit) {
250: return DONE;
251: }
252: int ch = text[scriptLimit++];
253: if (ch >= LEAD_START && ch < LEAD_LIMIT
254: && scriptLimit < textLimit) {
255: int nch = text[scriptLimit];
256: if (nch >= TAIL_START && nch < TAIL_LIMIT) {
257: ++scriptLimit;
258: ch = (ch << LEAD_SURROGATE_SHIFT) + nch
259: + SURROGATE_OFFSET;
260: }
261: }
262: return ch;
263: }
264:
265: private final void pushback(int ch) {
266: if (ch >= 0) {
267: if (ch >= 0x10000) {
268: scriptLimit -= 2;
269: } else {
270: scriptLimit -= 1;
271: }
272: }
273: }
274:
275: /**
276: * Compare two script codes to see if they are in the same script. If one script is
277: * a strong script, and the other is INHERITED or COMMON, it will compare equal.
278: *
279: * @param scriptOne one of the script codes.
280: * @param scriptTwo the other script code.
281: * @return <code>true</code> if the two scripts are the same.
282: * @see com.ibm.icu.lang.Script
283: */
284: private static boolean sameScript(int scriptOne, int scriptTwo) {
285: return scriptOne == scriptTwo || scriptOne <= Script.INHERITED
286: || scriptTwo <= Script.INHERITED;
287: }
288:
289: /**
290: * Find the highest bit that's set in a word. Uses a binary search through
291: * the bits.
292: *
293: * @param n the word in which to find the highest bit that's set.
294: * @return the bit number (counting from the low order bit) of the highest bit.
295: */
296: private static final byte highBit(int n) {
297: if (n <= 0) {
298: return -32;
299: }
300:
301: byte bit = 0;
302:
303: if (n >= 1 << 16) {
304: n >>= 16;
305: bit += 16;
306: }
307:
308: if (n >= 1 << 8) {
309: n >>= 8;
310: bit += 8;
311: }
312:
313: if (n >= 1 << 4) {
314: n >>= 4;
315: bit += 4;
316: }
317:
318: if (n >= 1 << 2) {
319: n >>= 2;
320: bit += 2;
321: }
322:
323: if (n >= 1 << 1) {
324: n >>= 1;
325: bit += 1;
326: }
327:
328: return bit;
329: }
330:
331: /**
332: * Search the pairedChars array for the given character.
333: *
334: * @param ch the character for which to search.
335: * @return the index of the character in the table, or -1 if it's not there.
336: */
337: private static int getPairIndex(int ch) {
338: int probe = pairedCharPower;
339: int index = 0;
340:
341: if (ch >= pairedChars[pairedCharExtra]) {
342: index = pairedCharExtra;
343: }
344:
345: while (probe > (1 << 0)) {
346: probe >>= 1;
347:
348: if (ch >= pairedChars[index + probe]) {
349: index += probe;
350: }
351: }
352:
353: if (pairedChars[index] != ch) {
354: index = -1;
355: }
356:
357: return index;
358: }
359:
360: // all common
361: private static int pairedChars[] = {
362: 0x0028,
363: 0x0029, // ascii paired punctuation // common
364: 0x003c,
365: 0x003e, // common
366: 0x005b,
367: 0x005d, // common
368: 0x007b,
369: 0x007d, // common
370: 0x00ab,
371: 0x00bb, // guillemets // common
372: 0x2018,
373: 0x2019, // general punctuation // common
374: 0x201c,
375: 0x201d, // common
376: 0x2039,
377: 0x203a, // common
378: 0x3008,
379: 0x3009, // chinese paired punctuation // common
380: 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, 0x3010,
381: 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, 0x3019,
382: 0x301a, 0x301b };
383:
384: private static final int pairedCharPower = 1 << highBit(pairedChars.length);
385: private static final int pairedCharExtra = pairedChars.length
386: - pairedCharPower;
387:
388: }
|