001: /*
002: *******************************************************************************
003: *
004: * Copyright (C) 1999-2006, International Business Machines
005: * Corporation and others. All Rights Reserved.
006: *
007: *******************************************************************************
008: */
009:
010: package com.ibm.icu.lang;
011:
012: import com.ibm.icu.text.UTF16;
013:
014: /**
015: * <code>UScriptRun</code> is used to find runs of characters in
016: * the same script, as defined in the <code>UScript</code> class.
017: * It implements a simple iterator over an array of characters.
018: * The iterator will assign <code>COMMON</code> and <code>INHERITED</code>
019: * characters to the same script as the preceeding characters. If the
020: * COMMON and INHERITED characters are first, they will be assigned to
021: * the same script as the following characters.
022: *
023: * The iterator will try to match paired punctuation. If it sees an
024: * opening punctuation character, it will remember the script that
025: * was assigned to that character, and assign the same script to the
026: * matching closing punctuation.
027: *
028: * No attempt is made to combine related scripts into a single run. In
029: * particular, Hiragana, Katakana, and Han characters will appear in seperate
030: * runs.
031:
032: * Here is an example of how to iterate over script runs:
033: * <pre>
034: * void printScriptRuns(char[] text)
035: * {
036: * UScriptRun scriptRun = new UScriptRun(text);
037: *
038: * while (scriptRun.next()) {
039: * int start = scriptRun.getScriptStart();
040: * int limit = scriptRun.getScriptLimit();
041: * int script = scriptRun.getScriptCode();
042: *
043: * System.out.println("Script \"" + UScript.getName(script) + "\" from " +
044: * start + " to " + limit + ".");
045: * }
046: * }
047: * </pre>
048: *
049: * @internal
050: * @deprecated This API is ICU internal only.
051: */
052: public final class UScriptRun {
053: /**
054: * Puts a copyright in the .class file
055: */
056: private static final String copyrightNotice = "Copyright \u00a91999-2002 IBM Corp. All rights reserved.";
057:
058: /**
059: * Construct an empty <code>UScriptRun</code> object. The <code>next()</code>
060: * method will return <code>false</code> the first time it is called.
061: *
062: * @internal
063: * @deprecated This API is ICU internal only.
064: */
065: public UScriptRun() {
066: char[] nullChars = null;
067:
068: reset(nullChars, 0, 0);
069: }
070:
071: /**
072: * Construct a <code>UScriptRun</code> object which iterates over the
073: * characters in the given string.
074: *
075: * @param text the string of characters over which to iterate.
076: *
077: * @internal
078: * @deprecated This API is ICU internal only.
079: */
080: public UScriptRun(String text) {
081: reset(text);
082: }
083:
084: /**
085: * Construct a <code>UScriptRun</code> object which iterates over a subrange
086: * of the characetrs in the given string.
087: *
088: * @param text the string of characters over which to iterate.
089: * @param start the index of the first character over which to iterate
090: * @param count the number of characters over which to iterate
091: *
092: * @internal
093: * @deprecated This API is ICU internal only.
094: */
095: public UScriptRun(String text, int start, int count) {
096: reset(text, start, count);
097: }
098:
099: /**
100: * Construct a <code>UScriptRun</code> object which iterates over the given
101: * characetrs.
102: *
103: * @param chars the array of characters over which to iterate.
104: *
105: * @internal
106: * @deprecated This API is ICU internal only.
107: */
108: public UScriptRun(char[] chars) {
109: reset(chars);
110: }
111:
112: /**
113: * Construct a <code>UScriptRun</code> object which iterates over a subrange
114: * of the given characetrs.
115: *
116: * @param chars the array of characters over which to iterate.
117: * @param start the index of the first character over which to iterate
118: * @param count the number of characters over which to iterate
119: *
120: * @internal
121: * @deprecated This API is ICU internal only.
122: */
123: public UScriptRun(char[] chars, int start, int count) {
124: reset(chars, start, count);
125: }
126:
127: /**
128: * Reset the iterator to the start of the text.
129: *
130: * @internal
131: * @deprecated This API is ICU internal only.
132: */
133: public final void reset() {
134: // empty any old parenStack contents.
135: // NOTE: this is not the most efficient way
136: // to do this, but it's the easiest to write...
137: while (stackIsNotEmpty()) {
138: pop();
139: }
140:
141: scriptStart = textStart;
142: scriptLimit = textStart;
143: scriptCode = UScript.INVALID_CODE;
144: parenSP = -1;
145: pushCount = 0;
146: fixupCount = 0;
147:
148: textIndex = textStart;
149: }
150:
151: /**
152: * Reset the iterator to iterate over the given range of the text. Throws
153: * IllegalArgumentException if the range is outside of the bounds of the
154: * character array.
155: *
156: * @param start the index of the new first character over which to iterate
157: * @param count the new number of characters over which to iterate.
158: * @exception IllegalArgumentException
159: *
160: * @internal
161: * @deprecated This API is ICU internal only.
162: */
163: public final void reset(int start, int count)
164: throws IllegalArgumentException {
165: int len = 0;
166:
167: if (text != null) {
168: len = text.length;
169: }
170:
171: if (start < 0 || count < 0 || start > len - count) {
172: throw new IllegalArgumentException();
173: }
174:
175: textStart = start;
176: textLimit = start + count;
177:
178: reset();
179: }
180:
181: /**
182: * Reset the iterator to iterate over <code>count</code> characters
183: * in <code>chars</code> starting at <code>start</code>. This allows
184: * clients to reuse an iterator.
185: *
186: * @param chars the new array of characters over which to iterate.
187: * @param start the index of the first character over which to iterate.
188: * @param count the number of characters over which to iterate.
189: *
190: * @internal
191: * @deprecated This API is ICU internal only.
192: */
193: public final void reset(char[] chars, int start, int count) {
194: if (chars == null) {
195: chars = emptyCharArray;
196: }
197:
198: text = chars;
199:
200: reset(start, count);
201: }
202:
203: /**
204: * Reset the iterator to iterate over the characters
205: * in <code>chars</code>. This allows clients to reuse an iterator.
206: *
207: * @param chars the new array of characters over which to iterate.
208: *
209: * @internal
210: * @deprecated This API is ICU internal only.
211: */
212: public final void reset(char[] chars) {
213: int length = 0;
214:
215: if (chars != null) {
216: length = chars.length;
217: }
218:
219: reset(chars, 0, length);
220: }
221:
222: /**
223: * Reset the iterator to iterate over <code>count</code> characters
224: * in <code>text</code> starting at <code>start</code>. This allows
225: * clients to reuse an iterator.
226: *
227: * @param text the new string of characters over which to iterate.
228: * @param start the index of the first character over which to iterate.
229: * @param count the nuber of characters over which to iterate.
230: *
231: * @internal
232: * @deprecated This API is ICU internal only.
233: */
234: public final void reset(String text, int start, int count) {
235: char[] chars = null;
236:
237: if (text != null) {
238: chars = text.toCharArray();
239: }
240:
241: reset(chars, start, count);
242: }
243:
244: /**
245: * Reset the iterator to iterate over the characters
246: * in <code>text</code>. This allows clients to reuse an iterator.
247: *
248: * @param text the new string of characters over which to iterate.
249: *
250: * @internal
251: * @deprecated This API is ICU internal only.
252: */
253: public final void reset(String text) {
254: int length = 0;
255:
256: if (text != null) {
257: length = text.length();
258: }
259:
260: reset(text, 0, length);
261: }
262:
263: /**
264: * Get the starting index of the current script run.
265: *
266: * @return the index of the first character in the current script run.
267: *
268: * @internal
269: * @deprecated This API is ICU internal only.
270: */
271: public final int getScriptStart() {
272: return scriptStart;
273: }
274:
275: /**
276: * Get the index of the first character after the current script run.
277: *
278: * @return the index of the first character after the current script run.
279: *
280: * @internal
281: * @deprecated This API is ICU internal only.
282: */
283: public final int getScriptLimit() {
284: return scriptLimit;
285: }
286:
287: /**
288: * Get the script code for the script of the current script run.
289: *
290: * @return the script code for the script of the current script run.
291: * @see com.ibm.icu.lang.UScript
292: *
293: * @internal
294: * @deprecated This API is ICU internal only.
295: */
296: public final int getScriptCode() {
297: return scriptCode;
298: }
299:
300: /**
301: * Find the next script run. Returns <code>false</code> if there
302: * isn't another run, returns <code>true</code> if there is.
303: *
304: * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
305: *
306: * @internal
307: * @deprecated This API is ICU internal only.
308: */
309: public final boolean next() {
310: // if we've fallen off the end of the text, we're done
311: if (scriptLimit >= textLimit) {
312: return false;
313: }
314:
315: scriptCode = UScript.COMMON;
316: scriptStart = scriptLimit;
317:
318: syncFixup();
319:
320: while (textIndex < textLimit) {
321: int ch = UTF16.charAt(text, textStart, textLimit, textIndex
322: - textStart);
323: int codePointCount = UTF16.getCharCount(ch);
324: int sc = UScript.getScript(ch);
325: int pairIndex = getPairIndex(ch);
326:
327: textIndex += codePointCount;
328:
329: // Paired character handling:
330: //
331: // if it's an open character, push it onto the stack.
332: // if it's a close character, find the matching open on the
333: // stack, and use that script code. Any non-matching open
334: // characters above it on the stack will be poped.
335: if (pairIndex >= 0) {
336: if ((pairIndex & 1) == 0) {
337: push(pairIndex, scriptCode);
338: } else {
339: int pi = pairIndex & ~1;
340:
341: while (stackIsNotEmpty() && top().pairIndex != pi) {
342: pop();
343: }
344:
345: if (stackIsNotEmpty()) {
346: sc = top().scriptCode;
347: }
348: }
349: }
350:
351: if (sameScript(scriptCode, sc)) {
352: if (scriptCode <= UScript.INHERITED
353: && sc > UScript.INHERITED) {
354: scriptCode = sc;
355:
356: fixup(scriptCode);
357: }
358:
359: // if this character is a close paired character,
360: // pop the matching open character from the stack
361: if (pairIndex >= 0 && (pairIndex & 1) != 0) {
362: pop();
363: }
364: } else {
365: // We've just seen the first character of
366: // the next run. Back over it so we'll see
367: // it again the next time.
368: textIndex -= codePointCount;
369: break;
370: }
371: }
372:
373: scriptLimit = textIndex;
374: return true;
375: }
376:
377: /**
378: * Compare two script codes to see if they are in the same script. If one script is
379: * a strong script, and the other is INHERITED or COMMON, it will compare equal.
380: *
381: * @param scriptOne one of the script codes.
382: * @param scriptTwo the other script code.
383: * @return <code>true</code> if the two scripts are the same.
384: * @see com.ibm.icu.lang.UScript
385: */
386: private static boolean sameScript(int scriptOne, int scriptTwo) {
387: return scriptOne <= UScript.INHERITED
388: || scriptTwo <= UScript.INHERITED
389: || scriptOne == scriptTwo;
390: }
391:
392: /*
393: * An internal class which holds entries on the paren stack.
394: */
395: private static final class ParenStackEntry {
396: int pairIndex;
397: int scriptCode;
398:
399: public ParenStackEntry(int thePairIndex, int theScriptCode) {
400: pairIndex = thePairIndex;
401: scriptCode = theScriptCode;
402: }
403: }
404:
405: private static final int mod(int sp) {
406: return sp % PAREN_STACK_DEPTH;
407: }
408:
409: private static final int inc(int sp, int count) {
410: return mod(sp + count);
411: }
412:
413: private static final int inc(int sp) {
414: return inc(sp, 1);
415: }
416:
417: private static final int dec(int sp, int count) {
418: return mod(sp + PAREN_STACK_DEPTH - count);
419: }
420:
421: private static final int dec(int sp) {
422: return dec(sp, 1);
423: }
424:
425: private static final int limitInc(int count) {
426: if (count < PAREN_STACK_DEPTH) {
427: count += 1;
428: }
429:
430: return count;
431: }
432:
433: private final boolean stackIsEmpty() {
434: return pushCount <= 0;
435: }
436:
437: private final boolean stackIsNotEmpty() {
438: return !stackIsEmpty();
439: }
440:
441: private final void push(int pairIndex, int scriptCode) {
442: pushCount = limitInc(pushCount);
443: fixupCount = limitInc(fixupCount);
444:
445: parenSP = inc(parenSP);
446: parenStack[parenSP] = new ParenStackEntry(pairIndex, scriptCode);
447: }
448:
449: private final void pop() {
450:
451: if (stackIsEmpty()) {
452: return;
453: }
454:
455: parenStack[parenSP] = null;
456:
457: if (fixupCount > 0) {
458: fixupCount -= 1;
459: }
460:
461: pushCount -= 1;
462: parenSP = dec(parenSP);
463:
464: // If the stack is now empty, reset the stack
465: // pointers to their initial values.
466: if (stackIsEmpty()) {
467: parenSP = -1;
468: }
469: }
470:
471: private final ParenStackEntry top() {
472: return parenStack[parenSP];
473: }
474:
475: private final void syncFixup() {
476: fixupCount = 0;
477: }
478:
479: private final void fixup(int scriptCode) {
480: int fixupSP = dec(parenSP, fixupCount);
481:
482: while (fixupCount-- > 0) {
483: fixupSP = inc(fixupSP);
484: parenStack[fixupSP].scriptCode = scriptCode;
485: }
486: }
487:
488: private char[] emptyCharArray = {};
489:
490: private char[] text;
491:
492: private int textIndex;
493: private int textStart;
494: private int textLimit;
495:
496: private int scriptStart;
497: private int scriptLimit;
498: private int scriptCode;
499:
500: private static int PAREN_STACK_DEPTH = 32;
501: private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH];
502: private int parenSP = -1;
503: private int pushCount = 0;
504: private int fixupCount = 0;
505:
506: /**
507: * Find the highest bit that's set in a word. Uses a binary search through
508: * the bits.
509: *
510: * @param n the word in which to find the highest bit that's set.
511: * @return the bit number (counting from the low order bit) of the highest bit.
512: */
513: private static final byte highBit(int n) {
514: if (n <= 0) {
515: return -32;
516: }
517:
518: byte bit = 0;
519:
520: if (n >= 1 << 16) {
521: n >>= 16;
522: bit += 16;
523: }
524:
525: if (n >= 1 << 8) {
526: n >>= 8;
527: bit += 8;
528: }
529:
530: if (n >= 1 << 4) {
531: n >>= 4;
532: bit += 4;
533: }
534:
535: if (n >= 1 << 2) {
536: n >>= 2;
537: bit += 2;
538: }
539:
540: if (n >= 1 << 1) {
541: n >>= 1;
542: bit += 1;
543: }
544:
545: return bit;
546: }
547:
548: /**
549: * Search the pairedChars array for the given character.
550: *
551: * @param ch the character for which to search.
552: * @return the index of the character in the table, or -1 if it's not there.
553: */
554: private static int getPairIndex(int ch) {
555: int probe = pairedCharPower;
556: int index = 0;
557:
558: if (ch >= pairedChars[pairedCharExtra]) {
559: index = pairedCharExtra;
560: }
561:
562: while (probe > (1 << 0)) {
563: probe >>= 1;
564:
565: if (ch >= pairedChars[index + probe]) {
566: index += probe;
567: }
568: }
569:
570: if (pairedChars[index] != ch) {
571: index = -1;
572: }
573:
574: return index;
575: }
576:
577: private static int pairedChars[] = {
578: 0x0028,
579: 0x0029, // ascii paired punctuation
580: 0x003c, 0x003e, 0x005b, 0x005d, 0x007b,
581: 0x007d,
582: 0x00ab,
583: 0x00bb, // guillemets
584: 0x2018,
585: 0x2019, // general punctuation
586: 0x201c, 0x201d, 0x2039, 0x203a,
587: 0x3008,
588: 0x3009, // chinese paired punctuation
589: 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, 0x3010,
590: 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, 0x3019,
591: 0x301a, 0x301b };
592:
593: private static int pairedCharPower = 1 << highBit(pairedChars.length);
594: private static int pairedCharExtra = pairedChars.length
595: - pairedCharPower;
596: }
|