001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.xerces.utils.regex;
059:
060: /**
061: * This class represents a character class such as [a-z] or a period.
062: */
063: final class RangeToken extends Token implements java.io.Serializable {
064:
065: int[] ranges;
066: boolean sorted;
067: boolean compacted;
068: RangeToken icaseCache = null;
069: int[] map = null;
070: int nonMapIndex;
071:
072: RangeToken(int type) {
073: super (type);
074: this .setSorted(false);
075: }
076:
077: // for RANGE or NRANGE
078: protected void addRange(int start, int end) {
079: this .icaseCache = null;
080: //System.err.println("Token#addRange(): "+start+" "+end);
081: int r1, r2;
082: if (start <= end) {
083: r1 = start;
084: r2 = end;
085: } else {
086: r1 = end;
087: r2 = start;
088: }
089:
090: int pos = 0;
091: if (this .ranges == null) {
092: this .ranges = new int[2];
093: this .ranges[0] = r1;
094: this .ranges[1] = r2;
095: this .setSorted(true);
096: } else {
097: pos = this .ranges.length;
098: if (this .ranges[pos - 1] + 1 == r1) {
099: this .ranges[pos - 1] = r2;
100: return;
101: }
102: int[] temp = new int[pos + 2];
103: System.arraycopy(this .ranges, 0, temp, 0, pos);
104: this .ranges = temp;
105: if (this .ranges[pos - 1] >= r1)
106: this .setSorted(false);
107: this .ranges[pos++] = r1;
108: this .ranges[pos] = r2;
109: if (!this .sorted)
110: this .sortRanges();
111: }
112: }
113:
114: private final boolean isSorted() {
115: return this .sorted;
116: }
117:
118: private final void setSorted(boolean sort) {
119: this .sorted = sort;
120: if (!sort)
121: this .compacted = false;
122: }
123:
124: private final boolean isCompacted() {
125: return this .compacted;
126: }
127:
128: private final void setCompacted() {
129: this .compacted = true;
130: }
131:
132: protected void sortRanges() {
133: if (this .isSorted())
134: return;
135: if (this .ranges == null)
136: return;
137: //System.err.println("Do sorting: "+this.ranges.length);
138:
139: // Bubble sort
140: // Why? -- In many cases,
141: // this.ranges has few elements.
142: for (int i = this .ranges.length - 4; i >= 0; i -= 2) {
143: for (int j = 0; j <= i; j += 2) {
144: if (this .ranges[j] > this .ranges[j + 2]
145: || this .ranges[j] == this .ranges[j + 2]
146: && this .ranges[j + 1] > this .ranges[j + 3]) {
147: int tmp;
148: tmp = this .ranges[j + 2];
149: this .ranges[j + 2] = this .ranges[j];
150: this .ranges[j] = tmp;
151: tmp = this .ranges[j + 3];
152: this .ranges[j + 3] = this .ranges[j + 1];
153: this .ranges[j + 1] = tmp;
154: }
155: }
156: }
157: this .setSorted(true);
158: }
159:
160: /**
161: * this.ranges is sorted.
162: */
163: protected void compactRanges() {
164: boolean DEBUG = false;
165: if (this .ranges == null || this .ranges.length <= 2)
166: return;
167: if (this .isCompacted())
168: return;
169: int base = 0; // Index of writing point
170: int target = 0; // Index of processing point
171:
172: while (target < this .ranges.length) {
173: if (base != target) {
174: this .ranges[base] = this .ranges[target++];
175: this .ranges[base + 1] = this .ranges[target++];
176: } else
177: target += 2;
178: int baseend = this .ranges[base + 1];
179: while (target < this .ranges.length) {
180: if (baseend + 1 < this .ranges[target])
181: break;
182: if (baseend + 1 == this .ranges[target]) {
183: if (DEBUG)
184: System.err
185: .println("Token#compactRanges(): Compaction: ["
186: + this .ranges[base]
187: + ", "
188: + this .ranges[base + 1]
189: + "], ["
190: + this .ranges[target]
191: + ", "
192: + this .ranges[target + 1]
193: + "] -> ["
194: + this .ranges[base]
195: + ", "
196: + this .ranges[target + 1] + "]");
197: this .ranges[base + 1] = this .ranges[target + 1];
198: baseend = this .ranges[base + 1];
199: target += 2;
200: } else if (baseend >= this .ranges[target + 1]) {
201: if (DEBUG)
202: System.err
203: .println("Token#compactRanges(): Compaction: ["
204: + this .ranges[base]
205: + ", "
206: + this .ranges[base + 1]
207: + "], ["
208: + this .ranges[target]
209: + ", "
210: + this .ranges[target + 1]
211: + "] -> ["
212: + this .ranges[base]
213: + ", "
214: + this .ranges[base + 1]
215: + "]");
216: target += 2;
217: } else if (baseend < this .ranges[target + 1]) {
218: if (DEBUG)
219: System.err
220: .println("Token#compactRanges(): Compaction: ["
221: + this .ranges[base]
222: + ", "
223: + this .ranges[base + 1]
224: + "], ["
225: + this .ranges[target]
226: + ", "
227: + this .ranges[target + 1]
228: + "] -> ["
229: + this .ranges[base]
230: + ", "
231: + this .ranges[target + 1] + "]");
232: this .ranges[base + 1] = this .ranges[target + 1];
233: baseend = this .ranges[base + 1];
234: target += 2;
235: } else {
236: throw new RuntimeException(
237: "Token#compactRanges(): Internel Error: ["
238: + this .ranges[base] + ","
239: + this .ranges[base + 1] + "] ["
240: + this .ranges[target] + ","
241: + this .ranges[target + 1] + "]");
242: }
243: } // while
244: base += 2;
245: }
246:
247: if (base != this .ranges.length) {
248: int[] result = new int[base];
249: System.arraycopy(this .ranges, 0, result, 0, base);
250: this .ranges = result;
251: }
252: this .setCompacted();
253: }
254:
255: protected void mergeRanges(Token token) {
256: if (token.type != this .type)
257: throw new IllegalArgumentException(
258: "Token#mergeRanges(): Mismatched Type: "
259: + token.type);
260: RangeToken tok = (RangeToken) token;
261: this .sortRanges();
262: tok.sortRanges();
263: if (tok.ranges == null)
264: return;
265: this .icaseCache = null;
266: this .setSorted(true);
267: if (this .ranges == null) {
268: this .ranges = new int[tok.ranges.length];
269: System.arraycopy(tok.ranges, 0, this .ranges, 0,
270: tok.ranges.length);
271: return;
272: }
273: int[] result = new int[this .ranges.length + tok.ranges.length];
274: for (int i = 0, j = 0, k = 0; i < this .ranges.length
275: || j < tok.ranges.length;) {
276: if (i >= this .ranges.length) {
277: result[k++] = tok.ranges[j++];
278: result[k++] = tok.ranges[j++];
279: } else if (j >= tok.ranges.length) {
280: result[k++] = this .ranges[i++];
281: result[k++] = this .ranges[i++];
282: } else if (tok.ranges[j] < this .ranges[i]
283: || tok.ranges[j] == this .ranges[i]
284: && tok.ranges[j + 1] < this .ranges[i + 1]) {
285: result[k++] = tok.ranges[j++];
286: result[k++] = tok.ranges[j++];
287: } else {
288: result[k++] = this .ranges[i++];
289: result[k++] = this .ranges[i++];
290: }
291: }
292: this .ranges = result;
293: }
294:
295: protected void subtractRanges(Token token) {
296: if (token.type == NRANGE) {
297: this .intersectRanges(token);
298: return;
299: }
300: RangeToken tok = (RangeToken) token;
301: if (tok.ranges == null || this .ranges == null)
302: return;
303: this .icaseCache = null;
304: this .sortRanges();
305: this .compactRanges();
306: tok.sortRanges();
307: tok.compactRanges();
308:
309: //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
310:
311: int[] result = new int[this .ranges.length + tok.ranges.length];
312: int wp = 0, src = 0, sub = 0;
313: while (src < this .ranges.length && sub < tok.ranges.length) {
314: int srcbegin = this .ranges[src];
315: int srcend = this .ranges[src + 1];
316: int subbegin = tok.ranges[sub];
317: int subend = tok.ranges[sub + 1];
318: if (srcend < subbegin) { // Not overlapped
319: // src: o-----o
320: // sub: o-----o
321: // res: o-----o
322: // Reuse sub
323: result[wp++] = this .ranges[src++];
324: result[wp++] = this .ranges[src++];
325: } else if (srcend >= subbegin && srcbegin <= subend) { // Overlapped
326: // src: o--------o
327: // sub: o----o
328: // sub: o----o
329: // sub: o----o
330: // sub: o------------o
331: if (subbegin <= srcbegin && srcend <= subend) {
332: // src: o--------o
333: // sub: o------------o
334: // res: empty
335: // Reuse sub
336: src += 2;
337: } else if (subbegin <= srcbegin) {
338: // src: o--------o
339: // sub: o----o
340: // res: o-----o
341: // Reuse src(=res)
342: this .ranges[src] = subend + 1;
343: sub += 2;
344: } else if (srcend <= subend) {
345: // src: o--------o
346: // sub: o----o
347: // res: o-----o
348: // Reuse sub
349: result[wp++] = srcbegin;
350: result[wp++] = subbegin - 1;
351: src += 2;
352: } else {
353: // src: o--------o
354: // sub: o----o
355: // res: o-o o-o
356: // Reuse src(=right res)
357: result[wp++] = srcbegin;
358: result[wp++] = subbegin - 1;
359: this .ranges[src] = subend + 1;
360: sub += 2;
361: }
362: } else if (subend < srcbegin) {
363: // Not overlapped
364: // src: o-----o
365: // sub: o----o
366: sub += 2;
367: } else {
368: throw new RuntimeException(
369: "Token#subtractRanges(): Internal Error: ["
370: + this .ranges[src] + ","
371: + this .ranges[src + 1] + "] - ["
372: + tok.ranges[sub] + ","
373: + tok.ranges[sub + 1] + "]");
374: }
375: }
376: while (src < this .ranges.length) {
377: result[wp++] = this .ranges[src++];
378: result[wp++] = this .ranges[src++];
379: }
380: this .ranges = new int[wp];
381: System.arraycopy(result, 0, this .ranges, 0, wp);
382: // this.ranges is sorted and compacted.
383: }
384:
385: /**
386: * @param tok Ignore whether it is NRANGE or not.
387: */
388: protected void intersectRanges(Token token) {
389: RangeToken tok = (RangeToken) token;
390: if (tok.ranges == null || this .ranges == null)
391: return;
392: this .icaseCache = null;
393: this .sortRanges();
394: this .compactRanges();
395: tok.sortRanges();
396: tok.compactRanges();
397:
398: int[] result = new int[this .ranges.length + tok.ranges.length];
399: int wp = 0, src1 = 0, src2 = 0;
400: while (src1 < this .ranges.length && src2 < tok.ranges.length) {
401: int src1begin = this .ranges[src1];
402: int src1end = this .ranges[src1 + 1];
403: int src2begin = tok.ranges[src2];
404: int src2end = tok.ranges[src2 + 1];
405: if (src1end < src2begin) { // Not overlapped
406: // src1: o-----o
407: // src2: o-----o
408: // res: empty
409: // Reuse src2
410: src1 += 2;
411: } else if (src1end >= src2begin && src1begin <= src2end) { // Overlapped
412: // src1: o--------o
413: // src2: o----o
414: // src2: o----o
415: // src2: o----o
416: // src2: o------------o
417: if (src2begin <= src2begin && src1end <= src2end) {
418: // src1: o--------o
419: // src2: o------------o
420: // res: o--------o
421: // Reuse src2
422: result[wp++] = src1begin;
423: result[wp++] = src1end;
424: src1 += 2;
425: } else if (src2begin <= src1begin) {
426: // src1: o--------o
427: // src2: o----o
428: // res: o--o
429: // Reuse the rest of src1
430: result[wp++] = src1begin;
431: result[wp++] = src2end;
432: this .ranges[src1] = src2end + 1;
433: src2 += 2;
434: } else if (src1end <= src2end) {
435: // src1: o--------o
436: // src2: o----o
437: // res: o--o
438: // Reuse src2
439: result[wp++] = src2begin;
440: result[wp++] = src1end;
441: src1 += 2;
442: } else {
443: // src1: o--------o
444: // src2: o----o
445: // res: o----o
446: // Reuse the rest of src1
447: result[wp++] = src2begin;
448: result[wp++] = src2end;
449: this .ranges[src1] = src2end + 1;
450: }
451: } else if (src2end < src1begin) {
452: // Not overlapped
453: // src1: o-----o
454: // src2: o----o
455: src2 += 2;
456: } else {
457: throw new RuntimeException(
458: "Token#intersectRanges(): Internal Error: ["
459: + this .ranges[src1] + ","
460: + this .ranges[src1 + 1] + "] & ["
461: + tok.ranges[src2] + ","
462: + tok.ranges[src2 + 1] + "]");
463: }
464: }
465: while (src1 < this .ranges.length) {
466: result[wp++] = this .ranges[src1++];
467: result[wp++] = this .ranges[src1++];
468: }
469: this .ranges = new int[wp];
470: System.arraycopy(result, 0, this .ranges, 0, wp);
471: // this.ranges is sorted and compacted.
472: }
473:
474: /**
475: * for RANGE: Creates complement.
476: * for NRANGE: Creates the same meaning RANGE.
477: */
478: static Token complementRanges(Token token) {
479: if (token.type != RANGE && token.type != NRANGE)
480: throw new IllegalArgumentException(
481: "Token#complementRanges(): must be RANGE: "
482: + token.type);
483: RangeToken tok = (RangeToken) token;
484: tok.sortRanges();
485: tok.compactRanges();
486: int len = tok.ranges.length + 2;
487: if (tok.ranges[0] == 0)
488: len -= 2;
489: int last = tok.ranges[tok.ranges.length - 1];
490: if (last == UTF16_MAX)
491: len -= 2;
492: RangeToken ret = Token.createRange();
493: ret.ranges = new int[len];
494: int wp = 0;
495: if (tok.ranges[0] > 0) {
496: ret.ranges[wp++] = 0;
497: ret.ranges[wp++] = tok.ranges[0] - 1;
498: }
499: for (int i = 1; i < tok.ranges.length - 2; i += 2) {
500: ret.ranges[wp++] = tok.ranges[i] + 1;
501: ret.ranges[wp++] = tok.ranges[i + 1] - 1;
502: }
503: if (last != UTF16_MAX) {
504: ret.ranges[wp++] = last + 1;
505: ret.ranges[wp] = UTF16_MAX;
506: }
507: ret.setCompacted();
508: return ret;
509: }
510:
511: synchronized RangeToken getCaseInsensitiveToken() {
512: if (this .icaseCache != null)
513: return this .icaseCache;
514:
515: RangeToken uppers = this .type == Token.RANGE ? Token
516: .createRange() : Token.createNRange();
517: for (int i = 0; i < this .ranges.length; i += 2) {
518: for (int ch = this .ranges[i]; ch <= this .ranges[i + 1]; ch++) {
519: if (ch > 0xffff)
520: uppers.addRange(ch, ch);
521: else {
522: char uch = Character.toUpperCase((char) ch);
523: uppers.addRange(uch, uch);
524: }
525: }
526: }
527: RangeToken lowers = this .type == Token.RANGE ? Token
528: .createRange() : Token.createNRange();
529: for (int i = 0; i < uppers.ranges.length; i += 2) {
530: for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i + 1]; ch++) {
531: if (ch > 0xffff)
532: lowers.addRange(ch, ch);
533: else {
534: char uch = Character.toUpperCase((char) ch);
535: lowers.addRange(uch, uch);
536: }
537: }
538: }
539: lowers.mergeRanges(uppers);
540: lowers.mergeRanges(this );
541: lowers.compactRanges();
542:
543: this .icaseCache = lowers;
544: return lowers;
545: }
546:
547: void dumpRanges() {
548: System.err.print("RANGE: ");
549: if (this .ranges == null)
550: System.err.println(" NULL");
551: for (int i = 0; i < this .ranges.length; i += 2) {
552: System.err.print("[" + this .ranges[i] + ","
553: + this .ranges[i + 1] + "] ");
554: }
555: System.err.println("");
556: }
557:
558: boolean match(int ch) {
559: if (this .map == null)
560: this .createMap();
561: boolean ret;
562: if (this .type == RANGE) {
563: if (ch < MAPSIZE)
564: return (this .map[ch / 32] & (1 << (ch & 0x1f))) != 0;
565: ret = false;
566: for (int i = this .nonMapIndex; i < this .ranges.length; i += 2) {
567: if (this .ranges[i] <= ch && ch <= this .ranges[i + 1])
568: return true;
569: }
570: } else {
571: if (ch < MAPSIZE)
572: return (this .map[ch / 32] & (1 << (ch & 0x1f))) == 0;
573: ret = true;
574: for (int i = this .nonMapIndex; i < this .ranges.length; i += 2) {
575: if (this .ranges[i] <= ch && ch <= this .ranges[i + 1])
576: return false;
577: }
578: }
579: return ret;
580: }
581:
582: private static final int MAPSIZE = 256;
583:
584: private void createMap() {
585: int asize = MAPSIZE / 32; // 32 is the number of bits in `int'.
586: this .map = new int[asize];
587: this .nonMapIndex = this .ranges.length;
588: for (int i = 0; i < asize; i++)
589: this .map[i] = 0;
590: for (int i = 0; i < this .ranges.length; i += 2) {
591: int s = this .ranges[i];
592: int e = this .ranges[i + 1];
593: if (s < MAPSIZE) {
594: for (int j = s; j <= e && j < MAPSIZE; j++)
595: this .map[j / 32] |= 1 << (j & 0x1f); // s&0x1f : 0-31
596: } else {
597: this .nonMapIndex = i;
598: break;
599: }
600: if (e >= MAPSIZE) {
601: this .nonMapIndex = i;
602: break;
603: }
604: }
605: //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
606: }
607:
608: public String toString(int options) {
609: String ret;
610: if (this .type == RANGE) {
611: if (this == Token.token_dot)
612: ret = ".";
613: else if (this == Token.token_0to9)
614: ret = "\\d";
615: else if (this == Token.token_wordchars)
616: ret = "\\w";
617: else if (this == Token.token_spaces)
618: ret = "\\s";
619: else {
620: StringBuffer sb = new StringBuffer();
621: sb.append("[");
622: for (int i = 0; i < this .ranges.length; i += 2) {
623: if ((options & RegularExpression.SPECIAL_COMMA) != 0
624: && i > 0)
625: sb.append(",");
626: if (this .ranges[i] == this .ranges[i + 1]) {
627: sb
628: .append(escapeCharInCharClass(this .ranges[i]));
629: } else {
630: sb
631: .append(escapeCharInCharClass(this .ranges[i]));
632: sb.append((char) '-');
633: sb
634: .append(escapeCharInCharClass(this .ranges[i + 1]));
635: }
636: }
637: sb.append("]");
638: ret = sb.toString();
639: }
640: } else {
641: if (this == Token.token_not_0to9)
642: ret = "\\D";
643: else if (this == Token.token_not_wordchars)
644: ret = "\\W";
645: else if (this == Token.token_not_spaces)
646: ret = "\\S";
647: else {
648: StringBuffer sb = new StringBuffer();
649: sb.append("[^");
650: for (int i = 0; i < this .ranges.length; i += 2) {
651: if ((options & RegularExpression.SPECIAL_COMMA) != 0
652: && i > 0)
653: sb.append(",");
654: if (this .ranges[i] == this .ranges[i + 1]) {
655: sb
656: .append(escapeCharInCharClass(this .ranges[i]));
657: } else {
658: sb
659: .append(escapeCharInCharClass(this .ranges[i]));
660: sb.append('-');
661: sb
662: .append(escapeCharInCharClass(this .ranges[i + 1]));
663: }
664: }
665: sb.append("]");
666: ret = sb.toString();
667: }
668: }
669: return ret;
670: }
671:
672: private static String escapeCharInCharClass(int ch) {
673: String ret;
674: switch (ch) {
675: case '[':
676: case ']':
677: case '-':
678: case '^':
679: case ',':
680: case '\\':
681: ret = "\\" + (char) ch;
682: break;
683: case '\f':
684: ret = "\\f";
685: break;
686: case '\n':
687: ret = "\\n";
688: break;
689: case '\r':
690: ret = "\\r";
691: break;
692: case '\t':
693: ret = "\\t";
694: break;
695: case 0x1b:
696: ret = "\\e";
697: break;
698: //case 0x0b: ret = "\\v"; break;
699: default:
700: if (ch < 0x20) {
701: String pre = "0" + Integer.toHexString(ch);
702: ret = "\\x"
703: + pre.substring(pre.length() - 2, pre.length());
704: } else if (ch >= 0x10000) {
705: String pre = "0" + Integer.toHexString(ch);
706: ret = "\\v"
707: + pre.substring(pre.length() - 6, pre.length());
708: } else
709: ret = "" + (char) ch;
710: }
711: return ret;
712: }
713:
714: }
|