001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.impl.xpath.regex;
019:
020: /**
021: * This class represents a character class such as [a-z] or a period.
022: *
023: * @xerces.internal
024: *
025: * @version $Id: RangeToken.java 504431 2007-02-07 04:25:14Z mrglavas $
026: */
027: final class RangeToken extends Token implements java.io.Serializable {
028:
029: private static final long serialVersionUID = -553983121197679934L;
030:
031: int[] ranges;
032: boolean sorted;
033: boolean compacted;
034: RangeToken icaseCache = null;
035: int[] map = null;
036: int nonMapIndex;
037:
038: RangeToken(int type) {
039: super (type);
040: this .setSorted(false);
041: }
042:
043: // for RANGE or NRANGE
044: protected void addRange(int start, int end) {
045: this .icaseCache = null;
046: //System.err.println("Token#addRange(): "+start+" "+end);
047: int r1, r2;
048: if (start <= end) {
049: r1 = start;
050: r2 = end;
051: } else {
052: r1 = end;
053: r2 = start;
054: }
055:
056: int pos = 0;
057: if (this .ranges == null) {
058: this .ranges = new int[2];
059: this .ranges[0] = r1;
060: this .ranges[1] = r2;
061: this .setSorted(true);
062: } else {
063: pos = this .ranges.length;
064: if (this .ranges[pos - 1] + 1 == r1) {
065: this .ranges[pos - 1] = r2;
066: return;
067: }
068: int[] temp = new int[pos + 2];
069: System.arraycopy(this .ranges, 0, temp, 0, pos);
070: this .ranges = temp;
071: if (this .ranges[pos - 1] >= r1)
072: this .setSorted(false);
073: this .ranges[pos++] = r1;
074: this .ranges[pos] = r2;
075: if (!this .sorted)
076: this .sortRanges();
077: }
078: }
079:
080: private final boolean isSorted() {
081: return this .sorted;
082: }
083:
084: private final void setSorted(boolean sort) {
085: this .sorted = sort;
086: if (!sort)
087: this .compacted = false;
088: }
089:
090: private final boolean isCompacted() {
091: return this .compacted;
092: }
093:
094: private final void setCompacted() {
095: this .compacted = true;
096: }
097:
098: protected void sortRanges() {
099: if (this .isSorted())
100: return;
101: if (this .ranges == null)
102: return;
103: //System.err.println("Do sorting: "+this.ranges.length);
104:
105: // Bubble sort
106: // Why? -- In many cases,
107: // this.ranges has few elements.
108: for (int i = this .ranges.length - 4; i >= 0; i -= 2) {
109: for (int j = 0; j <= i; j += 2) {
110: if (this .ranges[j] > this .ranges[j + 2]
111: || this .ranges[j] == this .ranges[j + 2]
112: && this .ranges[j + 1] > this .ranges[j + 3]) {
113: int tmp;
114: tmp = this .ranges[j + 2];
115: this .ranges[j + 2] = this .ranges[j];
116: this .ranges[j] = tmp;
117: tmp = this .ranges[j + 3];
118: this .ranges[j + 3] = this .ranges[j + 1];
119: this .ranges[j + 1] = tmp;
120: }
121: }
122: }
123: this .setSorted(true);
124: }
125:
126: /**
127: * this.ranges is sorted.
128: */
129: protected void compactRanges() {
130: boolean DEBUG = false;
131: if (this .ranges == null || this .ranges.length <= 2)
132: return;
133: if (this .isCompacted())
134: return;
135: int base = 0; // Index of writing point
136: int target = 0; // Index of processing point
137:
138: while (target < this .ranges.length) {
139: if (base != target) {
140: this .ranges[base] = this .ranges[target++];
141: this .ranges[base + 1] = this .ranges[target++];
142: } else
143: target += 2;
144: int baseend = this .ranges[base + 1];
145: while (target < this .ranges.length) {
146: if (baseend + 1 < this .ranges[target])
147: break;
148: if (baseend + 1 == this .ranges[target]) {
149: if (DEBUG)
150: System.err
151: .println("Token#compactRanges(): Compaction: ["
152: + this .ranges[base]
153: + ", "
154: + this .ranges[base + 1]
155: + "], ["
156: + this .ranges[target]
157: + ", "
158: + this .ranges[target + 1]
159: + "] -> ["
160: + this .ranges[base]
161: + ", "
162: + this .ranges[target + 1] + "]");
163: this .ranges[base + 1] = this .ranges[target + 1];
164: baseend = this .ranges[base + 1];
165: target += 2;
166: } else if (baseend >= this .ranges[target + 1]) {
167: if (DEBUG)
168: System.err
169: .println("Token#compactRanges(): Compaction: ["
170: + this .ranges[base]
171: + ", "
172: + this .ranges[base + 1]
173: + "], ["
174: + this .ranges[target]
175: + ", "
176: + this .ranges[target + 1]
177: + "] -> ["
178: + this .ranges[base]
179: + ", "
180: + this .ranges[base + 1]
181: + "]");
182: target += 2;
183: } else if (baseend < this .ranges[target + 1]) {
184: if (DEBUG)
185: System.err
186: .println("Token#compactRanges(): Compaction: ["
187: + this .ranges[base]
188: + ", "
189: + this .ranges[base + 1]
190: + "], ["
191: + this .ranges[target]
192: + ", "
193: + this .ranges[target + 1]
194: + "] -> ["
195: + this .ranges[base]
196: + ", "
197: + this .ranges[target + 1] + "]");
198: this .ranges[base + 1] = this .ranges[target + 1];
199: baseend = this .ranges[base + 1];
200: target += 2;
201: } else {
202: throw new RuntimeException(
203: "Token#compactRanges(): Internel Error: ["
204: + this .ranges[base] + ","
205: + this .ranges[base + 1] + "] ["
206: + this .ranges[target] + ","
207: + this .ranges[target + 1] + "]");
208: }
209: } // while
210: base += 2;
211: }
212:
213: if (base != this .ranges.length) {
214: int[] result = new int[base];
215: System.arraycopy(this .ranges, 0, result, 0, base);
216: this .ranges = result;
217: }
218: this .setCompacted();
219: }
220:
221: protected void mergeRanges(Token token) {
222: RangeToken tok = (RangeToken) token;
223: this .sortRanges();
224: tok.sortRanges();
225: if (tok.ranges == null)
226: return;
227: this .icaseCache = null;
228: this .setSorted(true);
229: if (this .ranges == null) {
230: this .ranges = new int[tok.ranges.length];
231: System.arraycopy(tok.ranges, 0, this .ranges, 0,
232: tok.ranges.length);
233: return;
234: }
235: int[] result = new int[this .ranges.length + tok.ranges.length];
236: for (int i = 0, j = 0, k = 0; i < this .ranges.length
237: || j < tok.ranges.length;) {
238: if (i >= this .ranges.length) {
239: result[k++] = tok.ranges[j++];
240: result[k++] = tok.ranges[j++];
241: } else if (j >= tok.ranges.length) {
242: result[k++] = this .ranges[i++];
243: result[k++] = this .ranges[i++];
244: } else if (tok.ranges[j] < this .ranges[i]
245: || tok.ranges[j] == this .ranges[i]
246: && tok.ranges[j + 1] < this .ranges[i + 1]) {
247: result[k++] = tok.ranges[j++];
248: result[k++] = tok.ranges[j++];
249: } else {
250: result[k++] = this .ranges[i++];
251: result[k++] = this .ranges[i++];
252: }
253: }
254: this .ranges = result;
255: }
256:
257: protected void subtractRanges(Token token) {
258: if (token.type == NRANGE) {
259: this .intersectRanges(token);
260: return;
261: }
262: RangeToken tok = (RangeToken) token;
263: if (tok.ranges == null || this .ranges == null)
264: return;
265: this .icaseCache = null;
266: this .sortRanges();
267: this .compactRanges();
268: tok.sortRanges();
269: tok.compactRanges();
270:
271: //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
272:
273: int[] result = new int[this .ranges.length + tok.ranges.length];
274: int wp = 0, src = 0, sub = 0;
275: while (src < this .ranges.length && sub < tok.ranges.length) {
276: int srcbegin = this .ranges[src];
277: int srcend = this .ranges[src + 1];
278: int subbegin = tok.ranges[sub];
279: int subend = tok.ranges[sub + 1];
280: if (srcend < subbegin) { // Not overlapped
281: // src: o-----o
282: // sub: o-----o
283: // res: o-----o
284: // Reuse sub
285: result[wp++] = this .ranges[src++];
286: result[wp++] = this .ranges[src++];
287: } else if (srcend >= subbegin && srcbegin <= subend) { // Overlapped
288: // src: o--------o
289: // sub: o----o
290: // sub: o----o
291: // sub: o----o
292: // sub: o------------o
293: if (subbegin <= srcbegin && srcend <= subend) {
294: // src: o--------o
295: // sub: o------------o
296: // res: empty
297: // Reuse sub
298: src += 2;
299: } else if (subbegin <= srcbegin) {
300: // src: o--------o
301: // sub: o----o
302: // res: o-----o
303: // Reuse src(=res)
304: this .ranges[src] = subend + 1;
305: sub += 2;
306: } else if (srcend <= subend) {
307: // src: o--------o
308: // sub: o----o
309: // res: o-----o
310: // Reuse sub
311: result[wp++] = srcbegin;
312: result[wp++] = subbegin - 1;
313: src += 2;
314: } else {
315: // src: o--------o
316: // sub: o----o
317: // res: o-o o-o
318: // Reuse src(=right res)
319: result[wp++] = srcbegin;
320: result[wp++] = subbegin - 1;
321: this .ranges[src] = subend + 1;
322: sub += 2;
323: }
324: } else if (subend < srcbegin) {
325: // Not overlapped
326: // src: o-----o
327: // sub: o----o
328: sub += 2;
329: } else {
330: throw new RuntimeException(
331: "Token#subtractRanges(): Internal Error: ["
332: + this .ranges[src] + ","
333: + this .ranges[src + 1] + "] - ["
334: + tok.ranges[sub] + ","
335: + tok.ranges[sub + 1] + "]");
336: }
337: }
338: while (src < this .ranges.length) {
339: result[wp++] = this .ranges[src++];
340: result[wp++] = this .ranges[src++];
341: }
342: this .ranges = new int[wp];
343: System.arraycopy(result, 0, this .ranges, 0, wp);
344: // this.ranges is sorted and compacted.
345: }
346:
347: /**
348: * @param tok Ignore whether it is NRANGE or not.
349: */
350: protected void intersectRanges(Token token) {
351: RangeToken tok = (RangeToken) token;
352: if (tok.ranges == null || this .ranges == null)
353: return;
354: this .icaseCache = null;
355: this .sortRanges();
356: this .compactRanges();
357: tok.sortRanges();
358: tok.compactRanges();
359:
360: int[] result = new int[this .ranges.length + tok.ranges.length];
361: int wp = 0, src1 = 0, src2 = 0;
362: while (src1 < this .ranges.length && src2 < tok.ranges.length) {
363: int src1begin = this .ranges[src1];
364: int src1end = this .ranges[src1 + 1];
365: int src2begin = tok.ranges[src2];
366: int src2end = tok.ranges[src2 + 1];
367: if (src1end < src2begin) { // Not overlapped
368: // src1: o-----o
369: // src2: o-----o
370: // res: empty
371: // Reuse src2
372: src1 += 2;
373: } else if (src1end >= src2begin && src1begin <= src2end) { // Overlapped
374: // src1: o--------o
375: // src2: o----o
376: // src2: o----o
377: // src2: o----o
378: // src2: o------------o
379: if (src2begin <= src1begin && src1end <= src2end) {
380: // src1: o--------o
381: // src2: o------------o
382: // res: o--------o
383: // Reuse src2
384: result[wp++] = src1begin;
385: result[wp++] = src1end;
386: src1 += 2;
387: } else if (src2begin <= src1begin) {
388: // src1: o--------o
389: // src2: o----o
390: // res: o--o
391: // Reuse the rest of src1
392: result[wp++] = src1begin;
393: result[wp++] = src2end;
394: this .ranges[src1] = src2end + 1;
395: src2 += 2;
396: } else if (src1end <= src2end) {
397: // src1: o--------o
398: // src2: o----o
399: // res: o--o
400: // Reuse src2
401: result[wp++] = src2begin;
402: result[wp++] = src1end;
403: src1 += 2;
404: } else {
405: // src1: o--------o
406: // src2: o----o
407: // res: o----o
408: // Reuse the rest of src1
409: result[wp++] = src2begin;
410: result[wp++] = src2end;
411: this .ranges[src1] = src2end + 1;
412: }
413: } else if (src2end < src1begin) {
414: // Not overlapped
415: // src1: o-----o
416: // src2: o----o
417: src2 += 2;
418: } else {
419: throw new RuntimeException(
420: "Token#intersectRanges(): Internal Error: ["
421: + this .ranges[src1] + ","
422: + this .ranges[src1 + 1] + "] & ["
423: + tok.ranges[src2] + ","
424: + tok.ranges[src2 + 1] + "]");
425: }
426: }
427: while (src1 < this .ranges.length) {
428: result[wp++] = this .ranges[src1++];
429: result[wp++] = this .ranges[src1++];
430: }
431: this .ranges = new int[wp];
432: System.arraycopy(result, 0, this .ranges, 0, wp);
433: // this.ranges is sorted and compacted.
434: }
435:
436: /**
437: * for RANGE: Creates complement.
438: * for NRANGE: Creates the same meaning RANGE.
439: */
440: static Token complementRanges(Token token) {
441: if (token.type != RANGE && token.type != NRANGE)
442: throw new IllegalArgumentException(
443: "Token#complementRanges(): must be RANGE: "
444: + token.type);
445: RangeToken tok = (RangeToken) token;
446: tok.sortRanges();
447: tok.compactRanges();
448: int len = tok.ranges.length + 2;
449: if (tok.ranges[0] == 0)
450: len -= 2;
451: int last = tok.ranges[tok.ranges.length - 1];
452: if (last == UTF16_MAX)
453: len -= 2;
454: RangeToken ret = Token.createRange();
455: ret.ranges = new int[len];
456: int wp = 0;
457: if (tok.ranges[0] > 0) {
458: ret.ranges[wp++] = 0;
459: ret.ranges[wp++] = tok.ranges[0] - 1;
460: }
461: for (int i = 1; i < tok.ranges.length - 2; i += 2) {
462: ret.ranges[wp++] = tok.ranges[i] + 1;
463: ret.ranges[wp++] = tok.ranges[i + 1] - 1;
464: }
465: if (last != UTF16_MAX) {
466: ret.ranges[wp++] = last + 1;
467: ret.ranges[wp] = UTF16_MAX;
468: }
469: ret.setCompacted();
470: return ret;
471: }
472:
473: synchronized RangeToken getCaseInsensitiveToken() {
474: if (this .icaseCache != null)
475: return this .icaseCache;
476:
477: RangeToken uppers = this .type == Token.RANGE ? Token
478: .createRange() : Token.createNRange();
479: for (int i = 0; i < this .ranges.length; i += 2) {
480: for (int ch = this .ranges[i]; ch <= this .ranges[i + 1]; ch++) {
481: if (ch > 0xffff)
482: uppers.addRange(ch, ch);
483: else {
484: char uch = Character.toUpperCase((char) ch);
485: uppers.addRange(uch, uch);
486: }
487: }
488: }
489: RangeToken lowers = this .type == Token.RANGE ? Token
490: .createRange() : Token.createNRange();
491: for (int i = 0; i < uppers.ranges.length; i += 2) {
492: for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i + 1]; ch++) {
493: if (ch > 0xffff)
494: lowers.addRange(ch, ch);
495: else {
496: char uch = Character.toUpperCase((char) ch);
497: lowers.addRange(uch, uch);
498: }
499: }
500: }
501: lowers.mergeRanges(uppers);
502: lowers.mergeRanges(this );
503: lowers.compactRanges();
504:
505: this .icaseCache = lowers;
506: return lowers;
507: }
508:
509: void dumpRanges() {
510: System.err.print("RANGE: ");
511: if (this .ranges == null)
512: System.err.println(" NULL");
513: for (int i = 0; i < this .ranges.length; i += 2) {
514: System.err.print("[" + this .ranges[i] + ","
515: + this .ranges[i + 1] + "] ");
516: }
517: System.err.println("");
518: }
519:
520: boolean match(int ch) {
521: if (this .map == null)
522: this .createMap();
523: boolean ret;
524: if (this .type == RANGE) {
525: if (ch < MAPSIZE)
526: return (this .map[ch / 32] & (1 << (ch & 0x1f))) != 0;
527: ret = false;
528: for (int i = this .nonMapIndex; i < this .ranges.length; i += 2) {
529: if (this .ranges[i] <= ch && ch <= this .ranges[i + 1])
530: return true;
531: }
532: } else {
533: if (ch < MAPSIZE)
534: return (this .map[ch / 32] & (1 << (ch & 0x1f))) == 0;
535: ret = true;
536: for (int i = this .nonMapIndex; i < this .ranges.length; i += 2) {
537: if (this .ranges[i] <= ch && ch <= this .ranges[i + 1])
538: return false;
539: }
540: }
541: return ret;
542: }
543:
544: private static final int MAPSIZE = 256;
545:
546: private void createMap() {
547: int asize = MAPSIZE / 32; // 32 is the number of bits in `int'.
548: int[] map = new int[asize];
549: int nonMapIndex = this .ranges.length;
550: for (int i = 0; i < asize; ++i) {
551: map[i] = 0;
552: }
553: for (int i = 0; i < this .ranges.length; i += 2) {
554: int s = this .ranges[i];
555: int e = this .ranges[i + 1];
556: if (s < MAPSIZE) {
557: for (int j = s; j <= e && j < MAPSIZE; j++) {
558: map[j / 32] |= 1 << (j & 0x1f); // s&0x1f : 0-31
559: }
560: } else {
561: nonMapIndex = i;
562: break;
563: }
564: if (e >= MAPSIZE) {
565: nonMapIndex = i;
566: break;
567: }
568: }
569: this .map = map;
570: this .nonMapIndex = nonMapIndex;
571: //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
572: }
573:
574: public String toString(int options) {
575: String ret;
576: if (this .type == RANGE) {
577: if (this == Token.token_dot)
578: ret = ".";
579: else if (this == Token.token_0to9)
580: ret = "\\d";
581: else if (this == Token.token_wordchars)
582: ret = "\\w";
583: else if (this == Token.token_spaces)
584: ret = "\\s";
585: else {
586: StringBuffer sb = new StringBuffer();
587: sb.append("[");
588: for (int i = 0; i < this .ranges.length; i += 2) {
589: if ((options & RegularExpression.SPECIAL_COMMA) != 0
590: && i > 0)
591: sb.append(",");
592: if (this .ranges[i] == this .ranges[i + 1]) {
593: sb
594: .append(escapeCharInCharClass(this .ranges[i]));
595: } else {
596: sb
597: .append(escapeCharInCharClass(this .ranges[i]));
598: sb.append((char) '-');
599: sb
600: .append(escapeCharInCharClass(this .ranges[i + 1]));
601: }
602: }
603: sb.append("]");
604: ret = sb.toString();
605: }
606: } else {
607: if (this == Token.token_not_0to9)
608: ret = "\\D";
609: else if (this == Token.token_not_wordchars)
610: ret = "\\W";
611: else if (this == Token.token_not_spaces)
612: ret = "\\S";
613: else {
614: StringBuffer sb = new StringBuffer();
615: sb.append("[^");
616: for (int i = 0; i < this .ranges.length; i += 2) {
617: if ((options & RegularExpression.SPECIAL_COMMA) != 0
618: && i > 0)
619: sb.append(",");
620: if (this .ranges[i] == this .ranges[i + 1]) {
621: sb
622: .append(escapeCharInCharClass(this .ranges[i]));
623: } else {
624: sb
625: .append(escapeCharInCharClass(this .ranges[i]));
626: sb.append('-');
627: sb
628: .append(escapeCharInCharClass(this .ranges[i + 1]));
629: }
630: }
631: sb.append("]");
632: ret = sb.toString();
633: }
634: }
635: return ret;
636: }
637:
638: private static String escapeCharInCharClass(int ch) {
639: String ret;
640: switch (ch) {
641: case '[':
642: case ']':
643: case '-':
644: case '^':
645: case ',':
646: case '\\':
647: ret = "\\" + (char) ch;
648: break;
649: case '\f':
650: ret = "\\f";
651: break;
652: case '\n':
653: ret = "\\n";
654: break;
655: case '\r':
656: ret = "\\r";
657: break;
658: case '\t':
659: ret = "\\t";
660: break;
661: case 0x1b:
662: ret = "\\e";
663: break;
664: //case 0x0b: ret = "\\v"; break;
665: default:
666: if (ch < 0x20) {
667: String pre = "0" + Integer.toHexString(ch);
668: ret = "\\x"
669: + pre.substring(pre.length() - 2, pre.length());
670: } else if (ch >= 0x10000) {
671: String pre = "0" + Integer.toHexString(ch);
672: ret = "\\v"
673: + pre.substring(pre.length() - 6, pre.length());
674: } else
675: ret = "" + (char) ch;
676: }
677: return ret;
678: }
679:
680: }
|