001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: /**
019: * @author Nikolay A. Kuznetsov
020: * @version $Revision: 1.11.2.2 $
021: */package java.util.regex;
022:
023: import java.util.BitSet;
024:
025: /**
026: * User defined character classes ([abef]). See AbstractCharClass
027: * documentation for more details.
028: *
029: * @author Nikolay A. Kuznetsov
030: * @version $Revision: 1.11.2.2 $
031: */
032: class CharClass extends AbstractCharClass {
033:
034: // Flag indicates if we add supplement upper/lower case
035: boolean ci = false;
036:
037: boolean uci = false;
038:
039: // Flag indicates if there are unicode supplements
040: boolean hasUCI = false;
041:
042: boolean invertedSurrogates = false;
043:
044: boolean inverted = false;
045:
046: boolean hideBits = false;
047:
048: BitSet bits = new BitSet();
049:
050: AbstractCharClass nonBitSet = null;
051:
052: public CharClass() {
053: }
054:
055: public CharClass(boolean ci, boolean uci) {
056: this .ci = ci;
057: this .uci = uci;
058: }
059:
060: public CharClass(boolean negative, boolean ci, boolean uci) {
061: this (ci, uci);
062: setNegative(negative);
063: }
064:
065: /*
066: * We can use this method safely even if nonBitSet != null
067: * due to specific of range constructions in regular expressions.
068: */
069: public CharClass add(int ch) {
070: if (ci) {
071: if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
072: if (!inverted) {
073: bits.set(Pattern.getSupplement((char) ch));
074: } else {
075: bits.clear(Pattern.getSupplement((char) ch));
076: }
077: } else if (uci && ch > 128) {
078: hasUCI = true;
079: ch = Character.toLowerCase(Character.toUpperCase(ch));
080: // return this;
081: }
082: }
083:
084: if (Lexer.isHighSurrogate(ch) || Lexer.isLowSurrogate(ch)) {
085: if (!invertedSurrogates) {
086: lowHighSurrogates.set(ch - Character.MIN_SURROGATE);
087: } else {
088: lowHighSurrogates.clear(ch - Character.MIN_SURROGATE);
089: }
090: }
091:
092: if (!inverted) {
093: bits.set(ch);
094: } else
095: bits.clear(ch);
096:
097: if (!mayContainSupplCodepoints
098: && Character.isSupplementaryCodePoint(ch)) {
099: mayContainSupplCodepoints = true;
100: }
101:
102: return this ;
103: }
104:
105: /*
106: * The difference between add(AbstractCharClass) and union(AbstractCharClass)
107: * is that add() is used for constructions like "[^abc\\d]"
108: * (this pattern doesn't match "1")
109: * while union is used for constructions like "[^abc[\\d]]"
110: * (this pattern matches "1").
111: */
112: public CharClass add(final AbstractCharClass cc) {
113:
114: if (!mayContainSupplCodepoints && cc.mayContainSupplCodepoints) {
115: mayContainSupplCodepoints = true;
116: }
117:
118: if (!invertedSurrogates) {
119:
120: //A | !B = ! ((A ^ B) & B)
121: if (cc.altSurrogates) {
122: lowHighSurrogates.xor(cc.getLowHighSurrogates());
123: lowHighSurrogates.and(cc.getLowHighSurrogates());
124: altSurrogates = !altSurrogates;
125: invertedSurrogates = true;
126:
127: //A | B
128: } else {
129: lowHighSurrogates.or(cc.getLowHighSurrogates());
130: }
131: } else {
132:
133: //!A | !B = !(A & B)
134: if (cc.altSurrogates) {
135: lowHighSurrogates.and(cc.getLowHighSurrogates());
136:
137: //!A | B = !(A & !B)
138: } else {
139: lowHighSurrogates.andNot(cc.getLowHighSurrogates());
140: }
141: }
142:
143: if (!hideBits && cc.getBits() != null) {
144: if (!inverted) {
145:
146: //A | !B = ! ((A ^ B) & B)
147: if (cc.isNegative()) {
148: bits.xor(cc.getBits());
149: bits.and(cc.getBits());
150: alt = !alt;
151: inverted = true;
152:
153: //A | B
154: } else {
155: bits.or(cc.getBits());
156: }
157: } else {
158:
159: //!A | !B = !(A & B)
160: if (cc.isNegative()) {
161: bits.and(cc.getBits());
162:
163: //!A | B = !(A & !B)
164: } else {
165: bits.andNot(cc.getBits());
166: }
167: }
168: } else {
169: final boolean curAlt = alt;
170:
171: if (nonBitSet == null) {
172:
173: if (curAlt && !inverted && bits.isEmpty()) {
174: nonBitSet = new AbstractCharClass() {
175: public boolean contains(int ch) {
176: return cc.contains(ch);
177: }
178: };
179: //alt = true;
180: } else {
181:
182: /*
183: * We keep the value of alt unchanged for
184: * constructions like [^[abc]fgb] by using
185: * the formula a ^ b == !a ^ !b.
186: */
187: if (curAlt) {
188: nonBitSet = new AbstractCharClass() {
189: public boolean contains(int ch) {
190: return !((curAlt ^ bits.get(ch)) || ((curAlt ^ inverted) ^ cc
191: .contains(ch)));
192: }
193: };
194: //alt = true
195: } else {
196: nonBitSet = new AbstractCharClass() {
197: public boolean contains(int ch) {
198: return (curAlt ^ bits.get(ch))
199: || ((curAlt ^ inverted) ^ cc
200: .contains(ch));
201: }
202: };
203: //alt = false
204: }
205: }
206:
207: hideBits = true;
208: } else {
209: final AbstractCharClass nb = nonBitSet;
210:
211: if (curAlt) {
212: nonBitSet = new AbstractCharClass() {
213: public boolean contains(int ch) {
214: return !(curAlt ^ (nb.contains(ch) || cc
215: .contains(ch)));
216: }
217: };
218: //alt = true
219: } else {
220: nonBitSet = new AbstractCharClass() {
221: public boolean contains(int ch) {
222: return curAlt
223: ^ (nb.contains(ch) || cc
224: .contains(ch));
225: }
226: };
227: //alt = false
228: }
229: }
230: }
231:
232: return this ;
233: }
234:
235: public CharClass add(int st, int end) {
236: if (st > end)
237: throw new IllegalArgumentException();
238: if (!ci
239:
240: //no intersection with surrogate characters
241: && (end < Character.MIN_SURROGATE || st > Character.MAX_SURROGATE)) {
242: if (!inverted) {
243: bits.set(st, end + 1);
244: } else {
245: bits.clear(st, end + 1);
246: }
247: } else {
248: for (int i = st; i < end + 1; i++) {
249: add(i);
250: }
251: }
252: return this ;
253: }
254:
255: // OR operation
256: public void union(final AbstractCharClass clazz) {
257: if (!mayContainSupplCodepoints
258: && clazz.mayContainSupplCodepoints) {
259: mayContainSupplCodepoints = true;
260: }
261:
262: if (clazz.hasUCI())
263: this .hasUCI = true;
264:
265: if (altSurrogates ^ clazz.altSurrogates) {
266:
267: //!A | B = !(A & !B)
268: if (altSurrogates) {
269: lowHighSurrogates.andNot(clazz.getLowHighSurrogates());
270:
271: //A | !B = !((A ^ B) & B)
272: } else {
273: lowHighSurrogates.xor(clazz.getLowHighSurrogates());
274: lowHighSurrogates.and(clazz.getLowHighSurrogates());
275: altSurrogates = true;
276: }
277:
278: } else {
279:
280: //!A | !B = !(A & B)
281: if (altSurrogates) {
282: lowHighSurrogates.and(clazz.getLowHighSurrogates());
283:
284: //A | B
285: } else {
286: lowHighSurrogates.or(clazz.getLowHighSurrogates());
287: }
288: }
289:
290: if (!hideBits && clazz.getBits() != null) {
291: if (alt ^ clazz.isNegative()) {
292:
293: //!A | B = !(A & !B)
294: if (alt) {
295: bits.andNot(clazz.getBits());
296:
297: //A | !B = !((A ^ B) & B)
298: } else {
299: bits.xor(clazz.getBits());
300: bits.and(clazz.getBits());
301: alt = true;
302: }
303:
304: } else {
305:
306: //!A | !B = !(A & B)
307: if (alt) {
308: bits.and(clazz.getBits());
309:
310: //A | B
311: } else {
312: bits.or(clazz.getBits());
313: }
314: }
315: } else {
316: final boolean curAlt = alt;
317:
318: if (nonBitSet == null) {
319:
320: if (!inverted && bits.isEmpty()) {
321: if (curAlt) {
322: nonBitSet = new AbstractCharClass() {
323: public boolean contains(int ch) {
324: return !clazz.contains(ch);
325: }
326: };
327: //alt = true
328: } else {
329: nonBitSet = new AbstractCharClass() {
330: public boolean contains(int ch) {
331: return clazz.contains(ch);
332: }
333: };
334: //alt = false
335: }
336: } else {
337:
338: if (curAlt) {
339: nonBitSet = new AbstractCharClass() {
340: public boolean contains(int ch) {
341: return !(clazz.contains(ch) || (curAlt ^ bits
342: .get(ch)));
343: }
344: };
345: //alt = true
346: } else {
347: nonBitSet = new AbstractCharClass() {
348: public boolean contains(int ch) {
349: return clazz.contains(ch)
350: || (curAlt ^ bits.get(ch));
351: }
352: };
353: //alt = false
354: }
355: }
356: hideBits = true;
357: } else {
358: final AbstractCharClass nb = nonBitSet;
359:
360: if (curAlt) {
361: nonBitSet = new AbstractCharClass() {
362: public boolean contains(int ch) {
363: return !((curAlt ^ nb.contains(ch)) || clazz
364: .contains(ch));
365: }
366: };
367: //alt = true
368: } else {
369: nonBitSet = new AbstractCharClass() {
370: public boolean contains(int ch) {
371: return (curAlt ^ nb.contains(ch))
372: || clazz.contains(ch);
373: }
374: };
375: //alt = false
376: }
377: }
378: }
379: }
380:
381: // AND operation
382: public void intersection(final AbstractCharClass clazz) {
383: if (!mayContainSupplCodepoints
384: && clazz.mayContainSupplCodepoints) {
385: mayContainSupplCodepoints = true;
386: }
387:
388: if (clazz.hasUCI())
389: this .hasUCI = true;
390:
391: if (altSurrogates ^ clazz.altSurrogates) {
392:
393: //!A & B = ((A ^ B) & B)
394: if (altSurrogates) {
395: lowHighSurrogates.xor(clazz.getLowHighSurrogates());
396: lowHighSurrogates.and(clazz.getLowHighSurrogates());
397: altSurrogates = false;
398:
399: //A & !B
400: } else {
401: lowHighSurrogates.andNot(clazz.getLowHighSurrogates());
402: }
403: } else {
404:
405: //!A & !B = !(A | B)
406: if (altSurrogates) {
407: lowHighSurrogates.or(clazz.getLowHighSurrogates());
408:
409: //A & B
410: } else {
411: lowHighSurrogates.and(clazz.getLowHighSurrogates());
412: }
413: }
414:
415: if (!hideBits && clazz.getBits() != null) {
416:
417: if (alt ^ clazz.isNegative()) {
418:
419: //!A & B = ((A ^ B) & B)
420: if (alt) {
421: bits.xor(clazz.getBits());
422: bits.and(clazz.getBits());
423: alt = false;
424:
425: //A & !B
426: } else {
427: bits.andNot(clazz.getBits());
428: }
429: } else {
430:
431: //!A & !B = !(A | B)
432: if (alt) {
433: bits.or(clazz.getBits());
434:
435: //A & B
436: } else {
437: bits.and(clazz.getBits());
438: }
439: }
440: } else {
441: final boolean curAlt = alt;
442:
443: if (nonBitSet == null) {
444:
445: if (!inverted && bits.isEmpty()) {
446: if (curAlt) {
447: nonBitSet = new AbstractCharClass() {
448: public boolean contains(int ch) {
449: return !clazz.contains(ch);
450: }
451: };
452: //alt = true
453: } else {
454: nonBitSet = new AbstractCharClass() {
455: public boolean contains(int ch) {
456: return clazz.contains(ch);
457: }
458: };
459: //alt = false
460: }
461: } else {
462:
463: if (curAlt) {
464: nonBitSet = new AbstractCharClass() {
465: public boolean contains(int ch) {
466: return !(clazz.contains(ch) && (curAlt ^ bits
467: .get(ch)));
468: }
469: };
470: //alt = true
471: } else {
472: nonBitSet = new AbstractCharClass() {
473: public boolean contains(int ch) {
474: return clazz.contains(ch)
475: && (curAlt ^ bits.get(ch));
476: }
477: };
478: //alt = false
479: }
480: }
481: hideBits = true;
482: } else {
483: final AbstractCharClass nb = nonBitSet;
484:
485: if (curAlt) {
486: nonBitSet = new AbstractCharClass() {
487: public boolean contains(int ch) {
488: return !((curAlt ^ nb.contains(ch)) && clazz
489: .contains(ch));
490: }
491: };
492: //alt = true
493: } else {
494: nonBitSet = new AbstractCharClass() {
495: public boolean contains(int ch) {
496: return (curAlt ^ nb.contains(ch))
497: && clazz.contains(ch);
498: }
499: };
500: //alt = false
501: }
502: }
503: }
504: }
505:
506: /**
507: * Returns <code>true</code> if character class contains symbol specified,
508: * <code>false</code> otherwise. Note: #setNegative() method changes the
509: * meaning of contains method;
510: *
511: * @param ch
512: * @return <code>true</code> if character class contains symbol specified;
513: *
514: * TODO: currently <code>character class</code> implementation based on
515: * BitSet, but this implementation possibly will be turned to combined
516: * BitSet(for first 256 symbols) and Black/Red tree for the rest of UTF.
517: */
518: public boolean contains(int ch) {
519: if (nonBitSet == null) {
520: return this .alt ^ bits.get(ch);
521: } else {
522: return alt ^ nonBitSet.contains(ch);
523: }
524: }
525:
526: protected BitSet getBits() {
527: if (hideBits)
528: return null;
529: return bits;
530: }
531:
532: protected BitSet getLowHighSurrogates() {
533: return lowHighSurrogates;
534: }
535:
536: public AbstractCharClass getInstance() {
537:
538: if (nonBitSet == null) {
539: final BitSet bs = getBits();
540:
541: AbstractCharClass res = new AbstractCharClass() {
542: public boolean contains(int ch) {
543: return this .alt ^ bs.get(ch);
544: }
545:
546: public String toString() {
547: StringBuffer temp = new StringBuffer();
548: for (int i = bs.nextSetBit(0); i >= 0; i = bs
549: .nextSetBit(i + 1)) {
550: temp.append(Character.toChars(i));
551: temp.append('|');
552: }
553:
554: if (temp.length() > 0)
555: temp.deleteCharAt(temp.length() - 1);
556:
557: return temp.toString();
558: }
559:
560: };
561: return res.setNegative(isNegative());
562: } else {
563: return this ;
564: }
565: }
566:
567: //for debugging purposes only
568: public String toString() {
569: StringBuffer temp = new StringBuffer();
570: for (int i = bits.nextSetBit(0); i >= 0; i = bits
571: .nextSetBit(i + 1)) {
572: temp.append(Character.toChars(i));
573: temp.append('|');
574: }
575:
576: if (temp.length() > 0)
577: temp.deleteCharAt(temp.length() - 1);
578:
579: return temp.toString();
580: }
581:
582: public boolean hasUCI() {
583: return hasUCI;
584: }
585: }
|