001: /**
002: * Copyright (c) 2001, Sergey A. Samokhodkin
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without modification,
006: * are permitted provided that the following conditions are met:
007: *
008: * - Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * - Redistributions in binary form
011: * must reproduce the above copyright notice, this list of conditions and the following
012: * disclaimer in the documentation and/or other materials provided with the distribution.
013: * - Neither the name of jregex nor the names of its contributors may be used
014: * to endorse or promote products derived from this software without specific prior
015: * written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
018: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020: * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021: * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
022: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
023: * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
024: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
025: * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026: *
027: * @version 1.2_01
028: */package jregex;
029:
030: import java.util.*;
031:
032: class CharacterClass extends Term implements UnicodeConstants {
033: static final Bitset DIGIT = new Bitset();
034: static final Bitset WORDCHAR = new Bitset();
035: static final Bitset SPACE = new Bitset();
036:
037: static final Bitset UDIGIT = new Bitset();
038: static final Bitset UWORDCHAR = new Bitset();
039: static final Bitset USPACE = new Bitset();
040:
041: static final Bitset NONDIGIT = new Bitset();
042: static final Bitset NONWORDCHAR = new Bitset();
043: static final Bitset NONSPACE = new Bitset();
044:
045: static final Bitset UNONDIGIT = new Bitset();
046: static final Bitset UNONWORDCHAR = new Bitset();
047: static final Bitset UNONSPACE = new Bitset();
048:
049: private static boolean namesInitialized = false;
050:
051: static final Hashtable namedClasses = new Hashtable();
052: static final Vector unicodeBlocks = new Vector();
053: static final Vector posixClasses = new Vector();
054: static final Vector unicodeCategories = new Vector();
055:
056: //modes; used in parseGroup(()
057: private final static int ADD = 1;
058: private final static int SUBTRACT = 2;
059: private final static int INTERSECT = 3;
060:
061: private static final String blockData = "0000..007F:InBasicLatin;0080..00FF:InLatin-1Supplement;0100..017F:InLatinExtended-A;"
062: + "0180..024F:InLatinExtended-B;0250..02AF:InIPAExtensions;02B0..02FF:InSpacingModifierLetters;"
063: + "0300..036F:InCombiningDiacriticalMarks;0370..03FF:InGreek;0400..04FF:InCyrillic;0530..058F:InArmenian;"
064: + "0590..05FF:InHebrew;0600..06FF:InArabic;0700..074F:InSyriac;0780..07BF:InThaana;0900..097F:InDevanagari;"
065: + "0980..09FF:InBengali;0A00..0A7F:InGurmukhi;0A80..0AFF:InGujarati;0B00..0B7F:InOriya;0B80..0BFF:InTamil;"
066: + "0C00..0C7F:InTelugu;0C80..0CFF:InKannada;0D00..0D7F:InMalayalam;0D80..0DFF:InSinhala;0E00..0E7F:InThai;"
067: + "0E80..0EFF:InLao;0F00..0FFF:InTibetan;1000..109F:InMyanmar;10A0..10FF:InGeorgian;1100..11FF:InHangulJamo;"
068: + "1200..137F:InEthiopic;13A0..13FF:InCherokee;1400..167F:InUnifiedCanadianAboriginalSyllabics;"
069: + "1680..169F:InOgham;16A0..16FF:InRunic;1780..17FF:InKhmer;1800..18AF:InMongolian;"
070: + "1E00..1EFF:InLatinExtendedAdditional;1F00..1FFF:InGreekExtended;2000..206F:InGeneralPunctuation;"
071: + "2070..209F:InSuperscriptsAndSubscripts;20A0..20CF:InCurrencySymbols;"
072: + "20D0..20FF:InCombiningMarksForSymbols;2100..214F:InLetterLikeSymbols;2150..218F:InNumberForms;"
073: + "2190..21FF:InArrows;2200..22FF:InMathematicalOperators;2300..23FF:InMiscellaneousTechnical;"
074: + "2400..243F:InControlPictures;2440..245F:InOpticalCharacterRecognition;"
075: + "2460..24FF:InEnclosedAlphanumerics;2500..257F:InBoxDrawing;2580..259F:InBlockElements;"
076: + "25A0..25FF:InGeometricShapes;2600..26FF:InMiscellaneousSymbols;2700..27BF:InDingbats;"
077: + "2800..28FF:InBraillePatterns;2E80..2EFF:InCJKRadicalsSupplement;2F00..2FDF:InKangxiRadicals;"
078: + "2FF0..2FFF:InIdeographicDescriptionCharacters;3000..303F:InCJKSymbolsAndPunctuation;"
079: + "3040..309F:InHiragana;30A0..30FF:InKatakana;3100..312F:InBopomofo;3130..318F:InHangulCompatibilityJamo;"
080: + "3190..319F:InKanbun;31A0..31BF:InBopomofoExtended;3200..32FF:InEnclosedCJKLettersAndMonths;"
081: + "3300..33FF:InCJKCompatibility;3400..4DB5:InCJKUnifiedIdeographsExtensionA;"
082: + "4E00..9FFF:InCJKUnifiedIdeographs;A000..A48F:InYiSyllables;A490..A4CF:InYiRadicals;"
083: + "AC00..D7A3:InHangulSyllables;D800..DB7F:InHighSurrogates;DB80..DBFF:InHighPrivateUseSurrogates;"
084: + "DC00..DFFF:InLowSurrogates;E000..F8FF:InPrivateUse;F900..FAFF:InCJKCompatibilityIdeographs;"
085: + "FB00..FB4F:InAlphabeticPresentationForms;FB50..FDFF:InArabicPresentationForms-A;"
086: + "FE20..FE2F:InCombiningHalfMarks;FE30..FE4F:InCJKCompatibilityForms;FE50..FE6F:InSmallFormVariants;"
087: + "FE70..FEFE:InArabicPresentationForms-B;FEFF..FEFF:InSpecials;FF00..FFEF:InHalfWidthAndFullWidthForms;"
088: + "FFF0..FFFD:InSpecials";
089:
090: static {
091: //*
092: DIGIT.setDigit(false);
093: WORDCHAR.setWordChar(false);
094: SPACE.setSpace(false);
095:
096: UDIGIT.setDigit(true);
097: UWORDCHAR.setWordChar(true);
098: USPACE.setSpace(true);
099:
100: NONDIGIT.setDigit(false);
101: NONDIGIT.setPositive(false);
102: NONWORDCHAR.setWordChar(false);
103: NONWORDCHAR.setPositive(false);
104: NONSPACE.setSpace(false);
105: NONSPACE.setPositive(false);
106:
107: UNONDIGIT.setDigit(true);
108: UNONDIGIT.setPositive(false);
109: UNONWORDCHAR.setWordChar(true);
110: UNONWORDCHAR.setPositive(false);
111: UNONSPACE.setSpace(true);
112: UNONSPACE.setPositive(false);
113:
114: initPosixClasses();
115: }
116:
117: private static void registerClass(String name, Bitset cls,
118: Vector realm) {
119: namedClasses.put(name, cls);
120: if (!realm.contains(name))
121: realm.addElement(name);
122: }
123:
124: private static void initPosixClasses() {
125: Bitset lower = new Bitset();
126: lower.setRange('a', 'z');
127: registerClass("Lower", lower, posixClasses);
128: Bitset upper = new Bitset();
129: upper.setRange('A', 'Z');
130: registerClass("Upper", upper, posixClasses);
131: Bitset ascii = new Bitset();
132: ascii.setRange((char) 0, (char) 0x7f);
133: registerClass("ASCII", ascii, posixClasses);
134: Bitset alpha = new Bitset();
135: alpha.add(lower);
136: alpha.add(upper);
137: registerClass("Alpha", alpha, posixClasses);
138: Bitset digit = new Bitset();
139: digit.setRange('0', '9');
140: registerClass("Digit", digit, posixClasses);
141: Bitset alnum = new Bitset();
142: alnum.add(alpha);
143: alnum.add(digit);
144: registerClass("Alnum", alnum, posixClasses);
145: Bitset punct = new Bitset();
146: punct.setChars("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~");
147: registerClass("Punct", punct, posixClasses);
148: Bitset graph = new Bitset();
149: graph.add(alnum);
150: graph.add(punct);
151: registerClass("Graph", graph, posixClasses);
152: registerClass("Print", graph, posixClasses);
153: Bitset blank = new Bitset();
154: blank.setChars(" \t");
155: registerClass("Blank", blank, posixClasses);
156: Bitset cntrl = new Bitset();
157: cntrl.setRange((char) 0, (char) 0x1f);
158: cntrl.setChar((char) 0x7f);
159: registerClass("Cntrl", cntrl, posixClasses);
160: Bitset xdigit = new Bitset();
161: xdigit.setRange('0', '9');
162: xdigit.setRange('a', 'f');
163: xdigit.setRange('A', 'F');
164: registerClass("XDigit", xdigit, posixClasses);
165: Bitset space = new Bitset();
166: space.setChars(" \t\n\r\f\u000b");
167: registerClass("Space", space, posixClasses);
168: }
169:
170: private static void initNames() {
171: initNamedCategory("C", new int[] { Cn, Cc, Cf, Co, Cs });
172: initNamedCategory("Cn", Cn);
173: initNamedCategory("Cc", Cc);
174: initNamedCategory("Cf", Cf);
175: initNamedCategory("Co", Co);
176: initNamedCategory("Cs", Cs);
177:
178: initNamedCategory("L", new int[] { Lu, Ll, Lt, Lm, Lo });
179: initNamedCategory("Lu", Lu);
180: initNamedCategory("Ll", Ll);
181: initNamedCategory("Lt", Lt);
182: initNamedCategory("Lm", Lm);
183: initNamedCategory("Lo", Lo);
184:
185: initNamedCategory("M", new int[] { Mn, Me, Mc });
186: initNamedCategory("Mn", Mn);
187: initNamedCategory("Me", Me);
188: initNamedCategory("Mc", Mc);
189:
190: initNamedCategory("N", new int[] { Nd, Nl, No });
191: initNamedCategory("Nd", Nd);
192: initNamedCategory("Nl", Nl);
193: initNamedCategory("No", No);
194:
195: initNamedCategory("Z", new int[] { Zs, Zl, Zp });
196: initNamedCategory("Zs", Zs);
197: initNamedCategory("Zl", Zl);
198: initNamedCategory("Zp", Zp);
199:
200: initNamedCategory("P", new int[] { Pd, Ps, Pi, Pe, Pf, Pc, Po });
201: initNamedCategory("Pd", Pd);
202: initNamedCategory("Ps", Ps);
203: initNamedCategory("Pi", Pi);
204: initNamedCategory("Pe", Pe);
205: initNamedCategory("Pf", Pf);
206: initNamedCategory("Pc", Pc);
207: initNamedCategory("Po", Po);
208:
209: initNamedCategory("S", new int[] { Sm, Sc, Sk, So });
210: initNamedCategory("Sm", Sm);
211: initNamedCategory("Sc", Sc);
212: initNamedCategory("Sk", Sk);
213: initNamedCategory("So", So);
214:
215: Bitset bs = new Bitset();
216: bs.setCategory(Cn);
217: registerClass("UNASSIGNED", bs, unicodeCategories);
218: bs = new Bitset();
219: bs.setCategory(Cn);
220: bs.setPositive(false);
221: registerClass("ASSIGNED", bs, unicodeCategories);
222:
223: StringTokenizer st = new StringTokenizer(blockData, ".,:;");
224: while (st.hasMoreTokens()) {
225: try {
226: int first = Integer.parseInt(st.nextToken(), 16);
227: int last = Integer.parseInt(st.nextToken(), 16);
228: String name = st.nextToken();
229: initNamedBlock(name, first, last);
230: } catch (Exception e) {
231: e.printStackTrace();
232: }
233: }
234:
235: initNamedBlock("ALL", 0, 0xffff);
236:
237: namesInitialized = true;
238: //*/
239: }
240:
241: private static void initNamedBlock(String name, int first, int last) {
242: if (first < Character.MIN_VALUE || first > Character.MAX_VALUE)
243: throw new IllegalArgumentException("wrong start code ("
244: + first + ") in block " + name);
245: if (last < Character.MIN_VALUE || last > Character.MAX_VALUE)
246: throw new IllegalArgumentException("wrong end code ("
247: + last + ") in block " + name);
248: if (last < first)
249: throw new IllegalArgumentException(
250: "end code < start code in block " + name);
251: Bitset bs = (Bitset) namedClasses.get(name);
252: if (bs == null) {
253: bs = new Bitset();
254: registerClass(name, bs, unicodeBlocks);
255: }
256: bs.setRange((char) first, (char) last);
257: }
258:
259: private static void initNamedCategory(String name, int cat) {
260: Bitset bs = new Bitset();
261: bs.setCategory(cat);
262: registerClass(name, bs, unicodeCategories);
263: }
264:
265: private static void initNamedCategory(String name, int[] cats) {
266: Bitset bs = new Bitset();
267: for (int i = 0; i < cats.length; i++) {
268: bs.setCategory(cats[i]);
269: }
270: namedClasses.put(name, bs);
271: }
272:
273: private static Bitset getNamedClass(String name) {
274: if (!namesInitialized)
275: initNames();
276: return (Bitset) namedClasses.get(name);
277: }
278:
279: static void makeICase(Term term, char c) {
280: Bitset bs = new Bitset();
281: bs.setChar(Character.toLowerCase(c));
282: bs.setChar(Character.toUpperCase(c));
283: bs.setChar(Character.toTitleCase(c));
284: Bitset.unify(bs, term);
285: }
286:
287: static void makeDigit(Term term, boolean inverse, boolean unicode) {
288: Bitset digit = unicode ? inverse ? UNONDIGIT : UDIGIT
289: : inverse ? NONDIGIT : DIGIT;
290: Bitset.unify(digit, term);
291: }
292:
293: static void makeSpace(Term term, boolean inverse, boolean unicode) {
294: Bitset space = unicode ? inverse ? UNONSPACE : USPACE
295: : inverse ? NONSPACE : SPACE;
296: Bitset.unify(space, term);
297: }
298:
299: static void makeWordChar(Term term, boolean inverse, boolean unicode) {
300: Bitset wordChar = unicode ? inverse ? UNONWORDCHAR : UWORDCHAR
301: : inverse ? NONWORDCHAR : WORDCHAR;
302: Bitset.unify(wordChar, term);
303: }
304:
305: static void makeWordBoundary(Term term, boolean inverse,
306: boolean unicode) {
307: makeWordChar(term, inverse, unicode);
308: term.type = unicode ? UBOUNDARY : BOUNDARY;
309: }
310:
311: static void makeWordStart(Term term, boolean unicode) {
312: makeWordChar(term, false, unicode);
313: term.type = unicode ? UDIRECTION : DIRECTION;
314: }
315:
316: static void makeWordEnd(Term term, boolean unicode) {
317: makeWordChar(term, true, unicode);
318: term.type = unicode ? UDIRECTION : DIRECTION;
319: }
320:
321: final static void parseGroup(char[] data, int i, int out,
322: Term term, boolean icase, boolean skipspaces,
323: boolean unicode, boolean xml) throws PatternSyntaxException {
324: Bitset sum = new Bitset();
325: Bitset bs = new Bitset();
326: int mode = ADD;
327: char c;
328: for (; i < out;) {
329: switch (c = data[i++]) {
330: case '+':
331: mode = ADD;
332: continue;
333: case '-':
334: mode = SUBTRACT;
335: continue;
336: case '&':
337: mode = INTERSECT;
338: continue;
339: case '[':
340: bs.reset();
341: i = parseClass(data, i, out, bs, icase, skipspaces,
342: unicode, xml);
343: switch (mode) {
344: case ADD:
345: sum.add(bs);
346: break;
347: case SUBTRACT:
348: sum.subtract(bs);
349: break;
350: case INTERSECT:
351: sum.intersect(bs);
352: break;
353: }
354: continue;
355: case ')':
356: throw new PatternSyntaxException(
357: "unbalanced class group");
358: }
359: }
360: Bitset.unify(sum, term);
361: }
362:
363: final static int parseClass(char[] data, int i, int out, Term term,
364: boolean icase, boolean skipspaces, boolean unicode,
365: boolean xml) throws PatternSyntaxException {
366: Bitset bs = new Bitset();
367: i = parseClass(data, i, out, bs, icase, skipspaces, unicode,
368: xml);
369: Bitset.unify(bs, term);
370: return i;
371: }
372:
373: final static int parseName(char[] data, int i, int out, Term term,
374: boolean inverse, boolean skipspaces)
375: throws PatternSyntaxException {
376: StringBuffer sb = new StringBuffer();
377: i = parseName(data, i, out, sb, skipspaces);
378: Bitset bs = getNamedClass(sb.toString());
379: if (bs == null)
380: throw new PatternSyntaxException("unknow class: {" + sb
381: + "}");
382: Bitset.unify(bs, term);
383: term.inverse = inverse;
384: return i;
385: }
386:
387: /*
388: * @param mode add/subtract
389: */
390: private final static int parseClass(char[] data, int i, int out,
391: Bitset bs, boolean icase, boolean skipspaces,
392: boolean unicode, boolean xml) throws PatternSyntaxException {
393: //System.out.println("parseClass("+new String(data)+","+i+","+out+",....)");
394: char c;
395: int prev = -1;
396: boolean isFirst = true, setFirst = false, inRange = false;
397: Bitset bs1 = null;
398: StringBuffer sb = null;
399: for (; i < out; isFirst = setFirst, setFirst = false) {
400: //System.out.println(" c="+data[i]);
401: handle_special: switch (c = data[i++]) {
402: case ']':
403: //if(inRange) throw new PatternSyntaxException("[...-] is illegal");
404: if (isFirst)
405: break; //treat as normal char
406: if (inRange) {
407: bs.setChar('-');
408: }
409: if (prev >= 0) {
410: char c1 = (char) prev;
411: if (icase) {
412: bs.setChar(Character.toLowerCase(c1));
413: bs.setChar(Character.toUpperCase(c1));
414: bs.setChar(Character.toTitleCase(c1));
415: } else
416: bs.setChar(c1);
417: }
418: return i;
419:
420: case '-':
421: if (isFirst)
422: break;
423: //if(isFirst) throw new PatternSyntaxException("[-...] is illegal");
424: if (inRange)
425: break;
426: //if(inRange) throw new PatternSyntaxException("[...--...] is illegal");
427: inRange = true;
428: continue;
429:
430: case '[':
431: if (inRange && xml) { //[..-[..]]
432: if (prev >= 0)
433: bs.setChar((char) prev);
434: if (bs1 == null)
435: bs1 = new Bitset();
436: else
437: bs1.reset();
438: i = parseClass(data, i, out, bs1, icase,
439: skipspaces, unicode, xml);
440: //System.out.println(" i="+i);
441: bs.subtract(bs1);
442: inRange = false;
443: prev = -1;
444: continue;
445: } else
446: break handle_special;
447:
448: case '^':
449: //if(!isFirst) throw new PatternSyntaxException("'^' isn't a first char in a class def");
450: //bs.setPositive(false);
451: //setFirst=true;
452: //continue;
453: if (isFirst) {
454: bs.setPositive(false);
455: setFirst = true;
456: continue;
457: }
458: //treat as normal char
459: break;
460:
461: case ' ':
462: case '\r':
463: case '\n':
464: case '\t':
465: case '\f':
466: if (skipspaces)
467: continue;
468: else
469: break handle_special;
470: case '\\':
471: Bitset negatigeClass = null;
472: boolean inv = false;
473: handle_escape: switch (c = data[i++]) {
474: case 'r':
475: c = '\r';
476: break handle_special;
477:
478: case 'n':
479: c = '\n';
480: break handle_special;
481:
482: case 'e':
483: c = '\u001B';
484: break handle_special;
485:
486: case 't':
487: c = '\t';
488: break handle_special;
489:
490: case 'f':
491: c = '\f';
492: break handle_special;
493:
494: case 'u':
495: if (i >= out - 4)
496: throw new PatternSyntaxException(
497: "incomplete escape sequence \\uXXXX");
498: c = (char) ((toHexDigit(c) << 12)
499: + (toHexDigit(data[i++]) << 8)
500: + (toHexDigit(data[i++]) << 4) + toHexDigit(data[i++]));
501: break handle_special;
502:
503: case 'v':
504: c = (char) ((toHexDigit(c) << 24)
505: + (toHexDigit(data[i++]) << 16)
506: + (toHexDigit(data[i++]) << 12)
507: + (toHexDigit(data[i++]) << 8)
508: + (toHexDigit(data[i++]) << 4) + toHexDigit(data[i++]));
509: break handle_special;
510:
511: case 'b':
512: c = 8; // backspace
513: break handle_special;
514:
515: case 'x': { // hex 2-digit number
516: int hex = 0;
517: char d;
518: if ((d = data[i++]) == '{') {
519: while ((d = data[i++]) != '}') {
520: hex = (hex << 4) + toHexDigit(d);
521: }
522: if (hex > 0xffff)
523: throw new PatternSyntaxException(
524: "\\x{<out of range>}");
525: } else {
526: hex = (toHexDigit(d) << 4)
527: + toHexDigit(data[i++]);
528: }
529: c = (char) hex;
530: break handle_special;
531: }
532: case '0': // oct 2- or 3-digit number
533: case 'o': // oct 2- or 3-digit number
534: int oct = 0;
535: for (;;) {
536: char d = data[i++];
537: if (d >= '0' && d <= '7') {
538: oct *= 8;
539: oct += d - '0';
540: if (oct > 0xffff)
541: break;
542: } else {
543: i--;
544: break;
545: }
546: }
547: c = (char) oct;
548: break handle_special;
549:
550: case 'm': // decimal number -> char
551: int dec = 0;
552: for (;;) {
553: char d = data[i++];
554: if (d >= '0' && d <= '9') {
555: dec *= 10;
556: dec += d - '0';
557: if (dec > 0xffff)
558: break;
559: } else {
560: i--;
561: break;
562: }
563: }
564: c = (char) dec;
565: break handle_special;
566:
567: case 'c': // ctrl-char
568: c = (char) (data[i++] & 0x1f);
569: break handle_special;
570:
571: //classes;
572: //
573: case 'D': // non-digit
574: negatigeClass = unicode ? UNONDIGIT : NONDIGIT;
575: break handle_escape;
576:
577: case 'S': // space
578: negatigeClass = unicode ? UNONSPACE : NONSPACE;
579: break handle_escape;
580:
581: case 'W': // space
582: negatigeClass = unicode ? UNONWORDCHAR
583: : NONWORDCHAR;
584: break handle_escape;
585:
586: case 'd': // digit
587: if (inRange)
588: throw new PatternSyntaxException(
589: "illegal range: [..." + prev
590: + "-\\d...]");
591: bs.setDigit(unicode);
592: continue;
593:
594: case 's': // digit
595: if (inRange)
596: throw new PatternSyntaxException(
597: "illegal range: [..." + prev
598: + "-\\s...]");
599: bs.setSpace(unicode);
600: continue;
601:
602: case 'w': // digit
603: if (inRange)
604: throw new PatternSyntaxException(
605: "illegal range: [..." + prev
606: + "-\\w...]");
607: bs.setWordChar(unicode);
608: continue;
609:
610: case 'P': // \\P{..}
611: inv = true;
612: case 'p': // \\p{..}
613: if (inRange)
614: throw new PatternSyntaxException(
615: "illegal range: [..." + prev
616: + "-\\w...]");
617: if (sb == null)
618: sb = new StringBuffer();
619: else
620: sb.setLength(0);
621: i = parseName(data, i, out, sb, skipspaces);
622: Bitset nc = getNamedClass(sb.toString());
623: if (nc == null)
624: throw new PatternSyntaxException(
625: "unknown named class: {" + sb + "}");
626: bs.add(nc, inv);
627: continue;
628:
629: default:
630: //other escaped treat as normal
631: break handle_special;
632: }
633: //negatigeClass;
634: //\S,\D,\W
635: if (inRange)
636: throw new PatternSyntaxException(
637: "illegal range: [..." + prev + "-\\" + c
638: + "...]");
639: bs.add(negatigeClass);
640: continue;
641: /* should probably not be here...
642: case '{': //
643: if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\w...]");
644: if(sb==null) sb=new StringBuffer();
645: else sb.setLength(0);
646: i=parseName(data,i-1,out,sb,skipspaces);
647: Bitset nc=getNamedClass(sb.toString());
648: if(nc==null) throw new PatternSyntaxException("unknown named class: {"+sb+"}");
649: bs.add(nc,false);
650: continue;
651: */
652: default:
653: }
654: //c is a normal char
655: //System.out.println(" normal c="+c+", inRange="+inRange+", prev="+(char)prev);
656: if (prev < 0) {
657: prev = c;
658: inRange = false;
659: continue;
660: }
661: if (!inRange) {
662: char c1 = (char) prev;
663: if (icase) {
664: bs.setChar(Character.toLowerCase(c1));
665: bs.setChar(Character.toUpperCase(c1));
666: bs.setChar(Character.toTitleCase(c1));
667: } else
668: bs.setChar(c1);
669: prev = c;
670: } else {
671: if (prev > c)
672: throw new PatternSyntaxException("illegal range: "
673: + prev + ">" + c);
674: char c0 = (char) prev;
675: inRange = false;
676: prev = -1;
677: if (icase) {
678: bs.setRange(Character.toLowerCase(c0), Character
679: .toLowerCase(c));
680: bs.setRange(Character.toUpperCase(c0), Character
681: .toUpperCase(c));
682: bs.setRange(Character.toTitleCase(c0), Character
683: .toTitleCase(c));
684: } else
685: bs.setRange(c0, c);
686: }
687: }
688: throw new PatternSyntaxException(
689: "unbalanced brackets in a class def");
690: }
691:
692: final static int parseName(char[] data, int i, int out,
693: StringBuffer sb, boolean skipspaces)
694: throws PatternSyntaxException {
695: char c;
696: int start = -1;
697: while (i < out) {
698: switch (c = data[i++]) {
699: case '{':
700: start = i;
701: continue;
702: case '}':
703: return i;
704: case ' ':
705: case '\r':
706: case '\n':
707: case '\t':
708: case '\f':
709: if (skipspaces)
710: continue;
711: //else pass on
712: default:
713: if (start < 0)
714: throw new PatternSyntaxException(
715: "named class doesn't start with '{'");
716: sb.append(c);
717: }
718: }
719: throw new PatternSyntaxException("wrong class name: "
720: + new String(data, i, out - i));
721: }
722:
723: static String stringValue0(boolean[] arr) {
724: /*
725: System.out.println("stringValue0():");
726: System.out.println("arr="+arr);
727: for(int i=0;i<BLOCK_SIZE;i++){
728: if(arr[i]) if(i>32 && i<127)System.out.print((char)i); else System.out.print("["+i+"]");
729: }
730: System.out.println();
731: */
732: StringBuffer b = new StringBuffer();
733: int c = 0;
734:
735: loop: for (;;) {
736: while (!arr[c]) {
737: //System.out.println(c+": "+arr[c]);
738: c++;
739: if (c >= 0xff)
740: break loop;
741: }
742: int first = c;
743: while (arr[c]) {
744: //System.out.println(c+": "+arr[c]);
745: c++;
746: if (c > 0xff)
747: break;
748: }
749: int last = c - 1;
750: if (last == first)
751: b.append(stringValue(last));
752: else {
753: b.append(stringValue(first));
754: b.append('-');
755: b.append(stringValue(last));
756: }
757: if (c > 0xff)
758: break;
759: }
760: return b.toString();
761: }
762:
763: /* Mmm.. what is it?
764: static String stringValueC(boolean[] categories){
765: StringBuffer sb=new StringBuffer();
766: for(int i=0;i<categories.length;i++){
767: if(!categories[i]) continue;
768: String name=(String)unicodeCategoryNames.get(new Integer(i));
769: sb.append('{');
770: sb.append(name);
771: sb.append('}');
772: }
773: return sb.toString();
774: }
775: */
776:
777: static String stringValue2(boolean[][] arr) {
778: StringBuffer b = new StringBuffer();
779: int c = 0;
780: loop: for (;;) {
781: boolean marked = false;
782: for (;;) {
783: boolean[] marks = arr[c >> 8];
784: if (marks != null && marks[c & 255])
785: break;
786: c++;
787: if (c > 0xffff)
788: break loop;
789: }
790: int first = c;
791: for (; c <= 0xffff;) {
792: boolean[] marks = arr[c >> 8];
793: if (marks == null || !marks[c & 255])
794: break;
795: c++;
796: }
797: int last = c - 1;
798: if (last == first)
799: b.append(stringValue(last));
800: else {
801: b.append(stringValue(first));
802: b.append('-');
803: b.append(stringValue(last));
804: }
805: if (c > 0xffff)
806: break;
807: }
808: return b.toString();
809: }
810:
811: static String stringValue(int c) {
812: StringBuffer b = new StringBuffer(5);
813: if (c < 32) {
814: switch (c) {
815: case '\r':
816: b.append("\\r");
817: break;
818: case '\n':
819: b.append("\\n");
820: break;
821: case '\t':
822: b.append("\\t");
823: break;
824: case '\f':
825: b.append("\\f");
826: break;
827: default:
828: b.append('(');
829: b.append((int) c);
830: b.append(')');
831: }
832: } else if (c < 256) {
833: b.append((char) c);
834: } else {
835: b.append('\\');
836: b.append('x');
837: b.append(Integer.toHexString(c));
838: }
839: return b.toString();
840: }
841:
842: static int toHexDigit(char d) throws PatternSyntaxException {
843: int val = 0;
844: if (d >= '0' && d <= '9')
845: val = d - '0';
846: else if (d >= 'a' && d <= 'f')
847: val = 10 + d - 'a';
848: else if (d >= 'A' && d <= 'F')
849: val = 10 + d - 'A';
850: else
851: throw new PatternSyntaxException(
852: "hexadecimal digit expected: " + d);
853: return val;
854: }
855:
856: public static void main(String[] args) {
857: if (!namesInitialized)
858: initNames();
859: if (args.length == 0) {
860: System.out.println("Class usage: \\p{Class},\\P{Class}");
861: printRealm(posixClasses, "Posix classes");
862: printRealm(unicodeCategories, "Unicode categories");
863: printRealm(unicodeBlocks, "Unicode blocks");
864: } else {
865: for (int i = 0; i < args.length; i++) {
866: System.out.print(args[i]);
867: System.out.print(": ");
868: System.out
869: .println(namedClasses.containsKey(args[i]) ? "supported"
870: : "not supported");
871: }
872: }
873: /*
874: int[][] data=new int[CATEGORY_COUNT][BLOCK_SIZE+2];
875: for(int i=Character.MIN_VALUE;i<=Character.MAX_VALUE;i++){
876: int cat=Character.getType((char)i);
877: data[cat][BLOCK_SIZE]++;
878: int b=(i>>8)&0xff;
879: if(data[cat][b]==0){
880: data[cat][b]=1;
881: data[cat][BLOCK_SIZE+1]++;
882: }
883: }
884: for(int i=0;i<CATEGORY_COUNT;i++){
885: System.out.print(unicodeCategoryNames.get(new Integer(i))+": ");
886: System.out.println(data[i][BLOCK_SIZE]+" chars, "+data[i][BLOCK_SIZE+1]+" blocks, "+(data[i][BLOCK_SIZE]/data[i][BLOCK_SIZE+1])+" chars/block");
887: }
888: */
889: }
890:
891: private static void printRealm(Vector realm, String name) {
892: System.out.println(name + ":");
893: Enumeration e = realm.elements();
894: while (e.hasMoreElements()) {
895: System.out.println(" " + e.nextElement());
896: }
897: }
898: }
|