Source Code Cross Referenced for Pattern.java in  » Apache-Harmony-Java-SE » java-package » java » util » regex » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Apache Harmony Java SE » java package » java.util.regex 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         *  Licensed to the Apache Software Foundation (ASF) under one or more
0003:         *  contributor license agreements.  See the NOTICE file distributed with
0004:         *  this work for additional information regarding copyright ownership.
0005:         *  The ASF licenses this file to You under the Apache License, Version 2.0
0006:         *  (the "License"); you may not use this file except in compliance with
0007:         *  the License.  You may obtain a copy of the License at
0008:         *
0009:         *     http://www.apache.org/licenses/LICENSE-2.0
0010:         *
0011:         *  Unless required by applicable law or agreed to in writing, software
0012:         *  distributed under the License is distributed on an "AS IS" BASIS,
0013:         *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014:         *  See the License for the specific language governing permissions and
0015:         *  limitations under the License.
0016:         */
0017:
0018:        /**
0019:         * @author Nikolay A. Kuznetsov
0020:         * @version $Revision: 1.36.2.2 $
0021:         */package java.util.regex;
0022:
0023:        import java.io.Serializable;
0024:
0025:        import java.util.ArrayList;
0026:
0027:        import org.apache.harmony.regex.internal.nls.Messages;
0028:
0029:        /**
0030:         * Pattern implements a compiler for regular expressions as defined by the J2SE
0031:         * specification. The regular expression syntax is largely similar to the syntax
0032:         * defined by Perl 5 but has both omissions and extensions. A formal and
0033:         * complete definition of the regular expression syntax is not provided by the
0034:         * J2SE speTBD (TODO)
0035:         * 
0036:         */
0037:        public final class Pattern implements  Serializable {
0038:
0039:            private static final long serialVersionUID = 5073258162644648461L;
0040:
0041:            static final boolean _DEBUG_ = false;
0042:
0043:            /**
0044:             * @com.intel.drl.spec_ref
0045:             */
0046:            public static final int UNIX_LINES = 1 << 0;
0047:
0048:            /**
0049:             * @com.intel.drl.spec_ref
0050:             */
0051:            public static final int CASE_INSENSITIVE = 1 << 1;
0052:
0053:            /**
0054:             * @com.intel.drl.spec_ref
0055:             */
0056:            public static final int COMMENTS = 1 << 2;
0057:
0058:            /**
0059:             * @com.intel.drl.spec_ref
0060:             */
0061:            public static final int MULTILINE = 1 << 3;
0062:
0063:            /**
0064:             * @com.intel.drl.spec_ref
0065:             */
0066:            public static final int LITERAL = 1 << 4;
0067:
0068:            /**
0069:             * @com.intel.drl.spec_ref
0070:             */
0071:            public static final int DOTALL = 1 << 5;
0072:
0073:            /**
0074:             * @com.intel.drl.spec_ref
0075:             */
0076:            public static final int UNICODE_CASE = 1 << 6;
0077:
0078:            /**
0079:             * @com.intel.drl.spec_ref
0080:             */
0081:            public static final int CANON_EQ = 1 << 7;
0082:
0083:            static final int BACK_REF_NUMBER = 10;
0084:
0085:            /**
0086:             * Bit mask that includes all defined match flags
0087:             */
0088:            static final int flagsBitMask = Pattern.UNIX_LINES
0089:                    | Pattern.CASE_INSENSITIVE | Pattern.COMMENTS
0090:                    | Pattern.MULTILINE | Pattern.DOTALL | Pattern.UNICODE_CASE
0091:                    | Pattern.CANON_EQ;
0092:
0093:            /**
0094:             * Current <code>pattern</code> to be compiled;
0095:             */
0096:            private transient Lexer lexemes = null;
0097:
0098:            /**
0099:             * Pattern compile flags;
0100:             */
0101:            private int flags = 0;
0102:
0103:            private String pattern = null;
0104:
0105:            /*
0106:             * All backreferences that may be used in pattern.
0107:             */
0108:            transient private FSet backRefs[] = new FSet[BACK_REF_NUMBER];
0109:
0110:            /*
0111:             * Is true if backreferenced sets replacement is needed
0112:             */
0113:            transient private boolean needsBackRefReplacement = false;
0114:
0115:            transient private int groupIndex = -1;
0116:
0117:            transient private int globalGroupIndex = -1;
0118:
0119:            transient private int compCount = -1;
0120:
0121:            transient private int consCount = -1;
0122:
0123:            transient AbstractSet start = null;
0124:
0125:            /**
0126:             * Create a matcher for this pattern and a given input character sequence
0127:             * 
0128:             * @param cs
0129:             *            The input character sequence
0130:             * @return A new matcher
0131:             */
0132:            public Matcher matcher(CharSequence cs) {
0133:                return new Matcher(this , cs);
0134:            }
0135:
0136:            /**
0137:             * Split an input string using the pattern as a token separator.
0138:             * 
0139:             * @param input
0140:             *            Input sequence to tokenize
0141:             * @param limit
0142:             *            If positive, the maximum number of tokens to return. If
0143:             *            negative, an indefinite number of tokens are returned. If
0144:             *            zero, an indefinite number of tokens are returned but trailing
0145:             *            empty tokens are excluded.
0146:             * @return A sequence of tokens split out of the input string.
0147:             */
0148:            public String[] split(CharSequence input, int limit) {
0149:                ArrayList res = new ArrayList();
0150:                Matcher mat = matcher(input);
0151:                int index = 0;
0152:                int curPos = 0;
0153:
0154:                if (input.length() == 0) {
0155:                    return new String[] { "" }; //$NON-NLS-1$
0156:                } else {
0157:                    while (mat.find() && (index + 1 < limit || limit <= 0)) {
0158:                        res.add(input.subSequence(curPos, mat.start())
0159:                                .toString());
0160:                        curPos = mat.end();
0161:                        index++;
0162:                    }
0163:
0164:                    res.add(input.subSequence(curPos, input.length())
0165:                            .toString());
0166:                    index++;
0167:
0168:                    /*
0169:                     * discard trailing empty strings
0170:                     */
0171:                    if (limit == 0) {
0172:                        while (--index >= 0
0173:                                && res.get(index).toString().length() == 0) {
0174:                            res.remove(index);
0175:                        }
0176:                    }
0177:                }
0178:                return (String[]) res
0179:                        .toArray(new String[index >= 0 ? index : 0]);
0180:            }
0181:
0182:            /**
0183:             * @com.intel.drl.spec_ref
0184:             */
0185:            public String[] split(CharSequence input) {
0186:                return split(input, 0);
0187:            }
0188:
0189:            /**
0190:             * Returns the pattern string passed to the compile method
0191:             * 
0192:             * @return A string representation of the pattern
0193:             */
0194:            public String pattern() {
0195:                return lexemes.toString();
0196:            }
0197:
0198:            /**
0199:             * Return a textual representation of the pattern.
0200:             * 
0201:             * @return The regular expression string
0202:             */
0203:            public String toString() {
0204:                return this .pattern();
0205:            }
0206:
0207:            /**
0208:             * Return the mask of flags used to compile the pattern
0209:             * 
0210:             * @return A mask of flags used to compile the pattern.
0211:             */
0212:            public int flags() {
0213:                return this .flags;
0214:            }
0215:
0216:            /**
0217:             * Return a compiled pattern corresponding to the input regular expression
0218:             * string.
0219:             * 
0220:             * The input <code>flags</code> is a mask of the following flags:
0221:             * <dl>
0222:             * <dt><code>UNIX_LINES</code> (0x0001)
0223:             * <dd>Enables UNIX lines mode where only \n is recognized as a line
0224:             * terminator. The default setting of this flag is <em>off</em> indicating
0225:             * that all of the following character sequences are recognized as line
0226:             * terminators: \n, \r, \r\n, NEL (&#92;u0085), &#92;u2028 and &#92;u2029.
0227:             * <dt><code>CASE_INSENSITIVE</code> (0x0002)
0228:             * <dd>Directs matching to be done in a way that ignores differences in
0229:             * case. If input character sequences are encoded in character sets other
0230:             * than ASCII, then the UNICODE_CASE must also be set to enable Unicode case
0231:             * detection.
0232:             * <dt><code>UNICODE_CASE</code> (0x0040)
0233:             * <dd>Enables Unicode case folding if used in conjunction with the
0234:             * <code>CASE_INSENSITIVE</code> flag. If <code>CASE_INSENSITIVE</code>
0235:             * is not set, then this flag has no effect.
0236:             * <dt><code>COMMENTS</code> (0x0004)
0237:             * <dd>Directs the pattern compiler to ignore whitespace and comments in
0238:             * the pattern. Whitespace consists of sequences including only these
0239:             * characters: SP (&#92;u0020), HT (\t or &#92;u0009), LF (\n or ), VT
0240:             * (&#92;u000b), FF (\f or &#92;u000c), and CR (\r or ). A comment is any
0241:             * sequence of characters beginning with the "#" (&#92;u0023) character and
0242:             * ending in a LF character.
0243:             * <dt><code>MULTILINE</code> (0x0008)
0244:             * <dd>Turns on multiple line mode for matching of character sequences. By
0245:             * default, this mode is off so that the character "^" (&#92;u005e) matches
0246:             * the beginning of the entire input sequence and the character "$"
0247:             * (&#92;u0024) matches the end of the input character sequence. In multiple
0248:             * line mode, the character "^" matches any character in the input sequence
0249:             * which immediately follows a line terminator and the character "$" matches
0250:             * any character in the input sequence which immediately precedes a line
0251:             * terminator.
0252:             * <dt><code>DOTALL</code> (0x0020)
0253:             * <dd>Enables the DOT (".") character in regular expressions to match line
0254:             * terminators. By default, line terminators are not matched by DOT.
0255:             * <dt><code>CANON_EQ</code> (0x0080)
0256:             * <dd>Enables matching of character sequences which are canonically
0257:             * equivalent according to the Unicode standard. Canonical equivalence is
0258:             * described here: http://www.unicode.org/reports/tr15/. By default,
0259:             * canonical equivalence is not detected while matching.
0260:             * </dl>
0261:             * 
0262:             * @param regex
0263:             *            A regular expression string.
0264:             * @param flags
0265:             *            A set of flags to control the compilation of the pattern.
0266:             * @return A compiled pattern
0267:             * @throws PatternSyntaxException
0268:             *             If the input regular expression does not match the required
0269:             *             grammar.
0270:             */
0271:            public static Pattern compile(String regex, int flags)
0272:                    throws PatternSyntaxException {
0273:
0274:                if ((flags != 0) && ((flags | flagsBitMask) != flagsBitMask)) {
0275:
0276:                    throw new IllegalArgumentException(Messages
0277:                            .getString("regex.1C"));
0278:                }
0279:
0280:                AbstractSet.counter = 1;
0281:
0282:                return new Pattern().compileImpl(regex, flags);
0283:            }
0284:
0285:            /**
0286:             * 
0287:             * @param pattern -
0288:             *            Regular expression to be compiled
0289:             * @param flags -
0290:             *            The bit mask including CASE_INSENSITIVE, MULTILINE, DOTALL,
0291:             *            UNICODE_CASE, and CANON_EQ
0292:             * 
0293:             * @return Compiled pattern
0294:             */
0295:            private Pattern compileImpl(String regex, int flags)
0296:                    throws PatternSyntaxException {
0297:                this .lexemes = new Lexer(regex, flags);
0298:                this .flags = flags;
0299:                this .pattern = regex;
0300:
0301:                start = processExpression(-1, this .flags, null);
0302:                if (!lexemes.isEmpty()) {
0303:                    throw new PatternSyntaxException(Messages
0304:                            .getString("regex.08"), lexemes.toString(), //$NON-NLS-1$
0305:                            lexemes.getIndex());
0306:                }
0307:                finalizeCompile();
0308:                return this ;
0309:            }
0310:
0311:            /**
0312:             * A->(a|)+
0313:             */
0314:            private AbstractSet processAlternations(AbstractSet last) {
0315:                CharClass auxRange = new CharClass(
0316:                        hasFlag(Pattern.CASE_INSENSITIVE),
0317:                        hasFlag(Pattern.UNICODE_CASE));
0318:                while (!lexemes.isEmpty()
0319:                        && lexemes.isLetter()
0320:                        && (lexemes.lookAhead() == 0
0321:                                || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
0322:                                .lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS)) {
0323:                    auxRange.add(lexemes.next());
0324:                    if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
0325:                        lexemes.next();
0326:                }
0327:                AbstractSet rangeSet = processRangeSet(auxRange);
0328:                rangeSet.setNext(last);
0329:
0330:                return rangeSet;
0331:            }
0332:
0333:            /**
0334:             * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)*
0335:             */
0336:            private AbstractSet processExpression(int ch, int newFlags,
0337:                    AbstractSet last) {
0338:                ArrayList children = new ArrayList();
0339:                AbstractSet child;
0340:                int saveFlags = flags;
0341:                FSet fSet;
0342:                boolean saveChangedFlags = false;
0343:
0344:                if (newFlags != flags) {
0345:                    flags = newFlags;
0346:                }
0347:
0348:                switch (ch) {
0349:                case Lexer.CHAR_NONCAP_GROUP:
0350:                    fSet = new NonCapFSet(++consCount);
0351:                    break;
0352:
0353:                case Lexer.CHAR_POS_LOOKAHEAD:
0354:                    /* falls through */
0355:
0356:                case Lexer.CHAR_NEG_LOOKAHEAD:
0357:                    fSet = new AheadFSet();
0358:                    break;
0359:
0360:                case Lexer.CHAR_POS_LOOKBEHIND:
0361:                    /* falls through */
0362:
0363:                case Lexer.CHAR_NEG_LOOKBEHIND:
0364:                    fSet = new BehindFSet(++consCount);
0365:                    break;
0366:
0367:                case Lexer.CHAR_ATOMIC_GROUP:
0368:                    fSet = new AtomicFSet(++consCount);
0369:                    break;
0370:
0371:                default:
0372:                    globalGroupIndex++;
0373:                    if (last == null) {
0374:
0375:                        // expr = new StartSet();
0376:                        fSet = new FinalSet();
0377:                        saveChangedFlags = true;
0378:                    } else {
0379:
0380:                        // expr = new JointSet(globalGroupIndex);
0381:                        fSet = new FSet(globalGroupIndex);
0382:                    }
0383:                    if (globalGroupIndex > -1 && globalGroupIndex < 10) {
0384:                        backRefs[globalGroupIndex] = fSet;
0385:                    }
0386:                    break;
0387:                }
0388:
0389:                do {
0390:                    if (lexemes.isLetter()
0391:                            && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) {
0392:                        child = processAlternations(fSet);
0393:                    } else if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) {
0394:                        child = new EmptySet(fSet);
0395:                        lexemes.next();
0396:                    } else {
0397:                        child = processSubExpression(fSet);
0398:                        if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) {
0399:                            lexemes.next();
0400:                        }
0401:                    }
0402:                    if (child != null) {
0403:
0404:                        //expr.addChild(child);
0405:                        children.add(child);
0406:                    }
0407:                } while (!(lexemes.isEmpty() || (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS)));
0408:
0409:                if (lexemes.back() == Lexer.CHAR_VERTICAL_BAR) {
0410:                    children.add(new EmptySet(fSet));
0411:                }
0412:
0413:                if (flags != saveFlags && !saveChangedFlags) {
0414:                    flags = saveFlags;
0415:                    lexemes.restoreFlags(flags);
0416:                }
0417:
0418:                switch (ch) {
0419:                case Lexer.CHAR_NONCAP_GROUP:
0420:                    return new NonCapJointSet(children, fSet);
0421:
0422:                case Lexer.CHAR_POS_LOOKAHEAD:
0423:                    return new PositiveLookAhead(children, fSet);
0424:
0425:                case Lexer.CHAR_NEG_LOOKAHEAD:
0426:                    return new NegativeLookAhead(children, fSet);
0427:
0428:                case Lexer.CHAR_POS_LOOKBEHIND:
0429:                    return new PositiveLookBehind(children, fSet);
0430:
0431:                case Lexer.CHAR_NEG_LOOKBEHIND:
0432:                    return new NegativeLookBehind(children, fSet);
0433:
0434:                case Lexer.CHAR_ATOMIC_GROUP:
0435:                    return new AtomicJointSet(children, fSet);
0436:
0437:                default:
0438:                    switch (children.size()) {
0439:                    case 0:
0440:                        return new EmptySet(fSet);
0441:
0442:                    case 1:
0443:                        return new SingleSet((AbstractSet) children.get(0),
0444:                                fSet);
0445:
0446:                    default:
0447:                        return new JointSet(children, fSet);
0448:                    }
0449:                }
0450:            }
0451:
0452:            /**
0453:             * T->a+
0454:             */
0455:            private AbstractSet processSequence(AbstractSet last) {
0456:                StringBuffer substring = new StringBuffer();
0457:
0458:                while (!lexemes.isEmpty()
0459:                        && lexemes.isLetter()
0460:                        && !lexemes.isHighSurrogate()
0461:                        && !lexemes.isLowSurrogate()
0462:                        && ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end
0463:                                // of
0464:                                // pattern
0465:                                || (!lexemes.isNextSpecial() && Lexer
0466:                                        .isLetter(lexemes.lookAhead()))
0467:                                || lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS
0468:                                || (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS
0469:                                || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
0470:                                .lookAhead() == Lexer.CHAR_DOLLAR)) {
0471:                    int ch = lexemes.next();
0472:
0473:                    if (Character.isSupplementaryCodePoint(ch)) {
0474:                        substring.append(Character.toChars(ch));
0475:                    } else {
0476:                        substring.append((char) ch);
0477:                    }
0478:                }
0479:                if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0480:                    return new SequenceSet(substring);
0481:                } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0482:                    return new CISequenceSet(substring);
0483:                } else {
0484:                    return new UCISequenceSet(substring);
0485:                }
0486:            }
0487:
0488:            /**
0489:             * D->a
0490:             */
0491:            private AbstractSet processDecomposedChar(AbstractSet last) {
0492:                int[] codePoints = new int[Lexer.MAX_DECOMPOSITION_LENGTH];
0493:                char[] codePointsHangul;
0494:                int readCodePoints = 0;
0495:                int curSymb = -1;
0496:                int curSymbIndex = -1;
0497:
0498:                if (!lexemes.isEmpty() && lexemes.isLetter()) {
0499:                    curSymb = lexemes.next();
0500:                    codePoints[readCodePoints] = curSymb;
0501:                    curSymbIndex = curSymb - Lexer.LBase;
0502:                }
0503:
0504:                /*
0505:                 * We process decomposed Hangul syllable LV or LVT or process jamo L.
0506:                 * See http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
0507:                 * "3.12 Conjoining Jamo Behavior"
0508:                 */
0509:                if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.LCount)) {
0510:                    codePointsHangul = new char[Lexer.MAX_HANGUL_DECOMPOSITION_LENGTH];
0511:                    codePointsHangul[readCodePoints++] = (char) curSymb;
0512:
0513:                    curSymb = lexemes.peek();
0514:                    curSymbIndex = curSymb - Lexer.VBase;
0515:                    if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.VCount)) {
0516:                        codePointsHangul[readCodePoints++] = (char) curSymb;
0517:                        lexemes.next();
0518:                        curSymb = lexemes.peek();
0519:                        curSymbIndex = curSymb - Lexer.TBase;
0520:                        if ((curSymbIndex >= 0)
0521:                                && (curSymbIndex < Lexer.TCount)) {
0522:                            codePointsHangul[readCodePoints++] = (char) curSymb;
0523:                            lexemes.next();
0524:
0525:                            //LVT syllable
0526:                            return new HangulDecomposedCharSet(
0527:                                    codePointsHangul, 3);
0528:                        } else {
0529:
0530:                            //LV syllable
0531:                            return new HangulDecomposedCharSet(
0532:                                    codePointsHangul, 2);
0533:                        }
0534:                    } else {
0535:
0536:                        //L jamo
0537:                        if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0538:                            return new CharSet(codePointsHangul[0]);
0539:                        } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0540:                            return new CICharSet(codePointsHangul[0]);
0541:                        } else {
0542:                            return new UCICharSet(codePointsHangul[0]);
0543:                        }
0544:                    }
0545:
0546:                    /*
0547:                     * We process single codepoint or decomposed codepoint.
0548:                     * We collect decomposed codepoint and obtain 
0549:                     * one DecomposedCharSet.
0550:                     */
0551:                } else {
0552:                    readCodePoints++;
0553:
0554:                    while ((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH)
0555:                            && !lexemes.isEmpty() && lexemes.isLetter()
0556:                            && !Lexer.isDecomposedCharBoundary(lexemes.peek())) {
0557:                        codePoints[readCodePoints++] = lexemes.next();
0558:                    }
0559:
0560:                    /*
0561:                     * We have read an ordinary symbol.
0562:                     */
0563:                    if (readCodePoints == 1
0564:                            && !Lexer
0565:                                    .hasSingleCodepointDecomposition(codePoints[0])) {
0566:                        return processCharSet(codePoints[0]);
0567:                    } else {
0568:                        if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0569:                            return new DecomposedCharSet(codePoints,
0570:                                    readCodePoints);
0571:                        } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0572:                            return new CIDecomposedCharSet(codePoints,
0573:                                    readCodePoints);
0574:                        } else {
0575:                            return new UCIDecomposedCharSet(codePoints,
0576:                                    readCodePoints);
0577:                        }
0578:                    }
0579:                }
0580:            }
0581:
0582:            /**
0583:             * S->BS; S->QS; S->Q; B->a+
0584:             */
0585:            private AbstractSet processSubExpression(AbstractSet last) {
0586:                AbstractSet cur;
0587:                if (lexemes.isLetter() && !lexemes.isNextSpecial()
0588:                        && Lexer.isLetter(lexemes.lookAhead())) {
0589:                    if (hasFlag(Pattern.CANON_EQ)) {
0590:                        cur = processDecomposedChar(last);
0591:                        if (!lexemes.isEmpty()
0592:
0593:                                /* && !pattern.isQuantifier() */
0594:                                && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof  FinalSet)
0595:                                && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR
0596:                                && !lexemes.isLetter()) {
0597:                            cur = processQuantifier(last, cur);
0598:                        }
0599:                    } else if (lexemes.isHighSurrogate()
0600:                            || lexemes.isLowSurrogate()) {
0601:                        AbstractSet term = processTerminal(last);
0602:                        cur = processQuantifier(last, term);
0603:                    } else {
0604:                        cur = processSequence(last);
0605:                    }
0606:                } else if (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS) {
0607:                    if (last instanceof  FinalSet) {
0608:                        throw new PatternSyntaxException(Messages
0609:                                .getString("regex.09"), lexemes.toString(), //$NON-NLS-1$
0610:                                lexemes.getIndex());
0611:                    } else {
0612:                        cur = new EmptySet(last);
0613:                    }
0614:                } else {
0615:                    AbstractSet term = processTerminal(last);
0616:                    cur = processQuantifier(last, term);
0617:                }
0618:
0619:                if (!lexemes.isEmpty()
0620:                        // && !pattern.isQuantifier()
0621:                        && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof  FinalSet)
0622:                        && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) {
0623:                    AbstractSet next = processSubExpression(last);
0624:                    if (cur instanceof  LeafQuantifierSet
0625:                            // TODO create personal UnifiedQuantifierSet for composite
0626:                            // quantifiers
0627:                            // to take into account Quantifier counters
0628:                            // ////
0629:                            && !(cur instanceof  CompositeQuantifierSet)
0630:                            && !(cur instanceof  GroupQuantifierSet)
0631:                            && !(cur instanceof  AltQuantifierSet)
0632:                            && !next.first(((LeafQuantifierSet) cur)
0633:                                    .getInnerSet())) {
0634:                        cur = new UnifiedQuantifierSet((LeafQuantifierSet) cur);
0635:                    }
0636:                    if (((char) next.getType()) == '+') {
0637:                        cur.setNext(((LeafQuantifierSet) next).getInnerSet());
0638:                    } else {
0639:                        cur.setNext(next);
0640:                    }
0641:                } else if (cur != null) {
0642:                    cur.setNext(last);
0643:                } else {
0644:                    return null;
0645:                }
0646:
0647:                if (((char) cur.getType()) == '+') {
0648:                    return ((QuantifierSet) cur).getInnerSet();
0649:                } else {
0650:                    return cur;
0651:                }
0652:            }
0653:
0654:            /**
0655:             * Q->T(*|+|?...) also do some optimizations.
0656:             * 
0657:             */
0658:            private AbstractSet processQuantifier(AbstractSet last,
0659:                    AbstractSet term) {
0660:                int quant = lexemes.peek();
0661:
0662:                if (term != null && !(term instanceof  LeafSet)) {
0663:                    switch (quant) {
0664:                    case Lexer.QUANT_STAR:
0665:                    case Lexer.QUANT_PLUS: {
0666:                        QuantifierSet q;
0667:
0668:                        lexemes.next();
0669:                        if (term.getType() == AbstractSet.TYPE_DOTSET) {
0670:                            if (!hasFlag(Pattern.DOTALL)) {
0671:                                q = new DotQuantifierSet(term, last, quant,
0672:                                        AbstractLineTerminator
0673:                                                .getInstance(flags));
0674:                            } else {
0675:                                q = new DotAllQuantifierSet(term, last, quant);
0676:                            }
0677:                        } else {
0678:                            q = new GroupQuantifierSet(term, last, quant);
0679:                        }
0680:                        term.setNext(q);
0681:                        return q;
0682:                    }
0683:
0684:                    case Lexer.QUANT_STAR_R:
0685:                    case Lexer.QUANT_PLUS_R: {
0686:                        lexemes.next();
0687:                        GroupQuantifierSet q = new ReluctantGroupQuantifierSet(
0688:                                term, last, quant);
0689:                        term.setNext(q);
0690:                        return q;
0691:                    }
0692:
0693:                    case Lexer.QUANT_PLUS_P: {
0694:                        lexemes.next();
0695:                        // possessive plus will be handled by unique class
0696:                        // and should not be postprocessed to point previous set
0697:                        // to the inner one.
0698:                        // //
0699:                        return new PosPlusGroupQuantifierSet(term, last,
0700:                                Lexer.QUANT_STAR_P);
0701:                    }
0702:
0703:                    case Lexer.QUANT_STAR_P: {
0704:                        lexemes.next();
0705:                        return new PossessiveGroupQuantifierSet(term, last,
0706:                                quant);
0707:                    }
0708:
0709:                    case Lexer.QUANT_ALT: {
0710:                        lexemes.next();
0711:                        AltGroupQuantifierSet q = new AltGroupQuantifierSet(
0712:                                term, last, Lexer.QUANT_ALT);
0713:                        term.setNext(last);
0714:                        return q;
0715:                    }
0716:
0717:                    case Lexer.QUANT_ALT_P: {
0718:                        lexemes.next();
0719:                        return new PosAltGroupQuantifierSet(term, last,
0720:                                Lexer.QUANT_ALT);
0721:                    }
0722:
0723:                    case Lexer.QUANT_ALT_R: {
0724:                        lexemes.next();
0725:                        RelAltGroupQuantifierSet q = new RelAltGroupQuantifierSet(
0726:                                term, last, Lexer.QUANT_ALT);
0727:                        term.setNext(last);
0728:                        return q;
0729:                    }
0730:
0731:                    case Lexer.QUANT_COMP: {
0732:                        CompositeGroupQuantifierSet q = new CompositeGroupQuantifierSet(
0733:                                (Quantifier) lexemes.nextSpecial(), term, last,
0734:                                Lexer.QUANT_ALT, ++compCount);
0735:                        term.setNext(q);
0736:                        return q;
0737:                    }
0738:
0739:                    case Lexer.QUANT_COMP_P: {
0740:                        return new PosCompositeGroupQuantifierSet(
0741:                                (Quantifier) lexemes.nextSpecial(), term, last,
0742:                                Lexer.QUANT_ALT, ++compCount);
0743:                    }
0744:
0745:                    case Lexer.QUANT_COMP_R: {
0746:                        RelCompositeGroupQuantifierSet q = new RelCompositeGroupQuantifierSet(
0747:                                (Quantifier) lexemes.nextSpecial(), term, last,
0748:                                Lexer.QUANT_ALT, ++compCount);
0749:                        term.setNext(q);
0750:                        return q;
0751:                    }
0752:
0753:                    default:
0754:                        return term;
0755:                    }
0756:                } else {
0757:                    LeafSet leaf = null;
0758:                    if (term != null)
0759:                        leaf = (LeafSet) term;
0760:                    switch (quant) {
0761:                    case Lexer.QUANT_STAR:
0762:                    case Lexer.QUANT_PLUS: {
0763:                        lexemes.next();
0764:                        LeafQuantifierSet q = new LeafQuantifierSet(leaf, last,
0765:                                quant);
0766:                        leaf.setNext(q);
0767:                        return q;
0768:                    }
0769:
0770:                    case Lexer.QUANT_STAR_R:
0771:                    case Lexer.QUANT_PLUS_R: {
0772:                        lexemes.next();
0773:                        ReluctantQuantifierSet q = new ReluctantQuantifierSet(
0774:                                leaf, last, quant);
0775:                        leaf.setNext(q);
0776:                        return q;
0777:                    }
0778:
0779:                    case Lexer.QUANT_PLUS_P:
0780:                    case Lexer.QUANT_STAR_P: {
0781:                        lexemes.next();
0782:                        PossessiveQuantifierSet q = new PossessiveQuantifierSet(
0783:                                leaf, last, quant);
0784:                        leaf.setNext(q);
0785:                        return q;
0786:                    }
0787:
0788:                    case Lexer.QUANT_ALT: {
0789:                        lexemes.next();
0790:                        return new AltQuantifierSet(leaf, last, Lexer.QUANT_ALT);
0791:                    }
0792:
0793:                    case Lexer.QUANT_ALT_P: {
0794:                        lexemes.next();
0795:                        return new PossessiveAltQuantifierSet(leaf, last,
0796:                                Lexer.QUANT_ALT_P);
0797:                    }
0798:
0799:                    case Lexer.QUANT_ALT_R: {
0800:                        lexemes.next();
0801:                        return new ReluctantAltQuantifierSet(leaf, last,
0802:                                Lexer.QUANT_ALT_R);
0803:                    }
0804:
0805:                    case Lexer.QUANT_COMP: {
0806:                        return new CompositeQuantifierSet((Quantifier) lexemes
0807:                                .nextSpecial(), leaf, last, Lexer.QUANT_COMP);
0808:                    }
0809:
0810:                    case Lexer.QUANT_COMP_P: {
0811:                        return new PossessiveCompositeQuantifierSet(
0812:                                (Quantifier) lexemes.nextSpecial(), leaf, last,
0813:                                Lexer.QUANT_COMP_P);
0814:                    }
0815:                    case Lexer.QUANT_COMP_R: {
0816:                        return new ReluctantCompositeQuantifierSet(
0817:                                (Quantifier) lexemes.nextSpecial(), leaf, last,
0818:                                Lexer.QUANT_COMP_R);
0819:                    }
0820:
0821:                    default:
0822:                        return term;
0823:                    }
0824:                }
0825:            }
0826:
0827:            /**
0828:             * T-> letter|[range]|{char-class}|(E)
0829:             */
0830:            private AbstractSet processTerminal(AbstractSet last) {
0831:                int ch;
0832:                AbstractSet term = null;
0833:                do {
0834:                    ch = lexemes.peek();
0835:                    if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) {
0836:                        int newFlags;
0837:                        lexemes.next();
0838:                        newFlags = (ch & 0x00ff0000) >> 16;
0839:                        ch = ch & 0xff00ffff;
0840:                        if (ch == Lexer.CHAR_FLAGS) {
0841:                            flags = newFlags;
0842:                        } else {
0843:                            newFlags = (ch == Lexer.CHAR_NONCAP_GROUP) ? newFlags
0844:                                    : flags;
0845:                            term = processExpression(ch, newFlags, last);
0846:                            if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS) {
0847:                                throw new PatternSyntaxException(
0848:                                        Messages.getString("regex.0A"), lexemes.toString(), //$NON-NLS-1$
0849:                                        lexemes.getIndex());
0850:                            }
0851:                            lexemes.next();
0852:                        }
0853:                    } else
0854:                        switch (ch) {
0855:                        case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
0856:                            lexemes.next();
0857:                            boolean negative = false;
0858:                            if (lexemes.peek() == Lexer.CHAR_CARET) {
0859:                                negative = true;
0860:                                lexemes.next();
0861:                            }
0862:
0863:                            term = processRange(negative, last);
0864:                            if (lexemes.peek() != Lexer.CHAR_RIGHT_SQUARE_BRACKET)
0865:                                throw new PatternSyntaxException(
0866:                                        Messages.getString("regex.0B"), lexemes.toString(), //$NON-NLS-1$
0867:                                        lexemes.getIndex());
0868:                            lexemes.setMode(Lexer.MODE_PATTERN);
0869:                            lexemes.next();
0870:                            break;
0871:                        }
0872:
0873:                        case Lexer.CHAR_DOT: {
0874:                            lexemes.next();
0875:
0876:                            if (!hasFlag(Pattern.DOTALL)) {
0877:                                term = new DotSet(AbstractLineTerminator
0878:                                        .getInstance(flags));
0879:                            } else {
0880:                                term = new DotAllSet();
0881:                            }
0882:
0883:                            break;
0884:                        }
0885:
0886:                        case Lexer.CHAR_CARET: {
0887:                            lexemes.next();
0888:                            consCount++;
0889:                            if (!hasFlag(Pattern.MULTILINE)) {
0890:                                term = new SOLSet();
0891:                            } else {
0892:                                term = new MultiLineSOLSet(
0893:                                        AbstractLineTerminator
0894:                                                .getInstance(flags));
0895:                            }
0896:
0897:                            break;
0898:                        }
0899:
0900:                        case Lexer.CHAR_DOLLAR: {
0901:                            lexemes.next();
0902:                            consCount++;
0903:                            if (!hasFlag(Pattern.MULTILINE)) {
0904:                                if (!hasFlag(Pattern.UNIX_LINES)) {
0905:                                    term = new EOLSet(consCount);
0906:                                } else {
0907:                                    term = new UEOLSet(consCount);
0908:                                }
0909:                            } else {
0910:                                if (!hasFlag(Pattern.UNIX_LINES)) {
0911:                                    term = new MultiLineEOLSet(consCount);
0912:                                } else {
0913:                                    term = new UMultiLineEOLSet(consCount);
0914:                                }
0915:                            }
0916:
0917:                            break;
0918:                        }
0919:
0920:                        case Lexer.CHAR_WORD_BOUND: {
0921:                            lexemes.next();
0922:                            term = new WordBoundary(true);
0923:                            break;
0924:                        }
0925:
0926:                        case Lexer.CHAR_NONWORD_BOUND: {
0927:                            lexemes.next();
0928:                            term = new WordBoundary(false);
0929:                            break;
0930:                        }
0931:
0932:                        case Lexer.CHAR_END_OF_INPUT: {
0933:                            lexemes.next();
0934:                            term = new EOISet();
0935:                            break;
0936:                        }
0937:
0938:                        case Lexer.CHAR_END_OF_LINE: {
0939:                            lexemes.next();
0940:                            term = new EOLSet(++consCount);
0941:                            break;
0942:                        }
0943:
0944:                        case Lexer.CHAR_START_OF_INPUT: {
0945:                            lexemes.next();
0946:                            term = new SOLSet();
0947:                            break;
0948:                        }
0949:
0950:                        case Lexer.CHAR_PREVIOUS_MATCH: {
0951:                            lexemes.next();
0952:                            term = new PreviousMatch();
0953:                            break;
0954:                        }
0955:
0956:                        case 0x80000000 | '1':
0957:                        case 0x80000000 | '2':
0958:                        case 0x80000000 | '3':
0959:                        case 0x80000000 | '4':
0960:                        case 0x80000000 | '5':
0961:                        case 0x80000000 | '6':
0962:                        case 0x80000000 | '7':
0963:                        case 0x80000000 | '8':
0964:                        case 0x80000000 | '9': {
0965:                            int number = (ch & 0x7FFFFFFF) - '0';
0966:                            if (globalGroupIndex >= number) {
0967:                                lexemes.next();
0968:                                consCount++;
0969:                                if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0970:                                    term = new BackReferenceSet(number,
0971:                                            consCount);
0972:                                } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0973:                                    term = new CIBackReferenceSet(number,
0974:                                            consCount);
0975:                                } else {
0976:                                    term = new UCIBackReferenceSet(number,
0977:                                            consCount);
0978:                                }
0979:                                (backRefs[number]).isBackReferenced = true;
0980:                                needsBackRefReplacement = true;
0981:                                break;
0982:                            } else {
0983:                                throw new PatternSyntaxException(Messages
0984:                                        .getString("regex.0C") //$NON-NLS-1$
0985:                                        , lexemes.toString(), lexemes
0986:                                                .getIndex());
0987:                            }
0988:                        }
0989:
0990:                        case 0: {
0991:                            AbstractCharClass cc = null;
0992:                            if ((cc = (AbstractCharClass) lexemes.peekSpecial()) != null) {
0993:                                term = processRangeSet(cc);
0994:                            } else if (!lexemes.isEmpty()) {
0995:
0996:                                //ch == 0
0997:                                term = new CharSet((char) ch);
0998:                            } else {
0999:                                term = new EmptySet(last);
1000:                                break;
1001:                            }
1002:                            lexemes.next();
1003:                            break;
1004:                        }
1005:
1006:                        default: {
1007:                            if (ch >= 0 && !lexemes.isSpecial()) {
1008:                                term = processCharSet(ch);
1009:                                lexemes.next();
1010:                            } else if (ch == Lexer.CHAR_VERTICAL_BAR) {
1011:                                term = new EmptySet(last);
1012:                            } else if (ch == Lexer.CHAR_RIGHT_PARENTHESIS) {
1013:                                if (last instanceof  FinalSet) {
1014:                                    throw new PatternSyntaxException(
1015:                                            Messages.getString("regex.09"), lexemes.toString(), //$NON-NLS-1$
1016:                                            lexemes.getIndex());
1017:                                } else {
1018:                                    term = new EmptySet(last);
1019:                                }
1020:                            } else {
1021:                                throw new PatternSyntaxException(Messages
1022:                                        .getString("regex.0D", //$NON-NLS-1$
1023:                                                (lexemes.isSpecial() ? lexemes
1024:                                                        .peekSpecial()
1025:                                                        .toString() : Character
1026:                                                        .toString((char) ch))),
1027:                                        lexemes.toString(), lexemes.getIndex());
1028:                            }
1029:                        }
1030:                        }
1031:                } while (ch == Lexer.CHAR_FLAGS);
1032:                return term;
1033:            }
1034:
1035:            private AbstractSet processRange(boolean negative, AbstractSet last) {
1036:                AbstractCharClass res = processRangeExpression(negative);
1037:                AbstractSet rangeSet = processRangeSet(res);
1038:                rangeSet.setNext(last);
1039:
1040:                return rangeSet;
1041:            }
1042:
1043:            /**
1044:             * process [...] ranges
1045:             */
1046:            private CharClass processRangeExpression(boolean alt) {
1047:                CharClass res = new CharClass(alt,
1048:                        hasFlag(Pattern.CASE_INSENSITIVE),
1049:                        hasFlag(Pattern.UNICODE_CASE));
1050:                int buffer = -1;
1051:                boolean intersection = false;
1052:                boolean notClosed = false;
1053:                boolean firstInClass = true;
1054:
1055:                while (!lexemes.isEmpty()
1056:                        && (notClosed = (lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET
1057:                                || firstInClass)) {
1058:                    switch (lexemes.peek()) {
1059:
1060:                    case Lexer.CHAR_RIGHT_SQUARE_BRACKET: {
1061:                        if (buffer >= 0)
1062:                            res.add(buffer);
1063:                        buffer = ']';
1064:                        lexemes.next();
1065:                        break;
1066:                    }
1067:                    case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
1068:                        if (buffer >= 0) {
1069:                            res.add(buffer);
1070:                            buffer = -1;
1071:                        }
1072:                        lexemes.next();
1073:                        boolean negative = false;
1074:                        if (lexemes.peek() == Lexer.CHAR_CARET) {
1075:                            lexemes.next();
1076:                            negative = true;
1077:                        }
1078:
1079:                        if (intersection)
1080:                            res.intersection(processRangeExpression(negative));
1081:                        else
1082:                            res.union(processRangeExpression(negative));
1083:                        intersection = false;
1084:                        lexemes.next();
1085:                        break;
1086:                    }
1087:
1088:                    case Lexer.CHAR_AMPERSAND: {
1089:                        if (buffer >= 0)
1090:                            res.add(buffer);
1091:                        buffer = lexemes.next();
1092:
1093:                        /*
1094:                         * if there is a start for subrange we will do an intersection
1095:                         * otherwise treat '&' as a normal character
1096:                         */
1097:                        if (lexemes.peek() == Lexer.CHAR_AMPERSAND) {
1098:                            if (lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) {
1099:                                lexemes.next();
1100:                                intersection = true;
1101:                                buffer = -1;
1102:                            } else {
1103:                                lexemes.next();
1104:                                if (firstInClass) {
1105:
1106:                                    //skip "&&" at "[&&...]" or "[^&&...]"
1107:                                    res = processRangeExpression(false);
1108:                                } else {
1109:
1110:                                    //ignore "&&" at "[X&&]" ending where X != empty string
1111:                                    if (!(lexemes.peek() == Lexer.CHAR_RIGHT_SQUARE_BRACKET)) {
1112:                                        res
1113:                                                .intersection(processRangeExpression(false));
1114:                                    }
1115:                                }
1116:
1117:                            }
1118:                        } else {
1119:
1120:                            //treat '&' as a normal character
1121:                            buffer = '&';
1122:                        }
1123:
1124:                        break;
1125:                    }
1126:
1127:                    case Lexer.CHAR_HYPHEN: {
1128:                        if (firstInClass
1129:                                || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
1130:                                || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET
1131:                                || buffer < 0) {
1132:                            // treat hypen as normal character
1133:                            if (buffer >= 0)
1134:                                res.add(buffer);
1135:                            buffer = '-';
1136:                            lexemes.next();
1137:                            // range
1138:                        } else {
1139:                            lexemes.next();
1140:                            int cur = lexemes.peek();
1141:
1142:                            if (!lexemes.isSpecial()
1143:                                    && (cur >= 0
1144:                                            || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
1145:                                            || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) {
1146:
1147:                                try {
1148:                                    if (!Lexer.isLetter(cur)) {
1149:                                        cur = cur & 0xFFFF;
1150:                                    }
1151:                                    res.add(buffer, cur);
1152:                                } catch (Exception e) {
1153:                                    throw new PatternSyntaxException(Messages
1154:                                            .getString("regex.0E"), //$NON-NLS-1$
1155:                                            pattern(), lexemes.getIndex());
1156:                                }
1157:                                lexemes.next();
1158:                                buffer = -1;
1159:                            } else {
1160:                                throw new PatternSyntaxException(Messages
1161:                                        .getString("regex.0E"), //$NON-NLS-1$
1162:                                        pattern(), lexemes.getIndex());
1163:                            }
1164:                        }
1165:
1166:                        break;
1167:                    }
1168:
1169:                    case Lexer.CHAR_CARET: {
1170:                        if (buffer >= 0)
1171:                            res.add(buffer);
1172:                        buffer = '^';
1173:                        lexemes.next();
1174:                        break;
1175:                    }
1176:
1177:                    case 0: {
1178:                        if (buffer >= 0)
1179:                            res.add(buffer);
1180:                        AbstractCharClass cs = (AbstractCharClass) lexemes
1181:                                .peekSpecial();
1182:                        if (cs != null) {
1183:                            res.add(cs);
1184:                            buffer = -1;
1185:                        } else {
1186:                            buffer = 0;
1187:                        }
1188:
1189:                        lexemes.next();
1190:                        break;
1191:                    }
1192:
1193:                    default: {
1194:                        if (buffer >= 0)
1195:                            res.add(buffer);
1196:                        buffer = lexemes.next();
1197:                        break;
1198:                    }
1199:                    }
1200:
1201:                    firstInClass = false;
1202:                }
1203:                if (notClosed) {
1204:                    throw new PatternSyntaxException(Messages
1205:                            .getString("regex.0F"), //$NON-NLS-1$
1206:                            pattern(), lexemes.getIndex() - 1);
1207:                }
1208:                if (buffer >= 0)
1209:                    res.add(buffer);
1210:                return res;
1211:            }
1212:
1213:            private AbstractSet processCharSet(int ch) {
1214:                boolean isSupplCodePoint = Character
1215:                        .isSupplementaryCodePoint(ch);
1216:
1217:                if (hasFlag(Pattern.CASE_INSENSITIVE)) {
1218:
1219:                    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
1220:                        return new CICharSet((char) ch);
1221:                    } else if (hasFlag(Pattern.UNICODE_CASE) && ch > 128) {
1222:                        if (isSupplCodePoint) {
1223:                            return new UCISupplCharSet(ch);
1224:                        } else if (Lexer.isLowSurrogate(ch)) {
1225:
1226:                            //we need no UCILowSurrogateCharSet
1227:                            return new LowSurrogateCharSet((char) ch);
1228:                        } else if (Lexer.isHighSurrogate(ch)) {
1229:
1230:                            //we need no UCIHighSurrogateCharSet
1231:                            return new HighSurrogateCharSet((char) ch);
1232:                        } else {
1233:                            return new UCICharSet((char) ch);
1234:                        }
1235:                    }
1236:                }
1237:
1238:                if (isSupplCodePoint) {
1239:                    return new SupplCharSet(ch);
1240:                } else if (Lexer.isLowSurrogate(ch)) {
1241:                    return new LowSurrogateCharSet((char) ch);
1242:                } else if (Lexer.isHighSurrogate(ch)) {
1243:                    return new HighSurrogateCharSet((char) ch);
1244:                } else {
1245:                    return new CharSet((char) ch);
1246:                }
1247:            }
1248:
1249:            private AbstractSet processRangeSet(AbstractCharClass charClass) {
1250:                if (charClass.hasLowHighSurrogates()) {
1251:                    AbstractCharClass surrogates = charClass.getSurrogates();
1252:                    LowHighSurrogateRangeSet lowHighSurrRangeSet = new LowHighSurrogateRangeSet(
1253:                            surrogates);
1254:
1255:                    if (charClass.mayContainSupplCodepoints()) {
1256:                        if (!charClass.hasUCI()) {
1257:                            return new CompositeRangeSet(new SupplRangeSet(
1258:                                    charClass.getWithoutSurrogates()),
1259:                                    lowHighSurrRangeSet);
1260:                        } else {
1261:                            return new CompositeRangeSet(new UCISupplRangeSet(
1262:                                    charClass.getWithoutSurrogates()),
1263:                                    lowHighSurrRangeSet);
1264:                        }
1265:                    }
1266:
1267:                    if (!charClass.hasUCI()) {
1268:                        return new CompositeRangeSet(new RangeSet(charClass
1269:                                .getWithoutSurrogates()), lowHighSurrRangeSet);
1270:                    } else {
1271:                        return new CompositeRangeSet(new UCIRangeSet(charClass
1272:                                .getWithoutSurrogates()), lowHighSurrRangeSet);
1273:                    }
1274:                }
1275:
1276:                if (charClass.mayContainSupplCodepoints()) {
1277:                    if (!charClass.hasUCI()) {
1278:                        return new SupplRangeSet(charClass);
1279:                    } else {
1280:                        return new UCISupplRangeSet(charClass);
1281:                    }
1282:                }
1283:
1284:                if (!charClass.hasUCI()) {
1285:                    return new RangeSet(charClass);
1286:                } else {
1287:                    return new UCIRangeSet(charClass);
1288:                }
1289:            }
1290:
1291:            /**
1292:             * @com.intel.drl.spec_ref
1293:             */
1294:            public static Pattern compile(String pattern) {
1295:                return compile(pattern, 0);
1296:            }
1297:
1298:            /*
1299:             * This method do traverses of
1300:             * automata to finish compilation.
1301:             */
1302:            private void finalizeCompile() {
1303:
1304:                /*
1305:                 * Processing second pass
1306:                 */
1307:                if (needsBackRefReplacement) { //|| needsReason1 || needsReason2) {
1308:                    start.processSecondPass();
1309:                }
1310:
1311:            }
1312:
1313:            /**
1314:             * @com.intel.drl.spec_ref
1315:             */
1316:            public static boolean matches(String regex, CharSequence input) {
1317:                return Pattern.compile(regex).matcher(input).matches();
1318:            }
1319:
1320:            public static String quote(String s) {
1321:                StringBuffer sb = new StringBuffer().append("\\Q"); //$NON-NLS-1$
1322:                int apos = 0;
1323:                int k;
1324:                while ((k = s.indexOf("\\E", apos)) >= 0) { //$NON-NLS-1$
1325:                    sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q"); //$NON-NLS-1$
1326:                    apos = k + 2;
1327:                }
1328:
1329:                return sb.append(s.substring(apos)).append("\\E").toString(); //$NON-NLS-1$
1330:            }
1331:
1332:            /**
1333:             * return number of groups found at compile time
1334:             */
1335:            int groupCount() {
1336:                return globalGroupIndex;
1337:            }
1338:
1339:            int compCount() {
1340:                return this .compCount + 1;
1341:            }
1342:
1343:            int consCount() {
1344:                return this .consCount + 1;
1345:            }
1346:
1347:            /**
1348:             * Returns supplementary character. At this time only for ASCII chars.
1349:             */
1350:            static char getSupplement(char ch) {
1351:                char res = ch;
1352:                if (ch >= 'a' && ch <= 'z') {
1353:                    res -= 32;
1354:                } else if (ch >= 'A' && ch <= 'Z') {
1355:                    res += 32;
1356:                }
1357:
1358:                return res;
1359:            }
1360:
1361:            /**
1362:             * @return true if pattern has specified flag
1363:             */
1364:            private boolean hasFlag(int flag) {
1365:                return (flags & flag) == flag;
1366:            }
1367:
1368:            /**
1369:             * Dismiss public constructor.
1370:             * 
1371:             */
1372:            private Pattern() {
1373:            }
1374:
1375:            /**
1376:             * Serialization support
1377:             */
1378:            private void readObject(java.io.ObjectInputStream s)
1379:                    throws java.io.IOException, ClassNotFoundException {
1380:                s.defaultReadObject();
1381:                AbstractSet.counter = 1;
1382:                groupIndex = -1;
1383:                globalGroupIndex = -1;
1384:                compCount = -1;
1385:                consCount = -1;
1386:                backRefs = new FSet[BACK_REF_NUMBER];
1387:
1388:                compileImpl(pattern, flags);
1389:
1390:            }
1391:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.