Source Code Cross Referenced for Lexer.java in » Apache-Harmony-Java-SE » java-package » java » util » regex » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Apache Harmony Java SE » java package » java.util.regex
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         *  Licensed to the Apache Software Foundation (ASF) under one or more
0003:         *  contributor license agreements.  See the NOTICE file distributed with
0004:         *  this work for additional information regarding copyright ownership.
0005:         *  The ASF licenses this file to You under the Apache License, Version 2.0
0006:         *  (the "License"); you may not use this file except in compliance with
0007:         *  the License.  You may obtain a copy of the License at
0008:         *
0009:         *     http://www.apache.org/licenses/LICENSE-2.0
0010:         *
0011:         *  Unless required by applicable law or agreed to in writing, software
0012:         *  distributed under the License is distributed on an "AS IS" BASIS,
0013:         *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014:         *  See the License for the specific language governing permissions and
0015:         *  limitations under the License.
0016:         */
0017:
0018:        /**
0019:         * @author Nikolay A. Kuznetsov
0020:         * @version $Revision: 1.21.2.2 $
0021:         */package java.util.regex;
0022:
0023:        import java.util.MissingResourceException;
0024:
0025:        import org.apache.harmony.regex.internal.nls.Messages;
0026:
0027:        /**
0028:         * The purpose of this class is to break given pattern into RE tokens; 
0029:         * 
0030:         * @author Nikolay A. Kuznetsov
0031:         * @version $Revision: 1.21.2.2 $
0032:         */
0033:        class Lexer {
0034:
0035:            public static final int CHAR_DOLLAR = 0xe0000000 | '$';
0036:
0037:            public static final int CHAR_RIGHT_PARENTHESIS = 0xe0000000 | ')';
0038:
0039:            public static final int CHAR_LEFT_SQUARE_BRACKET = 0xe0000000 | '[';
0040:
0041:            public static final int CHAR_RIGHT_SQUARE_BRACKET = 0xe0000000 | ']';
0042:
0043:            public static final int CHAR_CARET = 0xe0000000 | '^';
0044:
0045:            public static final int CHAR_VERTICAL_BAR = 0xe0000000 | '|';
0046:
0047:            public static final int CHAR_AMPERSAND = 0xe0000000 | '&';
0048:
0049:            public static final int CHAR_HYPHEN = 0xe0000000 | '-';
0050:
0051:            public static final int CHAR_DOT = 0xe0000000 | '.';
0052:
0053:            public static final int QMOD_GREEDY = 0xe0000000;
0054:
0055:            public static final int QMOD_RELUCTANT = 0xc0000000;
0056:
0057:            public static final int QMOD_POSSESSIVE = 0x80000000;
0058:
0059:            public static final int QUANT_STAR = QMOD_GREEDY | '*';
0060:
0061:            public static final int QUANT_STAR_P = QMOD_POSSESSIVE | '*';
0062:
0063:            public static final int QUANT_STAR_R = QMOD_RELUCTANT | '*';
0064:
0065:            public static final int QUANT_PLUS = QMOD_GREEDY | '+';
0066:
0067:            public static final int QUANT_PLUS_P = QMOD_POSSESSIVE | '+';
0068:
0069:            public static final int QUANT_PLUS_R = QMOD_RELUCTANT | '+';
0070:
0071:            public static final int QUANT_ALT = QMOD_GREEDY | '?';
0072:
0073:            public static final int QUANT_ALT_P = QMOD_POSSESSIVE | '?';
0074:
0075:            public static final int QUANT_ALT_R = QMOD_RELUCTANT | '?';
0076:
0077:            public static final int QUANT_COMP = QMOD_GREEDY | '{';
0078:
0079:            public static final int QUANT_COMP_P = QMOD_POSSESSIVE | '{';
0080:
0081:            public static final int QUANT_COMP_R = QMOD_RELUCTANT | '{';
0082:
0083:            public static final int CHAR_LEFT_PARENTHESIS = 0x80000000 | '(';
0084:
0085:            public static final int CHAR_NONCAP_GROUP = 0xc0000000 | '(';
0086:
0087:            public static final int CHAR_POS_LOOKAHEAD = 0xe0000000 | '(';
0088:
0089:            public static final int CHAR_NEG_LOOKAHEAD = 0xf0000000 | '(';
0090:
0091:            public static final int CHAR_POS_LOOKBEHIND = 0xf8000000 | '(';
0092:
0093:            public static final int CHAR_NEG_LOOKBEHIND = 0xfc000000 | '(';
0094:
0095:            public static final int CHAR_ATOMIC_GROUP = 0xfe000000 | '(';
0096:
0097:            public static final int CHAR_FLAGS = 0xff000000 | '(';
0098:
0099:            public static final int CHAR_START_OF_INPUT = 0x80000000 | 'A';
0100:
0101:            public static final int CHAR_WORD_BOUND = 0x80000000 | 'b';
0102:
0103:            public static final int CHAR_NONWORD_BOUND = 0x80000000 | 'B';
0104:
0105:            public static final int CHAR_PREVIOUS_MATCH = 0x80000000 | 'G';
0106:
0107:            public static final int CHAR_END_OF_INPUT = 0x80000000 | 'z';
0108:
0109:            public static final int CHAR_END_OF_LINE = 0x80000000 | 'Z';
0110:
0111:            public static final int MODE_PATTERN = 1 << 0;
0112:
0113:            public static final int MODE_RANGE = 1 << 1;
0114:
0115:            public static final int MODE_ESCAPE = 1 << 2;
0116:
0117:            //maximum length of decomposition
0118:            static final int MAX_DECOMPOSITION_LENGTH = 4;
0119:
0120:            /*
0121:             * maximum length of Hangul decomposition
0122:             * note that MAX_HANGUL_DECOMPOSITION_LENGTH <= MAX_DECOMPOSITION_LENGTH
0123:             */
0124:            static final int MAX_HANGUL_DECOMPOSITION_LENGTH = 3;
0125:
0126:            /*
0127:             * Following constants are needed for Hangul canonical decomposition.
0128:             * Hangul decomposition algorithm and constants are taken according
0129:             * to description at http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
0130:             * "3.12 Conjoining Jamo Behavior"
0131:             */
0132:            static final int SBase = 0xAC00;
0133:
0134:            static final int LBase = 0x1100;
0135:
0136:            static final int VBase = 0x1161;
0137:
0138:            static final int TBase = 0x11A7;
0139:
0140:            static final int SCount = 11172;
0141:
0142:            static final int LCount = 19;
0143:
0144:            static final int VCount = 21;
0145:
0146:            static final int TCount = 28;
0147:
0148:            static final int NCount = 588;
0149:
0150:            //table that contains canonical decomposition mappings
0151:            private static IntArrHash decompTable = null;
0152:
0153:            //table that contains canonical combining classes
0154:            private static IntHash canonClassesTable = null;
0155:
0156:            private static int canonClassesTableSize;
0157:
0158:            /*
0159:             * Table that contains information about Unicode codepoints with
0160:             * single codepoint decomposition
0161:             */
0162:            private static IntHash singleDecompTable = null;
0163:
0164:            private static int singleDecompTableSize;
0165:
0166:            private char[] pattern = null;
0167:
0168:            private int flags = 0;
0169:
0170:            private int mode = 1;
0171:
0172:            // when in literal mode, this field will save the previous one
0173:            private int saved_mode = 0;
0174:
0175:            // previous char read
0176:            private int lookBack;
0177:
0178:            //current character read
0179:            private int ch;
0180:
0181:            //next character
0182:            private int lookAhead;
0183:
0184:            //index of last char in pattern plus one
0185:            private int patternFullLength = 0;
0186:
0187:            // cur special token
0188:            private SpecialToken curST = null;
0189:
0190:            // next special token
0191:            private SpecialToken lookAheadST = null;
0192:
0193:            //  cur char being processed
0194:            private int index = 0;
0195:
0196:            //  previous non-whitespace character index;
0197:            private int prevNW = 0;
0198:
0199:            //  cur token start index
0200:            private int curToc = 0;
0201:
0202:            //  look ahead token index
0203:            private int lookAheadToc = 0;
0204:
0205:            //  original string representing pattern    
0206:            private String orig = null;
0207:
0208:            public Lexer(String pattern, int flags) {
0209:                orig = pattern;
0210:                if ((flags & Pattern.LITERAL) > 0) {
0211:                    pattern = Pattern.quote(pattern);
0212:                } else if ((flags & Pattern.CANON_EQ) > 0) {
0213:                    pattern = Lexer.normalize(pattern);
0214:                }
0215:
0216:                this .pattern = new char[pattern.length() + 2];
0217:                System.arraycopy(pattern.toCharArray(), 0, this .pattern, 0,
0218:                        pattern.length());
0219:                this .pattern[this .pattern.length - 1] = 0;
0220:                this .pattern[this .pattern.length - 2] = 0;
0221:                patternFullLength = this .pattern.length;
0222:                this .flags = flags;
0223:                // read first two tokens;
0224:                movePointer();
0225:                movePointer();
0226:
0227:            }
0228:
0229:            /**
0230:             * Returns current character w/o reading next one; if there are no more
0231:             * characters returns 0;
0232:             * 
0233:             * @return current character;
0234:             */
0235:            public int peek() {
0236:                return ch;
0237:            }
0238:
0239:            /**
0240:             * Set the Lexer to PATTERN or RANGE mode; Lexer interpret character two
0241:             * different ways in parser or range modes.
0242:             * 
0243:             * @param mode
0244:             *            Lexer.PATTERN or Lexer.RANGE
0245:             */
0246:            public void setMode(int mode) {
0247:                if (mode > 0 && mode < 3) {
0248:                    this .mode = mode;
0249:                }
0250:
0251:                if (mode == Lexer.MODE_PATTERN) {
0252:                    reread();
0253:                }
0254:            }
0255:
0256:            /**
0257:             * Restores flags for Lexer
0258:             * 
0259:             * @param flags
0260:             */
0261:            public void restoreFlags(int flags) {
0262:                this .flags = flags;
0263:                lookAhead = ch;
0264:                lookAheadST = curST;
0265:
0266:                //curToc is an index of closing bracket )
0267:                index = curToc + 1;
0268:                lookAheadToc = curToc;
0269:                movePointer();
0270:            }
0271:
0272:            public SpecialToken peekSpecial() {
0273:                return curST;
0274:            }
0275:
0276:            /**
0277:             * Returns true, if current token is special, i.e. quantifier, or other 
0278:             * compound token.
0279:             * 
0280:             * @return - true if current token is special, false otherwise.
0281:             */
0282:            public boolean isSpecial() {
0283:                return curST != null;
0284:            }
0285:
0286:            public boolean isQuantifier() {
0287:                return isSpecial()
0288:                        && curST.getType() == SpecialToken.TOK_QUANTIFIER;
0289:            }
0290:
0291:            public boolean isNextSpecial() {
0292:                return lookAheadST != null;
0293:            }
0294:
0295:            /**
0296:             * Returns current character and moves string index to the next one;
0297:             * 
0298:             */
0299:            public int next() {
0300:                movePointer();
0301:                return lookBack;
0302:            }
0303:
0304:            /**
0305:             * Returns current special token and moves string index to the next one;
0306:             */
0307:            public SpecialToken nextSpecial() {
0308:                SpecialToken res = curST;
0309:                movePointer();
0310:                return res;
0311:            }
0312:
0313:            /**
0314:             * Returns nest symbol read.
0315:             */
0316:            public int lookAhead() {
0317:                return lookAhead;
0318:            }
0319:
0320:            /**
0321:             * Returns previous character.
0322:             */
0323:            public int back() {
0324:                return lookBack;
0325:            }
0326:
0327:            /**
0328:             * Normalize given expression.
0329:             * 
0330:             * @param input - expression to normalize
0331:             * @return normalized expression.
0332:             */
0333:            static String normalize(String input) {
0334:                char[] inputChars = input.toCharArray();
0335:                int inputLength = inputChars.length;
0336:                int resCodePointsIndex = 0;
0337:                int inputCodePointsIndex = 0;
0338:                int decompHangulIndex = 0;
0339:
0340:                //codePoints of input
0341:                int[] inputCodePoints = new int[inputLength];
0342:
0343:                //result of canonical decomposition of input
0344:                int[] resCodePoints = new int[inputLength
0345:                        * MAX_DECOMPOSITION_LENGTH];
0346:
0347:                //current symbol's codepoint
0348:                int ch;
0349:
0350:                //current symbol's decomposition
0351:                int[] decomp;
0352:
0353:                //result of canonical and Hangul decomposition of input
0354:                int[] decompHangul;
0355:
0356:                //result of canonical decomposition of input in UTF-16 encoding
0357:                StringBuffer result = new StringBuffer();
0358:
0359:                decompTable = HashDecompositions.getHashDecompositions();
0360:                canonClassesTable = CanClasses.getHashCanClasses();
0361:                canonClassesTableSize = canonClassesTable.size;
0362:                singleDecompTable = SingleDecompositions
0363:                        .getHashSingleDecompositions();
0364:                singleDecompTableSize = singleDecompTable.size;
0365:
0366:                for (int i = 0; i < inputLength; i += Character.charCount(ch)) {
0367:                    ch = Character.codePointAt(inputChars, i);
0368:                    inputCodePoints[inputCodePointsIndex++] = ch;
0369:                }
0370:
0371:                /*
0372:                 * Canonical decomposition based on mappings in decompTable
0373:                 */
0374:                for (int i = 0; i < inputCodePointsIndex; i++) {
0375:                    ch = inputCodePoints[i];
0376:
0377:                    decomp = Lexer.getDecomposition(ch);
0378:                    if (decomp == null) {
0379:                        resCodePoints[resCodePointsIndex++] = ch;
0380:                    } else {
0381:                        int curSymbDecompLength = decomp.length;
0382:
0383:                        for (int j = 0; j < curSymbDecompLength; j++) {
0384:                            resCodePoints[resCodePointsIndex++] = decomp[j];
0385:                        }
0386:                    }
0387:                }
0388:
0389:                /*
0390:                 * Canonical ordering.
0391:                 * See http://www.unicode.org/reports/tr15/#Decomposition for
0392:                 * details
0393:                 */
0394:                resCodePoints = Lexer.getCanonicalOrder(resCodePoints,
0395:                        resCodePointsIndex);
0396:
0397:                /*
0398:                 * Decomposition for Hangul syllables.
0399:                 * See http://www.unicode.org/reports/tr15/#Hangul for
0400:                 * details
0401:                 */
0402:                decompHangul = new int[resCodePoints.length];
0403:
0404:                for (int i = 0; i < resCodePointsIndex; i++) {
0405:                    int curSymb = resCodePoints[i];
0406:
0407:                    decomp = getHangulDecomposition(curSymb);
0408:                    if (decomp == null) {
0409:                        decompHangul[decompHangulIndex++] = curSymb;
0410:                    } else {
0411:
0412:                        /*
0413:                         * Note that Hangul decompositions have length that is
0414:                         * equal 2 or 3.
0415:                         */
0416:                        decompHangul[decompHangulIndex++] = decomp[0];
0417:                        decompHangul[decompHangulIndex++] = decomp[1];
0418:                        if (decomp.length == 3) {
0419:                            decompHangul[decompHangulIndex++] = decomp[2];
0420:                        }
0421:                    }
0422:                }
0423:
0424:                /*
0425:                 * Translating into UTF-16 encoding
0426:                 */
0427:                for (int i = 0; i < decompHangulIndex; i++) {
0428:                    result.append(Character.toChars(decompHangul[i]));
0429:                }
0430:
0431:                return result.toString();
0432:            }
0433:
0434:            /**
0435:             * Rearrange codepoints according
0436:             * to canonical order.
0437:             * 
0438:             * @param inputInts - array that contains Unicode codepoints
0439:             * @param length - index of last Unicode codepoint plus 1
0440:             * 
0441:             * @return array that contains rearranged codepoints.
0442:             */
0443:            static int[] getCanonicalOrder(int[] inputInts, int length) {
0444:                int inputLength = (length < inputInts.length) ? length
0445:                        : inputInts.length;
0446:
0447:                /*
0448:                 * Simple bubble-sort algorithm.
0449:                 * Note that many codepoints have 0
0450:                 * canonical class, so this algorithm works
0451:                 * almost lineary in overwhelming majority
0452:                 * of cases. This is due to specific of Unicode
0453:                 * combining classes and codepoints.
0454:                 */
0455:                for (int i = 1; i < inputLength; i++) {
0456:                    int j = i - 1;
0457:                    int iCanonicalClass = getCanonicalClass(inputInts[i]);
0458:                    int ch;
0459:
0460:                    if (iCanonicalClass == 0) {
0461:                        continue;
0462:                    }
0463:
0464:                    while (j > -1) {
0465:                        if (getCanonicalClass(inputInts[j]) > iCanonicalClass) {
0466:                            j = j - 1;
0467:                        } else {
0468:                            break;
0469:                        }
0470:                    }
0471:
0472:                    ch = inputInts[i];
0473:                    for (int k = i; k > j + 1; k--) {
0474:                        inputInts[k] = inputInts[k - 1];
0475:                    }
0476:                    inputInts[j + 1] = ch;
0477:                }
0478:
0479:                return inputInts;
0480:            }
0481:
0482:            /**
0483:             * Reread current character, may be require if previous token changes mode
0484:             * to one with different character interpretation.
0485:             *
0486:             */
0487:            private void reread() {
0488:                lookAhead = ch;
0489:                lookAheadST = curST;
0490:                index = lookAheadToc;
0491:                lookAheadToc = curToc;
0492:                movePointer();
0493:            }
0494:
0495:            /**
0496:             * Moves pointer one position right; save current character to lookBack;
0497:             * lookAhead to current one and finally read one more to lookAhead;
0498:             */
0499:            private void movePointer() {
0500:                // swap pointers
0501:                lookBack = ch;
0502:                ch = lookAhead;
0503:                curST = lookAheadST;
0504:                curToc = lookAheadToc;
0505:                lookAheadToc = index;
0506:                boolean reread;
0507:                do {
0508:                    reread = false;
0509:                    // read next character analyze it and construct token:
0510:                    // //
0511:
0512:                    lookAhead = (index < pattern.length) ? nextCodePoint() : 0;
0513:                    lookAheadST = null;
0514:
0515:                    if (mode == Lexer.MODE_ESCAPE) {
0516:                        if (lookAhead == '\\') {
0517:
0518:                            //need not care about supplementary codepoints here 
0519:                            lookAhead = (index < pattern.length) ? pattern[nextIndex()]
0520:                                    : 0;
0521:
0522:                            switch (lookAhead) {
0523:                            case 'E': {
0524:                                mode = saved_mode;
0525:
0526:                                lookAhead = (index <= pattern.length - 2) ? nextCodePoint()
0527:                                        : 0;
0528:                                break;
0529:                            }
0530:
0531:                            default: {
0532:                                lookAhead = '\\';
0533:                                index = prevNW;
0534:                                return;
0535:                            }
0536:                            }
0537:                        } else {
0538:                            return;
0539:                        }
0540:                    }
0541:
0542:                    if (lookAhead == '\\') {
0543:
0544:                        lookAhead = (index < pattern.length - 2) ? nextCodePoint()
0545:                                : -1;
0546:                        switch (lookAhead) {
0547:                        case -1:
0548:                            throw new PatternSyntaxException(
0549:                                    Messages.getString("regex.10"), this .toString(), index); //$NON-NLS-1$
0550:                        case 'P':
0551:                        case 'p': {
0552:                            String cs = parseCharClassName();
0553:                            boolean negative = false;
0554:
0555:                            if (lookAhead == 'P')
0556:                                negative = true;
0557:                            ;
0558:                            try {
0559:                                lookAheadST = AbstractCharClass
0560:                                        .getPredefinedClass(cs, negative);
0561:                            } catch (MissingResourceException mre) {
0562:                                throw new PatternSyntaxException(Messages
0563:                                        .getString("regex.11" //$NON-NLS-1$
0564:                                                , cs), this .toString(), index);
0565:                            }
0566:                            lookAhead = 0;
0567:                            break;
0568:                        }
0569:
0570:                        case 'w':
0571:                        case 's':
0572:                        case 'd':
0573:                        case 'W':
0574:                        case 'S':
0575:                        case 'D': {
0576:                            lookAheadST = CharClass.getPredefinedClass(
0577:                                    new String(pattern, prevNW, 1), false);
0578:                            lookAhead = 0;
0579:                            break;
0580:                        }
0581:
0582:                        case 'Q': {
0583:                            saved_mode = mode;
0584:                            mode = Lexer.MODE_ESCAPE;
0585:                            reread = true;
0586:                            break;
0587:                        }
0588:
0589:                        case 't':
0590:                            lookAhead = '\t';
0591:                            break;
0592:                        case 'n':
0593:                            lookAhead = '\n';
0594:                            break;
0595:                        case 'r':
0596:                            lookAhead = '\r';
0597:                            break;
0598:                        case 'f':
0599:                            lookAhead = '\f';
0600:                            break;
0601:                        case 'a':
0602:                            lookAhead = '\u0007';
0603:                            break;
0604:                        case 'e':
0605:                            lookAhead = '\u001B';
0606:                            break;
0607:
0608:                        case '1':
0609:                        case '2':
0610:                        case '3':
0611:                        case '4':
0612:                        case '5':
0613:                        case '6':
0614:                        case '7':
0615:                        case '8':
0616:                        case '9': {
0617:                            if (mode == Lexer.MODE_PATTERN) {
0618:                                lookAhead = 0x80000000 | lookAhead;
0619:                            }
0620:                            break;
0621:                        }
0622:
0623:                        case '0':
0624:                            lookAhead = readOctals();
0625:                            break;
0626:                        case 'x':
0627:                            lookAhead = readHex("hexadecimal", 2); //$NON-NLS-1$
0628:                            break;
0629:                        case 'u':
0630:                            lookAhead = readHex("Unicode", 4); //$NON-NLS-1$
0631:                            break;
0632:
0633:                        case 'b':
0634:                            lookAhead = CHAR_WORD_BOUND;
0635:                            break;
0636:                        case 'B':
0637:                            lookAhead = CHAR_NONWORD_BOUND;
0638:                            break;
0639:                        case 'A':
0640:                            lookAhead = CHAR_START_OF_INPUT;
0641:                            break;
0642:                        case 'G':
0643:                            lookAhead = CHAR_PREVIOUS_MATCH;
0644:                            break;
0645:                        case 'Z':
0646:                            lookAhead = CHAR_END_OF_LINE;
0647:                            break;
0648:                        case 'z':
0649:                            lookAhead = CHAR_END_OF_INPUT;
0650:                            break;
0651:                        case 'c': {
0652:                            if (index < pattern.length - 2) {
0653:
0654:                                //need not care about supplementary codepoints here
0655:                                lookAhead = (pattern[nextIndex()] & 0x1f);
0656:                                break;
0657:                            } else {
0658:                                throw new PatternSyntaxException(Messages
0659:                                        .getString("regex.12") //$NON-NLS-1$
0660:                                        , this .toString(), index);
0661:                            }
0662:                        }
0663:                        case 'C':
0664:                        case 'E':
0665:                        case 'F':
0666:                        case 'H':
0667:                        case 'I':
0668:                        case 'J':
0669:                        case 'K':
0670:                        case 'L':
0671:                        case 'M':
0672:                        case 'N':
0673:                        case 'O':
0674:                        case 'R':
0675:                        case 'T':
0676:                        case 'U':
0677:                        case 'V':
0678:                        case 'X':
0679:                        case 'Y':
0680:                        case 'g':
0681:                        case 'h':
0682:                        case 'i':
0683:                        case 'j':
0684:                        case 'k':
0685:                        case 'l':
0686:                        case 'm':
0687:                        case 'o':
0688:                        case 'q':
0689:                        case 'y':
0690:                            throw new PatternSyntaxException(Messages
0691:                                    .getString("regex.13") //$NON-NLS-1$
0692:                                    , this .toString(), index);
0693:
0694:                        default:
0695:                            break;
0696:                        }
0697:                    } else if (mode == Lexer.MODE_PATTERN) {
0698:                        switch (lookAhead) {
0699:                        case '+':
0700:                        case '*':
0701:                        case '?': {
0702:                            char mod = (index < pattern.length) ? pattern[index]
0703:                                    : '*';
0704:                            switch (mod) {
0705:                            case '+': {
0706:                                lookAhead = lookAhead | Lexer.QMOD_POSSESSIVE;
0707:                                nextIndex();
0708:                                break;
0709:                            }
0710:                            case '?': {
0711:                                lookAhead = lookAhead | Lexer.QMOD_RELUCTANT;
0712:                                nextIndex();
0713:                                break;
0714:                            }
0715:                            default: {
0716:                                lookAhead = lookAhead | Lexer.QMOD_GREEDY;
0717:                                break;
0718:                            }
0719:                            }
0720:
0721:                            break;
0722:                        }
0723:
0724:                        case '{': {
0725:                            lookAheadST = processQuantifier(lookAhead);
0726:                            break;
0727:                        }
0728:
0729:                        case '$':
0730:                            lookAhead = CHAR_DOLLAR;
0731:                            break;
0732:                        case '(': {
0733:                            if (pattern[index] == '?') {
0734:                                nextIndex();
0735:                                char nonCap = pattern[index];
0736:                                boolean behind = false;
0737:                                do {
0738:                                    if (!behind) {
0739:                                        switch (nonCap) {
0740:                                        case '!':
0741:                                            lookAhead = CHAR_NEG_LOOKAHEAD;
0742:                                            nextIndex();
0743:                                            break;
0744:                                        case '=':
0745:                                            lookAhead = CHAR_POS_LOOKAHEAD;
0746:                                            nextIndex();
0747:                                            break;
0748:                                        case '>':
0749:                                            lookAhead = CHAR_ATOMIC_GROUP;
0750:                                            nextIndex();
0751:                                            break;
0752:                                        case '<': {
0753:                                            nextIndex();
0754:                                            nonCap = pattern[index];
0755:                                            behind = true;
0756:                                            break;
0757:                                        }
0758:                                        default: {
0759:                                            lookAhead = readFlags();
0760:
0761:                                            /*
0762:                                             * We return res = res | 1 << 8
0763:                                             * from readFlags() if we read
0764:                                             * (?idmsux-idmsux)
0765:                                             */
0766:                                            if (lookAhead >= 256) {
0767:
0768:                                                //Erase auxiliary bit
0769:                                                lookAhead = (lookAhead & 0xff);
0770:                                                flags = lookAhead;
0771:                                                lookAhead = lookAhead << 16;
0772:                                                lookAhead = CHAR_FLAGS
0773:                                                        | lookAhead;
0774:                                            } else {
0775:                                                flags = lookAhead;
0776:                                                lookAhead = lookAhead << 16;
0777:                                                lookAhead = CHAR_NONCAP_GROUP
0778:                                                        | lookAhead;
0779:                                            }
0780:                                            break;
0781:                                        }
0782:                                        }
0783:                                    } else {
0784:                                        behind = false;
0785:                                        switch (nonCap) {
0786:                                        case '!':
0787:                                            lookAhead = CHAR_NEG_LOOKBEHIND;
0788:                                            nextIndex();
0789:                                            break;
0790:                                        case '=':
0791:                                            lookAhead = CHAR_POS_LOOKBEHIND;
0792:                                            nextIndex();
0793:                                            break;
0794:                                        default:
0795:                                            throw new PatternSyntaxException(
0796:                                                    Messages
0797:                                                            .getString("regex.14") //$NON-NLS-1$
0798:                                                    , this .toString(), index);
0799:                                        }
0800:                                    }
0801:                                } while (behind);
0802:                            } else {
0803:                                lookAhead = CHAR_LEFT_PARENTHESIS;
0804:                            }
0805:                            break;
0806:                        }
0807:
0808:                        case ')':
0809:                            lookAhead = CHAR_RIGHT_PARENTHESIS;
0810:                            break;
0811:                        case '[': {
0812:                            lookAhead = CHAR_LEFT_SQUARE_BRACKET;
0813:                            setMode(Lexer.MODE_RANGE);
0814:                            break;
0815:                        }
0816:                        case ']': {
0817:                            if (mode == Lexer.MODE_RANGE) {
0818:                                lookAhead = CHAR_RIGHT_SQUARE_BRACKET;
0819:                            }
0820:                            break;
0821:                        }
0822:                        case '^':
0823:                            lookAhead = CHAR_CARET;
0824:                            break;
0825:                        case '|':
0826:                            lookAhead = CHAR_VERTICAL_BAR;
0827:                            break;
0828:                        case '.':
0829:                            lookAhead = CHAR_DOT;
0830:                            break;
0831:                        default:
0832:                            break;
0833:                        }
0834:                    } else if (mode == Lexer.MODE_RANGE) {
0835:                        switch (lookAhead) {
0836:                        case '[':
0837:                            lookAhead = CHAR_LEFT_SQUARE_BRACKET;
0838:                            break;
0839:                        case ']':
0840:                            lookAhead = CHAR_RIGHT_SQUARE_BRACKET;
0841:                            break;
0842:                        case '^':
0843:                            lookAhead = CHAR_CARET;
0844:                            break;
0845:                        case '&':
0846:                            lookAhead = CHAR_AMPERSAND;
0847:                            break;
0848:                        case '-':
0849:                            lookAhead = CHAR_HYPHEN;
0850:                            break;
0851:                        default:
0852:                            break;
0853:                        }
0854:                    }
0855:                } while (reread);
0856:            }
0857:
0858:            /**
0859:             * Parse character classes names and verifies correction of the syntax;
0860:             */
0861:            private String parseCharClassName() {
0862:                StringBuffer sb = new StringBuffer(10);
0863:                if (index < pattern.length - 2) {
0864:                    // one symbol family
0865:                    if (pattern[index] != '{') {
0866:                        return "Is" + new String(pattern, nextIndex(), 1); //$NON-NLS-1$
0867:                    }
0868:
0869:                    nextIndex();
0870:                    char ch = 0;
0871:                    while (index < pattern.length - 2
0872:                            && (ch = pattern[nextIndex()]) != '}') {
0873:                        sb.append((char) ch);
0874:                    }
0875:                    if (ch != '}')
0876:                        throw new PatternSyntaxException(Messages
0877:                                .getString("regex.15"), this  //$NON-NLS-1$
0878:                                .toString(), index);
0879:                }
0880:
0881:                if (sb.length() == 0)
0882:                    throw new PatternSyntaxException(Messages
0883:                            .getString("regex.16"), this .toString(), //$NON-NLS-1$
0884:                            index);
0885:
0886:                String res = sb.toString();
0887:                if (res.length() == 1)
0888:                    return "Is" + res; //$NON-NLS-1$
0889:                return (res.length() > 3 && (res.startsWith("Is") || res //$NON-NLS-1$
0890:                        .startsWith("In"))) ? res.substring(2) : res; //$NON-NLS-1$
0891:            }
0892:
0893:            /**
0894:             * Process given character in assumption that it's quantifier.
0895:             */
0896:            private Quantifier processQuantifier(int ch) {
0897:                StringBuffer sb = new StringBuffer(4);
0898:                int min = -1;
0899:                int max = Integer.MAX_VALUE;
0900:                while (index < pattern.length
0901:                        && (ch = pattern[nextIndex()]) != '}') {
0902:                    if (ch == ',' && min < 0) {
0903:                        try {
0904:                            min = Integer.parseInt(sb.toString(), 10);
0905:                            sb.delete(0, sb.length());
0906:                        } catch (NumberFormatException nfe) {
0907:                            throw new PatternSyntaxException(Messages
0908:                                    .getString("regex.17"), this  //$NON-NLS-1$
0909:                                    .toString(), index);
0910:                        }
0911:                    } else {
0912:                        sb.append((char) ch);
0913:                    }
0914:                }
0915:                if (ch != '}') {
0916:                    throw new PatternSyntaxException(Messages
0917:                            .getString("regex.17"), //$NON-NLS-1$
0918:                            this .toString(), index);
0919:                }
0920:                if (sb.length() > 0) {
0921:                    try {
0922:                        max = Integer.parseInt(sb.toString(), 10);
0923:                        if (min < 0)
0924:                            min = max;
0925:                    } catch (NumberFormatException nfe) {
0926:                        throw new PatternSyntaxException(Messages
0927:                                .getString("regex.17"), this  //$NON-NLS-1$
0928:                                .toString(), index);
0929:                    }
0930:                } else if (min < 0) {
0931:                    throw new PatternSyntaxException(Messages
0932:                            .getString("regex.17"), //$NON-NLS-1$
0933:                            this .toString(), index);
0934:                }
0935:                if ((min | max | max - min) < 0) {
0936:                    throw new PatternSyntaxException(Messages
0937:                            .getString("regex.17"), //$NON-NLS-1$
0938:                            this .toString(), index);
0939:                }
0940:
0941:                char mod = (index < pattern.length) ? pattern[index] : '*';
0942:
0943:                switch (mod) {
0944:                case '+':
0945:                    lookAhead = Lexer.QUANT_COMP_P;
0946:                    nextIndex();
0947:                    break;
0948:                case '?':
0949:                    lookAhead = Lexer.QUANT_COMP_R;
0950:                    nextIndex();
0951:                    break;
0952:                default:
0953:                    lookAhead = Lexer.QUANT_COMP;
0954:                    break;
0955:                }
0956:                return new Quantifier(min, max);
0957:            }
0958:
0959:            public String toString() {
0960:                return orig;
0961:            }
0962:
0963:            /**
0964:             * Checks if there are any characters in the pattern.
0965:             * 
0966:             * @return true if there are no more characters in the pattern.
0967:             */
0968:            public boolean isEmpty() {
0969:                return ch == 0 && lookAhead == 0 && index == patternFullLength
0970:                        && !isSpecial();
0971:            }
0972:
0973:            /**
0974:             * Returns true if current character is plain token.
0975:             */
0976:            public static boolean isLetter(int ch) {
0977:
0978:                //all supplementary codepoints have integer value that is >= 0;
0979:                return ch >= 0;
0980:            }
0981:
0982:            /**
0983:             * Return true if current character is letter, false otherwise; This is
0984:             * shortcut to static method isLetter to check the current character.
0985:             * 
0986:             * @return true if current character is letter, false otherwise
0987:             */
0988:            public boolean isLetter() {
0989:                return !isEmpty() && !isSpecial() && isLetter(ch);
0990:            }
0991:
0992:            /*
0993:             * Note that Character class methods
0994:             * isHighSurrogate(), isLowSurrogate()
0995:             * take char parameter while we need an int
0996:             * parameter without truncation to char value
0997:             */
0998:            public boolean isHighSurrogate() {
0999:                return (ch <= 0xDBFF) && (ch >= 0xD800);
1000:            }
1001:
1002:            public boolean isLowSurrogate() {
1003:                return (ch <= 0xDFFF) && (ch >= 0xDC00);
1004:            }
1005:
1006:            public static boolean isHighSurrogate(int ch) {
1007:                return (ch <= 0xDBFF) && (ch >= 0xD800);
1008:            }
1009:
1010:            public static boolean isLowSurrogate(int ch) {
1011:                return (ch <= 0xDFFF) && (ch >= 0xDC00);
1012:            }
1013:
1014:            /**
1015:             * Process hexadecimal integer. 
1016:             */
1017:            private int readHex(String radixName, int max) {
1018:                StringBuffer st = new StringBuffer(max);
1019:                int length = pattern.length - 2;
1020:                int i;
1021:                for (i = 0; i < max && index < length; i++) {
1022:                    st.append((char) pattern[nextIndex()]);
1023:                }
1024:                if (i == max) {
1025:                    try {
1026:                        return Integer.parseInt(st.toString(), 16);
1027:                    } catch (NumberFormatException nfe) {
1028:                    }
1029:                }
1030:
1031:                throw new PatternSyntaxException(Messages.getString(
1032:                        "regex.18", radixName) //$NON-NLS-1$
1033:                        , this .toString(), index);
1034:            }
1035:
1036:            /**
1037:             * Process octal integer.
1038:             */
1039:            private int readOctals() {
1040:                char ch;
1041:                int max = 3;
1042:                int i = 1;
1043:                int first;
1044:                int res;
1045:                int length = pattern.length - 2;
1046:
1047:                switch (first = Character.digit((ch = pattern[index]), 8)) {
1048:                case -1:
1049:                    throw new PatternSyntaxException(Messages
1050:                            .getString("regex.19") //$NON-NLS-1$
1051:                            , this .toString(), index);
1052:                default: {
1053:                    if (first > 3)
1054:                        max--;
1055:                    nextIndex();
1056:                    res = first;
1057:                }
1058:                }
1059:
1060:                while (i < max
1061:                        && index < length
1062:                        && (first = Character.digit((ch = pattern[index]), 8)) >= 0) {
1063:                    res = res * 8 + first;
1064:                    nextIndex();
1065:                    i++;
1066:                }
1067:
1068:                return res;
1069:            }
1070:
1071:            /**
1072:             * Process expression flags given with (?idmsux-idmsux)
1073:             */
1074:            private int readFlags() {
1075:                char ch;
1076:                boolean pos = true;
1077:                int res = flags;
1078:
1079:                while (index < pattern.length) {
1080:                    ch = pattern[index];
1081:                    switch (ch) {
1082:                    case '-':
1083:                        if (!pos) {
1084:                            throw new PatternSyntaxException(Messages
1085:                                    .getString("regex.1A") //$NON-NLS-1$
1086:                                    , this .toString(), index);
1087:                        }
1088:                        pos = false;
1089:                        break;
1090:
1091:                    case 'i':
1092:                        res = pos ? res | Pattern.CASE_INSENSITIVE
1093:                                : (res ^ Pattern.CASE_INSENSITIVE) & res;
1094:                        break;
1095:
1096:                    case 'd':
1097:                        res = pos ? res | Pattern.UNIX_LINES
1098:                                : (res ^ Pattern.UNIX_LINES) & res;
1099:                        break;
1100:
1101:                    case 'm':
1102:                        res = pos ? res | Pattern.MULTILINE
1103:                                : (res ^ Pattern.MULTILINE) & res;
1104:                        break;
1105:
1106:                    case 's':
1107:                        res = pos ? res | Pattern.DOTALL
1108:                                : (res ^ Pattern.DOTALL) & res;
1109:                        break;
1110:
1111:                    case 'u':
1112:                        res = pos ? res | Pattern.UNICODE_CASE
1113:                                : (res ^ Pattern.UNICODE_CASE) & res;
1114:                        break;
1115:
1116:                    case 'x':
1117:                        res = pos ? res | Pattern.COMMENTS
1118:                                : (res ^ Pattern.COMMENTS) & res;
1119:                        break;
1120:
1121:                    case ':':
1122:                        nextIndex();
1123:                        return res;
1124:
1125:                    case ')':
1126:                        nextIndex();
1127:                        return res | (1 << 8);
1128:
1129:                    default:
1130:                        // ignore invalid flags (HARMONY-2127)
1131:                    }
1132:                    nextIndex();
1133:                }
1134:                throw new PatternSyntaxException(
1135:                        Messages.getString("regex.1A"), //$NON-NLS-1$
1136:                        this .toString(), index);
1137:            }
1138:
1139:            /**
1140:             * Returns next character index to read and moves pointer to the next one.
1141:             * If comments flag is on this method will skip comments and whitespaces.
1142:             * 
1143:             * The following actions are equivalent if comments flag is off ch =
1144:             * pattern[index++] == ch = pattern[nextIndex]
1145:             * 
1146:             * @return next character index to read.
1147:             */
1148:            private int nextIndex() {
1149:                prevNW = index;
1150:                if ((flags & Pattern.COMMENTS) != 0) {
1151:                    skipComments();
1152:                } else {
1153:                    index++;
1154:                }
1155:                return prevNW;
1156:            }
1157:
1158:            /**
1159:             * Skips comments and whitespaces
1160:             */
1161:            private int skipComments() {
1162:                int length = pattern.length - 2;
1163:                index++;
1164:                do {
1165:                    while (index < length
1166:                            && Character.isWhitespace(pattern[index]))
1167:                        index++;
1168:                    if (index < length && pattern[index] == '#') {
1169:                        index++;
1170:                        while (index < length
1171:                                && !isLineSeparator(pattern[index]))
1172:                            index++;
1173:                    } else
1174:                        return index;
1175:                } while (true);
1176:            }
1177:
1178:            private boolean isLineSeparator(int ch) {
1179:                return (ch == '\n' || ch == '\r' || ch == '\u0085' || (ch | 1) == '\u2029');
1180:            }
1181:
1182:            /**
1183:             * Gets decomposition for given codepoint from
1184:             * decomposition mappings table.
1185:             * 
1186:             * @param ch - Unicode codepoint
1187:             * @return array of codepoints that is a canonical
1188:             *         decomposition of ch.
1189:             */
1190:            static int[] getDecomposition(int ch) {
1191:                return decompTable.get(ch);
1192:            }
1193:
1194:            /**
1195:             * Gets decomposition for given Hangul syllable. 
1196:             * This is an implementation of Hangul decomposition algorithm 
1197:             * according to http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
1198:             * "3.12 Conjoining Jamo Behavior".
1199:             * 
1200:             * @param ch - given Hangul syllable
1201:             * @return canonical decomposition of ch.
1202:             */
1203:            static int[] getHangulDecomposition(int ch) {
1204:                int SIndex = ch - SBase;
1205:
1206:                if (SIndex < 0 || SIndex >= SCount) {
1207:                    return null;
1208:                } else {
1209:                    int L = LBase + SIndex / NCount;
1210:                    int V = VBase + (SIndex % NCount) / TCount;
1211:                    int T = SIndex % TCount;
1212:                    int decomp[];
1213:
1214:                    if (T == 0) {
1215:                        decomp = new int[] { L, V };
1216:                    } else {
1217:                        T = TBase + T;
1218:                        decomp = new int[] { L, V, T };
1219:                    }
1220:                    return decomp;
1221:                }
1222:            }
1223:
1224:            /**
1225:             * Gets canonical class for given codepoint from
1226:             * decomposition mappings table.
1227:             * 
1228:             * @param - ch Unicode codepoint
1229:             * @return canonical class for given Unicode codepoint
1230:             *         that is represented by ch.
1231:             */
1232:            static int getCanonicalClass(int ch) {
1233:                int canClass = canonClassesTable.get(ch);
1234:
1235:                return (canClass == canonClassesTableSize) ? 0 : canClass;
1236:            }
1237:
1238:            /**
1239:             * Tests if given codepoint is a canonical decomposition of another
1240:             * codepoint.
1241:             * 
1242:             * @param ch - codepoint to test
1243:             * @return true if ch is a decomposition.
1244:             */
1245:            static boolean hasSingleCodepointDecomposition(int ch) {
1246:                int hasSingleDecomp = singleDecompTable.get(ch);
1247:
1248:                /*
1249:                 * singleDecompTable doesn't contain ch 
1250:                 * == (hasSingleDecomp == singleDecompTableSize)
1251:                 */
1252:                return (hasSingleDecomp == singleDecompTableSize) ? false
1253:                        : true;
1254:            }
1255:
1256:            /**
1257:             * Tests if given codepoint has canonical decomposition
1258:             * and given codepoint's canonical class is not 0.
1259:             * 
1260:             * @param ch - codepoint to test
1261:             * @return true if canonical class is not 0 and ch has a decomposition.
1262:             */
1263:            static boolean hasDecompositionNonNullCanClass(int ch) {
1264:                return ch == 0x0340 | ch == 0x0341 | ch == 0x0343
1265:                        | ch == 0x0344;
1266:            }
1267:
1268:            private int nextCodePoint() {
1269:                char high = pattern[nextIndex()];
1270:
1271:                if (Character.isHighSurrogate(high)) {
1272:
1273:                    //low and high char may be delimited by spaces
1274:                    int lowExpectedIndex = prevNW + 1;
1275:
1276:                    if (lowExpectedIndex < pattern.length) {
1277:                        char low = pattern[lowExpectedIndex];
1278:                        if (Character.isLowSurrogate(low)) {
1279:                            nextIndex();
1280:                            return Character.toCodePoint(high, low);
1281:                        }
1282:                    }
1283:                }
1284:
1285:                return (int) high;
1286:            }
1287:
1288:            /**
1289:             * Tests Unicode codepoint if it is a boundary
1290:             * of decomposed Unicode codepoint. 
1291:             * 
1292:             * @param ch - Unicode codepoint to test
1293:             * @return true if given codepoint is a boundary.
1294:             */
1295:            static boolean isDecomposedCharBoundary(int ch) {
1296:                int canClass = canonClassesTable.get(ch);
1297:
1298:                //Lexer.getCanonicalClass(ch) == 0
1299:                boolean isBoundary = (canClass == canonClassesTableSize);
1300:
1301:                return isBoundary;
1302:            }
1303:
1304:            /**
1305:             * Returns the curr. character index.
1306:             */
1307:            public int getIndex() {
1308:                return curToc;
1309:            }
1310:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.