Source Code Cross Referenced for RE.java in » Web-Crawler » WebSPHINX » org » apache » regexp » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » WebSPHINX » org.apache.regexp
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        package org.apache.regexp;
0002:
0003:        /*
0004:         * ====================================================================
0005:         * 
0006:         * The Apache Software License, Version 1.1
0007:         *
0008:         * Copyright (c) 1999 The Apache Software Foundation.  All rights 
0009:         * reserved.
0010:         *
0011:         * Redistribution and use in source and binary forms, with or without
0012:         * modification, are permitted provided that the following conditions
0013:         * are met:
0014:         *
0015:         * 1. Redistributions of source code must retain the above copyright
0016:         *    notice, this list of conditions and the following disclaimer. 
0017:         *
0018:         * 2. Redistributions in binary form must reproduce the above copyright
0019:         *    notice, this list of conditions and the following disclaimer in
0020:         *    the documentation and/or other materials provided with the
0021:         *    distribution.
0022:         *
0023:         * 3. The end-user documentation included with the redistribution, if
0024:         *    any, must include the following acknowlegement:  
0025:         *       "This product includes software developed by the 
0026:         *        Apache Software Foundation (http://www.apache.org/)."
0027:         *    Alternately, this acknowlegement may appear in the software itself,
0028:         *    if and wherever such third-party acknowlegements normally appear.
0029:         *
0030:         * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
0031:         *    Foundation" must not be used to endorse or promote products derived
0032:         *    from this software without prior written permission. For written 
0033:         *    permission, please contact apache@apache.org.
0034:         *
0035:         * 5. Products derived from this software may not be called "Apache"
0036:         *    nor may "Apache" appear in their names without prior written
0037:         *    permission of the Apache Group.
0038:         *
0039:         * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0040:         * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0041:         * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0042:         * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0043:         * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0044:         * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0045:         * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0046:         * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0047:         * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0048:         * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0049:         * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0050:         * SUCH DAMAGE.
0051:         * ====================================================================
0052:         *
0053:         * This software consists of voluntary contributions made by many
0054:         * individuals on behalf of the Apache Software Foundation.  For more
0055:         * information on the Apache Software Foundation, please see
0056:         * <http://www.apache.org/>.
0057:         *
0058:         */
0059:
0060:        import java.util.Vector;
0061:
0062:        /**
0063:         * RE is an efficient, lightweight regular expression evaluator/matcher class.
0064:         * Regular expressions are pattern descriptions which enable sophisticated matching of
0065:         * strings.  In addition to being able to match a string against a pattern, you
0066:         * can also extract parts of the match.  This is especially useful in text parsing!
0067:         * Details on the syntax of regular expression patterns are given below.
0068:         *
0069:         * <p>
0070:         *
0071:         * To compile a regular expression (RE), you can simply construct an RE matcher
0072:         * object from the string specification of the pattern, like this:
0073:         *
0074:         * <pre>
0075:         *
0076:         *     RE r = new RE("a*b");
0077:         *
0078:         * </pre>
0079:         *
0080:         * <p>
0081:         *
0082:         * Once you have done this, you can call either of the RE.match methods to
0083:         * perform matching on a String.  For example:
0084:         *
0085:         * <pre>
0086:         *
0087:         *     boolean matched = r.match("aaaab");
0088:         *
0089:         * </pre>
0090:         *
0091:         * will cause the boolean matched to be set to true because the
0092:         * pattern "a*b" matches the string "aaaab".
0093:         *
0094:         * <p>
0095:         * If you were interested in the <i>number</i> of a's which matched the first
0096:         * part of our example expression, you could change the expression to
0097:         * "(a*)b".  Then when you compiled the expression and matched it against
0098:         * something like "xaaaab", you would get results like this:
0099:         *
0100:         * <pre>
0101:         *
0102:         *     RE r = new RE("(a*)b");                  // Compile expression
0103:         *     boolean matched = r.match("xaaaab");     // Match against "xaaaab"
0104:         *
0105:         * <br>
0106:         *
0107:         *     String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
0108:         *     String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
0109:         *
0110:         * <br>
0111:         *
0112:         *     int startWholeExpr = getParenStart(0);   // startWholeExpr will be index 1
0113:         *     int endWholeExpr = getParenEnd(0);       // endWholeExpr will be index 6
0114:         *     int lenWholeExpr = getParenLength(0);    // lenWholeExpr will be 5
0115:         *
0116:         * <br>
0117:         *
0118:         *     int startInside = getParenStart(1);      // startInside will be index 1
0119:         *     int endInside = getParenEnd(1);          // endInside will be index 5
0120:         *     int lenInside = getParenLength(1);       // lenInside will be 4
0121:         *
0122:         * </pre>
0123:         *
0124:         * You can also refer to the contents of a parenthesized expression within
0125:         * a regular expression itself.  This is called a 'backreference'.  The first
0126:         * backreference in a regular expression is denoted by \1, the second by \2
0127:         * and so on.  So the expression:
0128:         *
0129:         * <pre>
0130:         *
0131:         *     ([0-9]+)=\1
0132:         *
0133:         * </pre>
0134:         *
0135:         * will match any string of the form n=n (like 0=0 or 2=2).
0136:         *
0137:         * <p>
0138:         *
0139:         * The full regular expression syntax accepted by RE is described here:
0140:         *
0141:         * <pre>
0142:         *
0143:         * <br>
0144:         *
0145:         *  <b><font face=times roman>Characters</font></b>
0146:         *
0147:         * <br>
0148:         *
0149:         *    <i>unicodeChar</i>          Matches any identical unicode character
0150:         *    \                    Used to quote a meta-character (like '*')
0151:         *    \\                   Matches a single '\' character
0152:         *    \0nnn                Matches a given octal character
0153:         *    \xhh                 Matches a given 8-bit hexadecimal character
0154:         *    \\uhhhh               Matches a given 16-bit hexadecimal character
0155: *    \t                   Matches an ASCII tab character
0156: *    \n                   Matches an ASCII newline character
0157: *    \r                   Matches an ASCII return character
0158: *    \f                   Matches an ASCII form feed character
0159: *
0160: * <br>
0161: *
0162: *  <b><font face=times roman>Character Classes</font></b>
0163: *
0164: * <br>
0165: *
0166: *    [abc]                Simple character class
0167: *    [a-zA-Z]             Character class with ranges
0168: *    [^abc]               Negated character class
0169: *
0170: * <br>
0171: *
0172: *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
0173: *
0174: * <br>
0175: *
0176: *    [:alnum:]            Alphanumeric characters. 
0177: *    [:alpha:]            Alphabetic characters. 
0178: *    [:blank:]            Space and tab characters. 
0179: *    [:cntrl:]            Control characters. 
0180: *    [:digit:]            Numeric characters. 
0181: *    [:graph:]            Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.) 
0182: *    [:lower:]            Lower-case alphabetic characters. 
0183: *    [:print:]            Printable characters (characters that are not control characters.) 
0184: *    [:punct:]            Punctuation characters (characters that are not letter, digits, control characters, or space characters). 
0185: *    [:space:]            Space characters (such as space, tab, and formfeed, to name a few). 
0186: *    [:upper:]            Upper-case alphabetic characters. 
0187: *    [:xdigit:]           Characters that are hexadecimal digits.
0188: *         
0189: * <br>
0190: *
0191: *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
0192: *
0193: * <br>
0194: *
0195: *    [:javastart:]        Start of a Java identifier
0196: *    [:javapart:]         Part of a Java identifier
0197: *
0198: * <br>
0199: *         
0200: *  <b><font face=times roman>Predefined Classes</font></b>
0201: *
0202: * <br>
0203: *
0204: *    .                    Matches any character other than newline
0205: *    \w                   Matches a "word" character (alphanumeric plus "_")
0206: *    \W                   Matches a non-word character
0207: *    \s                   Matches a whitespace character
0208: *    \S                   Matches a non-whitespace character
0209: *    \d                   Matches a digit character
0210: *    \D                   Matches a non-digit character
0211: *
0212: * <br>
0213: *
0214: *  <b><font face=times roman>Boundary Matchers</font></b>
0215: *
0216: * <br>
0217: *
0218: *    ^                    Matches only at the beginning of a line
0219: *    $                    Matches only at the end of a line
0220: *    \b                   Matches only at a word boundary
0221: *    \B                   Matches only at a non-word boundary
0222: *
0223: * <br>
0224: *
0225: *  <b><font face=times roman>Greedy Closures</font></b>
0226: *
0227: * <br>
0228: *
0229: *    A*                   Matches A 0 or more times (greedy)
0230: *    A+                   Matches A 1 or more times (greedy)
0231: *    A?                   Matches A 1 or 0 times (greedy)
0232: *    A{n}                 Matches A exactly n times (greedy)
0233: *    A{n,}                Matches A at least n times (greedy)
0234: *    A{n,m}               Matches A at least n but not more than m times (greedy)
0235: *
0236: * <br>
0237: *
0238: *  <b><font face=times roman>Reluctant Closures</font></b>
0239: *
0240: * <br>
0241: *
0242: *    A*?                  Matches A 0 or more times (reluctant)
0243: *    A+?                  Matches A 1 or more times (reluctant)
0244: *    A??                  Matches A 0 or 1 times (reluctant)
0245: *
0246: * <br>
0247: *
0248: *  <b><font face=times roman>Logical Operators</font></b>
0249: *
0250: * <br>
0251: *
0252: *    AB                   Matches A followed by B
0253: *    A|B                  Matches either A or B
0254: *    (A)                  Used for subexpression grouping
0255: *
0256: * <br>
0257: *
0258: *  <b><font face=times roman>Backreferences</font></b>
0259: *
0260: * <br>
0261: *
0262: *    \1                   Backreference to 1st parenthesized subexpression
0263: *    \2                   Backreference to 2nd parenthesized subexpression
0264: *    \3                   Backreference to 3rd parenthesized subexpression
0265: *    \4                   Backreference to 4th parenthesized subexpression
0266: *    \5                   Backreference to 5th parenthesized subexpression
0267: *    \6                   Backreference to 6th parenthesized subexpression
0268: *    \7                   Backreference to 7th parenthesized subexpression
0269: *    \8                   Backreference to 8th parenthesized subexpression
0270: *    \9                   Backreference to 9th parenthesized subexpression
0271: *
0272: * <br>
0273: *
0274: * </pre>
0275: *
0276: * <p>
0277: *
0278: * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
0279: * match as many elements of the string as possible without causing the overall
0280: * match to fail.  If you want a closure to be reluctant (non-greedy), you can
0281: * simply follow it with a '?'.  A reluctant closure will match as few elements
0282: * of the string as possible when finding matches.  {m,n} closures don't currently
0283: * support reluctancy.
0284: *
0285: * <p>
0286: *
0287: * RE runs programs compiled by the RECompiler class.  But the RE matcher class
0288: * does not include the actual regular expression compiler for reasons of
0289: * efficiency.  In fact, if you want to pre-compile one or more regular expressions,
0290: * the 'recompile' class can be invoked from the command line to produce compiled
0291: * output like this:
0292: *
0293: * <pre>
0294: *
0295: *    // Pre-compiled regular expression "a*b"
0296: *    char[] re1Instructions =
0297: *    {
0298: *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
0299: *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
0300: *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
0301: *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
0302: *        0x0000,
0303: *    };
0304: *
0305: *    <br>
0306: *
0307: *    REProgram re1 = new REProgram(re1Instructions);
0308: *
0309: * </pre>
0310: *
0311: * You can then construct a regular expression matcher (RE) object from the pre-compiled
0312: * expression re1 and thus avoid the overhead of compiling the expression at runtime.
0313: * If you require more dynamic regular expressions, you can construct a single RECompiler
0314: * object and re-use it to compile each expression.  Similarly, you can change the
0315: * program run by a given matcher object at any time.  However, RE and RECompiler are
0316: * not threadsafe (for efficiency reasons, and because requiring thread safety in this
0317: * class is deemed to be a rare requirement), so you will need to construct a separate
0318: * compiler or matcher object for each thread (unless you do thread synchronization
0319: * yourself).
0320: *
0321: * </pre>
0322: * <br><p><br>
0323: *
0324: * <font color=red>
0325: * <i>ISSUES:</i>
0326: *
0327: * <ul>
0328: *  <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
0329: *  <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
0330: *  <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
0331: *  <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
0332: *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)
0333: *  <li>Should RE support character iterators (for backwards RE matching!)?
0334: *  <li>Should RE support reluctant {m,n} closures (does anyone care)?
0335: *  <li>Not *all* possibilities are considered for greediness when backreferences
0336: *      are involved (as POSIX suggests should be the case).  The POSIX RE
0337: *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
0338: *      of acdacaa where \1 is "a".  This is not the case in this RE package,
0339: *      and actually Perl doesn't go to this extent either!  Until someone
0340: *      actually complains about this, I'm not sure it's worth "fixing".
0341: *      If it ever is fixed, test #137 in RETest.txt should be updated.
0342: * </ul>
0343: *
0344: * </font>
0345: *
0346: * @see recompile
0347: * @see RECompiler
0348: *
0349: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0350: * @version $Id: RE.java,v 1.1.1.1 2002/01/31 03:14:36 rcm Exp $
0351: */
0352:        public class RE {
0353:            /**
0354:             * Specifies normal, case-sensitive matching behaviour.
0355:             */
0356:            public static final int MATCH_NORMAL = 0x0000;
0357:
0358:            /**
0359:             * Flag to indicate that matching should be case-independent (folded)
0360:             */
0361:            public static final int MATCH_CASEINDEPENDENT = 0x0001;
0362:
0363:            /**
0364:             * Newlines should match as BOL/EOL (^ and $)
0365:             */
0366:            public static final int MATCH_MULTILINE = 0x0002;
0367:
0368:            /**
0369:             * Consider all input a single body of text - newlines are matched by .
0370:             */
0371:            public static final int MATCH_SINGLELINE = 0x0004;
0372:
0373:            /************************************************
0374:             *                                              *
0375:             * The format of a node in a program is:        *
0376:             *                                              *
0377:             * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
0378:             *                                              *
0379:             * char OPCODE - instruction                    *
0380:             * char OPDATA - modifying data                 *
0381:             * char OPNEXT - next node (relative offset)    *
0382:             *                                              *
0383:             ************************************************/
0384:
0385:            //   Opcode              Char       Opdata/Operand  Meaning
0386:            //   ----------          ---------- --------------- --------------------------------------------------
0387:            static final char OP_END = 'E'; //                 end of program
0388:            static final char OP_BOL = '^'; //                 match only if at beginning of line
0389:            static final char OP_EOL = '$'; //                 match only if at end of line
0390:            static final char OP_ANY = '.'; //                 match any single character except newline
0391:            static final char OP_ANYOF = '['; // count/ranges    match any char in the list of ranges
0392:            static final char OP_BRANCH = '|'; // node            match this alternative or the next one
0393:            static final char OP_ATOM = 'A'; // length/string   length of string followed by string itself
0394:            static final char OP_STAR = '*'; // node            kleene closure
0395:            static final char OP_PLUS = '+'; // node            positive closure
0396:            static final char OP_MAYBE = '?'; // node            optional closure
0397:            static final char OP_ESCAPE = '\\'; // escape          special escape code char class (escape is E_* code)
0398:            static final char OP_OPEN = '('; // number          nth opening paren
0399:            static final char OP_CLOSE = ')'; // number          nth closing paren
0400:            static final char OP_BACKREF = '#'; // number          reference nth already matched parenthesized string
0401:            static final char OP_GOTO = 'G'; //                 nothing but a (back-)pointer
0402:            static final char OP_NOTHING = 'N'; //                 match null string such as in '(a|)'
0403:            static final char OP_RELUCTANTSTAR = '8'; // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
0404:            static final char OP_RELUCTANTPLUS = '='; // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
0405:            static final char OP_RELUCTANTMAYBE = '/'; // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
0406:            static final char OP_POSIXCLASS = 'P'; // classid         one of the posix character classes
0407:
0408:            // Escape codes
0409:            static final char E_ALNUM = 'w'; // Alphanumeric
0410:            static final char E_NALNUM = 'W'; // Non-alphanumeric
0411:            static final char E_BOUND = 'b'; // Word boundary
0412:            static final char E_NBOUND = 'B'; // Non-word boundary
0413:            static final char E_SPACE = 's'; // Whitespace
0414:            static final char E_NSPACE = 'S'; // Non-whitespace
0415:            static final char E_DIGIT = 'd'; // Digit
0416:            static final char E_NDIGIT = 'D'; // Non-digit
0417:
0418:            // Posix character classes
0419:            static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
0420:            static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics 
0421:            static final char POSIX_CLASS_BLANK = 'b'; // Blanks
0422:            static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
0423:            static final char POSIX_CLASS_DIGIT = 'd'; // Digits
0424:            static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
0425:            static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
0426:            static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
0427:            static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
0428:            static final char POSIX_CLASS_SPACE = 's'; // Spaces
0429:            static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
0430:            static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
0431:            static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
0432:            static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
0433:
0434:            // Limits
0435:            static final int maxNode = 65536; // Maximum number of nodes in a program
0436:            static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
0437:
0438:            // Node layout constants
0439:            static final int offsetOpcode = 0; // Opcode offset (first character)
0440:            static final int offsetOpdata = 1; // Opdata offset (second char)
0441:            static final int offsetNext = 2; // Next index offset (third char)
0442:            static final int nodeSize = 3; // Node size (in chars)
0443:
0444:            /** Line Separator */
0445:            static final String NEWLINE = System.getProperty("line.separator");
0446:
0447:            // State of current program
0448:            REProgram program; // Compiled regular expression 'program'
0449:            CharacterIterator search; // The string being matched against
0450:            int idx; // Current index in string being searched
0451:            int matchFlags; // Match behaviour flags
0452:
0453:            // Parenthesized subexpressions
0454:            int parenCount; // Number of subexpressions matched (num open parens + 1)
0455:            int start0; // Cache of start[0]
0456:            int end0; // Cache of start[0]
0457:            int start1; // Cache of start[1]
0458:            int end1; // Cache of start[1]
0459:            int start2; // Cache of start[2]
0460:            int end2; // Cache of start[2]
0461:            int[] startn; // Lazy-alloced array of sub-expression starts
0462:            int[] endn; // Lazy-alloced array of sub-expression ends
0463:
0464:            // Backreferences
0465:            int[] startBackref; // Lazy-alloced array of backref starts
0466:            int[] endBackref; // Lazy-alloced array of backref ends
0467:
0468:            /**
0469:             * Constructs a regular expression matcher from a String by compiling it
0470:             * using a new instance of RECompiler.  If you will be compiling many
0471:             * expressions, you may prefer to use a single RECompiler object instead.
0472:             * @param pattern The regular expression pattern to compile.
0473:             * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0474:             * @see RECompiler
0475:             * @see recompile
0476:             */
0477:            public RE(String pattern) throws RESyntaxException {
0478:                this (pattern, MATCH_NORMAL);
0479:            }
0480:
0481:            /**
0482:             * Constructs a regular expression matcher from a String by compiling it
0483:             * using a new instance of RECompiler.  If you will be compiling many
0484:             * expressions, you may prefer to use a single RECompiler object instead.
0485:             * @param pattern The regular expression pattern to compile.
0486:             * @param matchFlags The matching style
0487:             * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0488:             * @see RECompiler
0489:             * @see recompile
0490:             */
0491:            public RE(String pattern, int matchFlags) throws RESyntaxException {
0492:                this (new RECompiler().compile(pattern));
0493:                setMatchFlags(matchFlags);
0494:            }
0495:
0496:            /**
0497:             * Construct a matcher for a pre-compiled regular expression from program
0498:             * (bytecode) data.  Permits special flags to be passed in to modify matching
0499:             * behaviour.
0500:             * @param program Compiled regular expression program (see RECompiler and/or recompile)
0501:             * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0502:             *
0503:             * <pre>
0504:             *
0505:             *   MATCH_NORMAL              // Normal (case-sensitive) matching
0506:             *   MATCH_CASEINDEPENDENT     // Case folded comparisons
0507:             *   MATCH_MULTILINE           // Newline matches as BOL/EOL
0508:             *
0509:             * </pre>
0510:             *
0511:             * @see RECompiler
0512:             * @see REProgram
0513:             * @see recompile
0514:             */
0515:            public RE(REProgram program, int matchFlags) {
0516:                setProgram(program);
0517:                setMatchFlags(matchFlags);
0518:            }
0519:
0520:            /**
0521:             * Construct a matcher for a pre-compiled regular expression from program
0522:             * (bytecode) data.
0523:             * @param program Compiled regular expression program
0524:             * @see RECompiler
0525:             * @see recompile
0526:             */
0527:            public RE(REProgram program) {
0528:                this (program, MATCH_NORMAL);
0529:            }
0530:
0531:            /**
0532:             * Constructs a regular expression matcher with no initial program.
0533:             * This is likely to be an uncommon practice, but is still supported.
0534:             */
0535:            public RE() {
0536:                this ((REProgram) null, MATCH_NORMAL);
0537:            }
0538:
0539:            /**
0540:             * Converts a 'simplified' regular expression to a full regular expression
0541:             * @param pattern The pattern to convert
0542:             * @return The full regular expression
0543:             */
0544:            public static String simplePatternToFullRegularExpression(
0545:                    String pattern) {
0546:                StringBuffer buf = new StringBuffer();
0547:                for (int i = 0; i < pattern.length(); i++) {
0548:                    char c = pattern.charAt(i);
0549:                    switch (c) {
0550:                    case '*':
0551:                        buf.append(".*");
0552:                        break;
0553:
0554:                    case '.':
0555:                    case '[':
0556:                    case ']':
0557:                    case '\\':
0558:                    case '+':
0559:                    case '?':
0560:                    case '{':
0561:                    case '}':
0562:                    case '$':
0563:                    case '^':
0564:                    case '|':
0565:                    case '(':
0566:                    case ')':
0567:                        buf.append('\\');
0568:                    default:
0569:                        buf.append(c);
0570:                        break;
0571:                    }
0572:                }
0573:                return buf.toString();
0574:            }
0575:
0576:            /**
0577:             * Sets match behaviour flags which alter the way RE does matching.
0578:             * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0579:             *
0580:             * <pre>
0581:             *
0582:             *   MATCH_NORMAL              // Normal (case-sensitive) matching
0583:             *   MATCH_CASEINDEPENDENT     // Case folded comparisons
0584:             *   MATCH_MULTILINE           // Newline matches as BOL/EOL
0585:             *
0586:             * </pre>
0587:             *
0588:             */
0589:            public void setMatchFlags(int matchFlags) {
0590:                this .matchFlags = matchFlags;
0591:            }
0592:
0593:            /**
0594:             * Returns the current match behaviour flags.
0595:             * @return Current match behaviour flags (RE.MATCH_*).
0596:             *
0597:             * <pre>
0598:             *
0599:             *   MATCH_NORMAL              // Normal (case-sensitive) matching
0600:             *   MATCH_CASEINDEPENDENT     // Case folded comparisons
0601:             *   MATCH_MULTILINE           // Newline matches as BOL/EOL
0602:             *
0603:             * </pre>
0604:             *
0605:             * @see #setMatchFlags
0606:             *
0607:             */
0608:            public int getMatchFlags() {
0609:                return matchFlags;
0610:            }
0611:
0612:            /**
0613:             * Sets the current regular expression program used by this matcher object.
0614:             * @param program Regular expression program compiled by RECompiler.
0615:             * @see RECompiler
0616:             * @see REProgram
0617:             * @see recompile
0618:             */
0619:            public void setProgram(REProgram program) {
0620:                this .program = program;
0621:            }
0622:
0623:            /**
0624:             * Returns the current regular expression program in use by this matcher object.
0625:             * @return Regular expression program
0626:             * @see #setProgram
0627:             */
0628:            public REProgram getProgram() {
0629:                return program;
0630:            }
0631:
0632:            /**
0633:             * Returns the number of parenthesized subexpressions available after a successful match.
0634:             * @return Number of available parenthesized subexpressions
0635:             */
0636:            public int getParenCount() {
0637:                return parenCount;
0638:            }
0639:
0640:            /**
0641:             * Gets the contents of a parenthesized subexpression after a successful match.
0642:             * @param which Nesting level of subexpression
0643:             * @return String
0644:             */
0645:            public String getParen(int which) {
0646:                int start;
0647:                if (which < parenCount && (start = getParenStart(which)) >= 0) {
0648:                    return search.substring(start, getParenEnd(which));
0649:                }
0650:                return null;
0651:            }
0652:
0653:            /**
0654:             * Returns the start index of a given paren level.
0655:             * @param which Nesting level of subexpression
0656:             * @return String index 
0657:             */
0658:            public final int getParenStart(int which) {
0659:                if (which < parenCount) {
0660:                    switch (which) {
0661:                    case 0:
0662:                        return start0;
0663:
0664:                    case 1:
0665:                        return start1;
0666:
0667:                    case 2:
0668:                        return start2;
0669:
0670:                    default:
0671:                        if (startn == null) {
0672:                            allocParens();
0673:                        }
0674:                        return startn[which];
0675:                    }
0676:                }
0677:                return -1;
0678:            }
0679:
0680:            /**
0681:             * Returns the end index of a given paren level.
0682:             * @param which Nesting level of subexpression
0683:             * @return String index 
0684:             */
0685:            public final int getParenEnd(int which) {
0686:                if (which < parenCount) {
0687:                    switch (which) {
0688:                    case 0:
0689:                        return end0;
0690:
0691:                    case 1:
0692:                        return end1;
0693:
0694:                    case 2:
0695:                        return end2;
0696:
0697:                    default:
0698:                        if (endn == null) {
0699:                            allocParens();
0700:                        }
0701:                        return endn[which];
0702:                    }
0703:                }
0704:                return -1;
0705:            }
0706:
0707:            /**
0708:             * Returns the length of a given paren level.
0709:             * @param which Nesting level of subexpression
0710:             * @return Number of characters in the parenthesized subexpression
0711:             */
0712:            public final int getParenLength(int which) {
0713:                if (which < parenCount) {
0714:                    return getParenEnd(which) - getParenStart(which);
0715:                }
0716:                return -1;
0717:            }
0718:
0719:            /**
0720:             * Sets the start of a paren level
0721:             * @param which Which paren level
0722:             * @param i Index in input array
0723:             */
0724:            protected final void setParenStart(int which, int i) {
0725:                if (which < parenCount) {
0726:                    switch (which) {
0727:                    case 0:
0728:                        start0 = i;
0729:                        break;
0730:
0731:                    case 1:
0732:                        start1 = i;
0733:                        break;
0734:
0735:                    case 2:
0736:                        start2 = i;
0737:                        break;
0738:
0739:                    default:
0740:                        if (startn == null) {
0741:                            allocParens();
0742:                        }
0743:                        startn[which] = i;
0744:                        break;
0745:                    }
0746:                }
0747:            }
0748:
0749:            /**
0750:             * Sets the end of a paren level
0751:             * @param which Which paren level
0752:             * @param i Index in input array
0753:             */
0754:            protected final void setParenEnd(int which, int i) {
0755:                if (which < parenCount) {
0756:                    switch (which) {
0757:                    case 0:
0758:                        end0 = i;
0759:                        break;
0760:
0761:                    case 1:
0762:                        end1 = i;
0763:                        break;
0764:
0765:                    case 2:
0766:                        end2 = i;
0767:                        break;
0768:
0769:                    default:
0770:                        if (endn == null) {
0771:                            allocParens();
0772:                        }
0773:                        endn[which] = i;
0774:                        break;
0775:                    }
0776:                }
0777:            }
0778:
0779:            /**
0780:             * Throws an Error representing an internal error condition probably resulting
0781:             * from a bug in the regular expression compiler (or possibly data corruption).
0782:             * In practice, this should be very rare.
0783:             * @param s Error description
0784:             */
0785:            protected void internalError(String s) throws Error {
0786:                throw new Error("RE internal error: " + s);
0787:            }
0788:
0789:            /**
0790:             * Performs lazy allocation of subexpression arrays
0791:             */
0792:            private final void allocParens() {
0793:                // Allocate arrays for subexpressions
0794:                startn = new int[maxParen];
0795:                endn = new int[maxParen];
0796:
0797:                // Set sub-expression pointers to invalid values
0798:                for (int i = 0; i < maxParen; i++) {
0799:                    startn[i] = -1;
0800:                    endn[i] = -1;
0801:                }
0802:            }
0803:
0804:            /**
0805:             * Try to match a string against a subset of nodes in the program
0806:             * @param firstNode Node to start at in program
0807:             * @param lastNode Last valid node (used for matching a subexpression without
0808:             * matching the rest of the program as well).
0809:             * @param idxStart Starting position in character array
0810:             * @return Final input array index if match succeeded.  -1 if not.
0811:             */
0812:            protected int matchNodes(int firstNode, int lastNode, int idxStart) {
0813:                // Our current place in the string
0814:                int idx = idxStart;
0815:
0816:                // Loop while node is valid
0817:                int next, opcode, opdata;
0818:                int idxNew;
0819:                char[] instruction = program.instruction;
0820:                for (int node = firstNode; node < lastNode;) {
0821:                    opcode = instruction[node + offsetOpcode];
0822:                    next = node + (short) instruction[node + offsetNext];
0823:                    opdata = instruction[node + offsetOpdata];
0824:
0825:                    switch (opcode) {
0826:                    case OP_RELUCTANTMAYBE: {
0827:                        int once = 0;
0828:                        do {
0829:                            // Try to match the rest without using the reluctant subexpr
0830:                            if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0831:                                return idxNew;
0832:                            }
0833:                        } while ((once++ == 0)
0834:                                && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
0835:                        return -1;
0836:                    }
0837:
0838:                    case OP_RELUCTANTPLUS:
0839:                        while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) {
0840:                            // Try to match the rest without using the reluctant subexpr
0841:                            if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0842:                                return idxNew;
0843:                            }
0844:                        }
0845:                        return -1;
0846:
0847:                    case OP_RELUCTANTSTAR:
0848:                        do {
0849:                            // Try to match the rest without using the reluctant subexpr
0850:                            if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0851:                                return idxNew;
0852:                            }
0853:                        } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
0854:                        return -1;
0855:
0856:                    case OP_OPEN:
0857:
0858:                        // Match subexpression
0859:                        if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0860:                            startBackref[opdata] = idx;
0861:                        }
0862:                        if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0863:                            // Increase valid paren count
0864:                            if ((opdata + 1) > parenCount) {
0865:                                parenCount = opdata + 1;
0866:                            }
0867:
0868:                            // Don't set paren if already set later on
0869:                            if (getParenStart(opdata) == -1) {
0870:                                setParenStart(opdata, idx);
0871:                            }
0872:                        }
0873:                        return idxNew;
0874:
0875:                    case OP_CLOSE:
0876:
0877:                        // Done matching subexpression
0878:                        if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0879:                            endBackref[opdata] = idx;
0880:                        }
0881:                        if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0882:                            // Increase valid paren count
0883:                            if ((opdata + 1) > parenCount) {
0884:                                parenCount = opdata + 1;
0885:                            }
0886:
0887:                            // Don't set paren if already set later on
0888:                            if (getParenEnd(opdata) == -1) {
0889:                                setParenEnd(opdata, idx);
0890:                            }
0891:                        }
0892:                        return idxNew;
0893:
0894:                    case OP_BACKREF: {
0895:                        // Get the start and end of the backref
0896:                        int s = startBackref[opdata];
0897:                        int e = endBackref[opdata];
0898:
0899:                        // We don't know the backref yet
0900:                        if (s == -1 || e == -1) {
0901:                            return -1;
0902:                        }
0903:
0904:                        // The backref is empty size
0905:                        if (s == e) {
0906:                            break;
0907:                        }
0908:
0909:                        // Get the length of the backref
0910:                        int l = e - s;
0911:
0912:                        // If there's not enough input left, give up.
0913:                        if (search.isEnd(idx + l - 1)) {
0914:                            return -1;
0915:                        }
0916:
0917:                        // Case fold the backref?
0918:                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
0919:                            // Compare backref to input, case-folding as we go
0920:                            for (int i = 0; i < l; i++) {
0921:                                if (Character.toLowerCase(search.charAt(idx++)) != Character
0922:                                        .toLowerCase(search.charAt(s + i))) {
0923:                                    return -1;
0924:                                }
0925:                            }
0926:                        } else {
0927:                            // Compare backref to input
0928:                            for (int i = 0; i < l; i++) {
0929:                                if (search.charAt(idx++) != search
0930:                                        .charAt(s + i)) {
0931:                                    return -1;
0932:                                }
0933:                            }
0934:                        }
0935:                    }
0936:                        break;
0937:
0938:                    case OP_BOL:
0939:
0940:                        // Fail if we're not at the start of the string
0941:                        if (idx != 0) {
0942:                            // If we're multiline matching, we could still be at the start of a line
0943:                            if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0944:                                // If not at start of line, give up
0945:                                if (idx <= 0 || !isNewline(idx - 1)) {
0946:                                    return -1;
0947:                                } else {
0948:                                    break;
0949:                                }
0950:                            }
0951:                            return -1;
0952:                        }
0953:                        break;
0954:
0955:                    case OP_EOL:
0956:
0957:                        // If we're not at the end of string
0958:                        if (!search.isEnd(0) && !search.isEnd(idx)) {
0959:                            // If we're multi-line matching
0960:                            if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0961:                                // Give up if we're not at the end of a line
0962:                                if (!isNewline(idx)) {
0963:                                    return -1;
0964:                                } else {
0965:                                    break;
0966:                                }
0967:                            }
0968:                            return -1;
0969:                        }
0970:                        break;
0971:
0972:                    case OP_ESCAPE:
0973:
0974:                        // Which escape?
0975:                        switch (opdata) {
0976:                        // Word boundary match
0977:                        case E_NBOUND:
0978:                        case E_BOUND: {
0979:                            char cLast = ((idx == getParenStart(0)) ? '\n'
0980:                                    : search.charAt(idx - 1));
0981:                            char cNext = ((search.isEnd(idx)) ? '\n' : search
0982:                                    .charAt(idx));
0983:                            if ((Character.isLetterOrDigit(cLast) == Character
0984:                                    .isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
0985:                                return -1;
0986:                            }
0987:                        }
0988:                            break;
0989:
0990:                        // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
0991:                        case E_ALNUM:
0992:                        case E_NALNUM:
0993:                        case E_DIGIT:
0994:                        case E_NDIGIT:
0995:                        case E_SPACE:
0996:                        case E_NSPACE:
0997:
0998:                            // Give up if out of input
0999:                            if (search.isEnd(idx)) {
1000:                                return -1;
1001:                            }
1002:
1003:                            // Switch on escape
1004:                            switch (opdata) {
1005:                            case E_ALNUM:
1006:                            case E_NALNUM:
1007:                                if (!(Character.isLetterOrDigit(search
1008:                                        .charAt(idx)) == (opdata == E_ALNUM))) {
1009:                                    return -1;
1010:                                }
1011:                                break;
1012:
1013:                            case E_DIGIT:
1014:                            case E_NDIGIT:
1015:                                if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) {
1016:                                    return -1;
1017:                                }
1018:                                break;
1019:
1020:                            case E_SPACE:
1021:                            case E_NSPACE:
1022:                                if (!(Character
1023:                                        .isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) {
1024:                                    return -1;
1025:                                }
1026:                                break;
1027:                            }
1028:                            idx++;
1029:                            break;
1030:
1031:                        default:
1032:                            internalError("Unrecognized escape '" + opdata
1033:                                    + "'");
1034:                        }
1035:                        break;
1036:
1037:                    case OP_ANY:
1038:
1039:                        if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1040:                            // Match anything
1041:                            if (search.isEnd(idx)) {
1042:                                return -1;
1043:                            }
1044:                            idx++;
1045:                            break;
1046:                        } else {
1047:                            // Match anything but a newline
1048:                            if (search.isEnd(idx)
1049:                                    || search.charAt(idx++) == '\n') {
1050:                                return -1;
1051:                            }
1052:                            break;
1053:                        }
1054:
1055:                    case OP_ATOM: {
1056:                        // Match an atom value
1057:                        if (search.isEnd(idx)) {
1058:                            return -1;
1059:                        }
1060:
1061:                        // Get length of atom and starting index
1062:                        int lenAtom = opdata;
1063:                        int startAtom = node + nodeSize;
1064:
1065:                        // Give up if not enough input remains to have a match
1066:                        if (search.isEnd(lenAtom + idx - 1)) {
1067:                            return -1;
1068:                        }
1069:
1070:                        // Match atom differently depending on casefolding flag
1071:                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1072:                            for (int i = 0; i < lenAtom; i++) {
1073:                                if (Character.toLowerCase(search.charAt(idx++)) != Character
1074:                                        .toLowerCase(instruction[startAtom + i])) {
1075:                                    return -1;
1076:                                }
1077:                            }
1078:                        } else {
1079:                            for (int i = 0; i < lenAtom; i++) {
1080:                                if (search.charAt(idx++) != instruction[startAtom
1081:                                        + i]) {
1082:                                    return -1;
1083:                                }
1084:                            }
1085:                        }
1086:                    }
1087:                        break;
1088:
1089:                    case OP_POSIXCLASS: {
1090:                        // Out of input?
1091:                        if (search.isEnd(idx)) {
1092:                            return -1;
1093:                        }
1094:
1095:                        switch (opdata) {
1096:                        case POSIX_CLASS_ALNUM:
1097:                            if (!Character.isLetterOrDigit(search.charAt(idx))) {
1098:                                return -1;
1099:                            }
1100:                            break;
1101:
1102:                        case POSIX_CLASS_ALPHA:
1103:                            if (!Character.isLetter(search.charAt(idx))) {
1104:                                return -1;
1105:                            }
1106:                            break;
1107:
1108:                        case POSIX_CLASS_DIGIT:
1109:                            if (!Character.isDigit(search.charAt(idx))) {
1110:                                return -1;
1111:                            }
1112:                            break;
1113:
1114:                        case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1115:                            if (!Character.isSpaceChar(search.charAt(idx))) {
1116:                                return -1;
1117:                            }
1118:                            break;
1119:
1120:                        case POSIX_CLASS_SPACE:
1121:                            if (!Character.isWhitespace(search.charAt(idx))) {
1122:                                return -1;
1123:                            }
1124:                            break;
1125:
1126:                        case POSIX_CLASS_CNTRL:
1127:                            if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1128:                                return -1;
1129:                            }
1130:                            break;
1131:
1132:                        case POSIX_CLASS_GRAPH: // JWL - bugbug???
1133:                            switch (Character.getType(search.charAt(idx))) {
1134:                            case Character.MATH_SYMBOL:
1135:                            case Character.CURRENCY_SYMBOL:
1136:                            case Character.MODIFIER_SYMBOL:
1137:                            case Character.OTHER_SYMBOL:
1138:                                break;
1139:
1140:                            default:
1141:                                return -1;
1142:                            }
1143:                            break;
1144:
1145:                        case POSIX_CLASS_LOWER:
1146:                            if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1147:                                return -1;
1148:                            }
1149:                            break;
1150:
1151:                        case POSIX_CLASS_UPPER:
1152:                            if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1153:                                return -1;
1154:                            }
1155:                            break;
1156:
1157:                        case POSIX_CLASS_PRINT:
1158:                            if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1159:                                return -1;
1160:                            }
1161:                            break;
1162:
1163:                        case POSIX_CLASS_PUNCT: {
1164:                            int type = Character.getType(search.charAt(idx));
1165:                            switch (type) {
1166:                            case Character.DASH_PUNCTUATION:
1167:                            case Character.START_PUNCTUATION:
1168:                            case Character.END_PUNCTUATION:
1169:                            case Character.CONNECTOR_PUNCTUATION:
1170:                            case Character.OTHER_PUNCTUATION:
1171:                                break;
1172:
1173:                            default:
1174:                                return -1;
1175:                            }
1176:                        }
1177:                            break;
1178:
1179:                        case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1180:                        {
1181:                            boolean isXDigit = ((search.charAt(idx) >= '0' && search
1182:                                    .charAt(idx) <= '9')
1183:                                    || (search.charAt(idx) >= 'a' && search
1184:                                            .charAt(idx) <= 'f') || (search
1185:                                    .charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1186:                            if (!isXDigit) {
1187:                                return -1;
1188:                            }
1189:                        }
1190:                            break;
1191:
1192:                        case POSIX_CLASS_JSTART:
1193:                            if (!Character.isJavaIdentifierStart(search
1194:                                    .charAt(idx))) {
1195:                                return -1;
1196:                            }
1197:                            break;
1198:
1199:                        case POSIX_CLASS_JPART:
1200:                            if (!Character.isJavaIdentifierPart(search
1201:                                    .charAt(idx))) {
1202:                                return -1;
1203:                            }
1204:                            break;
1205:
1206:                        default:
1207:                            internalError("Bad posix class");
1208:                            break;
1209:                        }
1210:
1211:                        // Matched.
1212:                        idx++;
1213:                    }
1214:                        break;
1215:
1216:                    case OP_ANYOF: {
1217:                        // Out of input?
1218:                        if (search.isEnd(idx)) {
1219:                            return -1;
1220:                        }
1221:
1222:                        // Get character to match against character class and maybe casefold
1223:                        char c = search.charAt(idx);
1224:                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1225:                        if (caseFold) {
1226:                            c = Character.toLowerCase(c);
1227:                        }
1228:
1229:                        // Loop through character class checking our match character
1230:                        int idxRange = node + nodeSize;
1231:                        int idxEnd = idxRange + (opdata * 2);
1232:                        boolean match = false;
1233:                        for (int i = idxRange; i < idxEnd;) {
1234:                            // Get start, end and match characters
1235:                            char s = instruction[i++];
1236:                            char e = instruction[i++];
1237:
1238:                            // Fold ends of range and match character
1239:                            if (caseFold) {
1240:                                s = Character.toLowerCase(s);
1241:                                e = Character.toLowerCase(e);
1242:                            }
1243:
1244:                            // If the match character is in range, break out
1245:                            if (c >= s && c <= e) {
1246:                                match = true;
1247:                                break;
1248:                            }
1249:                        }
1250:
1251:                        // Fail if we didn't match the character class
1252:                        if (!match) {
1253:                            return -1;
1254:                        }
1255:                        idx++;
1256:                    }
1257:                        break;
1258:
1259:                    case OP_BRANCH: {
1260:                        // Check for choices
1261:                        if (instruction[next + offsetOpcode] != OP_BRANCH) {
1262:                            // If there aren't any other choices, just evaluate this branch.
1263:                            node += nodeSize;
1264:                            continue;
1265:                        }
1266:
1267:                        // Try all available branches
1268:                        short nextBranch;
1269:                        do {
1270:                            // Try matching the branch against the string
1271:                            if ((idxNew = matchNodes(node + nodeSize, maxNode,
1272:                                    idx)) != -1) {
1273:                                return idxNew;
1274:                            }
1275:
1276:                            // Go to next branch (if any)
1277:                            nextBranch = (short) instruction[node + offsetNext];
1278:                            node += nextBranch;
1279:                        } while (nextBranch != 0
1280:                                && (instruction[node + offsetOpcode] == OP_BRANCH));
1281:
1282:                        // Failed to match any branch!
1283:                        return -1;
1284:                    }
1285:
1286:                    case OP_NOTHING:
1287:                    case OP_GOTO:
1288:
1289:                        // Just advance to the next node without doing anything
1290:                        break;
1291:
1292:                    case OP_END:
1293:
1294:                        // Match has succeeded!
1295:                        setParenEnd(0, idx);
1296:                        return idx;
1297:
1298:                    default:
1299:
1300:                        // Corrupt program
1301:                        internalError("Invalid opcode '" + opcode + "'");
1302:                    }
1303:
1304:                    // Advance to the next node in the program
1305:                    node = next;
1306:                }
1307:
1308:                // We "should" never end up here
1309:                internalError("Corrupt program");
1310:                return -1;
1311:            }
1312:
1313:            /**
1314:             * Match the current regular expression program against the current
1315:             * input string, starting at index i of the input string.  This method
1316:             * is only meant for internal use.
1317:             * @param i The input string index to start matching at
1318:             * @return True if the input matched the expression
1319:             */
1320:            protected boolean matchAt(int i) {
1321:                // Initialize start pointer, paren cache and paren count
1322:                start0 = -1;
1323:                end0 = -1;
1324:                start1 = -1;
1325:                end1 = -1;
1326:                start2 = -1;
1327:                end2 = -1;
1328:                startn = null;
1329:                endn = null;
1330:                parenCount = 1;
1331:                setParenStart(0, i);
1332:
1333:                // Allocate backref arrays (unless optimizations indicate otherwise)
1334:                if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1335:                    startBackref = new int[maxParen];
1336:                    endBackref = new int[maxParen];
1337:                }
1338:
1339:                // Match against string
1340:                int idx;
1341:                if ((idx = matchNodes(0, maxNode, i)) != -1) {
1342:                    setParenEnd(0, idx);
1343:                    return true;
1344:                }
1345:
1346:                // Didn't match
1347:                parenCount = 0;
1348:                return false;
1349:            }
1350:
1351:            /**
1352:             * Matches the current regular expression program against a character array,
1353:             * starting at a given index.
1354:             * @param search String to match against
1355:             * @param i Index to start searching at
1356:             * @return True if string matched
1357:             */
1358:            public boolean match(String search, int i) {
1359:                return match(new StringCharacterIterator(search), i);
1360:            }
1361:
1362:            /**
1363:             * Matches the current regular expression program against a character array,
1364:             * starting at a given index.
1365:             * @param search String to match against
1366:             * @param i Index to start searching at
1367:             * @return True if string matched
1368:             */
1369:            public boolean match(CharacterIterator search, int i) {
1370:                // There is no compiled program to search with!
1371:                if (program == null) {
1372:                    // This should be uncommon enough to be an error case rather
1373:                    // than an exception (which would have to be handled everywhere)
1374:                    internalError("No RE program to run!");
1375:                }
1376:
1377:                // Save string to search
1378:                this .search = search;
1379:
1380:                // Can we optimize the search by looking for a prefix string?
1381:                if (program.prefix == null) {
1382:                    // Unprefixed matching must try for a match at each character
1383:                    for (; !search.isEnd(i - 1); i++) {
1384:                        // Try a match at index i
1385:                        if (matchAt(i)) {
1386:                            return true;
1387:                        }
1388:                    }
1389:                    return false;
1390:                } else {
1391:                    // Prefix-anchored matching is possible
1392:                    boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1393:                    char[] prefix = program.prefix;
1394:                    for (; !search.isEnd(i + prefix.length - 1); i++) {
1395:                        // If the first character of the prefix matches
1396:                        boolean match = false;
1397:                        if (caseIndependent)
1398:                            match = Character.toLowerCase(search.charAt(i)) == Character
1399:                                    .toLowerCase(prefix[0]);
1400:                        else
1401:                            match = search.charAt(i) == prefix[0];
1402:                        if (match) {
1403:                            // Save first character position
1404:                            int firstChar = i++;
1405:                            int k;
1406:                            for (k = 1; k < prefix.length;) {
1407:                                // If there's a mismatch of any character in the prefix, give up
1408:                                if (caseIndependent)
1409:                                    match = Character.toLowerCase(search
1410:                                            .charAt(i++)) == Character
1411:                                            .toLowerCase(prefix[k++]);
1412:                                else
1413:                                    match = search.charAt(i++) == prefix[k++];
1414:                                if (!match) {
1415:                                    break;
1416:                                }
1417:                            }
1418:
1419:                            // See if the whole prefix string matched
1420:                            if (k == prefix.length) {
1421:                                // We matched the full prefix at firstChar, so try it
1422:                                if (matchAt(firstChar)) {
1423:                                    return true;
1424:                                }
1425:                            }
1426:
1427:                            // Match failed, reset i to continue the search
1428:                            i = firstChar;
1429:                        }
1430:                    }
1431:                    return false;
1432:                }
1433:            }
1434:
1435:            /**
1436:             * Matches the current regular expression program against a String.
1437:             * @param search String to match against
1438:             * @return True if string matched
1439:             */
1440:            public boolean match(String search) {
1441:                return match(search, 0);
1442:            }
1443:
1444:            /**
1445:             * Splits a string into an array of strings on regular expression boundaries.
1446:             * This function works the same way as the Perl function of the same name.
1447:             * Given a regular expression of "[ab]+" and a string to split of
1448:             * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1449:             * "[xyzzy, yyz, 123]".
1450:             * @param s String to split on this regular exression
1451:             * @return Array of strings
1452:             */
1453:            public String[] split(String s) {
1454:                // Create new vector
1455:                Vector v = new Vector();
1456:
1457:                // Start at position 0 and search the whole string
1458:                int pos = 0;
1459:                int len = s.length();
1460:
1461:                // Try a match at each position
1462:                while (pos < len && match(s, pos)) {
1463:                    // Get start of match
1464:                    int start = getParenStart(0);
1465:
1466:                    // Get end of match
1467:                    int newpos = getParenEnd(0);
1468:
1469:                    // Check if no progress was made
1470:                    if (newpos == pos) {
1471:                        v.addElement(s.substring(pos, start + 1));
1472:                        newpos++;
1473:                    } else {
1474:                        v.addElement(s.substring(pos, start));
1475:                    }
1476:
1477:                    // Move to new position
1478:                    pos = newpos;
1479:                }
1480:
1481:                // Push remainder if it's not empty
1482:                String remainder = s.substring(pos);
1483:                if (remainder.length() != 0) {
1484:                    v.addElement(remainder);
1485:                }
1486:
1487:                // Return vector as an array of strings
1488:                String[] ret = new String[v.size()];
1489:                v.copyInto(ret);
1490:                return ret;
1491:            }
1492:
1493:            /**
1494:             * Flag bit that indicates that subst should replace all occurrences of this
1495:             * regular expression.
1496:             */
1497:            public static final int REPLACE_ALL = 0x0000;
1498:
1499:            /**
1500:             * Flag bit that indicates that subst should only replace the first occurrence
1501:             * of this regular expression.
1502:             */
1503:            public static final int REPLACE_FIRSTONLY = 0x0001;
1504:
1505:            /**
1506:             * Substitutes a string for this regular expression in another string.
1507:             * This method works like the Perl function of the same name.
1508:             * Given a regular expression of "a*b", a String to substituteIn of
1509:             * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1510:             * resulting String returned by subst would be "-foo-garply-wacky-".
1511:             * @param substituteIn String to substitute within
1512:             * @param substitution String to substitute for all matches of this regular expression.
1513:             * @return The string substituteIn with zero or more occurrences of the current
1514:             * regular expression replaced with the substitution String (if this regular
1515:             * expression object doesn't match at any position, the original String is returned
1516:             * unchanged).
1517:             */
1518:            public String subst(String substituteIn, String substitution) {
1519:                return subst(substituteIn, substitution, REPLACE_ALL);
1520:            }
1521:
1522:            /**
1523:             * Substitutes a string for this regular expression in another string.
1524:             * This method works like the Perl function of the same name.
1525:             * Given a regular expression of "a*b", a String to substituteIn of
1526:             * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1527:             * resulting String returned by subst would be "-foo-garply-wacky-".
1528:             * @param substituteIn String to substitute within
1529:             * @param substitution String to substitute for matches of this regular expression
1530:             * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1531:             * flag bit is set, only the first occurrence of this regular expression is replaced.
1532:             * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1533:             * replaced.
1534:             * @return The string substituteIn with zero or more occurrences of the current
1535:             * regular expression replaced with the substitution String (if this regular
1536:             * expression object doesn't match at any position, the original String is returned
1537:             * unchanged).
1538:             */
1539:            public String subst(String substituteIn, String substitution,
1540:                    int flags) {
1541:                // String to return
1542:                StringBuffer ret = new StringBuffer();
1543:
1544:                // Start at position 0 and search the whole string
1545:                int pos = 0;
1546:                int len = substituteIn.length();
1547:
1548:                // Try a match at each position
1549:                while (pos < len && match(substituteIn, pos)) {
1550:                    // Append string before match
1551:                    ret.append(substituteIn.substring(pos, getParenStart(0)));
1552:
1553:                    // Append substitution
1554:                    ret.append(substitution);
1555:
1556:                    // Move forward, skipping past match
1557:                    int newpos = getParenEnd(0);
1558:
1559:                    // We always want to make progress! 
1560:                    if (newpos == pos) {
1561:                        newpos++;
1562:                    }
1563:
1564:                    // Try new position
1565:                    pos = newpos;
1566:
1567:                    // Break out if we're only supposed to replace one occurrence
1568:                    if ((flags & REPLACE_FIRSTONLY) != 0) {
1569:                        break;
1570:                    }
1571:                }
1572:
1573:                // If there's remaining input, append it
1574:                if (pos < len) {
1575:                    ret.append(substituteIn.substring(pos));
1576:                }
1577:
1578:                // Return string buffer as string 
1579:                return ret.toString();
1580:            }
1581:
1582:            /**
1583:             * Returns an array of Strings, whose toString representation matches a regular
1584:             * expression. This method works like the Perl function of the same name.  Given
1585:             * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1586:             * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1587:             * @param search Array of Objects to search 
1588:             * @return Array of Objects whose toString value matches this regular expression.
1589:             */
1590:            public String[] grep(Object[] search) {
1591:                // Create new vector to hold return items
1592:                Vector v = new Vector();
1593:
1594:                // Traverse array of objects
1595:                for (int i = 0; i < search.length; i++) {
1596:                    // Get next object as a string
1597:                    String s = search[i].toString();
1598:
1599:                    // If it matches this regexp, add it to the list
1600:                    if (match(s)) {
1601:                        v.addElement(s);
1602:                    }
1603:                }
1604:
1605:                // Return vector as an array of strings
1606:                String[] ret = new String[v.size()];
1607:                v.copyInto(ret);
1608:                return ret;
1609:            }
1610:
1611:            /** @return true if at the i-th position in the 'search' a newline ends */
1612:            private boolean isNewline(int i) {
1613:
1614:                if (i < NEWLINE.length() - 1) {
1615:                    return false;
1616:                }
1617:
1618:                if (search.charAt(i) == '\n') {
1619:                    return true;
1620:                }
1621:
1622:                for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
1623:                    if (NEWLINE.charAt(j) != search.charAt(i)) {
1624:                        return false;
1625:                    }
1626:                }
1627:                return true;
1628:            }
1629:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.