parser.cs :  » 2.6.4-mono-.net-core » System.Text » System » Text » RegularExpressions » Syntax » C# / CSharp Open Source

Home
C# / CSharp Open Source
1.2.6.4 mono .net core
2.2.6.4 mono core
3.Aspect Oriented Frameworks
4.Bloggers
5.Build Systems
6.Business Application
7.Charting Reporting Tools
8.Chat Servers
9.Code Coverage Tools
10.Content Management Systems CMS
11.CRM ERP
12.Database
13.Development
14.Email
15.Forum
16.Game
17.GIS
18.GUI
19.IDEs
20.Installers Generators
21.Inversion of Control Dependency Injection
22.Issue Tracking
23.Logging Tools
24.Message
25.Mobile
26.Network Clients
27.Network Servers
28.Office
29.PDF
30.Persistence Frameworks
31.Portals
32.Profilers
33.Project Management
34.RSS RDF
35.Rule Engines
36.Script
37.Search Engines
38.Sound Audio
39.Source Control
40.SQL Clients
41.Template Engines
42.Testing
43.UML
44.Web Frameworks
45.Web Service
46.Web Testing
47.Wiki Engines
48.Windows Presentation Foundation
49.Workflows
50.XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » 2.6.4 mono .net core » System.Text 
System.Text » System » Text » RegularExpressions » Syntax » parser.cs
//
// assembly:  System
// namespace:  System.Text.RegularExpressions
// file:  parser.cs
//
// author:  Dan Lewis (dlewis@gmx.co.uk)
//     (c) 2002

//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
// 
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//

using System;
using System.Collections;
using System.Globalization;

namespace System.Text.RegularExpressions.Syntax{

  class Parser {
    public static int ParseDecimal (string str, ref int ptr) {
      return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
    }

    public static int ParseOctal (string str, ref int ptr) {
      return ParseNumber (str, ref ptr, 8, 1, 3);
    }

    public static int ParseHex (string str, ref int ptr, int digits) {
      return ParseNumber (str, ref ptr, 16, digits, digits);
    }

    public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
      int p = ptr, n = 0, digits = 0, d;
      if (max < min)
        max = Int32.MaxValue;

      while (digits < max && p < str.Length) {
        d = ParseDigit (str[p ++], b, digits);
        if (d < 0) {
          -- p;
          break;
        }

        n = n * b + d;
        ++ digits;
      }

      if (digits < min)
        return -1;

      ptr = p;
      return n;
    }

    public static string ParseName (string str, ref int ptr) {
      if (Char.IsDigit (str[ptr])) {
        int gid = ParseNumber (str, ref ptr, 10, 1, 0);
        if (gid > 0)
          return gid.ToString ();
        
        return null;
      }

      int start = ptr;
      for (;;) {
        if (!IsNameChar (str[ptr]))
          break;
        ++ ptr;
      }

      if (ptr - start > 0)
        return str.Substring (start, ptr - start);

      return null;
    }

    public static string Escape (string str) {
      string result = "";
      for (int i = 0; i < str.Length; ++ i) {
        char c = str[i];
        switch (c) {
        case '\\': case '*': case '+': case '?': case '|':
        case '{': case '[': case '(': case ')': case '^':
        case '$': case '.': case '#': case ' ':
          result += "\\" + c;
          break;

        case '\t': result += "\\t"; break;
        case '\n': result += "\\n"; break;
        case '\r': result += "\\r"; break;
        case '\f': result += "\\f"; break;

        default: result += c; break;
        }
      }

      return result;
    }

    public static string Unescape (string str) {
      if (str.IndexOf ('\\') == -1)
        return str;
      return new Parser ().ParseString (str);
    }

    // public instance

    public Parser () {
      this.caps = new ArrayList ();
      this.refs = new Hashtable ();
    }

    public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
      this.pattern = pattern;
      this.ptr = 0;

      caps.Clear ();
      refs.Clear ();
      this.num_groups = 0;

      try {
        RegularExpression re = new RegularExpression ();
        ParseGroup (re, options, null);
        ResolveReferences ();

        re.GroupCount = num_groups;
        
        return re;
      }
      catch (IndexOutOfRangeException) {
        throw NewParseException ("Unexpected end of pattern.");
      }
    }

    public int GetMapping (Hashtable mapping)
    {
      int end = caps.Count;
      mapping.Add ("0", 0);
      for (int i = 0; i < end; i++) {
        CapturingGroup group = (CapturingGroup) caps [i];
        string name = group.Name != null ? group.Name : group.Index.ToString ();
        if (mapping.Contains (name)) {
          if ((int) mapping [name] != group.Index)
            throw new SystemException ("invalid state");
          continue;
        }
        mapping.Add (name, group.Index);
      }

      return gap;
    }

    // private methods

    private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
      bool is_top_level = group is RegularExpression;
    
      Alternation alternation = null;
      string literal = null;

      Group current = new Group ();
      Expression expr = null;
      bool closed = false;

      while (true) {
        ConsumeWhitespace (IsIgnorePatternWhitespace (options));
        if (ptr >= pattern.Length)
          break;
        
        // (1) Parse for Expressions
      
        char ch = pattern[ptr ++];
        
        switch (ch) {
        case '^': {
          Position pos =
            IsMultiline (options) ? Position.StartOfLine : Position.Start;
          expr = new PositionAssertion (pos);
          break;
        }

        case '$': {
          Position pos =
            IsMultiline (options) ? Position.EndOfLine : Position.End;
          expr = new PositionAssertion (pos);
          break;
        }

        case '.': {
          Category cat =
            IsSingleline (options) ? Category.AnySingleline : Category.Any;
          expr = new CharacterClass (cat, false);
          break;
        }

        case '\\': {
          int c = ParseEscape ();
          if (c >= 0)
            ch = (char)c;
          else {
            expr = ParseSpecial (options);

            if (expr == null)
              ch = pattern[ptr ++];    // default escape
          }
          break;
        }

        case '[': {
          expr = ParseCharacterClass (options);
          break;
        }

        case '(': {
          bool ignore = IsIgnoreCase (options);
          expr = ParseGroupingConstruct (ref options);
          if (expr == null) {
            if (literal != null && IsIgnoreCase (options) != ignore) {
              current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
              literal = null;
            }

            continue;
          }
          break;
        }

        case ')': {
          closed = true;
          goto EndOfGroup;
        }

        case '|': {
          if (literal != null) {
            current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
            literal = null;
          }

          if (assertion != null) {
            if (assertion.TrueExpression == null)
              assertion.TrueExpression = current;
            else if (assertion.FalseExpression == null)
              assertion.FalseExpression = current;
            else
              throw NewParseException ("Too many | in (?()|).");
          }
          else {
            if (alternation == null)
              alternation = new Alternation ();

            alternation.AddAlternative (current);
          }

          current = new Group ();
          continue;
        }

        case '*': case '+': case '?': {
          throw NewParseException ("Bad quantifier.");
        }

        default: 
          break;    // literal character
        }

        ConsumeWhitespace (IsIgnorePatternWhitespace (options));
        
        // (2) Check for Repetitions
        
        if (ptr < pattern.Length) {
          char k = pattern[ptr];
          int min = 0, max = 0;
          bool lazy = false;
          bool haveRep = false;


          if (k == '?' || k == '*' || k == '+') {
            ++ ptr;
            haveRep = true;

            switch (k) {
            case '?': min = 0; max = 1; break;
            case '*': min = 0; max = 0x7fffffff; break;
            case '+': min = 1; max = 0x7fffffff; break;
            }
          } else if (k == '{' && ptr + 1 < pattern.Length) {
            int saved_ptr = ptr;
            ++ptr;
            haveRep = ParseRepetitionBounds (out min, out max, options);
            if (!haveRep)
              ptr = saved_ptr;
          }

          if (haveRep) {
            ConsumeWhitespace (IsIgnorePatternWhitespace (options));
            if (ptr < pattern.Length && pattern[ptr] == '?') {
              ++ ptr;
              lazy = true;
            }

            Repetition repetition = new Repetition (min, max, lazy);

            if (expr == null)
              repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
            else
              repetition.Expression = expr;

            expr = repetition;
          }
        }

        // (3) Append Expression and/or Literal

        if (expr == null) {
          if (literal == null)
            literal = "";
          literal += ch;
        }
        else {
          if (literal != null) {
            current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
            literal = null;
          }

          current.AppendExpression (expr);
          expr = null;
        }

        if (is_top_level && ptr >= pattern.Length)
          goto EndOfGroup;
      }

    EndOfGroup:
      if (is_top_level && closed)
        throw NewParseException ("Too many )'s.");
      if (!is_top_level && !closed)
        throw NewParseException ("Not enough )'s.");
        
    
      // clean up literals and alternations

      if (literal != null)
        current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));

      if (assertion != null) {
        if (assertion.TrueExpression == null)
          assertion.TrueExpression = current;
        else
          assertion.FalseExpression = current;
        
        group.AppendExpression (assertion);
      }
      else if (alternation != null) {
        alternation.AddAlternative (current);
        group.AppendExpression (alternation);
      }
      else
        group.AppendExpression (current);
    }

    private Expression ParseGroupingConstruct (ref RegexOptions options) {
      if (pattern[ptr] != '?') {
        Group group;

        if (IsExplicitCapture (options))
          group = new Group ();
        else {
          group = new CapturingGroup ();
          caps.Add (group);
        }

        ParseGroup (group, options, null);
        return group;
      }
      else
        ++ ptr;

      switch (pattern[ptr]) {
      case ':': {            // non-capturing group
        ++ ptr;
        Group group = new Group ();
        ParseGroup (group, options, null);

        return group;
      }

      case '>': {            // non-backtracking group
        ++ ptr;
        Group group = new NonBacktrackingGroup ();
        ParseGroup (group, options, null);
        
        return group;
      }

      case 'i': case 'm': case 'n':
      case 's': case 'x': case '-': {        // options
        RegexOptions o = options;
        ParseOptions (ref o, false);
        if (pattern[ptr] == '-') {
          ++ ptr;
          ParseOptions (ref o, true);
        }

        if (pattern[ptr] == ':') {      // pass options to child group
          ++ ptr;
          Group group = new Group ();
          ParseGroup (group, o, null);
          return group;
        }
        else if (pattern[ptr] == ')') {      // change options of enclosing group
          ++ ptr;
          options = o;
          return null;
        }
        else
          throw NewParseException ("Bad options");
      }

      case '<': case '=': case '!': {        // lookahead/lookbehind
        ExpressionAssertion asn = new ExpressionAssertion ();
        if (!ParseAssertionType (asn))
          goto case '\'';        // it's a (?<name> ) construct

        Group test = new Group ();
        ParseGroup (test, options, null);

        asn.TestExpression = test;
        return asn;
      }

      case '\'': {            // named/balancing group
        char delim;
        if (pattern[ptr] == '<')
          delim = '>';
        else
          delim = '\'';

        ++ ptr;
        string name = ParseName ();

        if (pattern[ptr] == delim) {
          // capturing group

          if (name == null)
            throw NewParseException ("Bad group name.");

          ++ ptr;
          CapturingGroup cap = new CapturingGroup ();
          cap.Name = name;
          caps.Add (cap);
          ParseGroup (cap, options, null);

          return cap;
        }
        else if (pattern[ptr] == '-') {
          // balancing group

          ++ ptr;
          string balance_name = ParseName ();
          if (balance_name == null || pattern[ptr] != delim)
            throw NewParseException ("Bad balancing group name.");

          ++ ptr;
          BalancingGroup bal = new BalancingGroup ();
          bal.Name = name;
          
          if(bal.IsNamed) {
            caps.Add (bal);
          }

          refs.Add (bal, balance_name);

          ParseGroup (bal, options, null);

          return bal;
        }
        else
          throw NewParseException ("Bad group name.");
      }

      case '(': {            // expression/capture test
        Assertion asn;
      
        ++ ptr;
        int p = ptr;
        string name = ParseName ();
        if (name == null || pattern[ptr] != ')') {  // expression test
          // FIXME MS implementation doesn't seem to
          // implement this version of (?(x) ...)

          ptr = p;
          ExpressionAssertion expr_asn = new ExpressionAssertion ();

          if (pattern[ptr] == '?') {
            ++ ptr;
            if (!ParseAssertionType (expr_asn))
              throw NewParseException ("Bad conditional.");
          }
          else {
            expr_asn.Negate = false;
            expr_asn.Reverse = false;
          }

          Group test = new Group ();
          ParseGroup (test, options, null);
          expr_asn.TestExpression = test;
          asn = expr_asn;
        }
        else {            // capture test
          ++ ptr;
          asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
          refs.Add (asn, name);
        }

        Group group = new Group ();
        ParseGroup (group, options, asn);
        return group;
      }

      case '#': {            // comment
        ++ ptr;
        while (pattern[ptr ++] != ')') {
          if (ptr >= pattern.Length)
            throw NewParseException ("Unterminated (?#...) comment.");
        }
        return null;
      }

      default:             // error
        throw NewParseException ("Bad grouping construct.");
      }
    }

    private bool ParseAssertionType (ExpressionAssertion assertion) {
      if (pattern[ptr] == '<') {
        switch (pattern[ptr + 1]) {
        case '=':
          assertion.Negate = false;
          break;
        case '!':
          assertion.Negate = true;
          break;
        default:
          return false;
        }

        assertion.Reverse = true;
        ptr += 2;
      }
      else {
        switch (pattern[ptr]) {
        case '=':
          assertion.Negate = false;
          break;
        case '!':
          assertion.Negate = true;
          break;
        default:
          return false;
        }

        assertion.Reverse = false;
        ptr += 1;
      }

      return true;
    }

    private void ParseOptions (ref RegexOptions options, bool negate) {
      for (;;) {
        switch (pattern[ptr]) {
        case 'i':
          if (negate)
            options &= ~RegexOptions.IgnoreCase;
          else
            options |= RegexOptions.IgnoreCase;
          break;

        case 'm':
          if (negate)
            options &= ~RegexOptions.Multiline;
          else
            options |= RegexOptions.Multiline;
          break;
          
        case 'n':
          if (negate)
            options &= ~RegexOptions.ExplicitCapture;
          else
            options |= RegexOptions.ExplicitCapture;
          break;
          
        case 's':
          if (negate)
            options &= ~RegexOptions.Singleline;
          else
            options |= RegexOptions.Singleline;
          break;
          
        case 'x':
          if (negate)
            options &= ~RegexOptions.IgnorePatternWhitespace;
          else
            options |= RegexOptions.IgnorePatternWhitespace;
          break;

        default:
          return;
        }

        ++ ptr;
      }
    }

    private Expression ParseCharacterClass (RegexOptions options) {
      bool negate = false;
      if (pattern[ptr] == '^') {
        negate = true;
        ++ ptr;
      }
      
      bool ecma = IsECMAScript (options);
      CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));

      if (pattern[ptr] == ']') {
        cls.AddCharacter (']');
        ++ ptr;
      }

      int c = -1;
      int last = -1;
      bool range = false;
      bool closed = false;
      while (ptr < pattern.Length) {
        c = pattern[ptr ++];

        if (c == ']') {
          closed = true;
          break;
        }

        if (c == '-' && last >= 0 && !range) {
          range = true;
          continue;
        }

        if (c == '\\') {
          c = ParseEscape ();
          if (c >= 0)
            goto char_recognized;

          // didn't recognize escape
          c = pattern [ptr ++];
          switch (c) {
          case 'b':
            c = '\b';
            goto char_recognized;

          case 'd': case 'D':
            cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
            break;
            
          case 'w': case 'W':
            cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
            break;
            
          case 's': case 'S':
            cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
            break;
            
          case 'p': case 'P':
            cls.AddCategory (ParseUnicodeCategory (), c == 'P');  // ignore ecma
            break;

          default:    // add escaped character
            goto char_recognized;
          }

          // if the pattern looks like [a-\s] ...
          if (range)
            throw NewParseException ("character range cannot have category \\" + c);

          last = -1;
          continue;
        }

      char_recognized:
        if (range) {
          // if 'range' is true, we know that 'last >= 0'
          if (c < last)
            throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
          cls.AddRange ((char)last, (char)c);
          last = -1;
          range = false;
          continue;
        }

        cls.AddCharacter ((char)c);
        last = c;
      }

      if (!closed)
        throw NewParseException ("Unterminated [] set.");

      if (range)
        cls.AddCharacter ('-');

      return cls;
    }

    private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
      int n, m;
      min = max = 0;

      /* check syntax */

      ConsumeWhitespace (IsIgnorePatternWhitespace (options));
        
      if (pattern[ptr] == ',') {
                                n = -1;
      } else {
                                n = ParseNumber (10, 1, 0);
                                ConsumeWhitespace (IsIgnorePatternWhitespace (options));
      }
      
      switch (pattern[ptr ++]) {
      case '}':
        m = n;
        break;
      case ',':
        ConsumeWhitespace (IsIgnorePatternWhitespace (options));
        m = ParseNumber (10, 1, 0);
        ConsumeWhitespace (IsIgnorePatternWhitespace (options));
        if (pattern[ptr ++] != '}')
          return false;
        break;
      default:
        return false;
      }

      /* check bounds and ordering */

      if (n > 0x7fffffff || m > 0x7fffffff)
        throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
      if (m >= 0 && m < n)
        throw NewParseException ("Illegal {x, y} with x > y.");

      /* assign min and max */
      
      min = n;
      if (m > 0)
        max = m;
      else
        max = 0x7fffffff;

      return true;
    }

    private Category ParseUnicodeCategory () {
      if (pattern[ptr ++] != '{')
        throw NewParseException ("Incomplete \\p{X} character escape.");

      string name = ParseName (pattern, ref ptr);
      if (name == null)
        throw NewParseException ("Incomplete \\p{X} character escape.");

      Category cat = CategoryUtils.CategoryFromName (name);
      if (cat == Category.None)
        throw NewParseException ("Unknown property '" + name + "'.");

      if (pattern[ptr ++] != '}')
        throw NewParseException ("Incomplete \\p{X} character escape.");

      return cat;
    }

    private Expression ParseSpecial (RegexOptions options) {
      int p = ptr;
      bool ecma = IsECMAScript (options);
      Expression expr = null;
      
      switch (pattern[ptr ++]) {

      // categories

      case 'd':
        expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
        break;
        
      case 'w':
        expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
        break;
        
      case 's':
        expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
        break;
        
      case 'p':
        // this is odd - ECMAScript isn't supposed to support Unicode,
        // yet \p{..} compiles and runs under the MS implementation
        // identically to canonical mode. That's why I'm ignoring the
        // value of ecma here.
      
        expr = new CharacterClass (ParseUnicodeCategory (), false);
        break;
        
      case 'D':
        expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
        break;
        
      case 'W':
        expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
        break;
        
      case 'S':
        expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
        break;
        
      case 'P':
        expr = new CharacterClass (ParseUnicodeCategory (), true);
        break;

      // positions

      case 'A': expr = new PositionAssertion (Position.StartOfString); break;
      case 'Z': expr = new PositionAssertion (Position.End); break;
      case 'z': expr = new PositionAssertion (Position.EndOfString); break;
      case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
      case 'b': expr = new PositionAssertion (Position.Boundary); break;
      case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
      
      // references

      case '1': case '2': case '3': case '4': case '5':
      case '6': case '7': case '8': case '9': {
        ptr --;
        int n = ParseNumber (10, 1, 0);
        if (n < 0) {
          ptr = p;
          return null;
        }

        // FIXME test if number is within number of assigned groups
        // this may present a problem for right-to-left matching

        Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
        refs.Add (reference, n.ToString ());
        expr = reference;
        break;
      }

      case 'k': {
        char delim = pattern[ptr ++];
        if (delim == '<')
          delim = '>';
        else if (delim != '\'')
          throw NewParseException ("Malformed \\k<...> named backreference.");

        string name = ParseName ();
        if (name == null || pattern[ptr] != delim)
          throw NewParseException ("Malformed \\k<...> named backreference.");

        ++ ptr;
        Reference reference = new Reference (IsIgnoreCase (options));
        refs.Add (reference, name);
        expr = reference;
        break;
      }

      default:
        expr = null;
        break;
      }

      if (expr == null)
        ptr = p;

      return expr;
    }

    private int ParseEscape () {
      int p = ptr;
      int c;

      if (p >= pattern.Length)
        throw new ArgumentException (
            String.Format ("Parsing \"{0}\" - Illegal \\ at end of " + 
                "pattern.", pattern), pattern);
      
      switch (pattern[ptr ++]) {
  
      // standard escapes (except \b)

      case 'a': return '\u0007';
      case 't': return '\u0009';
      case 'r': return '\u000d';
      case 'v': return '\u000b';
      case 'f': return '\u000c';
      case 'n': return '\u000a';
      case 'e': return '\u001b';
      case '\\': return '\\';

      // character codes

      case '0':
        //
        // Turns out that octal values can be specified
        // without a leading zero.   But also the limit
        // of three character should include this first
        // one.  
        //
        ptr--;
        int prevptr = ptr;
        int result = ParseOctal (pattern, ref ptr);
        if (result == -1 && prevptr == ptr)
          return 0;

        return result;

      case 'x':
        c = ParseHex (pattern, ref ptr, 2);
        if (c < 0)
          throw NewParseException ("Insufficient hex digits");

        return c;

      case 'u':
        c = ParseHex (pattern, ref ptr, 4);
        if (c < 0)
          throw NewParseException ("Insufficient hex digits");
        
        return c;

      // control characters

      case 'c':
        c = pattern[ptr ++];
        if (c >= '@' && c <= '_')
          return c - '@';
        else
          throw NewParseException ("Unrecognized control character.");

      // unknown escape

      default:
        ptr = p;
        return -1;
      }
    }

    private string ParseName () {
      return Parser.ParseName (pattern, ref ptr);
    }

    private static bool IsNameChar (char c) {
      UnicodeCategory cat = Char.GetUnicodeCategory (c);
      if (cat == UnicodeCategory.ModifierLetter)
        return false;
      if (cat == UnicodeCategory.ConnectorPunctuation)
        return true;
      return Char.IsLetterOrDigit (c);
    }
  
    private int ParseNumber (int b, int min, int max) {
      return Parser.ParseNumber (pattern, ref ptr, b, min, max);
    }

    private static int ParseDigit (char c, int b, int n) {
      switch (b) {
      case 8:
        if (c >= '0' && c <= '7')
          return c - '0';
        else
          return -1;
      case 10:
        if (c >= '0' && c <= '9')
          return c - '0';
        else
          return -1;
      case 16:
        if (c >= '0' && c <= '9')
          return c - '0';
        else if (c >= 'a' && c <= 'f')
          return 10 + c - 'a';
        else if (c >= 'A' && c <= 'F')
          return 10 + c - 'A';
        else
          return -1;
      default:
        return -1;
      }
    }

    private void ConsumeWhitespace (bool ignore) {
      while (ptr < pattern.Length) {
        if (pattern[ptr] == '(') {
          if (ptr + 3 >= pattern.Length)
            return;

          if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
            return;

          ptr += 3;
          while (ptr < pattern.Length && pattern[ptr ++] != ')')
            /* ignore */ ;
        }
        else if (ignore && pattern[ptr] == '#') {
          while (ptr < pattern.Length && pattern[ptr ++] != '\n')
            /* ignore */ ;
        }
        else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
          while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
            ++ ptr;
        }
        else
          return;
      }
    }

    private string ParseString (string pattern) {
      this.pattern = pattern;
      this.ptr = 0;

      StringBuilder result = new StringBuilder (pattern.Length);
      while (ptr < pattern.Length) {
        int c = pattern[ptr ++];
        if (c == '\\') {
          c = ParseEscape ();

          if(c < 0) {
            c = pattern[ptr ++];
            if(c == 'b')
              c = '\b';
          }
        }
        result.Append ((char) c);
      }

      return result.ToString ();
    }

    private void ResolveReferences ()
    {
      int gid = 1;
      Hashtable dict = new Hashtable ();
      ArrayList explicit_numeric_groups = null;

      // number unnamed groups

      foreach (CapturingGroup group in caps) {
        if (group.Name != null)
          continue;

        dict.Add (gid.ToString (), group);
        group.Index = gid ++;
        ++ num_groups;
      }

      // number named groups

      foreach (CapturingGroup group in caps) {
        if (group.Name == null)
          continue;

        if (dict.Contains (group.Name)) {
          CapturingGroup prev = (CapturingGroup) dict [group.Name];
          group.Index = prev.Index;

          if (group.Index == gid)
            gid ++;
          else if (group.Index > gid)
            explicit_numeric_groups.Add (group);
          continue;
        }

        if (Char.IsDigit (group.Name [0])) {
          int ptr = 0;
          int group_gid = ParseDecimal (group.Name, ref ptr);
          if (ptr == group.Name.Length) {
            group.Index = group_gid;
            dict.Add (group.Name, group);
            ++ num_groups;

            if (group_gid == gid) {
              gid ++;
            } else {
              // all numbers before 'gid' are already in the dictionary.  So, we know group_gid > gid
              if (explicit_numeric_groups == null)
                explicit_numeric_groups = new ArrayList (4);
              explicit_numeric_groups.Add (group);
            }

            continue;
          }
        }

        string gid_s = gid.ToString ();
        while (dict.Contains (gid_s))
          gid_s = (++gid).ToString ();

        dict.Add (gid_s, group);
        dict.Add (group.Name, group);
        group.Index = gid ++;
        ++ num_groups;
      }

      gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null

      if (explicit_numeric_groups != null)
        HandleExplicitNumericGroups (explicit_numeric_groups);

      // resolve references

      foreach (Expression expr in refs.Keys) {
        string name = (string) refs [expr];
        if (!dict.Contains (name)) {
          if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
            continue;
          BackslashNumber bn = expr as BackslashNumber;
          if (bn != null && bn.ResolveReference (name, dict))
            continue;
          throw NewParseException ("Reference to undefined group " +
            (Char.IsDigit (name[0]) ? "number " : "name ") +
            name);
        }

        CapturingGroup group = (CapturingGroup)dict[name];
        if (expr is Reference)
          ((Reference)expr).CapturingGroup = group;
        else if (expr is CaptureAssertion)
          ((CaptureAssertion)expr).CapturingGroup = group;
        else if (expr is BalancingGroup)
          ((BalancingGroup)expr).Balance = group;
      }
    }

    private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
    {
      int gid = gap;
      int i = 0;
      int n_explicit = explicit_numeric_groups.Count;

      explicit_numeric_groups.Sort ();

      // move 'gap' forward to skip over all explicit groups that
      // turn out to match their index
      for (; i < n_explicit; ++i) {
        CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
        if (g.Index > gid)
          break;
        if (g.Index == gid)
          gid ++;
      }

      gap = gid;

      // re-index all further groups so that the indexes are contiguous
      int prev = gid;
      for (; i < n_explicit; ++i) {
        CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
        if (g.Index == prev) {
          g.Index = gid - 1;
        } else {
          prev = g.Index;
          g.Index = gid ++;
        }
      }
    }

    // flag helper functions

    private static bool IsIgnoreCase (RegexOptions options) {
      return (options & RegexOptions.IgnoreCase) != 0;
    }

    private static bool IsMultiline (RegexOptions options) {
      return (options & RegexOptions.Multiline) != 0;
    }

    private static bool IsExplicitCapture (RegexOptions options) {
      return (options & RegexOptions.ExplicitCapture) != 0;
    }
  
    private static bool IsSingleline (RegexOptions options) {
      return (options & RegexOptions.Singleline) != 0;
    }

    private static bool IsIgnorePatternWhitespace (RegexOptions options) {
      return (options & RegexOptions.IgnorePatternWhitespace) != 0;
    }

    private static bool IsECMAScript (RegexOptions options) {
      return (options & RegexOptions.ECMAScript) != 0;
    }

    // exception creation

    private ArgumentException NewParseException (string msg) {
      msg = "parsing \"" + pattern + "\" - " + msg;
      return new ArgumentException (msg, pattern);
    }

    private string pattern;
    private int ptr;

    private ArrayList caps;
    private Hashtable refs;
    private int num_groups;
    private int gap;
  }
}
www.java2v.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.