re2.py :  » Web-Services » python-xmltv » pytvgrab-lib-0.5.1 » lib » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Web Services » python xmltv 
python xmltv » pytvgrab lib 0.5.1 » lib » re2.py
#!/usr/bin/env python
# -----------------------------------------------------------------------
# Copyright (C) 2003 Chris Ottrey.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
#    http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev: 246 $
# $LastChangedDate: 2004-10-14 03:16:23 +1000 (Thu, 14 Oct 2004) $
# $LastChangedRevision: 246 $
# $LastChangedBy: gustavo $
#
# $Log: $
#
# TODO - Currently the following happens...
# 
# >>> re2.extract('(?P<word>\w+)',  'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)+', 'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)*', 'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+))', '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+)+)', '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
#
# ... I'd like the output to instead be:
#
# >>> re2.extract('(?P<word>\w+)', 'hello there')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)+', 'hello')
# {'word': ['hello']}
# >>> re2.extract('(?P<word>\w+)*', 'hello')
# {'word': ['hello']}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+))',  '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+)+)', '34cat')
# {'a': {'word': ['cat'], 'number': '34'}}
#
# ie. ~hopefully~ that will allow lists to be returned by the '+' and '*' pattern suffixes.
#
# To do this: have a look at compress_list and Group.pattern[-1]


"""\
Compile a recursively matching regular expression.
And use that to extract a hierarchical object.

Usage:

>>> import re2
>>> pat=re2.compile('(?P<number>\d+)((?P<word>\w+)(,)?)+')
>>> match=pat.extract('34cat,dog,fish')
>>> match
{'word': ['cat', 'dog', 'fish'], 'number': '34'}
>>> match.number
'34'
>>> match.word
['cat', 'dog', 'fish']
>>> print match
---
number: '34'
word:
    - cat
    - dog
    - fish

>>> pat='(?P<verse>(?P<number, float>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
>>> buf='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
>>> re2.extract(pat, buf)
{'verse': [{'number': 12.0, 'activity': 'DRUMMERS DRUMMING'}, {'number': 11.01, 'activity': 'PIPERS PIPING'}, {'number': 10.0, 'activity': 'LORDS A-LEAPING'}]}
>>> print re2.extract(pat, buf)
---
verse:
    -
        activity: DRUMMERS DRUMMING
        number: 12.0
    -
        activity: PIPERS PIPING
        number: 11.01
    -
        activity: LORDS A-LEAPING
        number: 10.0

>>> buf='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
>>> def exaggerate(n): return 3*float(n)
...
>>> pat='(?P<verse>(?P<number, exaggerate>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
>>> print re2.extract(pat, buf)
---
verse:
    -
        activity: DRUMMERS DRUMMING
        number: 36.0
    -
        activity: PIPERS PIPING
        number: 33.03
    -
        activity: LORDS A-LEAPING
        number: 30.0


Note: See unit test cases for more examples.
"""

import re
import string
import sys
USING_YAML=False
try:
  import yaml
  USING_YAML=True
except ImportError:
  pass

# flags
I = IGNORECASE = re.IGNORECASE # ignore case
L = LOCALE     = re.LOCALE     # assume current 8-bit locale
U = UNICODE    = re.UNICODE    # assume unicode locale
M = MULTILINE  = re.MULTILINE  # make anchors look for newline
S = DOTALL     = re.DOTALL     # make dot match newline
X = VERBOSE    = re.VERBOSE    # ignore whitespace and comments

__all__=['compile', 'extract']

def compile(pattern, flags=0):
  return _Group(pattern, flags)
# compile()

def extract(pattern, buf, flags=0):
  return _Group(pattern, flags).extract(buf)
# extract()

class _Match(dict):
  """A Match class used by _objectify"""

  def __setattr__(self, attr, value):
    # This method is overridden so attributes can be accessed with
    # match['attribute'] or
    # match.attribute
    if attr != '_value':
      # this case insures that the _value key is left out of _Match.keys()
      self[attr]=value
    self.__dict__[attr]=value
  # __setattr__()

  def __repr__(self):
    result=''
    for k in self.keys():
      result=result+'%s: %s, ' % (repr(k), repr(self[k]))
    result='{%s}' % result[:-2]
    return result
  # __repr__()

  def __str__(self):
    result=repr(self)
    if USING_YAML:
      result=yaml.dump(self)
    else:
      sys.stderr.write("""
WARNING: The following string representing the matched object will be much
         easier to debug if you first install yaml!
python yaml is available from http://python.yaml.org (2004/08/27)

""")
    return result
  # __str__()

# _Match

def _objectify(d):
  """Turn a hierarchical dictionary into a _Match object.
     Just cos I think it looks nicer to call match.attribute than
     match['attribute']
  """

  if type(d) == type({}):
    o=_Match()
    for k in d.keys():
      o.__setattr__(k, _objectify(d[k]))
    result=o
  elif type(d) == type([]):
    l=[]
    for i in d:
      l.append(_objectify(i))
    result=l
  else:
    result=d
  return result
# _objectify()

def _compress(d):
  """Compress a hierarchical dictionary returned by _Group._match(),
     so that any '_group*' key is replaced by its normalized contents.
     Eg.
       {'d': {'_group1': [{'a': 1, 'b': 'x'}, {'a': 2, 'b': 'y'}]}}
     will become:
       {'d': {'a': [1, 2], 'b': ['x', 'y']}}
     
     NB.  With all the recursion going on here some of this code is
          probably redundant.  But could I be bothered neatening it...?
  """

  def get_keys(l):
    """This is here incase some keys don't appear in all groups in a list."""
    d={}
    for i in l:
      if type(i) == type({}):
        d.update(i)
    return d.keys()
  # get_keys()

  def compress_list(list):
    result=[]
    for i in list:
      result.append(_compress(i))
    if len(result) == 1:
      result=result[0]
    elif len(result) == 2:
      if list[1] == '':
        result=list[0]
    return result
  # compress_list()

  def compress_dict(d):
    result={}
    for k in d.keys():
      if k[:6] == '_group':    # shuffle everything up.
        if type(d[k]) == type({}):
          result.update(compress_dict(d[k]))
        elif type(d[k]) == type([]):
          for key in get_keys(d[k]):
            if key != '_value':  # except for the _value key.
              l=[]
              for i in d[k]:
                try:
                  v=i[key]
                  l.append(v)
                except KeyError:
                  pass
              result[key]=compress_list(l)
        else:
          pass
      else:
        result[k]=_compress(d[k])

    # Keep compressing until no '_group*' keys are left.
    # XXX - This looks recursively dangerous!
    #       Probably should put a global watchdog counter in for protection.
    for k in result.keys():
      if k[:6] == '_group':
        result=compress_dict(result)
        break

    return result
  # compress_dict()

  result={}
  if type(d) == type({}):
    result=compress_dict(d)
  elif type(d) == type([]):
    result=compress_list(d)
  else:
    result=d

  return result
# _compress


class _Group:

  _group_pat=re.compile(r'^\s*[(](?:[?]P<(?P<name>\w+)(,\s*(?P<action>\w.+))?>|)(?P<pattern>.*)[)]\s*$', re.DOTALL)

  def __init__(self, buf, flags=0, group_no=0, mult=''):
    self.flags=flags
    self.name='_group%d' % group_no
    self.action=None
    pattern=buf
    self.mult=mult

    m=self._group_pat.match(buf)
    if m:
      d=m.groupdict()
      if d['name']:
        self.name=d['name']
      if d['action']:
        i=0
        while sys._getframe(i).f_globals['__name__'] == __name__:
          i+=1
        caller=sys._getframe(i).f_globals
        # Try getting the action from the calling frame.
        action=caller.get(d['action'])
        # If that doesn't work try getting the action from __builtins__
        if not action:
          action=string.__builtins__.get(d['action'])
        # If that doesn't work try getting the action from string methods
        if not action:
          action=string.__dict__.get(d['action'])
        if action:
          self.action=action
        else:
          raise Exception('Unknown action %s in group %s ' % (repr(d['action']), repr(buf)))
      pattern=d['pattern']

    self.buf, self.group=self._split(pattern)
    self._set_pattern()
  # __init__()

  def _split(self, pattern):
    # XXX This could do with some cleaning... :-(
    pattern=pattern+' ' # this hack is here so we don't skip the last group
    group=[]
    item=''
    new_buf=''
    bcount=0
    escape=False
    square=False
    postgroup=False
    group_no=False
    for c in pattern:
      if postgroup:
        if item[:3] == '(?:':    # "silent" group
          new_buf=new_buf+item+c
        else:
          new_buf=new_buf+'%s'
          mult=''
          if c in ['+', '*', '?']:
            mult=c
          elif c == '(':
            pass
          else:
            new_buf=new_buf+c
          group.append(_Group(item, self.flags, group_no, mult))
          group_no=group_no+1
        postgroup=False
        if c != '(':
          continue

      if (bcount == 0) and (c == '(') and (not escape) and (not square):
        item=c
        bcount=bcount+1
      elif (bcount == 0) and (c == '[') and (not square):
        new_buf=new_buf+c 
        square=True
      elif (bcount == 0) and (c == ']') and square:
        if new_buf[-1] != '[':
          square=False
        new_buf=new_buf+c 
      elif bcount > 0:
        if (c == '(') and (not escape) and (not square):
          item=item+c
          bcount=bcount+1
        elif c == '\\' and (not square):
          item=item+c
          escape=True
        elif escape:
          item=item+c
          escape=False
        elif (c == '[') and (not square):
          item=item+c
          square=True
        elif (c == ']') and square:
          if item[-1] != '[':
            square=False
          item=item+c
        elif (c == ')') and (not escape) and (not square):
          item=item+c
          bcount=bcount-1
          if (bcount == 0):
            postgroup=True
        elif item:
         item=item+c
      else:
        new_buf=new_buf+c
    return new_buf[:-1], group  # new_buf[:-1] is because of the hack at the start.
  # split()

  def _set_pattern(self):
    self.pattern=self._get_pattern()
    self.pat=re.compile(self.pattern, self.flags)
  # _set_pattern()

  def _match(self, buf):
    result={self.name:[]}
    if self.group:
      for m in self.pat.findall(buf):
        d={}
        d['_value']=m[0]
        for i in range(len(self.group)):
          g=self.group[i]
          gg=g._match(m[i+1])
          if gg:
            d.update(gg)
        result[self.name].append(d)
    else:
      for m in self.pat.findall(buf):       # Here's the call to regex
        if self.action:
          m = self.action(m)
        result[self.name].append(m)
    if len(result[self.name]) == 0:
      result=None
    elif len(result[self.name]) == 1:        # XXX - Cant remember what this does
      result[self.name]=result[self.name][0] # XXX - Cant remember what this does
      
    return result
  # _match()

  def _get_pattern(self):
    return '(?P<%s>%s)' % (self.name, self._get_buf(True))
  # _get_pattern()

  def _get_buf(self, top=False):
    l=[]
    for g in self.group:
      l.append(g.__str__(top))
    return self.buf % tuple(l)
  # _get_buf()

  def extract(self, buf):
    d=self._match(buf)
    d2=_compress(d)
    return _objectify(d2)
  # extract()

  def __str__(self, top=False):
    if top:
      result='((?:%s)%s)' % (self._get_buf(), self.mult)
    else:
      buf=self.buf % tuple(self.group)
      result='(?:%s)%s' % (buf, self.mult)
    return result
  # __str__()

# _Group

class Group:
  """Helper class for extraction"""

  pattern=''
  flags=0
  pat=None

  def extract(self, buf):
    if not self.pat:
      self.pat=_Group(self.pattern, self.flags)
    return self.pat.extract(buf)
  # extract()
  extract=classmethod(extract)
  
# Group

# --------------  Unit Tests  -------------- #
import sys
USING_UNITTEST2=False
try:
  import unittest2 as unittest
  USING_UNITTEST2=True
except:
  import unittest

class _Group_UnitTest(unittest.TestCase):
  def setUp(self):
    reg1=r"""
      (?P<paragraph>
        (
          (?P<sentence>
            (
              (?P<word>
                (?P<letter>\w)+
              )
              [ \n]*
            )+
            [.]
          )
          [\n]?
          (?:[ ][ ])?
        )+
      )+"""
    reg2=r"""(?P<paragraph>((?P<sentence>((?P<word>\w+)[ \n]*)+[.])[\n]?(?:[ ][ ])?)+)+"""
    buf="""\
This is the first groovy sentence of the first paragraph.
This is the second funky sentence.  And this is the stylish third.

THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH.  THiS iS the cruddy lASt
sEnTEnCe Of ThE SeconD pARAgraph."""
    g1=compile(reg1, re.VERBOSE)
    self.m1=g1.extract(buf)
    g2=compile(reg2)
    self.m2=g2.extract(buf)

    reg3=r"""
      (?P<number>[0-9]+)
      (
        (?P<word>[a-z]+)
        (,)?
      )+"""
    buf3='34cat,dog,fish'
    g3=compile(reg3, re.VERBOSE)
    self.m3=g3.extract(buf3)    

    reg4=r"""(?P<number>[0-9]+)((?P<word>[a-z]+)(?:,)?)+"""
    g4=compile(reg4)
    self.m4=g4.extract(buf3)    

    buf5='12 drummers drumming, 11 pipers piping, 10 lords a-leaping'
    reg5='(?P<verse>(?P<number>\d+) (?P<activity>[^,]+))(,)?'
    self.m5=extract(reg5, buf5)

    buf6='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
    reg6='(?P<verse>(?P<number, float>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
    self.m6=extract(reg6, buf6)

  # setUp()

  def test01(self): v=self.m1.paragraph[0].sentence[0].word[4].letter[2]; assert v == 'o', v
  def test02(self): v=self.m1.paragraph[0].sentence[1].word[4]._value; assert v == 'funky', v
  def test03(self): v=self.m1.paragraph[0].sentence[2].word[4]._value; assert v == 'stylish', v
  def test04(self): v=self.m1.paragraph[1].sentence[0]._value; assert v == 'THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH.', v
  def test05(self): v=self.m2.paragraph[0].sentence[2].word[4]; assert v == 'stylish', v
  def test06(self): v=self.m2.paragraph[1].sentence[0]._value; assert v == 'THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH.', v
  def test07(self): v=self.m3.number; assert v == '34', v
  def test08(self): v=self.m3.word; assert v == ['cat', 'dog', 'fish'], v
  def test09(self): v=self.m4.number; assert v == '34', v
  def test10(self): v=self.m4.word; assert v == ['cat', 'dog', 'fish'], v
  def test11(self): v=self.m4.has_key('word'); assert v == True, v
  def test12(self): v=self.m4.keys(); assert v == ['word', 'number'], v
  def test13(self): v=self.m5.verse[0]; assert v == {'activity': 'drummers drumming', 'number': '12'}, v
  def test14(self): v=self.m5.verse[0].activity; assert v == 'drummers drumming', v
  def test15(self): v=self.m5.verse[0].number; assert v == '12', v
  def test16(self): v=dict(self.m5); assert v == {'verse': [{'number': '12', 'activity': 'drummers drumming'}, {'number': '11', 'activity': 'pipers piping'}, {'number': '10', 'activity': 'lords a-leaping'}]}, v
  if USING_YAML:
    def test17(self): v=str(self.m5); assert v == "---\nverse:\n    -\n        activity: drummers drumming\n        number: '12'\n    -\n        activity: pipers piping\n        number: '11'\n    -\n        activity: lords a-leaping\n        number: '10'\n", v 
  def test18(self): v=dict(self.m6); assert v == {'verse': [{'number': 12.0, 'activity': 'DRUMMERS DRUMMING'}, {'number': 11.01, 'activity': 'PIPERS PIPING'}, {'number': 10.0, 'activity': 'LORDS A-LEAPING'}]}, v
  if USING_YAML:
    def test19(self): v=str(self.m6); assert v == "---\nverse:\n    -\n        activity: DRUMMERS DRUMMING\n        number: 12.0\n    -\n        activity: PIPERS PIPING\n        number: 11.01\n    -\n        activity: LORDS A-LEAPING\n        number: 10.0\n", v 

if USING_UNITTEST2 or __name__ == '__main__':
  unittest.main()
# --------------  Unit Tests  -------------- #
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.