#!/usr/bin/env python
# -----------------------------------------------------------------------
# Copyright (C) 2003 Chris Ottrey.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
# http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev: 246 $
# $LastChangedDate: 2004-10-14 03:16:23 +1000 (Thu, 14 Oct 2004) $
# $LastChangedRevision: 246 $
# $LastChangedBy: gustavo $
#
# $Log: $
#
# TODO - Currently the following happens...
#
# >>> re2.extract('(?P<word>\w+)', 'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)+', 'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)*', 'hello')
# {'word': 'hello'}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+))', '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+)+)', '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
#
# ... I'd like the output to instead be:
#
# >>> re2.extract('(?P<word>\w+)', 'hello there')
# {'word': 'hello'}
# >>> re2.extract('(?P<word>\w+)+', 'hello')
# {'word': ['hello']}
# >>> re2.extract('(?P<word>\w+)*', 'hello')
# {'word': ['hello']}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+))', '34cat')
# {'a': {'word': 'cat', 'number': '34'}}
# >>> re2.extract('(?P<a>(?P<number>\d+)(?P<word>\w+)+)', '34cat')
# {'a': {'word': ['cat'], 'number': '34'}}
#
# ie. ~hopefully~ that will allow lists to be returned by the '+' and '*' pattern suffixes.
#
# To do this: have a look at compress_list and Group.pattern[-1]
"""\
Compile a recursively matching regular expression.
And use that to extract a hierarchical object.
Usage:
>>> import re2
>>> pat=re2.compile('(?P<number>\d+)((?P<word>\w+)(,)?)+')
>>> match=pat.extract('34cat,dog,fish')
>>> match
{'word': ['cat', 'dog', 'fish'], 'number': '34'}
>>> match.number
'34'
>>> match.word
['cat', 'dog', 'fish']
>>> print match
---
number: '34'
word:
- cat
- dog
- fish
>>> pat='(?P<verse>(?P<number, float>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
>>> buf='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
>>> re2.extract(pat, buf)
{'verse': [{'number': 12.0, 'activity': 'DRUMMERS DRUMMING'}, {'number': 11.01, 'activity': 'PIPERS PIPING'}, {'number': 10.0, 'activity': 'LORDS A-LEAPING'}]}
>>> print re2.extract(pat, buf)
---
verse:
-
activity: DRUMMERS DRUMMING
number: 12.0
-
activity: PIPERS PIPING
number: 11.01
-
activity: LORDS A-LEAPING
number: 10.0
>>> buf='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
>>> def exaggerate(n): return 3*float(n)
...
>>> pat='(?P<verse>(?P<number, exaggerate>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
>>> print re2.extract(pat, buf)
---
verse:
-
activity: DRUMMERS DRUMMING
number: 36.0
-
activity: PIPERS PIPING
number: 33.03
-
activity: LORDS A-LEAPING
number: 30.0
Note: See unit test cases for more examples.
"""
import re
import string
import sys
USING_YAML=False
try:
import yaml
USING_YAML=True
except ImportError:
pass
# flags
I = IGNORECASE = re.IGNORECASE # ignore case
L = LOCALE = re.LOCALE # assume current 8-bit locale
U = UNICODE = re.UNICODE # assume unicode locale
M = MULTILINE = re.MULTILINE # make anchors look for newline
S = DOTALL = re.DOTALL # make dot match newline
X = VERBOSE = re.VERBOSE # ignore whitespace and comments
__all__=['compile', 'extract']
def compile(pattern, flags=0):
return _Group(pattern, flags)
# compile()
def extract(pattern, buf, flags=0):
return _Group(pattern, flags).extract(buf)
# extract()
class _Match(dict):
"""A Match class used by _objectify"""
def __setattr__(self, attr, value):
# This method is overridden so attributes can be accessed with
# match['attribute'] or
# match.attribute
if attr != '_value':
# this case insures that the _value key is left out of _Match.keys()
self[attr]=value
self.__dict__[attr]=value
# __setattr__()
def __repr__(self):
result=''
for k in self.keys():
result=result+'%s: %s, ' % (repr(k), repr(self[k]))
result='{%s}' % result[:-2]
return result
# __repr__()
def __str__(self):
result=repr(self)
if USING_YAML:
result=yaml.dump(self)
else:
sys.stderr.write("""
WARNING: The following string representing the matched object will be much
easier to debug if you first install yaml!
python yaml is available from http://python.yaml.org (2004/08/27)
""")
return result
# __str__()
# _Match
def _objectify(d):
"""Turn a hierarchical dictionary into a _Match object.
Just cos I think it looks nicer to call match.attribute than
match['attribute']
"""
if type(d) == type({}):
o=_Match()
for k in d.keys():
o.__setattr__(k, _objectify(d[k]))
result=o
elif type(d) == type([]):
l=[]
for i in d:
l.append(_objectify(i))
result=l
else:
result=d
return result
# _objectify()
def _compress(d):
"""Compress a hierarchical dictionary returned by _Group._match(),
so that any '_group*' key is replaced by its normalized contents.
Eg.
{'d': {'_group1': [{'a': 1, 'b': 'x'}, {'a': 2, 'b': 'y'}]}}
will become:
{'d': {'a': [1, 2], 'b': ['x', 'y']}}
NB. With all the recursion going on here some of this code is
probably redundant. But could I be bothered neatening it...?
"""
def get_keys(l):
"""This is here incase some keys don't appear in all groups in a list."""
d={}
for i in l:
if type(i) == type({}):
d.update(i)
return d.keys()
# get_keys()
def compress_list(list):
result=[]
for i in list:
result.append(_compress(i))
if len(result) == 1:
result=result[0]
elif len(result) == 2:
if list[1] == '':
result=list[0]
return result
# compress_list()
def compress_dict(d):
result={}
for k in d.keys():
if k[:6] == '_group': # shuffle everything up.
if type(d[k]) == type({}):
result.update(compress_dict(d[k]))
elif type(d[k]) == type([]):
for key in get_keys(d[k]):
if key != '_value': # except for the _value key.
l=[]
for i in d[k]:
try:
v=i[key]
l.append(v)
except KeyError:
pass
result[key]=compress_list(l)
else:
pass
else:
result[k]=_compress(d[k])
# Keep compressing until no '_group*' keys are left.
# XXX - This looks recursively dangerous!
# Probably should put a global watchdog counter in for protection.
for k in result.keys():
if k[:6] == '_group':
result=compress_dict(result)
break
return result
# compress_dict()
result={}
if type(d) == type({}):
result=compress_dict(d)
elif type(d) == type([]):
result=compress_list(d)
else:
result=d
return result
# _compress
class _Group:
_group_pat=re.compile(r'^\s*[(](?:[?]P<(?P<name>\w+)(,\s*(?P<action>\w.+))?>|)(?P<pattern>.*)[)]\s*$', re.DOTALL)
def __init__(self, buf, flags=0, group_no=0, mult=''):
self.flags=flags
self.name='_group%d' % group_no
self.action=None
pattern=buf
self.mult=mult
m=self._group_pat.match(buf)
if m:
d=m.groupdict()
if d['name']:
self.name=d['name']
if d['action']:
i=0
while sys._getframe(i).f_globals['__name__'] == __name__:
i+=1
caller=sys._getframe(i).f_globals
# Try getting the action from the calling frame.
action=caller.get(d['action'])
# If that doesn't work try getting the action from __builtins__
if not action:
action=string.__builtins__.get(d['action'])
# If that doesn't work try getting the action from string methods
if not action:
action=string.__dict__.get(d['action'])
if action:
self.action=action
else:
raise Exception('Unknown action %s in group %s ' % (repr(d['action']), repr(buf)))
pattern=d['pattern']
self.buf, self.group=self._split(pattern)
self._set_pattern()
# __init__()
def _split(self, pattern):
# XXX This could do with some cleaning... :-(
pattern=pattern+' ' # this hack is here so we don't skip the last group
group=[]
item=''
new_buf=''
bcount=0
escape=False
square=False
postgroup=False
group_no=False
for c in pattern:
if postgroup:
if item[:3] == '(?:': # "silent" group
new_buf=new_buf+item+c
else:
new_buf=new_buf+'%s'
mult=''
if c in ['+', '*', '?']:
mult=c
elif c == '(':
pass
else:
new_buf=new_buf+c
group.append(_Group(item, self.flags, group_no, mult))
group_no=group_no+1
postgroup=False
if c != '(':
continue
if (bcount == 0) and (c == '(') and (not escape) and (not square):
item=c
bcount=bcount+1
elif (bcount == 0) and (c == '[') and (not square):
new_buf=new_buf+c
square=True
elif (bcount == 0) and (c == ']') and square:
if new_buf[-1] != '[':
square=False
new_buf=new_buf+c
elif bcount > 0:
if (c == '(') and (not escape) and (not square):
item=item+c
bcount=bcount+1
elif c == '\\' and (not square):
item=item+c
escape=True
elif escape:
item=item+c
escape=False
elif (c == '[') and (not square):
item=item+c
square=True
elif (c == ']') and square:
if item[-1] != '[':
square=False
item=item+c
elif (c == ')') and (not escape) and (not square):
item=item+c
bcount=bcount-1
if (bcount == 0):
postgroup=True
elif item:
item=item+c
else:
new_buf=new_buf+c
return new_buf[:-1], group # new_buf[:-1] is because of the hack at the start.
# split()
def _set_pattern(self):
self.pattern=self._get_pattern()
self.pat=re.compile(self.pattern, self.flags)
# _set_pattern()
def _match(self, buf):
result={self.name:[]}
if self.group:
for m in self.pat.findall(buf):
d={}
d['_value']=m[0]
for i in range(len(self.group)):
g=self.group[i]
gg=g._match(m[i+1])
if gg:
d.update(gg)
result[self.name].append(d)
else:
for m in self.pat.findall(buf): # Here's the call to regex
if self.action:
m = self.action(m)
result[self.name].append(m)
if len(result[self.name]) == 0:
result=None
elif len(result[self.name]) == 1: # XXX - Cant remember what this does
result[self.name]=result[self.name][0] # XXX - Cant remember what this does
return result
# _match()
def _get_pattern(self):
return '(?P<%s>%s)' % (self.name, self._get_buf(True))
# _get_pattern()
def _get_buf(self, top=False):
l=[]
for g in self.group:
l.append(g.__str__(top))
return self.buf % tuple(l)
# _get_buf()
def extract(self, buf):
d=self._match(buf)
d2=_compress(d)
return _objectify(d2)
# extract()
def __str__(self, top=False):
if top:
result='((?:%s)%s)' % (self._get_buf(), self.mult)
else:
buf=self.buf % tuple(self.group)
result='(?:%s)%s' % (buf, self.mult)
return result
# __str__()
# _Group
class Group:
"""Helper class for extraction"""
pattern=''
flags=0
pat=None
def extract(self, buf):
if not self.pat:
self.pat=_Group(self.pattern, self.flags)
return self.pat.extract(buf)
# extract()
extract=classmethod(extract)
# Group
# -------------- Unit Tests -------------- #
import sys
USING_UNITTEST2=False
try:
import unittest2 as unittest
USING_UNITTEST2=True
except:
import unittest
class _Group_UnitTest(unittest.TestCase):
def setUp(self):
reg1=r"""
(?P<paragraph>
(
(?P<sentence>
(
(?P<word>
(?P<letter>\w)+
)
[ \n]*
)+
[.]
)
[\n]?
(?:[ ][ ])?
)+
)+"""
reg2=r"""(?P<paragraph>((?P<sentence>((?P<word>\w+)[ \n]*)+[.])[\n]?(?:[ ][ ])?)+)+"""
buf="""\
This is the first groovy sentence of the first paragraph.
This is the second funky sentence. And this is the stylish third.
THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH. THiS iS the cruddy lASt
sEnTEnCe Of ThE SeconD pARAgraph."""
g1=compile(reg1, re.VERBOSE)
self.m1=g1.extract(buf)
g2=compile(reg2)
self.m2=g2.extract(buf)
reg3=r"""
(?P<number>[0-9]+)
(
(?P<word>[a-z]+)
(,)?
)+"""
buf3='34cat,dog,fish'
g3=compile(reg3, re.VERBOSE)
self.m3=g3.extract(buf3)
reg4=r"""(?P<number>[0-9]+)((?P<word>[a-z]+)(?:,)?)+"""
g4=compile(reg4)
self.m4=g4.extract(buf3)
buf5='12 drummers drumming, 11 pipers piping, 10 lords a-leaping'
reg5='(?P<verse>(?P<number>\d+) (?P<activity>[^,]+))(,)?'
self.m5=extract(reg5, buf5)
buf6='12 drummers drumming, 11.01 pipers piping, 10 lords a-leaping'
reg6='(?P<verse>(?P<number, float>\d+.?\d*) (?P<activity, upper>[^,]+))(,)?'
self.m6=extract(reg6, buf6)
# setUp()
def test01(self): v=self.m1.paragraph[0].sentence[0].word[4].letter[2]; assert v == 'o', v
def test02(self): v=self.m1.paragraph[0].sentence[1].word[4]._value; assert v == 'funky', v
def test03(self): v=self.m1.paragraph[0].sentence[2].word[4]._value; assert v == 'stylish', v
def test04(self): v=self.m1.paragraph[1].sentence[0]._value; assert v == 'THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH.', v
def test05(self): v=self.m2.paragraph[0].sentence[2].word[4]; assert v == 'stylish', v
def test06(self): v=self.m2.paragraph[1].sentence[0]._value; assert v == 'THiS is tHE FIrst uGLy seNTenCe Of tHe sEcond PaRAgrapH.', v
def test07(self): v=self.m3.number; assert v == '34', v
def test08(self): v=self.m3.word; assert v == ['cat', 'dog', 'fish'], v
def test09(self): v=self.m4.number; assert v == '34', v
def test10(self): v=self.m4.word; assert v == ['cat', 'dog', 'fish'], v
def test11(self): v=self.m4.has_key('word'); assert v == True, v
def test12(self): v=self.m4.keys(); assert v == ['word', 'number'], v
def test13(self): v=self.m5.verse[0]; assert v == {'activity': 'drummers drumming', 'number': '12'}, v
def test14(self): v=self.m5.verse[0].activity; assert v == 'drummers drumming', v
def test15(self): v=self.m5.verse[0].number; assert v == '12', v
def test16(self): v=dict(self.m5); assert v == {'verse': [{'number': '12', 'activity': 'drummers drumming'}, {'number': '11', 'activity': 'pipers piping'}, {'number': '10', 'activity': 'lords a-leaping'}]}, v
if USING_YAML:
def test17(self): v=str(self.m5); assert v == "---\nverse:\n -\n activity: drummers drumming\n number: '12'\n -\n activity: pipers piping\n number: '11'\n -\n activity: lords a-leaping\n number: '10'\n", v
def test18(self): v=dict(self.m6); assert v == {'verse': [{'number': 12.0, 'activity': 'DRUMMERS DRUMMING'}, {'number': 11.01, 'activity': 'PIPERS PIPING'}, {'number': 10.0, 'activity': 'LORDS A-LEAPING'}]}, v
if USING_YAML:
def test19(self): v=str(self.m6); assert v == "---\nverse:\n -\n activity: DRUMMERS DRUMMING\n number: 12.0\n -\n activity: PIPERS PIPING\n number: 11.01\n -\n activity: LORDS A-LEAPING\n number: 10.0\n", v
if USING_UNITTEST2 or __name__ == '__main__':
unittest.main()
# -------------- Unit Tests -------------- #
|