#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
import sys
import string
from copy import copy
from tvgrab.grabexceptions import ParseError
import i18n
import re
class MalformedDataError( Exception ):
contents=None
exception=None
# MalformedDataError
def countInString(match,text):
"""
Counts the occurence of the regular expression
matches in some text
"""
m = match.search(text)
rv = 0
while m:
rv +=1
m = match.search(text,m.end(0))
return rv
def balanceTags(re_starttag,starttag,re_endtag,endtag,input):
# Brutally tries to balance tags by adding
# start tags at the beginning or end tags at the end
# of input
tablestart = countInString(re_starttag,input)
tableend = countInString(re_endtag,input)
if tablestart > tableend:
#input = input + multiplyString(endtag,tablestart-tableend)
input = input + endtag * (tablestart-tableend)
elif tablestart < tableend:
#input = multiplyString(starttag,tableend-tablestart) + input
input = starttag*(tableend-tablestart) + input
tablestart = countInString(re.compile("<table"),input)
tableend = countInString(re.compile("</table>"),input)
if tablestart!=tableend:
print "error in balanceTags"
return input
def findExtent(lines,startIndex,reStart,reStop):
"""
Figures out and returns the extent of the current scoope
starting at nextIndex and ending with a match of reStop
"""
i = startIndex
j = len(lines)
startCount = 0
endCount = 0
match = reStart.search(lines,i)
if match and match.start(0)==startIndex:
startCount+=1
i = match.end(0)
else:
assert "Dude, you did not set me up right!"
while i<j and startCount!=endCount:
matchStart = reStart.search(lines,i)
matchEnd = reStop.search(lines,i)
if matchStart and matchEnd:
if matchStart.start(0) < matchEnd.start(0):
i = matchStart.end(0)
startCount+=1
else:
i = matchEnd.end(0)
endCount+=1
elif matchStart:
# hm, seems to be an unbalance in the force
i = matchStart.end(0)
startCount+=1
else:
i = matchEnd.end(0)
endCount+=1
if endCount == startCount:
return lines[startIndex:i]
else:
# Found EOF before tags were balanced
return lines[startIndex:]
def blankHtml(lines,reStart,reStop,reTrigger):
"""
Iterates though a html text looking for sections of the type
reStart.*(<reStart>.*<reStop>)*.*reStop
if reTrigger matches the innermost section, it will be replaced with spaces.
Example:
restart="<a",restop="</a>",retrigger="winner"
txt= "<a>bla bla<a>bla</a><a>a winner</a></a>"
will return "<a>bla bla<a>bla</a> </a>"
"""
queue = []
index = 0
length = len(lines)
nextStart = None
nextEnd = None
while index < length:
nextStart = reStart.search(lines,index)
nextEnd = reStop.search(lines,index)
if nextStart and nextEnd:
if nextStart.start(0) < nextEnd.start(0):
queue.append(nextStart.start(0))
index = nextStart.end(0)
else:
range = (queue[-1],nextEnd.end(0))
index = nextEnd.end(0)
del queue[-1]
if reTrigger.search(lines,range[0],range[1]):
lines = lines[0:range[0]]+" "*(range[1]-range[0])+lines[range[1]:]
elif nextStart:
# Un-balanced tags
queue.append(nextStart.start(0))
elif nextEnd:
range = (queue[-1],nextEnd.end(0))
index = nextEnd.end(0)
del queue[-1]
if reTrigger.search(lines,range[0],range[1]):
lines = lines[0:range[0]]+" "*(range[1]-range[0])+lines[range[1]:]
else:
index = length
if reTrigger.search(lines):
return " "*length
else:
return lines
def searchHtmlDF(lines,reStart,reStop,reTrigger):
return searchHtml(lines, reStart,reStop,reTrigger,-1)
def searchHtmlBF(lines,reStart,reStop,reTrigger):
return searchHtml(lines,reStart,reStop,reTrigger,1)
def searchHtml(lines,reStart,reStop,reTrigger,depth,matchOnRoot=False):
"""
Iterates though a html text looking for sections of the type
reStart.*(<reStart>.*<reStop>)*.*reStop
if reTrigger matches the outermost section will be returned.
depth is the maximum depth of the value returned.
If it is positive the whole nested tree, blah ..... UPDATE COMMENT!
Example:
restart="<a",restop="</a>",retrigger="winner"
txt= "<a>bla</a><a>bla bla<a>bla</a><a>a winner</a></a>"
will return "<a>bla bla<a>bla</a><a>a winner</a></a>"
Note that this method returns data in breadth first mode.
(it still searches the input top-down)
In difference to searchHtmlDF nothing will be returned for
matches on the root lvl. reStart must match at least once.
"""
queue = []
index = 0
length = len(lines)
nextStart = None
nextEnd = None
lastMatch = None
while index < length:
if (not nextStart) or (nextStart.start(0) < index):
nextStart = reStart.search(lines,index)
if (not nextEnd) or (nextEnd.start(0) < index):
nextEnd = reStop.search(lines,index)
if nextStart and nextEnd:
if nextStart.start(0) < nextEnd.start(0):
queue.append(nextStart.start(0))
index = nextStart.end(0)
else:
range = (queue[-1],nextEnd.end(0))
index = nextEnd.end(0)
del queue[-1]
if reTrigger.search(lines,range[0],range[1]):
if not lastMatch:
distanceToLastMatch = 0
lastMatch = range
distanceToLastMatch -= 1
if depth==distanceToLastMatch or depth==len(queue)+1:
return lines[lastMatch[0]:lastMatch[1]]
#elif lastMatch:
# we had a match, but we lost it in the last iteration
#return lines[lastMatch[0]:lastMatch[1]]
elif nextStart:
# Un-balanced tags
pe = ParseError( _( "%s could not be correctly parsed." ) % "Html" )
pe.contents = lines
raise pe
elif nextEnd:
range = (queue[-1],nextEnd.end(0))
index = nextEnd.end(0)
del queue[-1]
if reTrigger.search(lines,range[0],range[1]):
if not lastMatch:
distanceToLastMatch = 0
lastMatch = range
distanceToLastMatch -= 1
if depth==distanceToLastMatch or depth==len(queue)+1:
return lines[lastMatch[0]:lastMatch[1]]
#elif lastMatch:
# # we had a match, but we lost it in the last iteration
# return lines[lastMatch[0]:lastMatch[1]]
else:
index = length
# end while
if matchOnRoot:
if reTrigger.search(lines):
return lines
else:
return ""
else:
return ""
# -------------- Unit Tests -------------- #
using_unittest2=False
try:
import unittest2 as unittest
using_unittest2=True
except:
import unittest
class TagUtils_UnitTest(unittest.TestCase):
def setUp(self):
# well formated test:
self.i = """
t011
<d>
t020t021
<d>
t1
</d>
t8
</d>
<d>
t2
</d>
<d>
t4
<d>
t3
t021
</d>
t5
<d>
t6
</d>
t7
</d>
<d>
<d>
<d>
t9
</d>
</d>
t034
</d>
t010
t022
"""
noWhite=re.compile("\s+",re.S)
self.i = noWhite.sub("",self.i)
start = re.compile("<d",re.S)
end = re.compile("</d>")
# searchHtmlDF
trigger = re.compile("t1",re.S)
self.o1 = searchHtmlDF(self.i,start,end,trigger)
self.e1 = "<d>t1</d>"
trigger = re.compile("t2",re.S)
self.o2 = searchHtmlDF(self.i,start,end,trigger)
self.e2 = "<d>t2</d>"
trigger = re.compile("t3",re.S)
self.o3 = searchHtmlDF(self.i,start,end,trigger)
self.e3 = "<d>t3t021</d>"
trigger = re.compile("t4",re.S)
self.o4 = searchHtmlDF(self.i,start,end,trigger)
self.e4 = "<d>t4<d>t3t021</d>t5<d>t6</d>t7</d>"
trigger = re.compile("t5",re.S)
self.o5 = searchHtmlDF(self.i,start,end,trigger)
self.e5 = "<d>t4<d>t3t021</d>t5<d>t6</d>t7</d>"
trigger = re.compile("t6",re.S)
self.o6 = searchHtmlDF(self.i,start,end,trigger)
self.e6 = "<d>t6</d>"
trigger = re.compile("t7",re.S)
self.o7 = searchHtmlDF(self.i,start,end,trigger)
self.e7 = "<d>t4<d>t3t021</d>t5<d>t6</d>t7</d>"
trigger = re.compile("t8",re.S)
self.o8 = searchHtmlDF(self.i,start,end,trigger)
self.e8 = "<d>t020t021<d>t1</d>t8</d>"
trigger = re.compile("t9",re.S)
self.o9 = searchHtmlDF(self.i,start,end,trigger)
self.e9 = "<d>t9</d>"
trigger = re.compile("t010",re.S)
self.o10 = searchHtml(self.i,start,end,trigger,-1,True)
self.e10 = "t011<d>t020t021<d>t1</d>t8</d><d>t2</d><d>t4<d>t3t021</d>t5<d>t6</d>t7</d><d><d><d>t9</d></d>t034</d>t010t022"
trigger = re.compile("t011",re.S)
self.o11 = searchHtml(self.i,start,end,trigger,-1,True)
self.e11 = "t011<d>t020t021<d>t1</d>t8</d><d>t2</d><d>t4<d>t3t021</d>t5<d>t6</d>t7</d><d><d><d>t9</d></d>t034</d>t010t022"
trigger = re.compile("t011",re.S)
self.o12 = searchHtml(self.i,start,end,trigger,-1,False)
self.e12 = ""
# blankHtml
trigger = re.compile("t020",re.S)
self.o20 = blankHtml(self.i,start,end,trigger)
self.e20 = "t011 <d>t2</d><d>t4<d>t3t021</d>t5<d>t6</d>t7</d><d><d><d>t9</d></d>t034</d>t010t022"
trigger = re.compile("t021",re.S)
self.o21 = blankHtml(self.i,start,end,trigger)
self.e21 = "t011 <d>t2</d><d>t4 t5<d>t6</d>t7</d><d><d><d>t9</d></d>t034</d>t010t022"
trigger = re.compile("t022",re.S)
self.o22 = blankHtml(self.i,start,end,trigger)
self.e22 = " "
# searchHtmlBF
trigger = re.compile("t020",re.S)
self.o30 = searchHtmlBF(self.i,start,end,trigger)
self.e30 = "<d>t020t021<d>t1</d>t8</d>"
trigger = re.compile("t022",re.S)
self.o31 = searchHtmlBF(self.i,start,end,trigger)
self.e31 = ""
trigger = re.compile("t1",re.S)
self.o32 = searchHtmlBF(self.i,start,end,trigger)
self.e32 = "<d>t020t021<d>t1</d>t8</d>"
trigger = re.compile("t021",re.S)
self.o33 = searchHtmlBF(self.i,start,end,trigger)
self.e33 = "<d>t020t021<d>t1</d>t8</d>"
trigger = re.compile("t034",re.S)
self.o34 = searchHtmlBF(self.i,start,end,trigger)
self.e34 = "<d><d><d>t9</d></d>t034</d>"
pass
# searchHtmlDF
def test01(self): v=self.o1; assert v == self.e1, v
def test02(self): v=self.o2; assert v == self.e2, v
def test03(self): v=self.o3; assert v == self.e3, v
def test04(self): v=self.o4; assert v == self.e4, v
def test05(self): v=self.o5; assert v == self.e5, v
def test06(self): v=self.o6; assert v == self.e6, v
def test07(self): v=self.o7; assert v == self.e7, v
def test08(self): v=self.o8; assert v == self.e8, v
def test09(self): v=self.o9; assert v == self.e9, v
def test10(self): v=self.o10; assert v == self.e10, v
def test11(self): v=self.o11; assert v == self.e11, v
def test12(self): v=self.o12; assert v == self.e12, v
# blankHtml
def test20(self): v=self.o20; assert v == self.e20, v
def test21(self): v=self.o21; assert v == self.e21, v
def test22(self): v=self.o21; assert v == self.e21, v
# searchHtmlBF
def test30(self): v=self.o30; assert v == self.e30, v
def test31(self): v=self.o31; assert v == self.e31, v
def test32(self): v=self.o32; assert v == self.e32, v
def test33(self): v=self.o33; assert v == self.e33, v
if using_unittest2 or __name__ == '__main__':
unittest.main()
|