#!/usr/bin/python
# -*- coding: iso8859-15 -*-
# -----------------------------------------------------------------------
# Copyright (C) 2003 Gustavo Sverzut Barbieri.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
# http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev: 255 $
# $LastChangedDate: 2004-11-12 13:35:28 +1100 (Fri, 12 Nov 2004) $
# $LastChangedRevision: 255 $
# $LastChangedBy: ottrey $
#
#
# $Log: customizedparser.py,v $
# Rev 80 - Fixed &#xxx; and &blah; handling and spacing, thx to
# Bastiaan Van Eeckhoudt for this.
#
# Revision 1.3 2003/08/25 21:02:25 den_RDC
# fixed must_close_tag cdata parsing
#
# Revision 1.2 2003/08/24 14:19:08 ottrey
# *** empty log message ***
#
from tag import Tag
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import message
import i18n
from grabexceptions import ParseError
class CustomizedParser ( HTMLParser ):
"""
This class parse the HTML, but in a customized way: it only parse
the tags and attributes specified in 'parse_tags' and 'parse_attrs',
skipping others.
NOTE: This Class tries to parse broken HTML, but it's something difficult
due the number of non-capable people doing HTML. Maybe you need to correct
the HTML using regular expressions or something like that.
One thing is to close one tag when you expect another, in this case, the
parser assumes that there is a need to close that tag and goes down through
the last opened tags, closing them until it matches the wanted tags. This
can lead to a problem when you close a tag that was not opened (in my case,
it was <table></tr>!), then you need to remove this tag.
"""
__author__ = "Gustavo Sverzut Barbieri <gustavo@linuxdicas.com.br>"
__revision__ = "$Rev: 255 $"
verbose = 0
queue = None
text = None
skip = None
parse_tags = None
parse_attrs = None
keep_empty_tags = None
must_close_tags = None
def __init__( self, parse_tags=None, parse_attrs=None,
must_close_tags=None, keep_empty_tags=None,
verbose=0 ):
"""
parse_tags: a list of tag names to be parsed. Tags outside this
list will be skipped.
parse_attrs: a list of attributes to be parsed. Attributes outside
this list will be skipped.
must_close_tags: a list of tags that should be closed before other tag
start. Ie: many document doesn't close <option> tag, so
you provide "option" as a must_close_tag.
keep_empty_tags: a list of tags that should be kept even if they don't
have cdata or children.
verbose: set this to 1, then it will print the tag structure to
the stderr, so you can check the document structure for
mismatched closing tags and other errors in the structure.
"""
HTMLParser.__init__( self )
self.queue = [ Tag( u"html_root" ) ]
self.text = [ u"" ]
self.counter = 0
self.parse_tags = parse_tags or [ u"html", u"body", u"table", u"tr", u"td", u"a" ]
self.parse_attrs = parse_attrs or [ u"href", u"colspan", u"cols" ]
self.keep_empty_tags = keep_empty_tags or [ u"input" ]
self.verbose = verbose
self.must_close_tags = must_close_tags or [ u"option" ]
# __init__()
def handle_starttag( self, name, attrs ):
"""
Handle the start tag. This creates a Tag(), which is the representation
of a HTML Tag and puts that in a queue. Then handle_* will use the
queue for their operations.
"""
if name in self.parse_tags:
myattrs = [ ]
for a in attrs:
if a[ 0 ] in self.parse_attrs:
myattrs += [ a ]
# for
# Close must close tags:
if self.queue[ -1 ].name in self.must_close_tags:
self.handle_endtag( self.queue[ -1 ].name )
# Close unclosed tags:
if ( name in ( u"td", u"tr" ) and self.queue[ -1 ].name == u"td" ):
self.handle_endtag( u"td" )
if ( name == u"tr" ): # close last <tr> too!
self.handle_endtag( u"tr" )
# Add to queue:
self.queue.append( Tag( name, myattrs ) )
self._message( u" " * self.counter + _( "tag: <%s>" ) % name )
self.counter += 1
self.text += [ u"" ]
# handle_starttag()
def handle_endtag( self, name ):
"""
Remove the tag from queue and see if we should parse it or throw it
away.
"""
#we only care about tags we need to parse
if name in self.parse_tags:
if name in self.must_close_tags and self.queue[ -1 ].name != name:
return # it was already closed
if name != self.queue[ -1 ].name:
if self.verbose:
message.error( _( "Inconsistent document structure! " \
"Closed tag in wrong place! " \
"Tag: '%s', Wanted Tag: '%s', "
"counter: %d" ) % \
( name, self.queue[ -1 ].name, self.counter ) )
# Try to close the unclosed tag
while ( ( len( self.queue ) > 1 ) and \
( name != self.queue[ -1 ].name ) ):
self.handle_endtag( self.queue[ -1 ].name )
# It's the last tag?
if len( self.queue ) < 2:
if self.verbose:
message.error( _( "Closing tag <%s> that doesn't " \
"exists!" ) % name )
return
self.counter -= 1
self._message( u" " * self.counter + _( "tag: </%s> \t"
"counter=%s" ) % \
( name, self.counter ) )
# Pop from queue and add as children
t = self.queue.pop()
t.cdata = self.text[ -1 ].strip()
# Eliminate empty (no children or cdata) tags:
if t.children or t.cdata or t.name in self.keep_empty_tags:
self.queue[ -1 ].children += [ t ]
else:
# If we should skip this tag, pass it's text (cdata) to
# parent. Ie: <a><b>text</b></a>, skip <b>, then we get
# <a>text</a>
self.text[ -2 ] += self.text[ -1 ]
del self.text[ -1 ]
# handle_endtag()
def handle_data( self, cdata ):
"""
Add the text to the tag
"""
self.text[ -1 ] += cdata
self._message( u" " * self.counter + _( "cdata: %s" ) % ( cdata ) )
# handle_data()
def handle_entityref( self, entityref ):
"""
Add the translated entityref to the queue
"""
try:
# does not translate & to & !
if entityref == u"amp":
self.text[ -1 ] += u"&"
else:
self.text[ -1 ] += entitydefs[ entityref ]
except:
message.warning( _( "&%s; is not mapped! " \
"Please check the code!" ) % entityref )
# handle_entityref()
def handle_charref( self, charref ):
"""
Add the translated charref to the queue, with or without spacing as
determined by ended_on_space
"""
try:
if isinstance( self.text[ -1 ], unicode ):
self.text[ -1 ] += unichr( int( charref ) )
else:
self.text[ -1 ] += chr( int( charref ) )
except Exception, e:
message.warning( _( "&#%s; is not valid: %s" ) % ( charref, e ) )
# hadle_charref()
def feed( self, contents ):
try:
HTMLParser.feed( self, contents )
except Exception, e:
pe = ParseError( _( "Could not parse document: %s\n\n%s\n\n" ) %
( e, contents ) )
pe.contents = contents
pe.exception = e
raise pe
# feed()
def get_structure( self ):
"""
Get the document structure. Note that if your document is not well
structured (ie, it lacks the <html>), you will only get the last tag!
If you have that ugly document, please get all the tags using:
CustomizedParser().queue[ -1 ].children
"""
if not self.queue:
raise ParseError( _( "Bad Document: no <html>?" ) )
if not self.queue[ -1 ].children:
raise ParseError( _( "Bad Document: empty <html>?\n\n%s\n\n" ) %
self.queue )
return self.queue[ -1 ].children[ -1 ]
# get_structure()
def _message( self, msg ):
if self.verbose:
message.message( msg )
# _message()
# CustomizedParser
# -------------- Unit Tests -------------- #
using_unittest2=False
try:
import unittest2 as unittest
using_unittest2=True
except:
import unittest
class CustomizedParser_UnitTest(unittest.TestCase):
"""
2 html, one well formated to test if the parser works as expected
and another poor formated, with missing close tags to check if the
parser can work around those problems.
Both must result the same structure.
"""
def setUp(self):
# well formated test:
self.i1 = """
<html>
<head>
<title>test</title>
</head>
<body color="white">
<!-- comment -->
<a href="link.html" title="not used">link</a>
<table cols="3" cellspacing="2">
<tbody>
<tr>
<td>c1</td>
<td>c2</td>
<td>c3</td>
</tr>
<tr>
<td>c4</td>
<td>c5</td>
<td><a href="test">c6</a></td>
</tr>
</tbody>
</table>
<form>
<select>
<option value="1">op1</option>
<option value="2">op2</option>
<option value="3">op3</option>
</select>
</form>
</body>
</html>
"""
# poor formatet test:
self.i2 = """
<html>
<head>
<title>test</title>
</head>
<body color="white">
<!-- comment -->
<a href="link.html" title="not used">link</a>
<table cols="3" cellspacing="2">
<tbody>
<tr>
<td>c1
<td>c2
<td>c3
<tr>
<td>c4
<td>c5
<td><a href="test">c6
</tbody>
</table>
<form>
<select>
<option value="1">op1
<option value="2">op2
<option value="3">op3
</select>
</form>
</body>
</html>
"""
self.o = Tag( "html", cdata="test",
children=[ Tag( "body",
children=[ Tag( "a", [ ( "href", "link.html" ) ], "link" ),
Tag( "table", [ ( "cols", "3" ) ],
children=[ Tag( "tr",
children=[ Tag( "td", cdata="c1" ),
Tag( "td", cdata="c2" ),
Tag( "td", cdata="c3" ) ] ),
Tag( "tr",
children=[ Tag( "td", cdata="c4" ),
Tag( "td", cdata="c5" ),
Tag( "td",
children=[ Tag( "a", [ ( "href", "test" ) ], "c6") ] ) ] )
]
),
Tag( "select",
children=[ Tag( "option",
[ ( "value", "1" ) ],
"op1" ),
Tag( "option",
[ ( "value", "2" ) ],
"op2" ),
Tag( "option",
[ ( "value", "3" ) ],
"op3" ) ]
) ]
)
]
)
self.cp = CustomizedParser( parse_tags=[ "html", "body", "table",
"tr", "td", "a", "select", "option" ],
parse_attrs=[ "href", "colspan", "cols", "value" ],
must_close_tags=[ "option" ] )
self.cp.feed( self.i1 )
self.p1 = self.cp.get_structure()
self.cp = CustomizedParser( parse_tags=[ "html", "body", "table",
"tr", "td", "a", "select", "option" ],
parse_attrs=[ "href", "colspan", "cols", "value" ],
must_close_tags=[ "option" ] )
self.cp.feed( self.i2 )
self.p2 = self.cp.get_structure()
# setUp()
def test01(self): v=str( self.p1 ); assert v == str( self.o ), v
def test02(self): v=str( self.p2 ); assert v == str( self.o ), v
if using_unittest2 or __name__ == '__main__':
unittest.main()
# -------------- Unit Tests -------------- #
|