customizedparser.py : » Web-Services » python-xmltv » pytvgrab-lib-0.5.1 » lib » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Web Services » python xmltv
python xmltv » pytvgrab lib 0.5.1 » lib » customizedparser.py
#!/usr/bin/python
# -*- coding: iso8859-15 -*-
# -----------------------------------------------------------------------
# Copyright (C) 2003 Gustavo Sverzut Barbieri.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
#    http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev: 255 $
# $LastChangedDate: 2004-11-12 13:35:28 +1100 (Fri, 12 Nov 2004) $
# $LastChangedRevision: 255 $
# $LastChangedBy: ottrey $
#
#
# $Log: customizedparser.py,v $
# Rev 80  - Fixed &#xxx; and &blah; handling and spacing, thx to
# Bastiaan Van Eeckhoudt for this.
#
# Revision 1.3  2003/08/25 21:02:25 den_RDC
# fixed must_close_tag cdata parsing
#
# Revision 1.2  2003/08/24 14:19:08  ottrey
# *** empty log message ***
#

from tag import Tag
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import message

import i18n

from grabexceptions import ParseError


class CustomizedParser ( HTMLParser ):
    """
    This class parse the HTML, but in a customized way: it only parse
    the tags and attributes specified in 'parse_tags' and 'parse_attrs',
    skipping others.

    NOTE: This Class tries to parse broken HTML, but it's something difficult
    due the number of non-capable people doing HTML. Maybe you need to correct
    the HTML using regular expressions or something like that.
       One thing is to close one tag when you expect another, in this case, the
    parser assumes that there is a need to close that tag and goes down through
    the last opened tags, closing them until it matches the wanted tags. This
    can lead to a problem when you close a tag that was not opened (in my case,
    it was <table></tr>!), then you need to remove this tag.

    """
    __author__   = "Gustavo Sverzut Barbieri <gustavo@linuxdicas.com.br>"
    __revision__ = "$Rev: 255 $"

    verbose         = 0
    queue           = None
    text            = None
    skip            = None
    parse_tags      = None
    parse_attrs     = None
    keep_empty_tags = None
    must_close_tags = None

    def __init__( self, parse_tags=None, parse_attrs=None,
                  must_close_tags=None, keep_empty_tags=None,
                  verbose=0 ):
        """
        parse_tags:  a list of tag names to be parsed. Tags outside this
                     list will be skipped.
        parse_attrs: a list of attributes to be parsed. Attributes outside
                     this list will be skipped.
        must_close_tags: a list of tags that should be closed before other tag
                     start. Ie: many document doesn't close <option> tag, so
                     you provide "option" as a must_close_tag.
        keep_empty_tags: a list of tags that should be kept even if they don't
                     have cdata or children.
        verbose:     set this to 1, then it will print the tag structure to
                     the stderr, so you can check the document structure for
                     mismatched closing tags and other errors in the structure.
        """
        HTMLParser.__init__( self )
        self.queue = [ Tag( u"html_root" ) ]
        self.text = [ u"" ]
        self.counter = 0
        self.parse_tags = parse_tags or [ u"html", u"body", u"table", u"tr", u"td", u"a" ]
        self.parse_attrs = parse_attrs or [ u"href", u"colspan", u"cols" ]
        self.keep_empty_tags = keep_empty_tags or [ u"input" ]
        self.verbose = verbose
        self.must_close_tags = must_close_tags or [ u"option" ]
    # __init__()


    def handle_starttag( self, name, attrs ):
        """
        Handle the start tag. This creates a Tag(), which is the representation
        of a HTML Tag and puts that in a queue. Then handle_* will use the
        queue for their operations.
        """
        if name in self.parse_tags:
            myattrs = [ ]
            for a in attrs:
                if a[ 0 ] in self.parse_attrs:
                    myattrs += [ a ]
            # for

            # Close must close tags:
            if self.queue[ -1 ].name in self.must_close_tags:
                self.handle_endtag( self.queue[ -1 ].name )

            # Close unclosed tags:
            if ( name in ( u"td", u"tr" ) and self.queue[ -1 ].name == u"td" ):
                self.handle_endtag( u"td" )

                if ( name == u"tr" ): # close last <tr> too!
                    self.handle_endtag( u"tr" )

            # Add to queue:
            self.queue.append( Tag( name, myattrs ) )
            self._message( u" " * self.counter + _( "tag: <%s>" ) % name )
            self.counter += 1

        self.text += [ u"" ]
    # handle_starttag()


    def handle_endtag( self, name ):
        """
        Remove the tag from queue and see if we should parse it or throw it
        away.
        """

        #we only care about tags we need to parse
        if name in self.parse_tags:
            if name in self.must_close_tags and self.queue[ -1 ].name != name:
                return # it was already closed

            if name != self.queue[ -1 ].name:
                if self.verbose:
                    message.error( _( "Inconsistent document structure! " \
                                      "Closed tag in wrong place! " \
                                      "Tag: '%s', Wanted Tag: '%s', "
                                      "counter: %d" ) % \
                                   ( name, self.queue[ -1 ].name, self.counter ) )

                # Try to close the unclosed tag
                while ( ( len( self.queue ) > 1 ) and \
                        ( name != self.queue[ -1 ].name ) ):
                    self.handle_endtag( self.queue[ -1 ].name )

                # It's the last tag?
                if len( self.queue ) < 2:
                    if self.verbose:
                        message.error( _( "Closing tag <%s> that doesn't " \
                                          "exists!" ) % name )
                    return

            self.counter -= 1

            self._message( u" " * self.counter + _( "tag: </%s>    \t"
                                                   "counter=%s" ) % \
                           ( name, self.counter ) )

            # Pop from queue and add as children
            t = self.queue.pop()
            t.cdata = self.text[ -1 ].strip()
            # Eliminate empty (no children or cdata) tags:
            if t.children or t.cdata or t.name in self.keep_empty_tags:
                self.queue[ -1 ].children += [ t ]

        else:
            # If we should skip this tag, pass it's text (cdata) to
            # parent. Ie: <a><b>text</b></a>, skip <b>, then we get
            # <a>text</a>
            self.text[ -2 ] += self.text[ -1 ]

        del self.text[ -1 ]
    # handle_endtag()


    def handle_data( self, cdata ):
        """
        Add the text to the tag
        """
        self.text[ -1 ] += cdata
        self._message( u" " * self.counter + _( "cdata: %s" ) % ( cdata ) )

    # handle_data()


    def handle_entityref( self, entityref ):
        """
        Add the translated entityref to the queue
        """
        try:
            # does not translate &amp; to & !
            if entityref == u"amp":
                self.text[ -1 ] += u"&amp;"
            else:
                self.text[ -1 ] += entitydefs[ entityref ]
        except:
            message.warning( _( "&%s; is not mapped! " \
                                "Please check the code!" ) % entityref )
    # handle_entityref()


    def handle_charref( self, charref ):
        """
        Add the translated charref to the queue, with or without spacing as
        determined by ended_on_space
        """
        try:
            if isinstance( self.text[ -1 ], unicode ):
                self.text[ -1 ] += unichr( int( charref ) )
            else:
                self.text[ -1 ] += chr( int( charref ) )
        except Exception, e:
            message.warning( _( "&#%s; is not valid: %s" ) % ( charref, e ) )
    # hadle_charref()


    def feed( self, contents ):
        try:
            HTMLParser.feed( self, contents )
        except Exception, e:
            pe = ParseError( _( "Could not parse document: %s\n\n%s\n\n" ) %
                             ( e, contents ) )
            pe.contents = contents
            pe.exception = e
            raise pe
    # feed()


    def get_structure( self ):
        """
        Get the document structure. Note that if your document is not well
        structured (ie, it lacks the <html>), you will only get the last tag!
        If you have that ugly document, please get all the tags using:
            CustomizedParser().queue[ -1 ].children
        """
        if not self.queue:
            raise ParseError( _( "Bad Document: no <html>?" ) )
        if not self.queue[ -1 ].children:
            raise ParseError( _( "Bad Document: empty <html>?\n\n%s\n\n" ) %
                              self.queue )
        return self.queue[ -1 ].children[ -1 ]
    # get_structure()


    def _message( self, msg ):
        if self.verbose:
            message.message( msg )
    # _message()

# CustomizedParser


# --------------  Unit Tests  -------------- #
using_unittest2=False
try:
  import unittest2 as unittest
  using_unittest2=True
except:
  import unittest

class CustomizedParser_UnitTest(unittest.TestCase):
  """
       2 html, one well formated to test if the parser works as expected
    and another poor formated, with missing close tags to check if the
    parser can work around those problems.
       Both must result the same structure.
  """

  def setUp(self):    
    # well formated test:
    self.i1 = """
    <html>
       <head>
          <title>test</title>
       </head>
       <body color="white">
          <!-- comment -->
          <a href="link.html" title="not used">link</a>
          <table cols="3" cellspacing="2">
             <tbody>
                <tr>
                   <td>c1</td>
                   <td>c2</td>
                   <td>c3</td>
                </tr>
                <tr>
                   <td>c4</td>                    
                   <td>c5</td>
                   <td><a href="test">c6</a></td>
                </tr>
             </tbody>
          </table>
          <form>
             <select>
                <option value="1">op1</option>
                <option value="2">op2</option>
                <option value="3">op3</option>
             </select>
          </form>
       </body>
    </html>
    """

    # poor formatet test:
    self.i2 = """
    <html>
       <head>
          <title>test</title>
       </head>
       <body color="white">
          <!-- comment -->
          <a href="link.html" title="not used">link</a>
          <table cols="3" cellspacing="2">
             <tbody>
                <tr>
                   <td>c1
                   <td>c2
                   <td>c3
                <tr>
                   <td>c4
                   <td>c5
                   <td><a href="test">c6
             </tbody>
          </table>
          <form>
             <select>
                <option value="1">op1
                <option value="2">op2
                <option value="3">op3
             </select>
          </form>
       </body>
    </html>
    """

    self.o = Tag( "html", cdata="test",
             children=[ Tag( "body",
                             children=[ Tag( "a", [ ( "href", "link.html" ) ], "link" ),
                                        Tag( "table", [ ( "cols", "3" ) ],
                                             children=[ Tag( "tr",
                                                             children=[ Tag( "td", cdata="c1" ),
                                                                        Tag( "td", cdata="c2" ),
                                                                        Tag( "td", cdata="c3" ) ] ),
                                                        Tag( "tr",
                                                             children=[ Tag( "td", cdata="c4" ),
                                                                        Tag( "td", cdata="c5" ),
                                                                        Tag( "td",
                                                                             children=[ Tag( "a", [ ( "href", "test" ) ], "c6") ] ) ] )
                                                        ]
                                                ),
                                        Tag( "select",
                                             children=[ Tag( "option",
                                                             [ ( "value", "1" ) ],
                                                             "op1" ),
                                                        Tag( "option",
                                                             [ ( "value", "2" ) ],
                                                             "op2" ),
                                                        Tag( "option",
                                                             [ ( "value", "3" ) ],
                                                             "op3" ) ]
                                             ) ]
                             )
                        ]
             )
    

    self.cp = CustomizedParser( parse_tags=[ "html", "body", "table",
                                        "tr", "td", "a", "select", "option" ],
                           parse_attrs=[ "href", "colspan", "cols", "value" ],
                           must_close_tags=[ "option" ] )
    self.cp.feed( self.i1 )
    self.p1 = self.cp.get_structure()

    self.cp = CustomizedParser( parse_tags=[ "html", "body", "table",
                                        "tr", "td", "a", "select", "option" ],
                           parse_attrs=[ "href", "colspan", "cols", "value" ],
                           must_close_tags=[ "option" ] )
    self.cp.feed( self.i2 )
    self.p2 = self.cp.get_structure()
  # setUp()
    
  def test01(self): v=str( self.p1 ); assert v == str( self.o ), v
  def test02(self): v=str( self.p2 ); assert v == str( self.o ), v

if using_unittest2 or __name__ == '__main__':
  unittest.main()
# --------------  Unit Tests  -------------- #
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.