mtgrab.py : » Web-Services » python-xmltv » pytvgrab-lib-0.5.1 » lib » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Web Services » python xmltv
python xmltv » pytvgrab lib 0.5.1 » lib » mtgrab.py
#!/usr/bin/env python
# -----------------------------------------------------------------------
# Copyright (C) 2003 dan fritz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
#  http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev:$
#
# $Log: $
#
# TODO:
#

"""

This is a rewrite of threading code written by den_RDC.

"""

import sys
import re
import time
import string
import new
from threading import Thread,Lock
from copy import copy
from cPickle import load,dump
from urlutils import get_urlcontents,get_urlparsed
from tvgrab.customizedparser import CustomizedParser
from tvgrab.tag import Tag
from tvgrab.grab import Grab_C,Date,Time,DateTime,Option
from tvgrab.tagutils import searchHtmlBF,searchHtmlDF,searchHtml
from tvgrab import message
from tvgrab.grabexceptions import ParseError
from datetime import tzinfo
try:
  usingXMLDataAndAttr = True
  from tvgrab.xmltv import XMLDataAndAttr
except:
  # For those who not yet have upgraded the lib..
  usingXMLDataAndAttr = False

class TextFilter:
  def filter(self, contents):
    return contents

class RE_sub (TextFilter):
  """
  A container class for regular expressions,
  if anything matches on the 'reg' it will be
  substituted with the 'sub'
  """
  def __init__(self, reg, sub=" "):
    self.reg = reg
    self.sub = sub

  def filter(self, contents):
    return self.reg.sub( self.sub, contents )

class RE_cut (TextFilter):
  def __init__(self, reStart, reStop, reTrigger, depth, context):
    self.reStart = reStart
    self.reStop = reStop
    self.reTrigger = reTrigger
    self.depth = depth
    self.context = context

  def filter(self, contents):
    return self.context % searchHtml(contents,
                     self.reStart,
                     self.reStop,
                     self.reTrigger,
                     self.depth)

class MTGrab (Grab_C):
  """
  Multi threaded and multi channel provider version of Grab_C.
  The whole design is easy, it is based on 3 stages.

  subgrabber.get_channels():
    Returns a list of channel_containers
    Those contains everything worth to know about each channel

  subgrabber.get_programs(): (or get_programs)
    Will parse the structure reuturned from CustomizedParser and
    return a list of Program_Containers containing everything
    get_programs() can find out about the program

  subgrabber.get_program_info(): (or get_program_info) (optional)
    Will parse the structure returned from CustomizedParser
    (based on the info from the Program_Containers.url
     wich get_programs() might have found)
    It populates Program_Containers with more program specific data

  Large part of this code is influensed by the tv_grab_be_tvb grabber
  written by <den_rdc at users dot sourceforge dot net>
  """

  xmltv_episode = 'system="xmltv_ns"'

  # chan_list is a dictionaty of lists of ChannelBlueprint:s key:ed by chan_id.
  # The first ChannelBlueprint in the list represents the "main"
  # subgrabber to be tried first, the rest of 'em are just fallback
  # subgrabbers to be used ONLY if the previous subgrabber failed.
  chan_list = {}

  ######
  # Named and pre-compiled filters:
  #  These can be reused by subgrabbers and others, that way they
  #  don't have to be re-compiled and copy&pasted as much
  #####
  class ReContainer:
    pass
  named_filters = ReContainer()
  named_re = ReContainer()

  named_filters.complete_head_tags = RE_sub(
    re.compile("<head\s*[^>]*>.*?<\/head>",re.I|re.S), u" ")
  named_filters.complete_select_tags = RE_sub(
    re.compile("<select\s*[^>]*>.*?<\/select>",re.I|re.S), u" ")
  named_filters.complete_script_tags = RE_sub(
    re.compile("<script.*?</script>",re.I|re.S), u" ")
  named_filters.complete_noscript_tags = RE_sub(
    re.compile("<noscript>.*?</noscript>",re.I|re.S), u" ")
  named_filters.complete_comment_tags = RE_sub(
    re.compile("<!--.*?-->", re.I|re.S ), u" ")
  named_filters.more_than_2_space_together = RE_sub(re.compile( " {2,}" ),u" ")
  named_filters.start_tags_with_args = RE_sub(
    re.compile("<(?:b|br|tbody|font|img|center) +[^>]*>",
    re.I ), " ")
  named_filters.start_tags_without_args = RE_sub(
    re.compile( "<(?:b|br|tbody|font|img|center)>", re.I ),
    " ")
  named_filters.end_tags = RE_sub(
    re.compile( "<\/(?:b|tbody|font|center)>", re.I), u" ")
  named_filters.br_tags = RE_sub(re.compile( "<br>", re.I ),"\n")
  named_filters.typographic_tags = \
    RE_sub(re.compile( "<b>|</b>|<strong>|</strong>", re.I ),u"")
  named_filters.stupid_javascript_stuff = \
    RE_sub(re.compile("document\.write\('[^)]*\);", re.I ),u"")

  #### Named RE #####

  named_re.twentyfour_hour_time = \
    re.compile("[01]?(:?\d|2[0-3])[\.:][0-5]\d")
  # will NOT match 0.12 am or 00.12 pm
  named_re.twelve_hour_time = \
    re.compile("(?:1[012]|[1-9])[\.:][0-5]\d (?:am|pm)", re.I)
  named_re.table_start = re.compile("<table",re.I)
  named_re.table_stop = re.compile("</table\s*>",re.I)
  named_re.anything = re.compile(".*")
  named_re.corrupted_numeric_char_entity_ref = re.compile("&amp;\#(\d\d\d+);",re.I)
  named_re.starttime = re.compile("^([\d]{2}(?:\.|:)[\d]{2})$")
  named_re.startendtime = re.compile("^([\d]{2}(?:\.|:)[\d]{2})-([\d]{2}(?:\.|:)[\d]{2})$")

  options=Grab_C.options

  # Don't set these to high. These give the best performance on my ADSL line
  # for broadband connections speed will be limited by the server
  prog_threads=2
  def set_prog_threads(self, a): self.prog_threads=int(a)
  options.append(
    Option(set_prog_threads, None, 'prog_threads',
     _('use %s number of threads getting program info') + ' ' +
     ( _( '(default: %s)' ) % "2"), 'PROG_THREADS')
  )

  # Don't set these to high. These give the best performance on my ADSL line
  # for broadband connections speed will be limited by the server
  chan_threads=2
  def set_chan_threads(self, a): self.chan_threads=int(a)
  options.append(
    Option(set_chan_threads, None, 'chan_threads',
     _('use %s number of threads getting channel info') + ' ' +
     ( _( '(default: %s)' ) % "2"), 'CHAN_THREADS')
  )

  def __init__(self, argv):
    Grab_C.__init__(self, argv)
  # __init__()

  def get_channels(self):
    """ Returns a list of tuples of ('channel name', 'channel url', 'channel id')"""
    result = []
    for (k,v) in self.chan_list.iteritems():
      result.append((v[0].display_name, v[0].base_url, k))
    return result
  # get_channels()

  def add_subgrabber(self, sg):
    assert isinstance(sg,SubGrabber), \
      "sg must be a subclass of SubGrabber"
    if isinstance(sg, SubGrabber_Re2):
      if sg._pat_channel and sg._channel_url:
        html_input = get_urlcontents( sg._channel_url, sg.clear_channel_html)
        html_input = sg._pat_channel.extract(html_input)
        self.append_to_chan_list(sg.get_channels(html_input))
      else:
        self.append_to_chan_list(sg.get_channels(None))
    elif isinstance(sg, SubGrabber_C):
      self.append_to_chan_list(sg.get_channels(None))
    else:
      assert False,"This is not supposed to happend, no operating "+ \
        "mode was selected for your subgrabber"
  # add_subgrabber()

  ############################################################################
  # XMLTV Assembler: this use the above get_channels(), get_programs() and   #
  # get_program_info() to assemble the XMLTV Guide                           #
  ############################################################################

  def get_chan_id_list_from_conf(self, conf):
    """ Why key the channels by name instead of id? """
    #rv = [ conf[n]['id'] for n in conf.keys()]
    rv = []
    for (n, v) in conf.iteritems():
      if v["grab_status"] != "no":
        rv.append(v['id'])
    return rv

  def grab_day( self, date ):

    chan_id_list = self.get_chan_id_list_from_conf(self.conf['channels'])

    assert chan_id_list, _("All channels are disabled by the configuration file.")

    #first add the channel elements to the xml file, so we can add
    #programme elements with threads in a purely random order :)
    for c in chan_id_list:
      if not self.chan_list.has_key(c):
        message.debug('chan_id_list=%s' % chan_id_list)
        message.debug('self.chan_list.keys=%s' % self.chan_list.keys())
        errMsg = _('There is an un-supported channel id ("%s") '+\
          'in the configuration file, remove it and try again.') % c
        message.error(errMsg)
        raise Exception(errMsg)
      # Always use the "main" grabber as channel information source, even
      # if it didn't/won't actually grab the data.
      chan_c = self.chan_list[c][0]

      if usingXMLDataAndAttr:
        self.xmltv.addChannel( chan_c.channel_id,
          XMLDataAndAttr(chan_c.display_name,self.xmltv_lang ))
      else:
        self.xmltv.addChannel( chan_c.channel_id,chan_c.display_name)

    threadlist = []

    #here we start channel threads, an we start new threads if old ones die
    # until there are no more channels to grab
    while (1):
      if len(threadlist) < self.chan_threads:
        for x in range(0,(self.chan_threads - len(threadlist))):
          if len(chan_id_list) > 0 :
            chan_c_list = self.chan_list[chan_id_list[0]]

            chanthread = TparseChannel(date, chan_c_list, self.prog_threads)
            if self.chan_threads>1 :
              chanthread.start()
            else :
              ## Non-concurrent mode
              # When in Non-concurrent mode we must
              # ignore exceptions, just like the main level of a
              # thread does when it dies..
              # Or else we would delegate the exception up the
              # chain and get a completely different result
              # depending on the number of threads we are using. Un-acceptable!
              # But as an infinite-loop failsafe i use the isDone variable.
              isDone = 3
              while isDone > 0:
                try:
                  chanthread.run()
                  isDone = 0
                except ParseError, e:
                  pass
                except Exception, e:
                  isDone=-1
                  message.exception( e )

            threadlist.append(chanthread)
            del chan_id_list[0]
      #hold on till 1 thread finishes
      if len(threadlist) != 0:
        if self.chan_threads>1 :
          threadlist[0].join()
        del threadlist[0]
      #exit loop if no more threads run AND there is nothing left to do...
      if len(threadlist) < 1 and len(chan_id_list) < 1:
        break
  # grab_day()

  ############################################################################
  ############################################################################
  ## Utility functions                            ##
  ############################################################################
  ############################################################################

  def append_to_chan_list(self, dict):
    """
    Appends the provided dictionary of channels to the chan_list
    dictionary of arrays.
    """
    for (k,v) in dict.iteritems():
      if self.chan_list.has_key(k):
        self.chan_list[k].append(v)
      else:
        self.chan_list[k] = [v]
   # append_to_chan_list()

# Grabber

class TparseChannel (Thread):
  def __init__(self, date, chan_c_list, work_prog_thread):
    Thread.__init__(self)
    self.date = date
    self.work_prog_thread = work_prog_thread
    # The grabber must be the same for all subgrabbers...
    self.grabber = chan_c_list[0].sub_grabber.grabber
    self.chan_c_list = chan_c_list

    # Make sure that ALL the Subgrabbers capable of parsing current
    # channel are properly initiated (including the backup subgrabbers)
    # Note that this must be done in single threaded mode to avoid 
    # nasty race conditions.
    for chan_c in self.chan_c_list:
      if not chan_c.sub_grabber.properly_instantiated:
        chan_c.sub_grabber.lazy_instantiation()
        chan_c.sub_grabber.properly_instantiated = True
  #__init__()

  def run(self):
    for chan_c in self.chan_c_list:
      datefmt = chan_c.sub_grabber.format_html_date(copy(self.date))
      if not datefmt:
        # This subgrabber didn't support that date for some reason
        # Try with a backup subgrabber
        continue
      self.chan_url = chan_c.dateless_channel_url + datefmt
      if self._run(chan_c):
        # We successfully grabbed that data, we are done.
        # Exit fallback sub-grabber loop end exit
        break
      message.warning("Could NOT read %s (%s of %s)" %
        (self.chan_url, chan_c.display_name, str(self.date)))

  def _run(self, chan_c):
    sg = chan_c.sub_grabber

    prog_list = None
    if isinstance(sg,SubGrabber_C):
      html_struct = get_urlparsed( self.chan_url,\
                     sg.clear_program_html,\
                     sg.parse_program_tags,\
                     sg.parse_program_attrs)#,verbose=True)

      prog_list = sg.get_programs( html_struct, chan_c)

    elif isinstance(sg,SubGrabber_Re2):
      html_input = get_urlcontents( self.chan_url, sg.clear_program_html)
      prog_list = sg.get_programs(html_input, chan_c)
    else:
      assert False,"This is not supposed to happend, no operating "+ \
                   "mode was selected for your subgrabber"

    if not prog_list:
      return False
    message.debug(prog_list)
    threadlist = []

    prog_lock = Lock()

    for x in range(0, self.work_prog_thread):
      progthread = TaddProgram(prog_list, self.date,
                   prog_lock, chan_c)
      if self.work_prog_thread>1 :
        #raise Exception("This is not suppose to happend")
        progthread.start()
      else :
        # When in Non-concurrent mode we must
        # ignore exceptions, just like the main level of a thread does.
        # Or else we would delegate the exception up the chain
        # and get a completely different result depending on the number
        # of threads we are using. Un-acceptable!
        # But as an infinite-loop failsafe i use the isDone variable.
        isDone = 3
        while isDone > 0:
          try:
            progthread.run()
            isDone = 0
          except ParseError, e:
            pass
          except Exception, e:
            isDone-=1
            message.exception( e )
      threadlist.append(progthread)

    for progthread in threadlist:
      if self.work_prog_thread>1:
        progthread.join()
    return True
  #_run()

# TparseChannel

# Because fetching data for each program in a row is s l o w, we gonna do
# them all at once, starting a thread for each programme page to grab/parse.
# This will dld and parse all program details at the same time, resulting in
# a speedup.

class TaddProgram (Thread):
  """
  This thread is responsible of fetching data on a "per program" basis
  """

  def __init__(self, prog_list, date, prog_lock, chan_c):
    Thread.__init__(self)
    self.prog_list = prog_list
    self.date = date
    self.grabber = chan_c.sub_grabber.grabber
    self.prog_lock = prog_lock
    assert isinstance(chan_c,ChannelBlueprint), \
      "chan_c must be instance of ChannelBlueprint"
    self.chan_c = chan_c
  #__init__()

  def run(self):

    message.debug('TaddProgram: thread started')
    while (1):
      try:
        p = self.grabnextprog()
        if not p:
          message.debug('TaddProgram: No more work')
          break
        self._run(p)
      except ParseError, e:
        pass
       #except Exception, e:
       #  import traceback
       #  traceback.print_exc()
       #  raise e
  #run()

  def _run(self, p):

    self.prog  = p.title
    self.prog_info = {}
    sg = self.chan_c.sub_grabber

    starttime = Time( p.startTime, self.chan_c.tz )

    if starttime < self.chan_c.start_of_day:
      self.start = ( self.date + 1 ) + starttime
    else:
      self.start = self.date + starttime

    if p.endTime:
      # Yes, i've seen it happend :/
      if p.endTime == "24:00" or p.endTime == "24.00":
        self.stop = ( self.date + 1 ) +  Time( "00:00" )
      else:
        stoptime =  Time( p.endTime )
        if (stoptime < starttime) or (starttime < self.chan_c.start_of_day):
          self.stop = ( self.date + 1 ) + stoptime
        else:
          self.stop = self.date + stoptime
    else:
      self.stop = None

    # chan_c.timeshift is in minutes
    if self.chan_c.timeshift != 0:
      if self.stop:
        self.stop.addSeconds(60*self.chan_c.timeshift)
      if self.start:
        self.start.addSeconds(60*self.chan_c.timeshift)

    # Grab Program Info
    # Yes, there are examples of subgrabbers who supports
    # program_info while the individual channel does not..
    if p.url and self.chan_c.supports_prog_info and sg.supports_prog_info:
      message.debug(p.url)
      if isinstance(sg, SubGrabber_C):
        html_struct = get_urlparsed( p.url,
                       sg.clear_program_info_html,
                       sg.parse_program_info_tags,
                       sg.parse_program_info_attrs)
        if message.verbose==message.verbose_level.DEBUG:
          html_struct.verbose=1; print html_struct

        # Will polulate p with more data
        sg.get_program_info( html_struct, p, self.chan_c)
      elif isinstance(sg, SubGrabber_Re2):

        html_input = get_urlcontents( p.url, sg.clear_program_info_html)
        # Will polulate p with more data
        sg.get_program_info(html_input, p, self.chan_c)
      else:
        message.error("This is not supposed to happend, no operating "+ \
          "mode was selected, bailing out")
        sys.exit(1)

    if p.description and len(p.description)>0:
      if usingXMLDataAndAttr:
        self.prog_info["desc"]=\
          XMLDataAndAttr(p.description,self.grabber.xmltv_lang)
      else:
        self.prog_info["desc"]=p.description
    if p.episode:
      if usingXMLDataAndAttr:
        self.prog_info["episode-num"] = \
          XMLDataAndAttr(p.episode,self.grabber.xmltv_episode)
      else:
        self.prog_info["episode-num"] = p.episode
    if p.previouslyShown:
      if usingXMLDataAndAttr:
        self.prog_info["previously-shown"] =\
          XMLDataAndAttr("",'start="%s"'%(p.previouslyShown))
      else:
        self.prog_info["previously-shown"] = p.previouslyShown
    if p.sub_title:
      if usingXMLDataAndAttr:
        self.prog_info["sub-title"] =\
          XMLDataAndAttr(p.sub_title,self.grabber.xmltv_lang)
      else:
        self.prog_info["sub-title"] = p.sub_title

    if p.category:
      self.prog_info["category"] = p.category

    if p.url:
      self.prog_info["url"] = sg.markup(p.url)

    # Is there no efficient way of telling if a dict is zero sized?
    if len(self.prog_info.keys())==0:
      self.prog_info = None

    if self.chan_c.tz != None:
      if self.start:
        self.start.tzinfo = self.chan_c.tz
      if self.stop:
        self.stop.tzinfo = self.chan_c.tz
    if usingXMLDataAndAttr:
      self.grabber.xmltv.addProgram( self.start,
                     self.chan_c.channel_id,
                     XMLDataAndAttr(self.prog,self.grabber.xmltv_lang),
                     self.stop,
                     self.prog_info
                   )
    else:
      self.grabber.xmltv.addProgram( self.start,
                     self.chan_c.channel_id,
                     self.prog,
                     self.stop,
                     self.prog_info
                   )
    message.info(_("Grab program info complete : [%s] (%s) %s") % \
         ( self.start, self.stop, self.prog ) )
  #_run()

  def grabnextprog(self):
    self.prog_lock.acquire(1) # 1 means we block
    if (len (self.prog_list) <1 ):
      p = None

    else:
      p = self.prog_list[0]
      del self.prog_list[0]

    self.prog_lock.release()
    return p

#TaddProgram

class SubGrabber:
  """
  Abstract baseclass for subgrabbers.
  Use SubGrabber_c or SubGrabber_Re2 as a baseclass when creating your own
  implementation of a subgrabber
  """
  page_charset = 'iso-8859-1'

  supports_prog_info = False
  # lazy_instantiation() has not yet been run
  properly_instantiated = False

  re_desc_replace = [
    # removes every totally empty line
    (re.compile("\s*\n*\s*$",re.M),"")
    ]

  re_episode_num = [
    re.compile("(?:P|p)art (P<episode_num>\d+) ?of ?(?P<episodes_max>\d+)"),
    re.compile("(?:P|p)art (P<episode_num>\d+)\((?P<episodes_max>\d+)\)"),
    ]

  # These chould all hava a match group - year or date when shown previously
  re_rerun_identifiers = []

  re_title_clean = []

  # Set this one to be your re2 expression if you are using re2 mode
  _pat_guide = None

  # Set these if you are using re2 and a dynamic get_channels
  _pat_channel = None
  _channel_url = None

  ############################################################################
  # Mode (CustomizedParser or Re2) independent methods:
  ############################################################################

  def __init__(self, grabber):
    self.grabber = grabber
    # These can't be put outside as class constants. (append-side-effect-problems)
    self.re_channel_clean = []
    self.re_program_clean = []
    self.re_program_info_clean = []

  def format_html_date(self,aDate):
    """
    Returns a date formatted string that will be appended to the url
    to get the correct page.
    This method SHOULD/MUST be overridden by subclasses.
    None should be returned if the channel provider does not displaying any
    programs of this date on his site.

    This method should only be invoked with dates that isn't
    used by anything else (use copy(date) to avoid nasty side-effects).
    """
    assert False, _( "Not Implemented Yet" )
  # format_html_date

  def prettyPrintTitle(self,title):
    """
    Sometimes programme data providers adds some stuff to the title that
    simply does not belong in there, this function will try to remove such
    things. It will also convert the text to unicode
    """
    string.strip(title)
    for clean in self.re_title_clean:
      title = clean.sub( "", title )
      string.strip(title)
    return self.markup(self.clean_char(title))
  # prettyPrintTitle

  def prettyPrintDesc(self,desc):
    """
    Makes the description easier on the eyes
    It will also convert the text to unicode
    """
    desc = self.markup(desc)
    for repl in self.re_desc_replace:
      desc = repl[ 0 ].sub( repl[ 1 ], desc )
    desc = self.clean_char(desc)
    return desc
  # prettyPrintDesc

  def prettyPrintTime(self,time):
    return time.replace(".",":")
  # prettyPrintTime

  def clean_char( self, contents ):
    """
    Convert ASCII char's to unicode
    """
    if not isinstance(contents,unicode):
      contents = unicode(contents, self.page_charset,'replace')
    return contents
  # clean_char()

  def markup(self,text):
    """
    Seems like the python XML parser will freak out whenever it sees a '&'
    in the cdata.
    This method will markup common problem charachters.
    """
    text = text.replace('&', "&amp;")\
           .replace('&amp;amp;', "&amp;")\
           .replace('<', "&lt;")\
           .replace('&amp;lt;', "&lt;")\
           .replace('"', "&quot;")\
           .replace('&amp;quot;', "&quot;")\
           .replace('>', "&gt;")\
           .replace('&amp;gt;', "&gt;")\
           .strip()
    # Correct corruptions we just created...
    return self.grabber.named_re.corrupted_numeric_char_entity_ref.sub(
          "&#\g<1>;", text)
  # markup

  def _clear_html( self,data,re_clean,unicode_error="replace"):
    """
    Common implementation of clear_channel_html, clear_channel_html and
    clear_program_info_html.
    """
    if self.page_charset and not isinstance(data,unicode):
      data = unicode( data, self.page_charset, unicode_error )

    for clean in re_clean:
      data = clean.filter( data )

    return data
  # _clear_html()

  def clear_channel_html( self, contents, unicode_error="replace" ):
    if not self.re_channel_clean:
      return contents
    return self._clear_html(contents,
                self.re_channel_clean,
                unicode_error)
  # clear_channel_html()

  def clear_program_html( self, contents, unicode_error="replace" ):
    """
    This is the same method as Grab_C.clear_html but since every
    subgrabber needs unique page_charset, re_clean, re_replace and what not
    i can't use that medhod.

    Clear unwanted tags, attributes and whatever is defined as a
    compiled Regular Expression in re_clean and re_replace.

    Converts to unicode if page_charset is set, using unicode error policy
    defined by 'unicode_error'.
    """

    if not self.properly_instantiated:
      self.lazy_instantiation()
      self.properly_instantiated = True

    return self._clear_html(contents,
                self.re_program_clean,
                unicode_error)
  # clear_program_html()

  def clear_program_info_html( self, contents, unicode_error="replace" ):
    """
    Clear unwanted tags, attributes and whatever is defined as a
    compiled Regular Expression in re_program_info_clean and
    re_program_info_replace
    """
    if not self.properly_instantiated:
      self.lazy_instantiation()
      self.properly_instantiated = True

    return self._clear_html(contents,
                self.re_program_info_clean,
                unicode_error)
  # clear_program_info_html()

  def lazy_instantiation (self):
    """
    Should setup the re_program_info_clean, re_program_clean and
    other variables needed when this subgrabber is actually grabbing
    stuff from the internet.
    Will be called by clear_program_html() and clear_program_info_html()
    if those variables are not set.

    The reason to use lazy instansiation is that some
    subgrabbers may be inactivated by the configuration file, and
    re and re2 are kind of slow to compile.
    """
    assert False,_( "Not Implemented Yet" )

  ###########################################################################
  # Information retreival methods,
  # Will try to generate data from the description field
  ###########################################################################

  def get_episode_num(self,descr):
    """
    Search the descr for text resembling a episode number.
    returns a string containing a episode-num string of
    type xmltv_ns described in the xmltv dtd (version 0.5.35)
    Returns None if no episode number is found
    """
    if not descr or descr == "":
      return None
    if isinstance(descr,unicode):
      #int() does not seem to be able to handle unicode
      descr = descr.encode(self.page_charset,'replace')
    for reg in self.re_episode_num:
      match = reg.search(descr)
      if match:
        rv=".%d/%s." %(int(match.group("episode_num"))-1,match.group("episodes_max"))
        return unicode(rv,self.page_charset,'replace')
    return None
  # get_episode_num

  def get_rerun(self, text):
    """
    Search the title and/or desc for text resembling a rerun identifier.
    returns group(1) of the regexp match if one is found
    """
    if not text or text == "":
      return None
    for reg in self.re_rerun_identifiers:
      match = reg.search(text)
      if match:
        return match.group(1)
    return None
  # get_rerun

  def get_duration(self, description, startTime):
    """
    Tries to figure out the endTime of this programme based
    on information found in the description, returns None if
    nothing was found
    """
    return None

# SubGrabber

class SubGrabber_Re2 (SubGrabber):
  " Subgrabber utilizing Re2 "
  def flatten_array(self, arr):
    """
    Sometimes the data is put inside an array when using re2 (for no
    apparent reason)
    """
    rv = ""
    for o in arr:
      rv += o
    return rv

  def get_channels(self, html_input):
    assert False,_( "Not Implemented Yet" )

  def get_programs( self, html_input, chan_c):
    """
    Return a list of ProgramContainer:
      ( ProgramContainer )
    from the html structure.
    """
    if (not html_input) or html_input.strip() == "":
      return None
    guide=self._pat_guide.extract(html_input)
    if not guide or not hasattr(guide, 'programme'):
      return None

    # Fall back to the standard guide handling
    return self.get_programs_from_re2_output(guide, chan_c)

  def get_programs_from_re2_output(self, re2_output, chan_c):
    """
    Converts the output of re2.extract to the same
    list of ProgramContainer the customized parser version
    of get_programs outputs
    """
    if not re2_output or not hasattr(re2_output, 'programme'):
      return None

    list = []
    for p in re2_output.programme:
      pc = ProgramContainer()
      pc.startTime = self.prettyPrintTime(p.startTime)
      pc.title = p.title
      if not (pc.startTime and pc.title):
        # title and starttime is the minimum
        continue

      if hasattr(p, "endTime"):
        pc.endTime = self.prettyPrintTime(p.endTime)
      pc.title = self.prettyPrintTitle(p.title)
      if hasattr(p, "description"):
        if type(p.description) == type([]):
          p.description = self.flatten_array(p.description)
        pc.description = self.prettyPrintDesc(p.description)
      if hasattr(p, "category"):
        pc.category = p.category
      if hasattr(p, "previouslyShown"):
        pc.previouslyShown = p.previouslyShown
      if hasattr(p, "sub_title"):
        pc.sub_title = p.sub_title
      if hasattr(p, "episode"):
        pc.episode = p.episode
      if hasattr(p, "url") and p.url.strip() != "":
        pc.url = chan_c.base_url + p.url

      if pc.description:
        if not pc.episode:
          pc.episode = self.get_episode_num(pc.description)
        if not pc.previouslyShown:
          pc.previouslyShown = self.get_rerun(pc.description)

      if len(list) > 0:
        prevPc = list[-1]
        if prevPc.endTime == None:
           prevPc.endTime = pc.startTime
        # remove duplicates
        if prevPc.startTime == pc.startTime:
          del list[-1]

      if pc.endTime==None and pc.description and pc.startTime:
         pc.endTime = self.get_duration(pc.description, pc.startTime)

      if self.re_title_noservice:
        if not self.re_title_noservice.search(pc.title):
          list.append(pc)
      else:
        list.append(pc)
    return list

  def get_program_info(self, html_input, pc, chan_c):
    """
    Populates the ProgramContainer with more data from the
    html_input
    """

    guide=self._pat_guide_program_info.extract(html_input)
    if not guide or not hasattr(guide, 'programme_info'):
      return None

    # Fall back to the standard program info handling
    self.get_program_info_from_re2_output(pc, guide, chan_c)

  def _get_program_info_from_re2_output(self, pc, p, chan_c):
    if pc.startTime == None:
      if hasattr(p, 'startTime'):
        pc.startTime = self.prettyPrintTime(p.startTime)

    if pc.endTime == None:
      if hasattr(p, 'endTime'):
        pc.endTime = self.prettyPrintTime(p.endTime)

    if pc.title == None:
      if hasattr(p, 'title'):
        pc.title = self.prettyPrintTitle(p.title)

    if pc.description == None:
      if hasattr(p, 'description'):
        if type(p.description) == type([]):
          p.description = self.flatten_array(p.description)
        pc.description = self.prettyPrintDesc(p.description)
      else:
        pc.description = ""

    if hasattr(p, 'description_addon'):
      if type(p.description_addon) == type([]):
        p.description_addon = self.flatten_array(p.description_addon)
      pc.description += self.prettyPrintDesc(p.description_addon)

    if hasattr(p, 'description_replace'):
      if type(p.description_replace) == type([]):
        p.description_replace = self.flatten_array(p.description_replace)
      pc.description = self.prettyPrintDesc(p.description_replace)

    if pc.category == None:
      if hasattr(p, "category"):
        pc.category = p.category

    if pc.previouslyShown == None:
      if hasattr(p, "previouslyShown"):
        pc.previouslyShown = p.previouslyShown

    if pc.sub_title == None:
      if hasattr(p, "sub_title"):
        pc.sub_title = p.sub_title

    if pc.episode == None:
      if hasattr(p, "episode"):
        pc.episode = p.episode

    # Can't see this ever happening, the url was the thing
    # leading us here in the first place, but better safe than sorry
    if pc.url == None:
      if hasattr(p, "url"):
        pc.url = p.url

    if pc.description:
      if not pc.episode:
        pc.episode = self.get_episode_num(pc.description)
      if not pc.previouslyShown:
        pc.previouslyShown = self.get_rerun(pc.description)

  def get_program_info_from_re2_output(self, pc, re2_output, chan_c):
    """
    Converts the output of re2.extract to the same
    list of ProgramContainer the customized parser version
    of get_programs outputs
    """
    if not re2_output or not hasattr(re2_output, 'programme_info'):
      return None

    if type(re2_output.programme_info) == type([]):
      for p in re2_output.programme_info:
        self._get_program_info_from_re2_output(pc, p, chan_c)
    else:
      self._get_program_info_from_re2_output(pc, \
                           re2_output.programme_info, \
                           chan_c)
# SubGrabber_Re2

class SubGrabber_C (SubGrabber):
  " Subgrabber utilizing Customized Parser "

  def get_subtree(self, tree, traceArray):
    """
    Returns a subtree pointed out by the array of child indexes in
    traceArray.
    get_subtree(tree,[0,1,2]) will get
    tree.children[0].children[1].children[2] or None
    """
    if len(traceArray) == 0:
      return tree
    for t in traceArray:
      if len(tree.children) > t:
        tree = tree.children[t]
      else:
        # traceArray was faulty, could not traverse tree
        return None
    return tree

  def search_tree(self,tree,searchFunc,cutoff=0):
    """
    Searches the tree recursively, when searchFunc signals a match
    an array of child indexes pointing to the point where the match
    occured is returned. See get_subtree().
    if cutoff > 0 that number of items will be removed from the
    result
    """
    if searchFunc(tree):
      return []
    i = 0
    for t in tree.children:
      rv = self.search_tree(t,searchFunc)
      if rv!=None: # an empty [] is also false...
        rv.insert(0,i)
        for i in range (0,cutoff):
          del rv[-1]
        return rv
      i += 1
    return None

  def search_and_get_tree(self,tree,searchFunc,cutoff=0):
    """
    combines search_tree() and get_subtree().
    Returns None if something fails
    """
    t = self.search_tree(tree,searchFunc,cutoff)
    if t!=None: # an empty [] is also false...
      return self.get_subtree(tree,t)
    else:
      return None

  def get_attr(self,tree,attr_name):
    """
    Searches after the attr named attr_name of this Tag (tree).
    Returns the first occurence
    """
    for (attr_t,attr_v) in tree.attrs:
      if attr_t == attr_name:
        return attr_v
    return None
# SubGrabber_C

class ProgramContainer:
  """
  Container for program data
  """
  startTime = None
  endTime = None
  title = None
  description = None
  url = None
  episode = None
  previouslyShown = None
  sub_title = None
  category = None
# ProgramContainer

class ChannelBlueprint:
  """
  Channel specific data needed by the parser
  """
  def __init__(self, ch_id, display_name, base_url,
         ch_url, sub_grabber,\
         start_of_day, ch_icon_url=None, timeshift=0,\
         supports_prog_info=True, tz=None):

    assert isinstance(sub_grabber, SubGrabber), \
      "sub_grabber must be subclass of tvgrab.mtgrab.SubGrabber"

    # Id of this channel: i.e. svt1.svt.se
    self.channel_id = ch_id
    # The display name: i.e. SVT1
    self.display_name = display_name
    # root url of the provider. i.e. "http://svt.se"
    self.base_url = base_url
    # Url for the webpage to parse, minus the date
    # i.e. "http://svt.se/svt/jsp/Crosslink.jsp?d=8764&selectedDate="
    self.dateless_channel_url = ch_url
    self.icon_url = ch_icon_url
    # ref to the subgrabber containing all the specific channel logic
    self.sub_grabber = sub_grabber
    # Anything before this time (24h notation) will marked
    # as belonging to the prev. day
    self.start_of_day = start_of_day
    # number of hours the shows are shifted.
    # positive number means the program starts x hours earlier
    self.timeshift = timeshift
    # Wheater we should use get_program_info phase or not.
    self.supports_prog_info = supports_prog_info
    # Time zone of the grabbed data
    if tz != None:
      assert isinstance(tz, tzinfo),\
        "tz must be a subclass of datetime.tzinfo"
    self.tz = tz
# ChannelBlueprint
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.