#!/usr/bin/env python
# -----------------------------------------------------------------------
# Copyright (C) 2003 dan fritz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# -----------------------------------------------------------------------
#
# This code is part of the pytvgrab project:
# http://pytvgrab.sourceforge.net
#
# -----------------------------------------------------------------------
# Subversion Information, do not edit
#
# $Rev:$
#
# $Log: $
#
# TODO:
#
"""
This is a rewrite of threading code written by den_RDC.
"""
import sys
import re
import time
import string
import new
from threading import Thread,Lock
from copy import copy
from cPickle import load,dump
from urlutils import get_urlcontents,get_urlparsed
from tvgrab.customizedparser import CustomizedParser
from tvgrab.tag import Tag
from tvgrab.grab import Grab_C,Date,Time,DateTime,Option
from tvgrab.tagutils import searchHtmlBF,searchHtmlDF,searchHtml
from tvgrab import message
from tvgrab.grabexceptions import ParseError
from datetime import tzinfo
try:
usingXMLDataAndAttr = True
from tvgrab.xmltv import XMLDataAndAttr
except:
# For those who not yet have upgraded the lib..
usingXMLDataAndAttr = False
class TextFilter:
def filter(self, contents):
return contents
class RE_sub (TextFilter):
"""
A container class for regular expressions,
if anything matches on the 'reg' it will be
substituted with the 'sub'
"""
def __init__(self, reg, sub=" "):
self.reg = reg
self.sub = sub
def filter(self, contents):
return self.reg.sub( self.sub, contents )
class RE_cut (TextFilter):
def __init__(self, reStart, reStop, reTrigger, depth, context):
self.reStart = reStart
self.reStop = reStop
self.reTrigger = reTrigger
self.depth = depth
self.context = context
def filter(self, contents):
return self.context % searchHtml(contents,
self.reStart,
self.reStop,
self.reTrigger,
self.depth)
class MTGrab (Grab_C):
"""
Multi threaded and multi channel provider version of Grab_C.
The whole design is easy, it is based on 3 stages.
subgrabber.get_channels():
Returns a list of channel_containers
Those contains everything worth to know about each channel
subgrabber.get_programs(): (or get_programs)
Will parse the structure reuturned from CustomizedParser and
return a list of Program_Containers containing everything
get_programs() can find out about the program
subgrabber.get_program_info(): (or get_program_info) (optional)
Will parse the structure returned from CustomizedParser
(based on the info from the Program_Containers.url
wich get_programs() might have found)
It populates Program_Containers with more program specific data
Large part of this code is influensed by the tv_grab_be_tvb grabber
written by <den_rdc at users dot sourceforge dot net>
"""
xmltv_episode = 'system="xmltv_ns"'
# chan_list is a dictionaty of lists of ChannelBlueprint:s key:ed by chan_id.
# The first ChannelBlueprint in the list represents the "main"
# subgrabber to be tried first, the rest of 'em are just fallback
# subgrabbers to be used ONLY if the previous subgrabber failed.
chan_list = {}
######
# Named and pre-compiled filters:
# These can be reused by subgrabbers and others, that way they
# don't have to be re-compiled and copy&pasted as much
#####
class ReContainer:
pass
named_filters = ReContainer()
named_re = ReContainer()
named_filters.complete_head_tags = RE_sub(
re.compile("<head\s*[^>]*>.*?<\/head>",re.I|re.S), u" ")
named_filters.complete_select_tags = RE_sub(
re.compile("<select\s*[^>]*>.*?<\/select>",re.I|re.S), u" ")
named_filters.complete_script_tags = RE_sub(
re.compile("<script.*?</script>",re.I|re.S), u" ")
named_filters.complete_noscript_tags = RE_sub(
re.compile("<noscript>.*?</noscript>",re.I|re.S), u" ")
named_filters.complete_comment_tags = RE_sub(
re.compile("<!--.*?-->", re.I|re.S ), u" ")
named_filters.more_than_2_space_together = RE_sub(re.compile( " {2,}" ),u" ")
named_filters.start_tags_with_args = RE_sub(
re.compile("<(?:b|br|tbody|font|img|center) +[^>]*>",
re.I ), " ")
named_filters.start_tags_without_args = RE_sub(
re.compile( "<(?:b|br|tbody|font|img|center)>", re.I ),
" ")
named_filters.end_tags = RE_sub(
re.compile( "<\/(?:b|tbody|font|center)>", re.I), u" ")
named_filters.br_tags = RE_sub(re.compile( "<br>", re.I ),"\n")
named_filters.typographic_tags = \
RE_sub(re.compile( "<b>|</b>|<strong>|</strong>", re.I ),u"")
named_filters.stupid_javascript_stuff = \
RE_sub(re.compile("document\.write\('[^)]*\);", re.I ),u"")
#### Named RE #####
named_re.twentyfour_hour_time = \
re.compile("[01]?(:?\d|2[0-3])[\.:][0-5]\d")
# will NOT match 0.12 am or 00.12 pm
named_re.twelve_hour_time = \
re.compile("(?:1[012]|[1-9])[\.:][0-5]\d (?:am|pm)", re.I)
named_re.table_start = re.compile("<table",re.I)
named_re.table_stop = re.compile("</table\s*>",re.I)
named_re.anything = re.compile(".*")
named_re.corrupted_numeric_char_entity_ref = re.compile("&\#(\d\d\d+);",re.I)
named_re.starttime = re.compile("^([\d]{2}(?:\.|:)[\d]{2})$")
named_re.startendtime = re.compile("^([\d]{2}(?:\.|:)[\d]{2})-([\d]{2}(?:\.|:)[\d]{2})$")
options=Grab_C.options
# Don't set these to high. These give the best performance on my ADSL line
# for broadband connections speed will be limited by the server
prog_threads=2
def set_prog_threads(self, a): self.prog_threads=int(a)
options.append(
Option(set_prog_threads, None, 'prog_threads',
_('use %s number of threads getting program info') + ' ' +
( _( '(default: %s)' ) % "2"), 'PROG_THREADS')
)
# Don't set these to high. These give the best performance on my ADSL line
# for broadband connections speed will be limited by the server
chan_threads=2
def set_chan_threads(self, a): self.chan_threads=int(a)
options.append(
Option(set_chan_threads, None, 'chan_threads',
_('use %s number of threads getting channel info') + ' ' +
( _( '(default: %s)' ) % "2"), 'CHAN_THREADS')
)
def __init__(self, argv):
Grab_C.__init__(self, argv)
# __init__()
def get_channels(self):
""" Returns a list of tuples of ('channel name', 'channel url', 'channel id')"""
result = []
for (k,v) in self.chan_list.iteritems():
result.append((v[0].display_name, v[0].base_url, k))
return result
# get_channels()
def add_subgrabber(self, sg):
assert isinstance(sg,SubGrabber), \
"sg must be a subclass of SubGrabber"
if isinstance(sg, SubGrabber_Re2):
if sg._pat_channel and sg._channel_url:
html_input = get_urlcontents( sg._channel_url, sg.clear_channel_html)
html_input = sg._pat_channel.extract(html_input)
self.append_to_chan_list(sg.get_channels(html_input))
else:
self.append_to_chan_list(sg.get_channels(None))
elif isinstance(sg, SubGrabber_C):
self.append_to_chan_list(sg.get_channels(None))
else:
assert False,"This is not supposed to happend, no operating "+ \
"mode was selected for your subgrabber"
# add_subgrabber()
############################################################################
# XMLTV Assembler: this use the above get_channels(), get_programs() and #
# get_program_info() to assemble the XMLTV Guide #
############################################################################
def get_chan_id_list_from_conf(self, conf):
""" Why key the channels by name instead of id? """
#rv = [ conf[n]['id'] for n in conf.keys()]
rv = []
for (n, v) in conf.iteritems():
if v["grab_status"] != "no":
rv.append(v['id'])
return rv
def grab_day( self, date ):
chan_id_list = self.get_chan_id_list_from_conf(self.conf['channels'])
assert chan_id_list, _("All channels are disabled by the configuration file.")
#first add the channel elements to the xml file, so we can add
#programme elements with threads in a purely random order :)
for c in chan_id_list:
if not self.chan_list.has_key(c):
message.debug('chan_id_list=%s' % chan_id_list)
message.debug('self.chan_list.keys=%s' % self.chan_list.keys())
errMsg = _('There is an un-supported channel id ("%s") '+\
'in the configuration file, remove it and try again.') % c
message.error(errMsg)
raise Exception(errMsg)
# Always use the "main" grabber as channel information source, even
# if it didn't/won't actually grab the data.
chan_c = self.chan_list[c][0]
if usingXMLDataAndAttr:
self.xmltv.addChannel( chan_c.channel_id,
XMLDataAndAttr(chan_c.display_name,self.xmltv_lang ))
else:
self.xmltv.addChannel( chan_c.channel_id,chan_c.display_name)
threadlist = []
#here we start channel threads, an we start new threads if old ones die
# until there are no more channels to grab
while (1):
if len(threadlist) < self.chan_threads:
for x in range(0,(self.chan_threads - len(threadlist))):
if len(chan_id_list) > 0 :
chan_c_list = self.chan_list[chan_id_list[0]]
chanthread = TparseChannel(date, chan_c_list, self.prog_threads)
if self.chan_threads>1 :
chanthread.start()
else :
## Non-concurrent mode
# When in Non-concurrent mode we must
# ignore exceptions, just like the main level of a
# thread does when it dies..
# Or else we would delegate the exception up the
# chain and get a completely different result
# depending on the number of threads we are using. Un-acceptable!
# But as an infinite-loop failsafe i use the isDone variable.
isDone = 3
while isDone > 0:
try:
chanthread.run()
isDone = 0
except ParseError, e:
pass
except Exception, e:
isDone=-1
message.exception( e )
threadlist.append(chanthread)
del chan_id_list[0]
#hold on till 1 thread finishes
if len(threadlist) != 0:
if self.chan_threads>1 :
threadlist[0].join()
del threadlist[0]
#exit loop if no more threads run AND there is nothing left to do...
if len(threadlist) < 1 and len(chan_id_list) < 1:
break
# grab_day()
############################################################################
############################################################################
## Utility functions ##
############################################################################
############################################################################
def append_to_chan_list(self, dict):
"""
Appends the provided dictionary of channels to the chan_list
dictionary of arrays.
"""
for (k,v) in dict.iteritems():
if self.chan_list.has_key(k):
self.chan_list[k].append(v)
else:
self.chan_list[k] = [v]
# append_to_chan_list()
# Grabber
class TparseChannel (Thread):
def __init__(self, date, chan_c_list, work_prog_thread):
Thread.__init__(self)
self.date = date
self.work_prog_thread = work_prog_thread
# The grabber must be the same for all subgrabbers...
self.grabber = chan_c_list[0].sub_grabber.grabber
self.chan_c_list = chan_c_list
# Make sure that ALL the Subgrabbers capable of parsing current
# channel are properly initiated (including the backup subgrabbers)
# Note that this must be done in single threaded mode to avoid
# nasty race conditions.
for chan_c in self.chan_c_list:
if not chan_c.sub_grabber.properly_instantiated:
chan_c.sub_grabber.lazy_instantiation()
chan_c.sub_grabber.properly_instantiated = True
#__init__()
def run(self):
for chan_c in self.chan_c_list:
datefmt = chan_c.sub_grabber.format_html_date(copy(self.date))
if not datefmt:
# This subgrabber didn't support that date for some reason
# Try with a backup subgrabber
continue
self.chan_url = chan_c.dateless_channel_url + datefmt
if self._run(chan_c):
# We successfully grabbed that data, we are done.
# Exit fallback sub-grabber loop end exit
break
message.warning("Could NOT read %s (%s of %s)" %
(self.chan_url, chan_c.display_name, str(self.date)))
def _run(self, chan_c):
sg = chan_c.sub_grabber
prog_list = None
if isinstance(sg,SubGrabber_C):
html_struct = get_urlparsed( self.chan_url,\
sg.clear_program_html,\
sg.parse_program_tags,\
sg.parse_program_attrs)#,verbose=True)
prog_list = sg.get_programs( html_struct, chan_c)
elif isinstance(sg,SubGrabber_Re2):
html_input = get_urlcontents( self.chan_url, sg.clear_program_html)
prog_list = sg.get_programs(html_input, chan_c)
else:
assert False,"This is not supposed to happend, no operating "+ \
"mode was selected for your subgrabber"
if not prog_list:
return False
message.debug(prog_list)
threadlist = []
prog_lock = Lock()
for x in range(0, self.work_prog_thread):
progthread = TaddProgram(prog_list, self.date,
prog_lock, chan_c)
if self.work_prog_thread>1 :
#raise Exception("This is not suppose to happend")
progthread.start()
else :
# When in Non-concurrent mode we must
# ignore exceptions, just like the main level of a thread does.
# Or else we would delegate the exception up the chain
# and get a completely different result depending on the number
# of threads we are using. Un-acceptable!
# But as an infinite-loop failsafe i use the isDone variable.
isDone = 3
while isDone > 0:
try:
progthread.run()
isDone = 0
except ParseError, e:
pass
except Exception, e:
isDone-=1
message.exception( e )
threadlist.append(progthread)
for progthread in threadlist:
if self.work_prog_thread>1:
progthread.join()
return True
#_run()
# TparseChannel
# Because fetching data for each program in a row is s l o w, we gonna do
# them all at once, starting a thread for each programme page to grab/parse.
# This will dld and parse all program details at the same time, resulting in
# a speedup.
class TaddProgram (Thread):
"""
This thread is responsible of fetching data on a "per program" basis
"""
def __init__(self, prog_list, date, prog_lock, chan_c):
Thread.__init__(self)
self.prog_list = prog_list
self.date = date
self.grabber = chan_c.sub_grabber.grabber
self.prog_lock = prog_lock
assert isinstance(chan_c,ChannelBlueprint), \
"chan_c must be instance of ChannelBlueprint"
self.chan_c = chan_c
#__init__()
def run(self):
message.debug('TaddProgram: thread started')
while (1):
try:
p = self.grabnextprog()
if not p:
message.debug('TaddProgram: No more work')
break
self._run(p)
except ParseError, e:
pass
#except Exception, e:
# import traceback
# traceback.print_exc()
# raise e
#run()
def _run(self, p):
self.prog = p.title
self.prog_info = {}
sg = self.chan_c.sub_grabber
starttime = Time( p.startTime, self.chan_c.tz )
if starttime < self.chan_c.start_of_day:
self.start = ( self.date + 1 ) + starttime
else:
self.start = self.date + starttime
if p.endTime:
# Yes, i've seen it happend :/
if p.endTime == "24:00" or p.endTime == "24.00":
self.stop = ( self.date + 1 ) + Time( "00:00" )
else:
stoptime = Time( p.endTime )
if (stoptime < starttime) or (starttime < self.chan_c.start_of_day):
self.stop = ( self.date + 1 ) + stoptime
else:
self.stop = self.date + stoptime
else:
self.stop = None
# chan_c.timeshift is in minutes
if self.chan_c.timeshift != 0:
if self.stop:
self.stop.addSeconds(60*self.chan_c.timeshift)
if self.start:
self.start.addSeconds(60*self.chan_c.timeshift)
# Grab Program Info
# Yes, there are examples of subgrabbers who supports
# program_info while the individual channel does not..
if p.url and self.chan_c.supports_prog_info and sg.supports_prog_info:
message.debug(p.url)
if isinstance(sg, SubGrabber_C):
html_struct = get_urlparsed( p.url,
sg.clear_program_info_html,
sg.parse_program_info_tags,
sg.parse_program_info_attrs)
if message.verbose==message.verbose_level.DEBUG:
html_struct.verbose=1; print html_struct
# Will polulate p with more data
sg.get_program_info( html_struct, p, self.chan_c)
elif isinstance(sg, SubGrabber_Re2):
html_input = get_urlcontents( p.url, sg.clear_program_info_html)
# Will polulate p with more data
sg.get_program_info(html_input, p, self.chan_c)
else:
message.error("This is not supposed to happend, no operating "+ \
"mode was selected, bailing out")
sys.exit(1)
if p.description and len(p.description)>0:
if usingXMLDataAndAttr:
self.prog_info["desc"]=\
XMLDataAndAttr(p.description,self.grabber.xmltv_lang)
else:
self.prog_info["desc"]=p.description
if p.episode:
if usingXMLDataAndAttr:
self.prog_info["episode-num"] = \
XMLDataAndAttr(p.episode,self.grabber.xmltv_episode)
else:
self.prog_info["episode-num"] = p.episode
if p.previouslyShown:
if usingXMLDataAndAttr:
self.prog_info["previously-shown"] =\
XMLDataAndAttr("",'start="%s"'%(p.previouslyShown))
else:
self.prog_info["previously-shown"] = p.previouslyShown
if p.sub_title:
if usingXMLDataAndAttr:
self.prog_info["sub-title"] =\
XMLDataAndAttr(p.sub_title,self.grabber.xmltv_lang)
else:
self.prog_info["sub-title"] = p.sub_title
if p.category:
self.prog_info["category"] = p.category
if p.url:
self.prog_info["url"] = sg.markup(p.url)
# Is there no efficient way of telling if a dict is zero sized?
if len(self.prog_info.keys())==0:
self.prog_info = None
if self.chan_c.tz != None:
if self.start:
self.start.tzinfo = self.chan_c.tz
if self.stop:
self.stop.tzinfo = self.chan_c.tz
if usingXMLDataAndAttr:
self.grabber.xmltv.addProgram( self.start,
self.chan_c.channel_id,
XMLDataAndAttr(self.prog,self.grabber.xmltv_lang),
self.stop,
self.prog_info
)
else:
self.grabber.xmltv.addProgram( self.start,
self.chan_c.channel_id,
self.prog,
self.stop,
self.prog_info
)
message.info(_("Grab program info complete : [%s] (%s) %s") % \
( self.start, self.stop, self.prog ) )
#_run()
def grabnextprog(self):
self.prog_lock.acquire(1) # 1 means we block
if (len (self.prog_list) <1 ):
p = None
else:
p = self.prog_list[0]
del self.prog_list[0]
self.prog_lock.release()
return p
#TaddProgram
class SubGrabber:
"""
Abstract baseclass for subgrabbers.
Use SubGrabber_c or SubGrabber_Re2 as a baseclass when creating your own
implementation of a subgrabber
"""
page_charset = 'iso-8859-1'
supports_prog_info = False
# lazy_instantiation() has not yet been run
properly_instantiated = False
re_desc_replace = [
# removes every totally empty line
(re.compile("\s*\n*\s*$",re.M),"")
]
re_episode_num = [
re.compile("(?:P|p)art (P<episode_num>\d+) ?of ?(?P<episodes_max>\d+)"),
re.compile("(?:P|p)art (P<episode_num>\d+)\((?P<episodes_max>\d+)\)"),
]
# These chould all hava a match group - year or date when shown previously
re_rerun_identifiers = []
re_title_clean = []
# Set this one to be your re2 expression if you are using re2 mode
_pat_guide = None
# Set these if you are using re2 and a dynamic get_channels
_pat_channel = None
_channel_url = None
############################################################################
# Mode (CustomizedParser or Re2) independent methods:
############################################################################
def __init__(self, grabber):
self.grabber = grabber
# These can't be put outside as class constants. (append-side-effect-problems)
self.re_channel_clean = []
self.re_program_clean = []
self.re_program_info_clean = []
def format_html_date(self,aDate):
"""
Returns a date formatted string that will be appended to the url
to get the correct page.
This method SHOULD/MUST be overridden by subclasses.
None should be returned if the channel provider does not displaying any
programs of this date on his site.
This method should only be invoked with dates that isn't
used by anything else (use copy(date) to avoid nasty side-effects).
"""
assert False, _( "Not Implemented Yet" )
# format_html_date
def prettyPrintTitle(self,title):
"""
Sometimes programme data providers adds some stuff to the title that
simply does not belong in there, this function will try to remove such
things. It will also convert the text to unicode
"""
string.strip(title)
for clean in self.re_title_clean:
title = clean.sub( "", title )
string.strip(title)
return self.markup(self.clean_char(title))
# prettyPrintTitle
def prettyPrintDesc(self,desc):
"""
Makes the description easier on the eyes
It will also convert the text to unicode
"""
desc = self.markup(desc)
for repl in self.re_desc_replace:
desc = repl[ 0 ].sub( repl[ 1 ], desc )
desc = self.clean_char(desc)
return desc
# prettyPrintDesc
def prettyPrintTime(self,time):
return time.replace(".",":")
# prettyPrintTime
def clean_char( self, contents ):
"""
Convert ASCII char's to unicode
"""
if not isinstance(contents,unicode):
contents = unicode(contents, self.page_charset,'replace')
return contents
# clean_char()
def markup(self,text):
"""
Seems like the python XML parser will freak out whenever it sees a '&'
in the cdata.
This method will markup common problem charachters.
"""
text = text.replace('&', "&")\
.replace('&amp;', "&")\
.replace('<', "<")\
.replace('&lt;', "<")\
.replace('"', """)\
.replace('&quot;', """)\
.replace('>', ">")\
.replace('&gt;', ">")\
.strip()
# Correct corruptions we just created...
return self.grabber.named_re.corrupted_numeric_char_entity_ref.sub(
"&#\g<1>;", text)
# markup
def _clear_html( self,data,re_clean,unicode_error="replace"):
"""
Common implementation of clear_channel_html, clear_channel_html and
clear_program_info_html.
"""
if self.page_charset and not isinstance(data,unicode):
data = unicode( data, self.page_charset, unicode_error )
for clean in re_clean:
data = clean.filter( data )
return data
# _clear_html()
def clear_channel_html( self, contents, unicode_error="replace" ):
if not self.re_channel_clean:
return contents
return self._clear_html(contents,
self.re_channel_clean,
unicode_error)
# clear_channel_html()
def clear_program_html( self, contents, unicode_error="replace" ):
"""
This is the same method as Grab_C.clear_html but since every
subgrabber needs unique page_charset, re_clean, re_replace and what not
i can't use that medhod.
Clear unwanted tags, attributes and whatever is defined as a
compiled Regular Expression in re_clean and re_replace.
Converts to unicode if page_charset is set, using unicode error policy
defined by 'unicode_error'.
"""
if not self.properly_instantiated:
self.lazy_instantiation()
self.properly_instantiated = True
return self._clear_html(contents,
self.re_program_clean,
unicode_error)
# clear_program_html()
def clear_program_info_html( self, contents, unicode_error="replace" ):
"""
Clear unwanted tags, attributes and whatever is defined as a
compiled Regular Expression in re_program_info_clean and
re_program_info_replace
"""
if not self.properly_instantiated:
self.lazy_instantiation()
self.properly_instantiated = True
return self._clear_html(contents,
self.re_program_info_clean,
unicode_error)
# clear_program_info_html()
def lazy_instantiation (self):
"""
Should setup the re_program_info_clean, re_program_clean and
other variables needed when this subgrabber is actually grabbing
stuff from the internet.
Will be called by clear_program_html() and clear_program_info_html()
if those variables are not set.
The reason to use lazy instansiation is that some
subgrabbers may be inactivated by the configuration file, and
re and re2 are kind of slow to compile.
"""
assert False,_( "Not Implemented Yet" )
###########################################################################
# Information retreival methods,
# Will try to generate data from the description field
###########################################################################
def get_episode_num(self,descr):
"""
Search the descr for text resembling a episode number.
returns a string containing a episode-num string of
type xmltv_ns described in the xmltv dtd (version 0.5.35)
Returns None if no episode number is found
"""
if not descr or descr == "":
return None
if isinstance(descr,unicode):
#int() does not seem to be able to handle unicode
descr = descr.encode(self.page_charset,'replace')
for reg in self.re_episode_num:
match = reg.search(descr)
if match:
rv=".%d/%s." %(int(match.group("episode_num"))-1,match.group("episodes_max"))
return unicode(rv,self.page_charset,'replace')
return None
# get_episode_num
def get_rerun(self, text):
"""
Search the title and/or desc for text resembling a rerun identifier.
returns group(1) of the regexp match if one is found
"""
if not text or text == "":
return None
for reg in self.re_rerun_identifiers:
match = reg.search(text)
if match:
return match.group(1)
return None
# get_rerun
def get_duration(self, description, startTime):
"""
Tries to figure out the endTime of this programme based
on information found in the description, returns None if
nothing was found
"""
return None
# SubGrabber
class SubGrabber_Re2 (SubGrabber):
" Subgrabber utilizing Re2 "
def flatten_array(self, arr):
"""
Sometimes the data is put inside an array when using re2 (for no
apparent reason)
"""
rv = ""
for o in arr:
rv += o
return rv
def get_channels(self, html_input):
assert False,_( "Not Implemented Yet" )
def get_programs( self, html_input, chan_c):
"""
Return a list of ProgramContainer:
( ProgramContainer )
from the html structure.
"""
if (not html_input) or html_input.strip() == "":
return None
guide=self._pat_guide.extract(html_input)
if not guide or not hasattr(guide, 'programme'):
return None
# Fall back to the standard guide handling
return self.get_programs_from_re2_output(guide, chan_c)
def get_programs_from_re2_output(self, re2_output, chan_c):
"""
Converts the output of re2.extract to the same
list of ProgramContainer the customized parser version
of get_programs outputs
"""
if not re2_output or not hasattr(re2_output, 'programme'):
return None
list = []
for p in re2_output.programme:
pc = ProgramContainer()
pc.startTime = self.prettyPrintTime(p.startTime)
pc.title = p.title
if not (pc.startTime and pc.title):
# title and starttime is the minimum
continue
if hasattr(p, "endTime"):
pc.endTime = self.prettyPrintTime(p.endTime)
pc.title = self.prettyPrintTitle(p.title)
if hasattr(p, "description"):
if type(p.description) == type([]):
p.description = self.flatten_array(p.description)
pc.description = self.prettyPrintDesc(p.description)
if hasattr(p, "category"):
pc.category = p.category
if hasattr(p, "previouslyShown"):
pc.previouslyShown = p.previouslyShown
if hasattr(p, "sub_title"):
pc.sub_title = p.sub_title
if hasattr(p, "episode"):
pc.episode = p.episode
if hasattr(p, "url") and p.url.strip() != "":
pc.url = chan_c.base_url + p.url
if pc.description:
if not pc.episode:
pc.episode = self.get_episode_num(pc.description)
if not pc.previouslyShown:
pc.previouslyShown = self.get_rerun(pc.description)
if len(list) > 0:
prevPc = list[-1]
if prevPc.endTime == None:
prevPc.endTime = pc.startTime
# remove duplicates
if prevPc.startTime == pc.startTime:
del list[-1]
if pc.endTime==None and pc.description and pc.startTime:
pc.endTime = self.get_duration(pc.description, pc.startTime)
if self.re_title_noservice:
if not self.re_title_noservice.search(pc.title):
list.append(pc)
else:
list.append(pc)
return list
def get_program_info(self, html_input, pc, chan_c):
"""
Populates the ProgramContainer with more data from the
html_input
"""
guide=self._pat_guide_program_info.extract(html_input)
if not guide or not hasattr(guide, 'programme_info'):
return None
# Fall back to the standard program info handling
self.get_program_info_from_re2_output(pc, guide, chan_c)
def _get_program_info_from_re2_output(self, pc, p, chan_c):
if pc.startTime == None:
if hasattr(p, 'startTime'):
pc.startTime = self.prettyPrintTime(p.startTime)
if pc.endTime == None:
if hasattr(p, 'endTime'):
pc.endTime = self.prettyPrintTime(p.endTime)
if pc.title == None:
if hasattr(p, 'title'):
pc.title = self.prettyPrintTitle(p.title)
if pc.description == None:
if hasattr(p, 'description'):
if type(p.description) == type([]):
p.description = self.flatten_array(p.description)
pc.description = self.prettyPrintDesc(p.description)
else:
pc.description = ""
if hasattr(p, 'description_addon'):
if type(p.description_addon) == type([]):
p.description_addon = self.flatten_array(p.description_addon)
pc.description += self.prettyPrintDesc(p.description_addon)
if hasattr(p, 'description_replace'):
if type(p.description_replace) == type([]):
p.description_replace = self.flatten_array(p.description_replace)
pc.description = self.prettyPrintDesc(p.description_replace)
if pc.category == None:
if hasattr(p, "category"):
pc.category = p.category
if pc.previouslyShown == None:
if hasattr(p, "previouslyShown"):
pc.previouslyShown = p.previouslyShown
if pc.sub_title == None:
if hasattr(p, "sub_title"):
pc.sub_title = p.sub_title
if pc.episode == None:
if hasattr(p, "episode"):
pc.episode = p.episode
# Can't see this ever happening, the url was the thing
# leading us here in the first place, but better safe than sorry
if pc.url == None:
if hasattr(p, "url"):
pc.url = p.url
if pc.description:
if not pc.episode:
pc.episode = self.get_episode_num(pc.description)
if not pc.previouslyShown:
pc.previouslyShown = self.get_rerun(pc.description)
def get_program_info_from_re2_output(self, pc, re2_output, chan_c):
"""
Converts the output of re2.extract to the same
list of ProgramContainer the customized parser version
of get_programs outputs
"""
if not re2_output or not hasattr(re2_output, 'programme_info'):
return None
if type(re2_output.programme_info) == type([]):
for p in re2_output.programme_info:
self._get_program_info_from_re2_output(pc, p, chan_c)
else:
self._get_program_info_from_re2_output(pc, \
re2_output.programme_info, \
chan_c)
# SubGrabber_Re2
class SubGrabber_C (SubGrabber):
" Subgrabber utilizing Customized Parser "
def get_subtree(self, tree, traceArray):
"""
Returns a subtree pointed out by the array of child indexes in
traceArray.
get_subtree(tree,[0,1,2]) will get
tree.children[0].children[1].children[2] or None
"""
if len(traceArray) == 0:
return tree
for t in traceArray:
if len(tree.children) > t:
tree = tree.children[t]
else:
# traceArray was faulty, could not traverse tree
return None
return tree
def search_tree(self,tree,searchFunc,cutoff=0):
"""
Searches the tree recursively, when searchFunc signals a match
an array of child indexes pointing to the point where the match
occured is returned. See get_subtree().
if cutoff > 0 that number of items will be removed from the
result
"""
if searchFunc(tree):
return []
i = 0
for t in tree.children:
rv = self.search_tree(t,searchFunc)
if rv!=None: # an empty [] is also false...
rv.insert(0,i)
for i in range (0,cutoff):
del rv[-1]
return rv
i += 1
return None
def search_and_get_tree(self,tree,searchFunc,cutoff=0):
"""
combines search_tree() and get_subtree().
Returns None if something fails
"""
t = self.search_tree(tree,searchFunc,cutoff)
if t!=None: # an empty [] is also false...
return self.get_subtree(tree,t)
else:
return None
def get_attr(self,tree,attr_name):
"""
Searches after the attr named attr_name of this Tag (tree).
Returns the first occurence
"""
for (attr_t,attr_v) in tree.attrs:
if attr_t == attr_name:
return attr_v
return None
# SubGrabber_C
class ProgramContainer:
"""
Container for program data
"""
startTime = None
endTime = None
title = None
description = None
url = None
episode = None
previouslyShown = None
sub_title = None
category = None
# ProgramContainer
class ChannelBlueprint:
"""
Channel specific data needed by the parser
"""
def __init__(self, ch_id, display_name, base_url,
ch_url, sub_grabber,\
start_of_day, ch_icon_url=None, timeshift=0,\
supports_prog_info=True, tz=None):
assert isinstance(sub_grabber, SubGrabber), \
"sub_grabber must be subclass of tvgrab.mtgrab.SubGrabber"
# Id of this channel: i.e. svt1.svt.se
self.channel_id = ch_id
# The display name: i.e. SVT1
self.display_name = display_name
# root url of the provider. i.e. "http://svt.se"
self.base_url = base_url
# Url for the webpage to parse, minus the date
# i.e. "http://svt.se/svt/jsp/Crosslink.jsp?d=8764&selectedDate="
self.dateless_channel_url = ch_url
self.icon_url = ch_icon_url
# ref to the subgrabber containing all the specific channel logic
self.sub_grabber = sub_grabber
# Anything before this time (24h notation) will marked
# as belonging to the prev. day
self.start_of_day = start_of_day
# number of hours the shows are shifted.
# positive number means the program starts x hours earlier
self.timeshift = timeshift
# Wheater we should use get_program_info phase or not.
self.supports_prog_info = supports_prog_info
# Time zone of the grabbed data
if tz != None:
assert isinstance(tz, tzinfo),\
"tz must be a subclass of datetime.tzinfo"
self.tz = tz
# ChannelBlueprint
|