__init__.py :  » Media-Sound-Audio » IMDbPY » IMDbPY-4.5.1 » imdb » parser » mobile » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Media Sound Audio » IMDbPY 
IMDbPY » IMDbPY 4.5.1 » imdb » parser » mobile » __init__.py
"""
parser.mobile package (imdb package).

This package provides the IMDbMobileAccessSystem class used to access
IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile".

Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

import re
import logging
from urllib import unquote

from imdb import imdbURL_movie_main,imdbURL_person_main,imdbURL_character_main
from imdb.Movie import Movie
from imdb.utils import analyze_title,analyze_name,canonicalName,\
                        date_and_notes
from imdb._exceptions import IMDbDataAccessError
from imdb.parser.http import IMDbHTTPAccessSystem
from imdb.parser.http.utils import subXMLRefs,subSGMLRefs,build_person,\
                                    build_movie, re_spaces

# XXX NOTE: the first version of this module was heavily based on
#           regular expressions.  This new version replace regexps with
#           find() strings' method calls; despite being less flexible, it
#           seems to be at least as fast and, hopefully, much more
#           lightweight.  Yes: the regexp-based version was too heavyweight
#           for systems with very limited CPU power and memory footprint.
re_spacessub = re_spaces.sub
# Strip html.
re_unhtml = re.compile(r'<.+?>')
re_unhtmlsub = re_unhtml.sub
# imdb person or movie ids.
re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')

# movie AKAs.
re_makas = re.compile('(<p class="find-aka">.*?</p>)')


def _unHtml(s):
    """Return a string without tags and no multiple spaces."""
    return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())


_inttype = type(0)

def _getTagsWith(s, cont, toClosure=False, maxRes=None):
    """Return the html tags in the 's' string containing the 'cont'
    string; if toClosure is True, everything between the opening
    tag and the closing tag is returned."""
    lres = []
    bi = s.find(cont)
    if bi != -1:
        btag = s[:bi].rfind('<')
        if btag != -1:
            if not toClosure:
                etag = s[bi+1:].find('>')
                if etag != -1:
                    endidx = bi+2+etag
                    lres.append(s[btag:endidx])
                    if maxRes is not None and len(lres) >= maxRes: return lres
                    lres += _getTagsWith(s[endidx:], cont,
                                        toClosure=toClosure)
            else:
                spaceidx = s[btag:].find(' ')
                if spaceidx != -1:
                    ctag = '</%s>' % s[btag+1:btag+spaceidx]
                    closeidx = s[bi:].find(ctag)
                    if closeidx != -1:
                        endidx = bi+closeidx+len(ctag)
                        lres.append(s[btag:endidx])
                        if maxRes is not None and len(lres) >= maxRes:
                            return lres
                        lres += _getTagsWith(s[endidx:], cont,
                                            toClosure=toClosure)
    return lres


def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None):
    """Return the list of strings from the 's' string which are included
    between the 'begins' and 'ends' strings."""
    if lres is None:
        lres = []
    bi = s.find(begins, beginindx)
    if bi != -1:
        lbegins = len(begins)
        if isinstance(ends, (list, tuple)):
            eset = [s.find(end, bi+lbegins) for end in ends]
            eset[:] = [x for x in eset if x != -1]
            if not eset: ei = -1
            else: ei = min(eset)
        else:
            ei = s.find(ends, bi+lbegins)
        if ei != -1:
            match = s[bi+lbegins:ei]
            lres.append(match)
            if maxRes is not None and len(lres) >= maxRes: return lres
            _findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes,
                        lres=lres)
    return lres


class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
    """The class used to access IMDb's data through the web for
    mobile terminals."""

    accessSystem = 'mobile'
    _mobile_logger = logging.getLogger('imdbpy.parser.mobile')

    def __init__(self, isThin=1, *arguments, **keywords):
        self.accessSystem = 'mobile'
        IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)

    def _clean_html(self, html):
        """Normalize the retrieve html."""
        html = re_spaces.sub(' ', html)
        return subXMLRefs(html)

    def _mretrieve(self, url, size=-1):
        """Retrieve an html page and normalize it."""
        cont = self._retrieve(url, size=size)
        return self._clean_html(cont)

    def _getPersons(self, s, sep='<br/>'):
        """Return a list of Person objects, from the string s; items
        are assumed to be separated by the sep string."""
        names = s.split(sep)
        pl = []
        plappend = pl.append
        counter = 1
        for name in names:
            pid = re_imdbID.findall(name)
            if not pid: continue
            characters = _getTagsWith(name, 'class="char"',
                                        toClosure=True, maxRes=1)
            chpids = []
            if characters:
                for ch in characters[0].split(' / '):
                    chid = re_imdbID.findall(ch)
                    if not chid:
                        chpids.append(None)
                    else:
                        chpids.append(chid[-1])
            if not chpids:
                chpids = None
            elif len(chpids) == 1:
                chpids = chpids[0]
            name = _unHtml(name)
            # Catch unclosed tags.
            gt_indx = name.find('>')
            if gt_indx != -1:
                name = name[gt_indx+1:].lstrip()
            if not name: continue
            if name.endswith('...'):
                name = name[:-3]
            p = build_person(name, personID=str(pid[0]), billingPos=counter,
                            modFunct=self._defModFunct, roleID=chpids,
                            accessSystem=self.accessSystem)
            plappend(p)
            counter += 1
        return pl

    def _search_movie(self, title, results):
        ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
        ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
        ##cont = self._mretrieve(imdbURL_search % params)
        cont = subXMLRefs(self._get_search_content('tt', title, results))
        title = _findBetween(cont, '<title>', '</title>', maxRes=1)
        res = []
        if not title:
            self._mobile_logger.error('no title tag searching for movie %s',
                                    title)
            return res
        tl = title[0].lower()
        if not tl.startswith('imdb title'):
            # a direct hit!
            title = _unHtml(title[0])
            mid = None
            midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
            if midtag:
                mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
            if not (mid and title):
                self._mobile_logger.error('no direct hit title/movieID for' \
                                            ' title %s', title)
                return res
            if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
                title += ' (mini)'
            res[:] = [(str(mid[0]), analyze_title(title))]
        else:
            # XXX: this results*3 prevents some recursion errors, but...
            #      it's not exactly understandable (i.e.: why 'results' is
            #      not enough to get all the results?)
            lis = _findBetween(cont, 'td valign="top">', '</td>',
                                maxRes=results*3)
            for li in lis:
                akas = re_makas.findall(li)
                for idx, aka in enumerate(akas):
                    aka = aka.replace('" - ', '::', 1)
                    aka = _unHtml(aka)
                    if aka.startswith('aka "'):
                        aka = aka[5:].strip()
                    if aka[-1] == '"':
                        aka = aka[:-1]
                    akas[idx] = aka
                imdbid = re_imdbID.findall(li)
                li = re_makas.sub('', li)
                mtitle = _unHtml(li)
                if not (imdbid and mtitle):
                    self._mobile_logger.debug('no title/movieID parsing' \
                                            ' %s searching for title %s', li,
                                            title)
                    continue
                mtitle = mtitle.replace('(TV mini-series)', '(mini)')
                resd = analyze_title(mtitle)
                if akas:
                    resd['akas'] = akas
                res.append((str(imdbid[0]), resd))
        return res

    def get_movie_main(self, movieID):
        cont = self._mretrieve(imdbURL_movie_main % movieID + 'maindetails')
        title = _findBetween(cont, '<title>', '</title>', maxRes=1)
        if not title:
            raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID
        title = _unHtml(title[0])
        if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
            title += ' (mini)'
        d = analyze_title(title)
        kind = d.get('kind')
        tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1)
        if tv_series: mid = re_imdbID.findall(tv_series[0])
        else: mid = None
        if tv_series and mid:
            s_title = _unHtml(tv_series[0])
            s_data = analyze_title(s_title)
            m = Movie(movieID=str(mid[0]), data=s_data,
                        accessSystem=self.accessSystem,
                        modFunct=self._defModFunct)
            d['kind'] = kind = u'episode'
            d['episode of'] = m
        if kind in ('tv series', 'tv mini series'):
            years = _findBetween(cont, '<h1>', '</h1>', maxRes=1)
            if years:
                years[:] = _findBetween(years[0], 'TV series', '</span>',
                                        maxRes=1)
                if years:
                    d['series years'] = years[0].strip()
        air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>',
                                maxRes=1)
        if air_date:
            air_date = air_date[0]
            vi = air_date.find('(')
            if vi != -1:
                date = _unHtml(air_date[:vi]).strip()
                if date != '????':
                    d['original air date'] = date
                air_date = air_date[vi:]
                season = _findBetween(air_date, 'Season', ',', maxRes=1)
                if season:
                    season = season[0].strip()
                    try: season = int(season)
                    except: pass
                    if season or type(season) is _inttype:
                        d['season'] = season
                episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
                if episode:
                    episode = episode[0].strip()
                    try: episode = int(episode)
                    except: pass
                    if episode or type(season) is _inttype:
                        d['episode'] = episode
        direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'),
                                maxRes=1)
        if direct:
            direct = direct[0]
            h5idx = direct.find('/h5>')
            if h5idx != -1:
                direct = direct[h5idx+4:]
            direct = self._getPersons(direct)
            if direct: d['director'] = direct
        if kind in ('tv series', 'tv mini series', 'episode'):
            if kind != 'episode':
                seasons = _findBetween(cont, 'Seasons:</h5>', '</div>',
                                        maxRes=1)
                if seasons:
                    d['number of seasons'] = seasons[0].count('|') + 1
            creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"',
                                                            '</div>',
                                                            '<br/> <br/>'),
                                                            maxRes=1)
            if not creator:
                # They change 'Created by' to 'Creator' and viceversa
                # from time to time...
                # XXX: is 'Creators' also used?
                creator = _findBetween(cont, 'Creator:</h5>',
                                        ('class="tn15more"', '</div>',
                                        '<br/> <br/>'), maxRes=1)
            if creator:
                creator = creator[0]
                if creator.find('tn15more'): creator = '%s>' % creator
                creator = self._getPersons(creator)
                if creator: d['creator'] = creator
        writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'),
                                maxRes=1)
        if writers:
            writers = writers[0]
            h5idx = writers.find('/h5>')
            if h5idx != -1:
                writers = writers[h5idx+4:]
            writers = self._getPersons(writers)
            if writers: d['writer'] = writers
        cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
        if cvurl:
            cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
            if cvurl: d['cover url'] = cvurl[0]
        genres = _findBetween(cont, 'href="/Sections/Genres/', '/')
        if genres:
            d['genres'] = list(set(genres))
        ur = _findBetween(cont, '<div class="starbar-meta">', '</div>',
                            maxRes=1)
        if ur:
            rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
            if rat:
                teni = rat[0].find('/10')
                if teni != -1:
                    rat = rat[0][:teni]
                    try:
                        rat = float(rat.strip())
                        d['rating'] = rat
                    except ValueError:
                        self._mobile_logger.warn('wrong rating: %s', rat)
            vi = ur[0].rfind('tn15more">')
            if vi != -1 and ur[0][vi+10:].find('await') == -1:
                try:
                    votes = _unHtml(ur[0][vi+10:]).replace('votes', '').strip()
                    votes = int(votes.replace(',', ''))
                    d['votes'] = votes
                except ValueError:
                    self._mobile_logger.warn('wrong votes: %s', ur)
        top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1)
        if top250:
            fn = top250[0].rfind('#')
            if fn != -1:
                try:
                    td = int(top250[0][fn+1:])
                    d['top 250 rank'] = td
                except ValueError:
                    self._mobile_logger.warn('wrong top250: %s', top250)
        castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1)
        if not castdata:
            castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1)
        if not castdata:
            castdata = _findBetween(cont, 'Complete credited cast', '</table>',
                                    maxRes=1)
        if not castdata:
            castdata = _findBetween(cont, 'Series Cast Summary', '</table>',
                                    maxRes=1)
        if not castdata:
            castdata = _findBetween(cont, 'Episode Credited cast', '</table>',
                                    maxRes=1)
        if castdata:
            castdata = castdata[0]
            # Reintegrate the fist tag.
            fl = castdata.find('href=')
            if fl != -1: castdata = '<a ' + castdata[fl:]
            # Exclude the 'rest of cast listed alphabetically' row.
            smib = castdata.find('<tr><td align="center" colspan="4"><small>')
            if smib != -1:
                smie = castdata.rfind('</small></td></tr>')
                if smie != -1:
                    castdata = castdata[:smib].strip() + \
                                castdata[smie+18:].strip()
            castdata = castdata.replace('/tr> <tr', '/tr><tr')
            cast = self._getPersons(castdata, sep='</tr><tr')
            if cast: d['cast'] = cast
        akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1)
        if akas:
            # For some reason, here <br> is still used in place of <br/>.
            akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
            akas = [_unHtml(x).replace(' (','::(', 1) for x in akas]
            if 'more' in akas: akas.remove('more')
            d['akas'] = akas
        mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1)
        if mpaa: d['mpaa'] = _unHtml(mpaa[0])
        runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1)
        if runtimes:
            runtimes = runtimes[0]
            runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
                    for x in runtimes.split('|')]
            d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
        if kind == 'episode':
            # number of episodes.
            epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
                                maxRes=1)
            if epsn:
                epsn = epsn[0].replace(' Episodes', '').strip()
                if epsn:
                    try:
                        epsn = int(epsn)
                    except:
                        self._mobile_logger.warn('wrong episodes #: %s', epsn)
                    d['number of episodes'] = epsn
        country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1)
        if country:
            country[:] = country[0].split(' | ')
            country[:] = ['<a %s' % x for x in country if x]
            country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country]
            if country: d['countries'] = country
        lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1)
        if lang:
            lang[:] = lang[0].split(' | ')
            lang[:] = ['<a %s' % x for x in lang if x]
            lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang]
            if lang: d['languages'] = lang
        col = _findBetween(cont, '"/search/title?colors=', '</div>')
        if col:
            col[:] = col[0].split(' | ')
            col[:] = ['<a %s' % x for x in col if x]
            col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col]
            if col: d['color info'] = col
        sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>',
                            maxRes=1)
        if sm:
            sm[:] = sm[0].split(' | ')
            sm[:] = ['<a %s' % x for x in sm if x]
            sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm]
            if sm: d['sound mix'] = sm
        cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1)
        if cert:
            cert[:] = cert[0].split(' | ')
            cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert]
            if cert: d['certificates'] = cert
        plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'],
                                    maxRes=1)
        if plotoutline:
            plotoutline = plotoutline[0].strip()
            plotoutline = plotoutline.rstrip('|').rstrip()
            if plotoutline: d['plot outline'] = _unHtml(plotoutline)
        aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
                            maxRes=1)
        if aratio:
            aratio = aratio[0].strip().replace(' (', '::(', 1)
            if aratio:
                d['aspect ratio'] = _unHtml(aratio)
        return {'data': d}

    def get_movie_plot(self, movieID):
        cont = self._mretrieve(imdbURL_movie_main % movieID + 'plotsummary')
        plot = _findBetween(cont, '<p class="plotpar">', '</p>')
        plot[:] = [_unHtml(x) for x in plot]
        for i in xrange(len(plot)):
            p = plot[i]
            wbyidx = p.rfind(' Written by ')
            if wbyidx != -1:
                plot[i] = '%s::%s' % \
                        (p[:wbyidx].rstrip(),
                    p[wbyidx+12:].rstrip().replace('{','<').replace('}','>'))
            else:
                plot[i] = '%s::Anonymous' % p
        if plot: return {'data': {'plot': plot}}
        return {'data': {}}

    def _search_person(self, name, results):
        ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
        ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
        ##cont = self._mretrieve(imdbURL_search % params)
        cont = subXMLRefs(self._get_search_content('nm', name, results))
        name = _findBetween(cont, '<title>', '</title>', maxRes=1)
        res = []
        if not name:
            self._mobile_logger.warn('no title tag searching for name %s', name)
            return res
        nl = name[0].lower()
        if not nl.startswith('imdb name'):
            # a direct hit!
            name = _unHtml(name[0])
            name = name.replace('- Filmography by type' , '').strip()
            pid = None
            pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
            if pidtag:
                pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
            if not (pid and name):
                self._mobile_logger.error('no direct hit name/personID for' \
                                            ' name %s', name)
                return res
            res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
        else:
            lis = _findBetween(cont, 'td valign="top">', '</td>',
                                maxRes=results*3)
            for li in lis:
                akas = _findBetween(li, '<em>"', '"</em>')
                for sep in ['<small', '<br> aka', '<br> birth name']:
                    sepIdx = li.find(sep)
                    if sepIdx != -1:
                        li = li[:sepIdx]
                pid = re_imdbID.findall(li)
                pname = _unHtml(li)
                if not (pid and pname):
                    self._mobile_logger.debug('no name/personID parsing' \
                                            ' %s searching for name %s', li,
                                            name)
                    continue
                resd = analyze_name(pname, canonical=1)
                if akas:
                    resd['akas'] = akas
                res.append((str(pid[0]), resd))
        return res

    def get_person_main(self, personID, _parseChr=False):
        if not _parseChr:
            url = imdbURL_person_main % personID + 'maindetails'
        else:
            url = imdbURL_character_main % personID
        s = self._mretrieve(url)
        r = {}
        name = _findBetween(s, '<title>', '</title>', maxRes=1)
        if not name:
            if _parseChr: w = 'characterID'
            else: w = 'personID'
            raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
        name = _unHtml(name[0])
        if _parseChr:
            name = name.replace('(Character)', '').strip()
            name = name.replace('- Filmography by type', '').strip()
        else:
            name = name.replace('- Filmography by', '').strip()
        r = analyze_name(name, canonical=not _parseChr)
        for dKind in ('birth', 'death'):
            date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(),
                                ('<a class', '</div>', '<br/><br/>'), maxRes=1)
            if date:
                date = _unHtml(date[0])
                if date:
                    date, notes = date_and_notes(date)
                    if date:
                        r['%s date' % dKind] = date
                    if notes:
                        r['%s notes' % dKind] = notes
        akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
                            '<br/><br/>'), maxRes=1)
        if akas:
            akas = akas[0]
            if akas.find(' | ') != -1:
                akas = _unHtml(akas).split(' | ')
            else:
                akas = _unHtml(akas).split(' / ')
            if akas: r['akas'] = akas
        hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
        if hs:
            hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
            if hs: r['headshot'] = hs[0]
        # Build a list of tuples such [('hrefLink', 'section name')]
        workkind = _findBetween(s, '<div class="strip jump">', '</div>',
                                maxRes=1)
        if workkind:
            workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
        else:
            # Assume there's only one section and/or there are no
            # section links, for some reason.
            workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
            workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
        ws = []
        for work in workkind:
            wsplit = work.split('">', 1)
            if len(wsplit) == 2:
                sect = wsplit[0]
                if '"' in sect:
                    sect = sect[:sect.find('"')]
                ws.append((sect, wsplit[1].lower()))
        # XXX: I think "guest appearances" are gone.
        if s.find('<a href="#guest-appearances"') != -1:
            ws.append(('guest-appearances', 'notable tv guest appearances'))
        #if _parseChr:
        #    ws.append(('filmography', 'filmography'))
        for sect, sectName in ws:
            raws = u''
            # Everything between the current section link and the end
            # of the <ol> tag.
            if _parseChr and sect == 'filmography':
                inisect = s.find('<div class="filmo">')
            else:
                inisect = s.find('<a name="%s' % sect)
            if inisect != -1:
                endsect = s[inisect:].find('</ol>')
                if endsect != -1: raws = s[inisect:inisect+endsect]
            if not raws: continue
            mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
            for m in mlist:
                # For every movie in the current section.
                movieID = re_imdbID.findall(m)
                if not movieID:
                    self._mobile_logger.debug('no movieID in %s', m)
                    continue
                if not _parseChr:
                    chrIndx = m.find(' .... ')
                else:
                    chrIndx = m.find(' Played by ')
                chids = []
                if chrIndx != -1:
                    chrtxt = m[chrIndx+6:]
                    if _parseChr:
                        chrtxt = chrtxt[5:]
                    for ch in chrtxt.split(' / '):
                        chid = re_imdbID.findall(ch)
                        if not chid:
                            chids.append(None)
                        else:
                            chids.append(chid[-1])
                if not chids:
                    chids = None
                elif len(chids) == 1:
                    chids = chids[0]
                movieID = str(movieID[0])
                # Search the status.
                stidx = m.find('<i>')
                status = u''
                if stidx != -1:
                    stendidx = m.rfind('</i>')
                    if stendidx != -1:
                        status = _unHtml(m[stidx+3:stendidx])
                        m = m.replace(m[stidx+3:stendidx], '')
                m = _unHtml(m)
                if not m:
                    self._mobile_logger.warn('no title fo rmovieID %s', movieID)
                    continue
                movie = build_movie(m, movieID=movieID, status=status,
                                    roleID=chids, modFunct=self._defModFunct,
                                    accessSystem=self.accessSystem,
                                    _parsingCharacter=_parseChr)
                r.setdefault(sectName, []).append(movie)
        # If available, take the always correct name from a form.
        itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
        if not itag:
            itag = _getTagsWith(s, 'name="primary"', maxRes=1)
        if itag:
            vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
            if not vtag:
                vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
            if vtag:
                try:
                    vtag = unquote(str(vtag[0]))
                    vtag = unicode(vtag, 'latin_1')
                    r.update(analyze_name(vtag))
                except UnicodeEncodeError:
                    pass
        return {'data': r, 'info sets': ('main', 'filmography')}

    def get_person_biography(self, personID):
        cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
        d = {}
        spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
                                maxRes=1)
        if spouses:
            sl = []
            for spouse in spouses[0].split('</tr>'):
                if spouse.count('</td>') > 1:
                    spouse = spouse.replace('</td>', '::</td>', 1)
                spouse = _unHtml(spouse)
                spouse = spouse.replace(':: ', '::').strip()
                if spouse: sl.append(spouse)
            if sl: d['spouse'] = sl
        nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
                                maxRes=1)
        if nnames:
            nnames = nnames[0]
            if nnames:
                nnames = [x.strip().replace(' (', '::(', 1)
                            for x in nnames.split('<br/>')]
                if nnames:
                    d['nick names'] = nnames
        misc_sects = _findBetween(cont, '<h5>', '<br/>')
        misc_sects[:] = [x.split('</h5>') for x in misc_sects]
        misc_sects[:] = [x for x in misc_sects if len(x) == 2]
        for sect, data in misc_sects:
            sect = sect.lower().replace(':', '').strip()
            if d.has_key(sect) and sect != 'mini biography': continue
            elif sect in ('spouse', 'nickname'): continue
            if sect == 'salary': sect = 'salary history'
            elif sect == 'where are they now': sect = 'where now'
            elif sect == 'personal quotes': sect = 'quotes'
            data = data.replace('</p><p>', '::')
            data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
            data = data.replace('</td> <td valign="top">', '@@@@')
            data = data.replace('</td> </tr>', '::')
            data = _unHtml(data)
            data = [x.strip() for x in data.split('::')]
            data[:] = [x.replace('@@@@', '::') for x in data if x]
            if sect == 'height' and data: data = data[0]
            elif sect == 'birth name': data = canonicalName(data[0])
            elif sect == 'date of birth':
                date, notes = date_and_notes(data[0])
                if date:
                    d['birth date'] = date
                if notes:
                    d['birth notes'] = notes
                continue
            elif sect == 'date of death':
                date, notes = date_and_notes(data[0])
                if date:
                    d['death date'] = date
                if notes:
                    d['death notes'] = notes
                continue
            elif sect == 'mini biography':
                ndata = []
                for bio in data:
                    byidx = bio.rfind('IMDb Mini Biography By')
                    if byidx != -1:
                        bioAuth = bio[:byidx].rstrip()
                    else:
                        bioAuth = 'Anonymous'
                    bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
                    ndata.append(bio)
                data[:] = ndata
                if 'mini biography' in d:
                    d['mini biography'].append(ndata[0])
                    continue
            d[sect] = data
        return {'data': d}

    def _search_character(self, name, results):
        cont = subXMLRefs(self._get_search_content('char', name, results))
        name = _findBetween(cont, '<title>', '</title>', maxRes=1)
        res = []
        if not name:
            self._mobile_logger.error('no title tag searching character %s',
                                    name)
            return res
        nl = name[0].lower()
        if not (nl.startswith('imdb search') or nl.startswith('imdb  search') \
                or nl.startswith('imdb character')):
            # a direct hit!
            name = _unHtml(name[0]).replace('(Character)', '').strip()
            pid = None
            pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
            if pidtag:
                pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
            if not (pid and name):
                self._mobile_logger.error('no direct hit name/characterID for' \
                                            ' character %s', name)
                return res
            res[:] = [(str(pid[0]), analyze_name(name))]
        else:
            sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
                                maxRes=results*3)
            sects += _findBetween(cont, '<b>Characters', '</table>',
                                maxRes=results*3)
            for sect in sects:
                lis = _findBetween(sect, '<a href="/character/',
                                    ['<small', '</td>', '<br'])
                for li in lis:
                    li = '<%s' % li
                    pid = re_imdbID.findall(li)
                    pname = _unHtml(li)
                    if not (pid and pname):
                        self._mobile_logger.debug('no name/characterID' \
                                                ' parsing %s searching for' \
                                                ' character %s', li, name)
                        continue
                    res.append((str(pid[0]), analyze_name(pname)))
        return res

    def get_character_main(self, characterID):
        return self.get_person_main(characterID, _parseChr=True)

    def get_character_biography(self, characterID):
        cont = self._mretrieve(imdbURL_character_main % characterID + 'bio')
        d = {}
        intro = _findBetween(cont, '<div class="display">',
                            ('<span>', '<h4>'), maxRes=1)
        if intro:
            intro = _unHtml(intro[0]).strip()
            if intro:
                d['introduction'] = intro
        bios = _findBetween(cont, '<div class="display">',
                            '<div class="history">')
        if bios:
            bios = _findBetween(bios[0], '<h4>', ('<h4>', '</div>'))
        if bios:
            for bio in bios:
                bio = bio.replace('</h4>', '::')
                bio = bio.replace('\n', ' ')
                bio = bio.replace('<br>', '\n')
                bio = bio.replace('<br/>', '\n')
                bio = subSGMLRefs(re_unhtmlsub('', bio).strip())
                bio = bio.replace(' ::', '::').replace(':: ', '::')
                bio = bio.replace('::', ': ', 1)
                if bio:
                    d.setdefault('biography', []).append(bio)
        return {'data': d}


www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.