watchlist.py : » Network » Python-Wikipedia-Robot-Framework » pywikipedia » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML

Python Open Source » Network » Python Wikipedia Robot Framework

Python Wikipedia Robot Framework » pywikipedia » watchlist.py

# -*- coding: utf-8 -*-
"""
Allows access to the bot account's watchlist.

The function refresh() downloads the current watchlist and saves
it to disk. It is run automatically when a bot first tries to save a page
retrieved via XML Export. The watchlist can be updated manually by running
this script. The list will also be reloaded automatically once a month.

Syntax: python watchlist [-all]

Command line options:
    -all  -  Reloads watchlists for all wikis where a watchlist is already
             present
    -new  -  Load watchlists for all wikis where accounts is setting in user-config.py
"""

# (C) Daniel Herding, 2005
#
# Distributed under the terms of the MIT license.

__version__='$Id: watchlist.py 8105 2010-04-18 12:16:23Z alexsh $'

import wikipedia
import re, sys, pickle
import os.path
import time

cache = {}

def get(site = None):
    if site is None:
        site = wikipedia.getSite()
    if site in cache:
        # Use cached copy if it exists.
        watchlist = cache[site]
    else:
        fn = wikipedia.config.datafilepath('watchlists',
                  'watchlist-%s-%s.dat' % (site.family.name, site.lang))
        try:
            # find out how old our saved dump is (in seconds)
            file_age = time.time() - os.path.getmtime(fn)
            # if it's older than 1 month, reload it
            if file_age > 30 * 24 * 60 * 60:
                wikipedia.output(u'Copy of watchlist is one month old, reloading')
                refresh(site)
        except OSError:
            # no saved watchlist exists yet, retrieve one
            refresh(site)
        f = open(fn, 'r')
        watchlist = pickle.load(f)
        f.close()
        # create cached copy
        cache[site] = watchlist
    return watchlist

def isWatched(pageName, site=None):
    watchlist = get(site)
    return pageName in watchlist

def refresh(site, sysop=False):
    if not site.has_api() or site.versionnumber() < 10:
        _refreshOld(site)
    
    # get watchlist special page's URL
    if not site.loggedInAs(sysop=sysop):
        site.forceLogin(sysop=sysop)
    
    params = {
        'action': 'query',
        'list': 'watchlist',
        'wllimit': wikipedia.config.special_page_limit,
        'wlprop': 'title',
    }
    
    wikipedia.output(u'Retrieving watchlist for %s via API.' % repr(site))
    #wikipedia.put_throttle() # It actually is a get, but a heavy one.
    watchlist = []
    while True:
        data = wikipedia.query.GetData(params, site, sysop=sysop)
        if 'error' in data:
            raise RuntimeError('ERROR: %s' % data)
        watchlist.extend([w['title'] for w in data['query']['watchlist']])
        
        if 'query-continue' in data:
            params['wlstart'] = data['query-continue']['watchlist']['wlstart']
        else:
            break

    # Save the watchlist to disk
    # The file is stored in the watchlists subdir. Create if necessary.
    if sysop:
        f = open(wikipedia.config.datafilepath('watchlists',
                 'watchlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w')    
    else:
        f = open(wikipedia.config.datafilepath('watchlists',
                 'watchlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(watchlist, f)
    f.close()

def _refreshOld(site, sysop=False):
    # get watchlist special page's URL
    path = site.watchlist_address()
    wikipedia.output(u'Retrieving watchlist for %s' % repr(site))
    #wikipedia.put_throttle() # It actually is a get, but a heavy one.
    watchlistHTML = site.getUrl(path, sysop=sysop)

    wikipedia.output(u'Parsing watchlist')
    watchlist = []
    for itemR in [re.compile(r'<li><input type="checkbox" name="id\[\]" value="(.+?)" />'), re.compile(r'<li><input name="titles\[\]" type="checkbox" value="(.+?)" />')]:
        for m in itemR.finditer(watchlistHTML):
            pageName = m.group(1)
            watchlist.append(pageName)

    # Save the watchlist to disk
    # The file is stored in the watchlists subdir. Create if necessary.
    if sysop:
        f = open(wikipedia.config.datafilepath('watchlists',
                 'watchlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w')    
    else:
        f = open(wikipedia.config.datafilepath('watchlists',
                 'watchlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(watchlist, f)
    f.close()

def refresh_all(new = False, sysop=False):
    if new:
        import config
        wikipedia.output('Downloading All watchlists for your accounts in user-config.py');
        for family in config.usernames:
            for lang in config.usernames[ family ]:
                refresh(wikipedia.getSite( code = lang, fam = family ), sysop=sysop )
        for family in config.sysopnames:
            for lang in config.sysopnames[ family ]:
                refresh(wikipedia.getSite( code = lang, fam = family ), sysop=sysop )

    else:
        import dircache, time
        filenames = dircache.listdir(wikipedia.config.datafilepath('watchlists'))
        watchlist_filenameR = re.compile('watchlist-([a-z\-:]+).dat')
        for filename in filenames:
            match = watchlist_filenameR.match(filename)
            if match:
                arr = match.group(1).split('-')
                family = arr[0]
                lang = '-'.join(arr[1:])
                refresh(wikipedia.getSite(code = lang, fam = family))

def main():
    all = False
    new = False
    sysop = False
    for arg in wikipedia.handleArgs():
        if arg == '-all' or arg == '-update':
            all = True
        elif arg == '-new':
            new = True
        elif arg == '-sysop':
            sysop = True
    if all:
        refresh_all(sysop=sysop)
    elif new:
        refresh_all(new, sysop=sysop)
    else:
        refresh(wikipedia.getSite(), sysop=sysop)

        watchlist = get(wikipedia.getSite())
        wikipedia.output(u'%i pages in the watchlist.' % len(watchlist))
        for pageName in watchlist:
            wikipedia.output( pageName, toStdout = True )

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.