selflink.py :  » Network » Python-Wikipedia-Robot-Framework » pywikipedia » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Network » Python Wikipedia Robot Framework 
Python Wikipedia Robot Framework » pywikipedia » selflink.py
#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple pages of the home wiki, searches for selflinks, and
allows removing them.

These command line parameters can be used to specify which pages to work on:

&params;

    -xml           Retrieve information from a local XML dump (pages-articles
                   or pages-meta-current, see http://download.wikimedia.org).
                   Argument can also be given as "-xml:filename".

    -always        Unlink always but don't prompt you for each replacement.
                   ATTENTION: Use this with care!

    -namespace:n   Number of namespace to process. The parameter can be used
                   multiple times. It works in combination with all other
                   parameters, except for the -start parameter. If you e.g.
                   want to iterate over all categories starting at M, use
                   -start:Category:M.

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""

__version__='$Id: selflink.py 8173 2010-05-14 21:32:30Z amir $'

import wikipedia, pagegenerators, catlib
import editarticle
import re, sys

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;':     pagegenerators.parameterHelp,
}

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# in fixes.py.
msg = {
    'ar':u':   ',
    'cs':u'Robot odstranil odkaz na nzev lnku',
    'de':u'Bot: Entferne Selbstlinks',
    'en':u'Robot: Removing selflinks',
    'es':u'Bot: Eliminando enlaces al mismo artculo',
    'fa':u':   ',
    'fr':u'Robot: Enlve autoliens',
    'he':u':     ',
    'hu':u'Bot: nmagukra mutat hivatkozsok eltvoltsa',
    'ja':u' ',
    'ksh':u'Bot: Ene Lengk vun de Sigg op sesch sellver, erus jenumme.',
    'nl':u'Bot: verwijzingen naar pagina zelf verwijderd',
    'nn':u'robot: fjerna sjlvlenkjer',
    'no':u'robot: fjerner selvlenker',
    'pl':u'Robot automatycznie usuwa linki zwrotne',
    'pt':u'Bot: Retirando link para o prprio artigo',
    'ru':u':  -  . . ',
    'zh':u':',
}

class XmlDumpSelflinkPageGenerator:
    """
    Generator which will yield Pages that might contain selflinks.
    These pages will be retrieved from a local XML dump file
    (cur table).
    """
    def __init__(self, xmlFilename):
        """
        Arguments:
            * xmlFilename  - The dump's path, either absolute or relative
        """

        self.xmlFilename = xmlFilename

    def __iter__(self):
        import xmlreader
        mysite = wikipedia.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        for entry in dump.parse():
            if mysite.nocapitalize:
                title = re.escape(entry.title)
            else:
                title = '[%s%s]%s' % (re.escape(entry.title[0].lower()),
                                      re.escape(entry.title[0].upper()),
                                      re.escape(entry.title[1:]))
            selflinkR = re.compile(r'\[\[' + title + '(\|[^\]]*)?\]\]')
            if selflinkR.search(entry.text):
                yield wikipedia.Page(mysite, entry.title)
                continue

class SelflinkBot:

    def __init__(self, generator, always=False):
        self.generator = generator
        linktrail = wikipedia.getSite().linktrail()
        # The regular expression which finds links. Results consist of four groups:
        # group title is the target page title, that is, everything before | or ].
        # group section is the page section. It'll include the # to make life easier for us.
        # group label is the alternative link title, that's everything between | and ].
        # group linktrail is the link trail, that's letters after ]] which are part of the word.
        # note that the definition of 'letter' varies from language to language.
        self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
        self.always = always
        self.done = False

    def handleNextLink(self, page, text, match, context = 100):
        """
        Returns a tuple (text, jumpToBeginning).
        text is the unicode string after the current link has been processed.
        jumpToBeginning is a boolean which specifies if the cursor position
        should be reset to 0. This is required after the user has edited the
        article.
        """
        # ignore interwiki links and links to sections of the same page as well
        # as section links
        if not match.group('title') \
           or page.site().isInterwikiLink(match.group('title')) \
           or match.group('section'):
            return text, False
        try:
            linkedPage = wikipedia.Page(page.site(), match.group('title'))
        except wikipedia.InvalidTitle, err:
            wikipedia.output(u'Warning: %s' % err)
            return text, False
            
        # Check whether the link found is to the current page itself.
        if linkedPage != page:
            # not a self-link
            return text, False
        else:
            # at the beginning of the link, start red color.
            # at the end of the link, reset the color to default
            if self.always:
                choice = 'a'
            else:
                wikipedia.output(
                    text[max(0, match.start() - context) : match.start()] \
                    + '\03{lightred}' + text[match.start() : match.end()] \
                    + '\03{default}' + text[match.end() : match.end() + context])
                choice = wikipedia.inputChoice(
                    u'\nWhat shall be done with this selflink?\n',
                    ['unlink', 'make bold', 'skip', 'edit', 'more context',
                     'unlink all', 'quit'],
                    ['U', 'b', 's', 'e', 'm', 'a', 'q'], 'u')
                wikipedia.output(u'')

                if choice == 's':
                    # skip this link
                    return text, False
                elif choice == 'e':
                    editor = editarticle.TextEditor()
                    newText = editor.edit(text, jumpIndex = match.start())
                    # if user didn't press Cancel
                    if newText:
                        return newText, True
                    else:
                        return text, True
                elif choice == 'm':
                    # show more context by recursive self-call
                    return self.handleNextLink(page, text, match, context = context + 100)
                elif choice == 'a':
                    self.always = True
                elif choice == 'q':
                    self.done = True
                    return text, False
            new = match.group('label') or match.group('title')
            new += match.group('linktrail')
            if choice == 'b':
                # make bold
                return text[:match.start()] + "'''" + new + "'''" + text[match.end():], False
            else:
                return text[:match.start()] + new + text[match.end():], False

    def treat(self, page):
        # Show the title of the page we're working on.
        # Highlight the title in purple.
        wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
        try:
            oldText = page.get()
            # Inside image maps, don't touch selflinks, as they're used
            # to create tooltip labels. See for example:
            # http://de.wikipedia.org/w/index.php?title=Innenstadt_%28Bautzen%29&diff=next&oldid=35721641
            if '<imagemap>' in oldText:
                wikipedia.output(u'Skipping page %s because it contains an image map.' % page.aslink())
                return
            text = oldText
            curpos = 0
            while curpos < len(text):
                match = self.linkR.search(text, pos = curpos)
                if not match:
                    break
                # Make sure that next time around we will not find this same hit.
                curpos = match.start() + 1
                text, jumpToBeginning = self.handleNextLink(page, text, match)
                if jumpToBeginning:
                    curpos = 0

            if oldText == text:
                wikipedia.output(u'No changes necessary.')
            else:
                wikipedia.showDiff(oldText, text)
                page.put_async(text)
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist?!" % page.aslink())
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
        except wikipedia.LockedPage:
            wikipedia.output(u"Page %s is locked?!" % page.aslink())

    def run(self):
        comment = wikipedia.translate(wikipedia.getSite(), msg)
        wikipedia.setAction(comment)

        for page in self.generator:
            if self.done: break
            self.treat(page)

def main():
    #page generator
    gen = None
    # This temporary array is used to read the page title if one single
    # page to work on is specified by the arguments.
    pageTitle = []
    # Which namespaces should be processed?
    # default to [] which means all namespaces will be processed
    namespaces = []
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    always = False

    for arg in wikipedia.handleArgs():
        if arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
            gen = XmlDumpSelflinkPageGenerator(xmlFilename)
        elif arg == '-sql':
            # NOT WORKING YET
            query = """
SELECT page_namespace, page_title
FROM page JOIN pagelinks JOIN text ON (page_id = pl_from AND page_id = old_id)
WHERE pl_title = page_title
AND pl_namespace = page_namespace
AND page_namespace = 0
AND (old_text LIKE concat('%[[', page_title, ']]%')
    OR old_text LIKE concat('%[[', page_title, '|%'))
LIMIT 100"""
            gen = pagegenerators.MySQLPageGenerator(query)
        elif arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[11:]))
            except ValueError:
                namespaces.append(arg[11:])
        elif arg == '-always':
            always = True
        else:
            if not genFactory.handleArg(arg):
                pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        gen = genFactory.getCombinedGenerator()
    if not gen:
        wikipedia.showHelp('selflink')
    else:
        if namespaces != []:
            gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = SelflinkBot(preloadingGen, always)
        bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.