fixing_redirects.py :  » Network » Python-Wikipedia-Robot-Framework » pywikipedia » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Network » Python Wikipedia Robot Framework 
Python Wikipedia Robot Framework » pywikipedia » fixing_redirects.py
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script has the intention to correct all redirect
links in featured pages or only one page of each wiki.

Can be using with:
&params;

-featured         Run over featured pages

Run fixing_redirects.py -help to see all the command-line
options -file, -ref, -links, ...

"""
#
# This script based on disambredir.py and solve_disambiguation.py
#
# Distributed under the terms of the MIT license.
#
__version__='$Id: fixing_redirects.py 8229 2010-05-29 11:02:47Z xqt $'
#
import wikipedia
import pagegenerators
import re, sys

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;':     pagegenerators.parameterHelp,
}

msg = {
    'ar': u':  ',
    'cs': u'Robot opravil pesmrovn',
    'en': u'Bot: Fixing redirects',
    'fa': u':   ',
    'he': u':  ',
    'ja': u':',
    'nn': u'robot: retta omdirigeringar',
    'no': u'Robot: Retter omdirigeringer',
    'pl': u'Bot: naprawa przekierowa',
    'pt': u'Bot: Arrumando redirects',
    'sv': u'Bot: Rttar omdirigeringar',
    'vi': u'Robot: Sa i hng',
    'zh': u': ',
}

featured_articles = {
    'ar': u': ',
    'cs': u'Wikipedie:Nejlep lnky',
    'de': u'Wikipedia:Exzellente_Artikel',
    'en': u'Wikipedia:Featured_articles',
    'es': u'Wikipedia:Artculos_destacados',
    'fa': u': ',
    'fr': u'Wikipdia:Articles_de_qualit',
    'he': u':_',
    'it': u'Wikipedia:Articoli_in_vetrina',
    'ja': u'Wikipedia:',
    'nl': u'Wikipedia:Etalage',
    'nn': u'Wikipedia:Gode artiklar',
    'no': u'Wikipedia:Anbefalte artikler',
    'pl': u'Wikipedia:Artykuy_na_medal',
    'pt': u'Wikipedia:Os_melhores_artigos',
    'sv': u'Wikipedia:Utvalda_artiklar',
    'vi': u'Wikipedia:Bi_vit_chn_lc',
    'zh': u'Wikipedia:',
}

def firstcap(string):
    return string[0].upper()+string[1:]

def treat(text, linkedPage, targetPage):
    """
    Based on the method of the same name in solve_disambiguation.py
    """
    mysite = wikipedia.getSite()
    linktrail = mysite.linktrail()

    # make a backup of the original text so we can show the changes later
    linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
    curpos = 0
    # This loop will run until we have finished the current page
    while True:
        m = linkR.search(text, pos = curpos)
        if not m:
            break
        # Make sure that next time around we will not find this same hit.
        curpos = m.start() + 1
        # ignore interwiki links and links to sections of the same page
        if m.group('title').strip() == '' or mysite.isInterwikiLink(m.group('title')):
            continue
        else:
            actualLinkPage = wikipedia.Page(targetPage.site(), m.group('title'))
            # Check whether the link found is to page.
            if actualLinkPage != linkedPage:
                continue

        # how many bytes should be displayed around the current link
        context = 15
        # at the beginning of the link, start red color.
        # at the end of the link, reset the color to default
        #wikipedia.output(text[max(0, m.start() - context) : m.start()] + '\03{lightred}' + text[m.start() : m.end()] + '\03{default}' + text[m.end() : m.end() + context])
        choice = 'y'

        # The link looks like this:
        # [[page_title|link_text]]trailing_chars
        page_title = m.group('title')
        link_text = m.group('label')

        if not link_text:
            # or like this: [[page_title]]trailing_chars
            link_text = page_title
        if m.group('section') == None:
            section = ''
        else:
            section = m.group('section')
        trailing_chars = m.group('linktrail')
        if trailing_chars:
            link_text += trailing_chars

        if choice in "uU":
            # unlink - we remove the section if there's any
            text = text[:m.start()] + link_text + text[m.end():]
            continue
        replaceit = choice in "rR"

        # remove preleading ":"
        if link_text[0]==':':
            link_text = link_text[1:]
        if link_text[0].isupper():
            new_page_title = targetPage.title()
        else:
            new_page_title = targetPage.title()[0].lower() + targetPage.title()[1:]

        # remove preleading ":"
        if new_page_title[0]==':':
            new_page_title = new_page_title[1:]

        if replaceit and trailing_chars:
            newlink = "[[%s%s]]%s" % (new_page_title, section, trailing_chars)
        elif replaceit or (new_page_title == link_text and not section):
            newlink = "[[%s]]" % new_page_title
        # check if we can create a link with trailing characters instead of a pipelink
        elif len(new_page_title) <= len(link_text) and firstcap(link_text[:len(new_page_title)]) == firstcap(new_page_title) and re.sub(re.compile(linktrail), '', link_text[len(new_page_title):]) == '' and not section:
            newlink = "[[%s]]%s" % (link_text[:len(new_page_title)], link_text[len(new_page_title):])
        else:
            newlink = "[[%s%s|%s]]" % (new_page_title, section, link_text)
        text = text[:m.start()] + newlink + text[m.end():]
        continue
    return text

pageCache = []

def workon(page):
    mysite = wikipedia.getSite()
    try:
        text = page.get()
    except wikipedia.IsRedirectPage:
        wikipedia.output(u'%s is a redirect page. Skipping' % page.aslink())
        return
    except wikipedia.NoPage:
        wikipedia.output(u'%s does not exist. Skipping' % page.aslink())
        return
    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
    links = page.linkedPages()
    if len(links) > 0:
        wikipedia.getall(mysite,links)
    else:
        wikipedia.output('Nothing left to do.')
        return
    
    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (wikipedia.Error,wikipedia.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = wikipedia.translate(mysite, msg)
        wikipedia.showDiff(page.get() ,text)
        try:
            page.put(text, comment)
        except (wikipedia.Error):
            wikipedia.output('Error: unable to put %s' % page.aslink())

def main():
    start = '!'
    featured = False
    namespace = None
    gen = None

    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()

    for arg in wikipedia.handleArgs():
        if arg == '-featured':
            featured = True
        elif arg.startswith('-namespace'):
            if len(arg) == 10:
                namespace = int(wikipedia.input(u'Which namespace should be processed?'))
            else:
                namespace = int(arg[11:])
        else:
            genFactory.handleArg(arg)

    gen = genFactory.getCombinedGenerator()

    mysite = wikipedia.getSite()
    if mysite.sitename() == 'wikipedia:nl':
        wikipedia.output(u'\03{lightred}There is consensus on the Dutch Wikipedia that bots should not be used to fix redirects.\03{default}')
        sys.exit()

    linktrail = mysite.linktrail()
    if featured:
        featuredList = wikipedia.translate(mysite, featured_articles)
        ref = wikipedia.Page(wikipedia.getSite(), featuredList)
        gen = pagegenerators.ReferringPageGenerator(ref)
        generator = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
        for page in generator:
            workon(page)
    elif namespace is not None:
        for page in pagegenerators.AllpagesPageGenerator(start=start, namespace=namespace, includeredirects=False):
            workon(page)
    elif gen:
        for page in pagegenerators.PreloadingGenerator(gen):
            workon(page)
    else:
        wikipedia.showHelp('fixing_redirects')

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.