disambredir.py :  » Network » Python-Wikipedia-Robot-Framework » pywikipedia » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Network » Python Wikipedia Robot Framework 
Python Wikipedia Robot Framework » pywikipedia » disambredir.py
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Goes through the disambiguation pages, checks their links, and asks for
each link that goes to a redirect page whether it should be replaced.
"""
#
# (C) Andr Engels and others, 2006-2009
#
# Distributed under the terms of the MIT license.
#
__version__='$Id: disambredir.py 7861 2010-01-06 06:49:23Z xqt $'
#
import wikipedia
import pagegenerators
import sys, re
import catlib

msg = {
    'ar': u'    ',
    'en': u'Changing redirects on a disambiguation page',
    'he': u'    ',
    'ja': u': ',
    'nl': u'Verandering van redirects op een doorverwijspagina',
    'pl': u'Zmiana przekierowa na stronie ujednoznaczajcej',
    'pt': u'Arrumando redirects na pgina de desambiguao',
    'zh': u': ',
}

def firstcap(string):
    return string[0].upper()+string[1:]

def treat(text, linkedPage, targetPage):
    """
    Based on the method of the same name in solve_disambiguation.py.
    """
    # make a backup of the original text so we can show the changes later
    linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
    curpos = 0
    # This loop will run until we have finished the current page
    while True:
        m = linkR.search(text, pos = curpos)
        if not m:
            break
        # Make sure that next time around we will not find this same hit.
        curpos = m.start() + 1
        # ignore interwiki links and links to sections of the same page
        if m.group('title') == '' or mysite.isInterwikiLink(m.group('title')):
            continue
        else:
            actualLinkPage = wikipedia.Page(page.site(), m.group('title'))
            # Check whether the link found is to page.
            if actualLinkPage != linkedPage:
                continue

        # how many bytes should be displayed around the current link
        context = 30
        # at the beginning of the link, start red color.
        # at the end of the link, reset the color to default
        wikipedia.output(text[max(0, m.start() - context) : m.start()] + '\03{lightred}' + text[m.start() : m.end()] + '\03{default}' + text[m.end() : m.end() + context])
        while True:
            choice = wikipedia.input(u"Option (N=do not change, y=change link to \03{lightpurple}%s\03{default}, r=change and replace text, u=unlink)"%targetPage.title())
            try:
                choice = choice[0]
            except:
                choice = 'N'
            if choice in 'nNyYrRuU':
                break
        if choice in "nN":
            continue

        # The link looks like this:
        # [[page_title|link_text]]trailing_chars
        page_title = m.group('title')
        link_text = m.group('label')

        if not link_text:
            # or like this: [[page_title]]trailing_chars
            link_text = page_title
        if m.group('section') == None:
            section = ''
        else:
            section = m.group('section')
        trailing_chars = m.group('linktrail')
        if trailing_chars:
            link_text += trailing_chars

        if choice in "uU":
            # unlink - we remove the section if there's any
            text = text[:m.start()] + link_text + text[m.end():]
            continue
        replaceit = choice in "rR"

        if link_text[0].isupper():
            new_page_title = targetPage.title()
        else:
            new_page_title = targetPage.title()[0].lower() + targetPage.title()[1:]
        if replaceit and trailing_chars:
            newlink = "[[%s%s]]%s" % (new_page_title, section, trailing_chars)
        elif replaceit or (new_page_title == link_text and not section):
            newlink = "[[%s]]" % new_page_title
        # check if we can create a link with trailing characters instead of a pipelink
        elif len(new_page_title) <= len(link_text) and firstcap(link_text[:len(new_page_title)]) == firstcap(new_page_title) and re.sub(re.compile(linktrail), '', link_text[len(new_page_title):]) == '' and not section:
            newlink = "[[%s]]%s" % (link_text[:len(new_page_title)], link_text[len(new_page_title):])
        else:
            newlink = "[[%s%s|%s]]" % (new_page_title, section, link_text)
        text = text[:m.start()] + newlink + text[m.end():]
        continue
    return text

def workon(page, links):
    text = page.get()
    # Show the title of the page we're working on.
    # Highlight the title in purple.
    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (wikipedia.Error,wikipedia.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = wikipedia.translate(mysite, msg)
        page.put(text, comment)

def main():
    global mysite, linktrail, page
    start = []
    for arg in wikipedia.handleArgs():
        start.append(arg)
    if start:
        start = " ".join(start)
    else:
        start = "!"
    mysite = wikipedia.getSite()
    linktrail = mysite.linktrail()
    try:
        generator = pagegenerators.CategorizedPageGenerator(mysite.disambcategory(), start = start)
    except wikipedia.NoPage:
        print "The bot does not know the disambiguation category for your wiki."
        raise
    # only work on articles
    generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0])
    generator = pagegenerators.PreloadingGenerator(generator)
    pagestodo = []
    pagestoload = []
    for page in generator:
        if page.isRedirectPage():
            continue
        linked = page.linkedPages()
        pagestodo.append((page,linked))
        pagestoload += linked
        if len(pagestoload) > 49:
            wikipedia.getall(mysite,pagestoload)
            for page, links in pagestodo:
                workon(page,links)
            pagestoload = []
            pagestodo = []

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()

www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.