fixes.py :  » Network » Python-Wikipedia-Robot-Framework » pywikipedia » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Network » Python Wikipedia Robot Framework 
Python Wikipedia Robot Framework » pywikipedia » fixes.py
# -*- coding: utf-8  -*-
"""
File containing all standard fixes

"""

#
# (C) Pywikipedia team, 2008-2010
#
__version__ = '$Id: fixes.py 8228 2010-05-28 17:32:37Z xqt $'
#
# Distributed under the terms of the MIT license.
#

help = u"""
                  * HTML        - Convert HTML tags to wiki syntax, and
                                  fix XHTML.
                                    **) NOTE below
                  * isbn        - Fix badly formatted ISBNs.
                                    **) NOTE below
                  * syntax      - Try to fix bad wiki markup. Do not run
                                  this in automatic mode, as the bot may
                                  make mistakes.
                  * syntax-safe - Like syntax, but less risky, so you can
                                  run this in automatic mode.
                                    **) NOTE below
                  * case-de     - fix upper/lower case errors in German
                  * grammar-de  - fix grammar and typography in German
                  * vonbis      - Ersetze Binde-/Gedankenstrich durch "bis"
                                  in German
                  * music       - Links auf Begriffsklrungen in German
                  * datum       - specific date formats in German
                  * correct-ar  - Corrections for Arabic Wikipedia and any
                                  Arabic wiki.
                  * yu-tld      - the yu top-level domain will soon be
                                  disabled, see
                  * fckeditor   - Try to convert FCKeditor HTML tags to wiki
                                  syntax.
                                  http://lists.wikimedia.org/pipermail/wikibots-l/2009-February/000290.html

                                    **) NOTE: these fixes are part of the
                                        cosmetic_changes.py. You may use
                                        that script instead.

"""

fixes = {
    # These replacements will convert HTML to wiki syntax where possible, and
    # make remaining tags XHTML compliant.
    'HTML': {
        'regex': True,
        'msg': {
            'ar':u': / HTML',
            'be':u':  HTML',
            'cs':u'pevod/oprava HTML',
            'en':u'Robot: Converting/fixing HTML',
            'eo':u'Bot: koredtado de HTMLa teksto',
            'fa':u':/  ',
            'de':u'Bot: konvertiere/korrigiere HTML',
            'fr':u'Robot: convertit/fixe HTML',
            'he':u': / HTML',
            'ja':u': HTML',
            'ksh':u'Bot: vun HTML en Wikikood wandelle',
            'ia':u'Robot: conversion/reparation de HTML',
            'lt':u'robotas: konvertuojamas/taisomas HTML',
            'nl':u'Bot: conversie/reparatie HTML',
            'pl':u'Robot konwertuje/naprawia HTML',
            'pt':u'Bot: Corrigindo HTML',
            'ru':u':  HTML',
            'sr':u':  HTML-',
            'sv':u'Bot: Konverterar/korrigerar HTML',
            'uk':u': i HTML',
            'zh':u': HTML',
        },
        'replacements': [
            # Everything case-insensitive (?i)
            # Keep in mind that MediaWiki automatically converts <br> to <br />
            # when rendering pages, so you might comment the next two lines out
            # to save some time/edits.
            #(r'(?i)<br>',                      r'<br />'),
            # linebreak with attributes
            #(r'(?i)<br ([^>/]+?)>',            r'<br \1 />'),
            (r'(?i)<b>(.*?)</b>',              r"'''\1'''"),
            (r'(?i)<strong>(.*?)</strong>',    r"'''\1'''"),
            (r'(?i)<i>(.*?)</i>',              r"''\1''"),
            (r'(?i)<em>(.*?)</em>',            r"''\1''"),
            # horizontal line without attributes in a single line
            (r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2'),
            # horizontal line without attributes with more text in the same line
            #(r'(?i) +<hr[ /]*> +',             r'\r\n----\r\n'),
            # horizontal line with attributes; can't be done with wiki syntax
            # so we only make it XHTML compliant
            (r'(?i)<hr ([^>/]+?)>',            r'<hr \1 />'),
            # a header where only spaces are in the same line
            (r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])',  r"\1= \2 =\3"),
            (r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])',  r"\1== \2 ==\3"),
            (r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])',  r"\1=== \2 ===\3"),
            (r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])',  r"\1==== \2 ====\3"),
            (r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])',  r"\1===== \2 =====\3"),
            (r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])',  r"\1====== \2 ======\3"),
            # TODO: maybe we can make the bot replace <p> tags with \r\n's.
        ],
        'exceptions': {
            'inside-tags': [
                'nowiki',
                'comment',
                'math',
                'pre'
            ],
        }
    },

    # Grammar fixes for German language
    # Do NOT run this automatically!
    'grammar-de': {
        'regex': True,
        'msg': {
            'de':u'Bot: korrigiere Grammatik',
        },
        'replacements': [
            #(u'([Ss]owohl) ([^,\.]+?), als auch',                                                            r'\1 \2 als auch'),
            #(u'([Ww]eder) ([^,\.]+?), noch', r'\1 \2 noch'),
            #
            # Vorsicht bei Substantiven, z. B. 3-Jhriger!
            (u'(\d+)(mintig|stndig|tgig|wchig|jhrig|mintlich|stndlich|tglich|wchentlich|jhrlich|fach|mal|malig|kpfig|teilig|gliedrig|geteilt|elementig|dimensional|bndig|eckig|farbig|stimmig)', r'\1-\2'),
            # zusammengesetztes Wort, Bindestrich wird durchgeschleift
            (u'(?<!\w)(\d+|\d+[\.,]\d+)(\$||DM|||mg|g|kg|ml|cl|l|t|ms|min|m|mm|cm|dm|m|km|ha|C|kB|MB|GB|TB|W|kW|MW|GW|PS|Nm|eV|kcal|mA|mV|kV||Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)([]?-[\w\[])',           r'\1-\2\3'),
            # Grenangabe ohne Leerzeichen vor Einheit
            # weggelassen wegen vieler falsch Positiver: s, A, V, C, S, J, %
            (u'(?<!\w)(\d+|\d+[\.,]\d+)(\$||DM|||mg|g|kg|ml|cl|l|t|ms|min|m|mm|cm|dm|m|km|ha|C|kB|MB|GB|TB|W|kW|MW|GW|PS|Nm|eV|kcal|mA|mV|kV||Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)(?=\W|||$)',          r'\1 \2'),
            # Temperaturangabe mit falsch gesetztem Leerzeichen
            (u'(?<!\w)(\d+|\d+[\.,]\d+) C(?=\W|||$)',          ur'\1 C'),
            # Kein Leerzeichen nach Komma
            (u'([a-z](\]\])?,)((\[\[)?[a-zA-Z])',                                                                          r'\1 \3'),
            # Leerzeichen und Komma vertauscht
            (u'([a-z](\]\])?) ,((\[\[)?[a-zA-Z])',                                                                          r'\1, \3'),
            # Plenks (d. h. Leerzeichen auch vor dem Komma/Punkt/Ausrufezeichen/Fragezeichen)
            # Achtung bei Franzsisch: http://de.wikipedia.org/wiki/Plenk#Sonderfall_Franz.C3.B6sisch
            # Leerzeichen vor Doppelpunkt/Semikolon kann korrekt sein, nach irgendeiner Norm fr Zitationen.
            (u'([a-z](\]\])?) ([,\.!\?]) ((\[\[)?[a-zA-Z])',                                                                          r'\1\3 \4'),
            #(u'([a-z]\.)([A-Z])',                                                                             r'\1 \2'),
        ],
        'exceptions': {
            'inside-tags': [
                'nowiki',
                'comment',
                'math',
                'pre',           # because of code examples
                'source',        # because of code examples
                'startspace',    # because of code examples
                'hyperlink',     # e.g. commas in URLs
                'gallery',       # because of filenames
                'timeline',
            ],
            'text-contains': [
                r'sic!',
                r'20min.ch',     # Schweizer News-Seite
            ],
            'inside': [
                r'<code>.*</code>', # because of code examples
                r'{{[Zz]itat\|.*?}}',
                ur'{{\|.*?}}',  # Gesetzesparagraph
                ur' ?\d+[a-z]',  # Gesetzesparagraph
                r'Ju 52/1m', # Flugzeugbezeichnung
                r'Ju 52/3m', # Flugzeugbezeichnung
                r'AH-1W',    # Hubschrauberbezeichnung
                r'ZPG-3W',   # Luftschiffbezeichnung
                r'8mm',      # Filmtitel
                r'802.11g',  # WLAN-Standard
                r'DOS/4GW',  # Software
                r'ntfs-3g',  # Dateisystem-Treiber
                r'/\w(,\w)*/',     # Laut-Aufzhlung in der Linguistik
                r'[xyz](,[xyz])+', # Variablen in der Mathematik (unklar, ob Leerzeichen hier Pflicht sind)
                r'(?m)^;(.*?)$', # Definitionslisten, dort gibt es oft absichtlich Leerzeichen vor Doppelpunkten
                r'\d+h( |&nbsp;)\d+m', # Schreibweise fr Zeiten, vor allem in Film-Infoboxen. Nicht korrekt, aber dafr schn kurz.
                r'(?i)\[\[(Bild|Image|Media):.+?\|', # Dateinamen auslassen
                r'{{bgc\|.*?}}',  # Hintergrundfarbe
                r'<sup>\d+m</sup>',                   # bei chemischen Formeln
                r'\([A-Z][A-Za-z]*(,[A-Z][A-Za-z]*(<sup>.*?</sup>|<sub>.*?</sub>|))+\)' # chemische Formel, z. B. AuPb(Pb,Sb,Bi)Te. Hier sollen keine Leerzeichen hinter die Kommata.
            ],
            'title': [
                r'Arsen',  # chemische Formel
            ],
        }
    },

    # Do NOT run this automatically!
    # Recommendation: First run syntax-safe automatically, afterwards
    # run syntax manually, carefully checking that you're not breaking
    # anything.
    'syntax': {
        'regex': True,
        'msg': {
            'ar':u':   ',
            'be':u':  ii-ii',
            'cs':u'Oprava wikisyntaxe',
            'de':u'Bot: Korrigiere Wiki-Syntax',
            'en':u'Bot: Fixing wiki syntax',
            'eo':u'Bot: Korektado de vikia sintakso',
            'fa':u':  ',
            'fr':u'Bot: Corrige wiki-syntaxe',
            'he':u':   ',
            'ia':u'Robot: Reparation de syntaxe wiki',
            'ja':u': wiki',
            'lt':u'robotas: Taisoma wiki sintaks',
            'nl':u'Bot: reparatie wikisyntaxis',
            'pl':u'Robot poprawia wiki-skadni',
            'pt':u'Bot: Corrigindo sintaxe wiki',
            'ru':u':   ',
            'sr':u':   ',
            'uk':u': i ii-',
            'zh':u': wiki',
        },
        'replacements': [
            # external link in double brackets
            (r'\[\[(?P<url>https?://[^\]]+?)\]\]',   r'[\g<url>]'),
            # external link starting with double bracket
            (r'\[\[(?P<url>https?://.+?)\]',   r'[\g<url>]'),
            # external link with forgotten closing bracket
            #(r'\[(?P<url>https?://[^\]\s]+)\r\n',  r'[\g<url>]\r\n'),
            # external link ending with double bracket.
            # do not change weblinks that contain wiki links inside
            # inside the description
            (r'\[(?P<url>https?://[^\[\]]+?)\]\](?!\])',   r'[\g<url>]'),
            # external link and description separated by a dash.
            # ATTENTION: while this is a mistake in most cases, there are some
            # valid URLs that contain dashes!
            (r'\[(?P<url>https?://[^\|\]\s]+?) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
            # wiki link closed by single bracket.
            # ATTENTION: There are some false positives, for example
            # Brainfuck code examples or MS-DOS parameter instructions.
            # There are also sometimes better ways to fix it than
            # just putting an additional ] after the link.
            (r'\[\[([^\[\]]+?)\](?!\])',  r'[[\1]]'),
            # wiki link opened by single bracket.
            # ATTENTION: same as above.
            (r'(?<!\[)\[([^\[\]]+?)\]\](?!\])',  r'[[\1]]'),
            # template closed by single bracket
            # ATTENTION: There are some false positives, especially in
            # mathematical context or program code.
            (r'{{([^{}]+?)}(?!})',       r'{{\1}}'),
        ],
        'exceptions': {
            'inside-tags': [
                'nowiki',
                'comment',
                'math',
                'pre',
                'source',        # because of code examples
                'startspace',    # because of code examples
            ],
            'text-contains': [
                r'http://.*?object=tx\|',               # regular dash in URL
                r'http://.*?allmusic\.com',             # regular dash in URL
                r'http://.*?allmovie\.com',             # regular dash in URL
                r'http://physics.nist.gov/',            # regular dash in URL
                r'http://www.forum-seniorenarbeit.de/', # regular dash in URL
                r'http://kuenstlerdatenbank.ifa.de/',   # regular dash in URL
                r'&object=med',                         # regular dash in URL
                r'\[CDATA\['                            # lots of brackets
            ],
        }
    },

    # The same as syntax, but restricted to replacements that should
    # be safe to run automatically.
    'syntax-safe': {
        'regex': True,
        'msg': {
            'ar':u':   ',
            'be':u':  ii-ii',
            'cs':u'Oprava wikisyntaxe',
            'de':u'Bot: Korrigiere Wiki-Syntax',
            'en':u'Bot: Fixing wiki syntax',
            'eo':u'Bot: Korektado de vikia sintakso',
            'fa':u':  ',
            'fr':u'Bot: Corrige wiki-syntaxe',
            'he':u':   ',
            'ia':u'Robot: Reparation de syntaxe wiki',
            'ja':u': wiki',
            'lt':u'robotas: Taisoma wiki sintaks',
            'nl':u'Bot: reparatie wikisyntaxis',
            'pl':u'Robot poprawia wiki-skadni',
            'pt':u'Bot: Corrigindo sintaxe wiki',
            'ru':u':   ',
            'sr':u':   ',
            'uk':u': i ii-',
            'zh':u': wiki',
        },
        'replacements': [
            # external link in double brackets
            (r'\[\[(?P<url>https?://[^\]]+?)\]\]',   r'[\g<url>]'),
            # external link starting with double bracket
            (r'\[\[(?P<url>https?://.+?)\]',   r'[\g<url>]'),
            # external link with forgotten closing bracket
            #(r'\[(?P<url>https?://[^\]\s]+)\r\n',   r'[\g<url>]\r\n'),
            # external link and description separated by a dash, with
            # whitespace in front of the dash, so that it is clear that
            # the dash is not a legitimate part of the URL.
            (r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
            # dash in external link, where the correct end of the URL can
            # be detected from the file extension. It is very unlikely that
            # this will cause mistakes.
            (r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
        ],
        'exceptions': {
            'inside-tags': [
                'nowiki',
                'comment',
                'math',
                'pre',
                'source',        # because of code examples
                'startspace',    # because of code examples
            ],
        }
    },

    'case-de': { # German upper / lower case issues
        'regex': True,
        'msg': {
            'de':u'Bot: Korrigiere Gro-/Kleinschreibung',
        },
        'replacements': [
            (r'\batlantische(r|n|) Ozean', r'Atlantische\1 Ozean'),
            (r'\bdeutsche(r|n|) Bundestag\b', r'Deutsche\1 Bundestag'),
            (r'\bdeutschen Bundestags\b', r'Deutschen Bundestags'), # Aufpassen, z. B. 'deutsche Bundestagswahl'
            (r'\bdeutsche(r|n|) Reich\b', r'Deutsche\1 Reich'),
            (r'\bdeutschen Reichs\b', r'Deutschen Reichs'), # Aufpassen, z. B. 'deutsche Reichsgrenzen'
            (r'\bdritte(n|) Welt(?!krieg)', r'Dritte\1 Welt'),
            (r'\bdreiigjhrige(r|n|) Krieg', r'Dreiigjhrige\1 Krieg'),
            (r'\beuropische(n|) Gemeinschaft', r'Europische\1 Gemeinschaft'),
            (r'\beuropische(n|) Kommission', r'Europische\1 Kommission'),
            (r'\beuropische(n|) Parlament', r'Europische\1 Parlament'),
            (r'\beuropische(n|) Union', r'Europische\1 Union'),
            (r'\berste(r|n|) Weltkrieg', r'Erste\1 Weltkrieg'),
            (r'\bkalte(r|n|) Krieg', r'Kalte\1 Krieg'),
            (r'\bpazifische(r|n|) Ozean', r'Pazifische\1 Ozean'),
            (r'Tag der deutschen Einheit', r'Tag der Deutschen Einheit'),
            (r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'),
        ],
        'exceptions': {
            'inside-tags': [
                'nowiki',
                'comment',
                'math',
                'pre',
            ],
            'text-contains': [
                r'sic!',
            ],
        }
    },

    'vonbis': {
        'regex': True,
        'msg': {
            'de':u'Bot: Ersetze Binde-/Gedankenstrich durch "bis"',
        },
        'replacements': [
            # Bindestrich, Gedankenstrich, Geviertstrich
            (u'(von \d{3,4}) *(-|&ndash;||&mdash;|) *(\d{3,4})', r'\1 bis \3'),
        ],
    },

    # some disambiguation stuff for de:
    # python replace.py -fix:music -subcat:Album
    'music': {
        'regex': False,
        'msg': {
            'de':u'Bot: korrigiere Links auf Begriffsklrungen',
        },
        'replacements': [
            (u'[[CD]]', u'[[Audio-CD|CD]]'),
            (u'[[LP]]', u'[[Langspielplatte|LP]]'),
            (u'[[EP]]', u'[[Extended Play|EP]]'),
            (u'[[MC]]', u'[[Musikkassette|MC]]'),
            (u'[[Single]]', u'[[Single (Musik)|Single]]'),
        ],
        'exceptions': {
            'inside-tags': [
                'hyperlink',
            ]
        }
    },

    # format of dates of birth and death, for de:
    # python replace.py -fix:datum -ref:Vorlage:Personendaten
    'datum': {
        'regex': True,
        'msg': {
            'de': u'Bot: Korrigiere Datumsformat',
        },
        'replacements': [
            # space after birth sign w/ year
            #(u'\(\*(\d{3,4})', u'(* \\1'),
            ## space after death sign w/ year
            #(u'(\d{3,4})', u' \\1'),
            #(u'&dagger;(\d{3,4})', u' \\1'),
            ## space after birth sign w/ linked date
            #(u'\(\*\[\[(\d)', u'(* [[\\1'),
            ## space after death sign w/ linked date
            #(u'\[\[(\d)', u' [[\\1'),
            #(u'&dagger;\[\[(\d)', u' [[\\1'),
            (u'\[\[(\d+\. (?:Januar|Februar|Mrz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)) (\d{1,4})\]\]', u'[[\\1]] [[\\2]]'),
            # Keine fhrende Null beim Datum (ersteinmal nur bei denen, bei denen auch ein Leerzeichen fehlt)
            (u'0(\d+)\.(Januar|Februar|Mrz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)', r'\1. \2'),
            # Kein Leerzeichen zwischen Tag und Monat
            (u'(\d+)\.(Januar|Februar|Mrz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)', r'\1. \2'),
            # Kein Punkt vorm Jahr
            (u'(\d+)\. (Januar|Februar|Mrz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\.(\d{1,4})', r'\1. \2 \3'),
        ],
        'exceptions': {
            'inside': [
                r'\[\[20. Juli 1944\]\]', # Hitler-Attentat
                r'\[\[17. Juni 1953\]\]', # Ost-Berliner Volksaufstand
                r'\[\[1. April 2000\]\]', # Film
                r'\[\[11. September 2001\]\]', # Anschlge in den USA
                r'\[\[7. Juli 2005\]\]',  # Terroranschlge in Spanien
            ],
        }
    },

    'isbn': {
        'regex': True,
        'msg': {
            'ar': u':   ISBN',
            'be': u':  ISBN ',
            'cs': u'Oprava formtu ISBN',
            'de': u'Bot: Korrigiere ISBN-Format',
            'en': u'Robot: Fixing ISBN format',
            'es': u'Arreglando formato ISBN',
            'eo': u'Bot: Korekto de teksto en ISBN-formato',
            'fa': u': ',
            'he': u':   ISBN',
            'ja': u': ISBN',
            'ru': u':  ISBN ',
            'uk': u':  ISBN ',
            'zh': u': ISBN',
        },
        'replacements': [
            # colon
            (r'ISBN: (\d+)', r'ISBN \1'),
            # superfluous word "number"
            (r'ISBN( number| no\.?| No\.?|-Nummer|-Nr\.):? (\d+)', r'ISBN \2'),
            # Space, minus, dot,  hypen, en dash, em dash, etc. instead of
            # hyphen-minus as separator, or spaces between digits and separators.
            # Note that these regular expressions also match valid ISBNs, but
            # these won't be changed.
            (ur'ISBN (978|979) *[\- \.-] *(\d+) *[\- \.-] *(\d+) *[\- \.-] *(\d+) *[\- \.-] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN-13
            (ur'ISBN (\d+) *[\- \.-] *(\d+) *[\- \.-] *(\d+) *[\- \.-] *(\d|X|x)(?!\d)', r'ISBN \1-\2-\3-\4'), # ISBN-10
            # missing space before ISBN-10 or before ISBN-13,
            # or non-breaking space.
            (r'ISBN(|&nbsp;| )((\d(-?)){12}\d|(\d(-?)){9}[\dXx])', r'ISBN \2'),
        ],
        'exceptions': {
            'inside-tags': [
                'comment',
                'hyperlink',
            ],
            'inside': [
                r'ISBN (\d(-?)){12}\d',    # matches valid ISBN-13s
                r'ISBN (\d(-?)){9}[\dXx]', # matches valid ISBN-10s
            ],
        }
    },

    #Corrections for Arabic Wikipedia and any Arabic wiki.
    #python replace.py -always -start:! -fix:correct-ar

    'correct-ar': {
        'regex': True,
        'msg': {
            'ar':u' . 528   .',
        },
        'replacements': [
            #(u' ,', u' '), #FIXME: Do not replace comma in non-Arabic text, interwiki, image links or <math> syntax.
            (ur'\b\b', u''),
            (ur'\b\b', ur''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'==[ ]? [ ]?==', u'==   =='),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            #(ur'\b\b', u''), #FIXME: Do not replace this (and all others) in interwiki links. This is an Arabic typo, but it is correct in Farsi.
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b(|)\b', ur'\1'),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b\b', u''),
            (ur'\b(|)(||)(|||||||||)\b', ur'\1\2\3'),
            (ur'\b(|)(|)(||||||||||)\b', ur'\1\2\3'),
            (ur'\b(|)(|||||)(|||||||||)\b', ur'\1\2\3'),
            (ur'\b(|)(||||)(|||||||||)\b', ur'\1\2\3'),
            (ur'\b(|)(|)(|)(|||||||||)\b', ur'\1\4'),
            (ur'\b(||||||||||||)(|||)\b', ur'\1\2'),
            (ur'\b(|||||||||)\b', ur'\1'),
        ]
    },
    'specialpages': {
        'regex': False,
        'msg': {
            'en': u'Robot: Fixing special page capitalisation',
        },
        'replacements': [
            (u'Special:Allpages',        u'Special:AllPages'),
            (u'Special:Blockip',         u'Special:BlockIP'),
            (u'Special:Blankpage',       u'Special:BlankPage'),
            (u'Special:Filepath',        u'Special:FilePath'),
            (u'Special:Globalusers',     u'Special:GlobalUsers'),
            (u'Special:Imagelist',       u'Special:ImageList'),
            (u'Special:Ipblocklist',     u'Special:IPBlockList'),
            (u'Special:Listgrouprights', u'Special:ListGroupRights'),
            (u'Special:Listusers',       u'Special:ListUsers'),
            (u'Special:Newimages',       u'Special:NewImages'),
            (u'Special:Prefixindex',     u'Special:PrefixIndex'),
            (u'Special:Protectedpages',  u'Special:ProtectedPages'),
            (u'Special:Recentchanges',   u'Special:RecentChanges'),
            (u'Special:Specialpages',    u'Special:SpecialPages'),
            (u'Special:Unlockdb',        u'Special:UnlockDB'),
            (u'Special:Userlogin',       u'Special:UserLogin'),
            (u'Special:Userlogout',      u'Special:UserLogout'),
            (u'Special:Whatlinkshere',   u'Special:WhatLinksHere'),
        ],
    },
    # yu top-level domain will soon be disabled,
    # see http://lists.wikimedia.org/pipermail/wikibots-l/2009-February/000290.html
    # The following are domains that are often-used.
    'yu-tld': {
        'regex': False,
        'nocase': True,
        'msg': {
            'de': u'Bot: Ersetze Links auf .yu-Domains',
            'en': u'Robot: Replacing links to .yu domains',
            'fr': u'Robot: Correction des liens pointant vers le domaine .yu, qui expire en 2009',
            'ksh': u'Bot: de ahle .yu-Domains loufe us, drm ujetuusch',
         },
         'replacements': [
            (u'www.budva.cg.yu',             u'www.budva.rs'),
            (u'spc.org.yu',                  u'spc.rs'),
            (u'www.oks.org.yu',              u'www.oks.org.rs'),
            (u'www.kikinda.org.yu',          u'www.kikinda.rs'),
            (u'www.ds.org.yu',               u'www.ds.org.rs'),
            (u'www.nbs.yu',                  u'www.nbs.rs'),
            (u'www.serbia.sr.gov.yu',        u'www.srbija.gov.rs'),
            (u'eunet.yu',                    u'eunet.rs'),
            (u'www.zastava-arms.co.yu',      u'www.zastava-arms.co.rs'),
            (u'www.airportnis.co.yu',        u'www.airportnis.rs'),
            # (u'www.danas.co.yu',             u'www.danas.rs'), # Archive links don't seem to work
            (u'www.belex.co.yu',             u'www.belex.rs'),
            (u'beograd.org.yu',              u'beograd.rs'),
            (u'www.vlada.cg.yu',             u'www.vlada.me'),
            (u'webrzs.statserb.sr.gov.yu',   u'webrzs.stat.gov.rs'),
            (u'www.statserb.sr.gov.yu',      u'webrzs.stat.gov.rs'),
            (u'www.rastko.org.yu',           u'www.rastko.org.rs'),
            (u'www.reprezentacija.co.yu',    u'www.reprezentacija.rs'),
            (u'www.blic.co.yu',              u'www.blic.co.rs'),
            (u'www.beograd.org.yu',          u'www.beograd.org.rs'),
            (u'arhiva.glas-javnosti.co.yu',  u'arhiva.glas-javnosti.rs'),
            (u'www.srpsko-nasledje.co.yu',   u'www.srpsko-nasledje.co.rs'),
            (u'www.dnevnik.co.yu',           u'www.dnevnik.rs'),
            (u'www.srbija.sr.gov.yu',        u'www.srbija.gov.rs'),
            (u'www.kurir-info.co.yu/Arhiva', u'arhiva.kurir-info.rs/Arhiva'),
            (u'www.kurir-info.co.yu/arhiva', u'arhiva.kurir-info.rs/arhiva'),
            (u'www.kurir-info.co.yu',        u'www.kurir-info.rs'),
            (u'arhiva.kurir-info.co.yu',     u'arhiva.kurir-info.rs'),
            (u'www.prvaliga.co.yu',          u'www.prvaliga.rs'),
            (u'www.mitropolija.cg.yu',       u'www.mitropolija.me'),
            (u'www.spc.yu/sr',               u'www.spc.rs/sr'),
            (u'www.sk.co.yu',                u'www.sk.co.rs'),
            (u'www.ekoforum.org.yu',         u'www.ekoforum.org'),
            (u'www.svevlad.org.yu',          u'www.svevlad.org.rs'),
            (u'www.posta.co.yu',             u'www.posta.rs'),
            (u'www.glas-javnosti.co.yu',     u'www.glas-javnosti.rs'),
            (u'www.fscg.cg.yu',              u'www.fscg.co.me'),
            (u'ww1.rts.co.yu/euro',          u'ww1.rts.co.rs/euro'),
            (u'www.rtv.co.yu',               u'www.rtv.rs'),
            (u'www.politika.co.yu',          u'www.politika.rs'),
            (u'www.mfa.gov.yu',              u'www.mfa.gov.rs'),
            (u'www.drzavnauprava.sr.gov.yu', u'www.drzavnauprava.gov.rs'),
        ],
    },
    # These replacements will convert HTML tag from FCK-editor to wiki syntax.
    #
    'fckeditor': {
        'regex': True,
        'msg': {
            'en': u'Robot: Fixing rich-editor html',
         },
         'replacements': [
            # replace <br> with a new line
            (r'(?i)<br>',                      r'\n'),
            # replace &nbsp; with a space
            (r'(?i)&nbsp;',                      r' '),
        ],
    },
}

#
# Load the user fixes file.

import config

try:
    execfile(config.datafilepath(config.base_dir, "user-fixes.py"))
except IOError:
    pass
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.