#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script goes over multiple pages, searches for pages where <references />
is missing although a <ref> tag is present, and in that case adds a new
references section.
These command line parameters can be used to specify which pages to work on:
¶ms;
-xml Retrieve information from a local XML dump (pages-articles
or pages-meta-current, see http://download.wikimedia.org).
Argument can also be given as "-xml:filename".
-namespace:n Number or name of namespace to process. The parameter can be
used multiple times. It works in combination with all other
parameters, except for the -start parameter. If you e.g.
want to iterate over all categories starting at M, use
-start:Category:M.
-always Don't prompt you for each replacement.
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
It is strongly recommended not to run this script over the entire article
namespace (using the -start) parameter, as that would consume too much
bandwidth. Instead, use the -xml parameter, or use another way to generate
a list of affected articles
"""
__version__='$Id: noreferences.py 8179 2010-05-15 13:13:26Z amir $'
import wikipedia, pagegenerators, catlib
import editarticle
import re, sys
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
}
# Summary messages in different languages
msg = {
'ar':u': <references /> ',
'cs':u'Robot doplnil chybjc <references />',
'de':u'Bot: Trage fehlendes <references /> nach',
'en':u'Robot: Adding missing <references /> tag',
'eo':u'Roboto: Aldono de "<references />"',
'fa':u': ',
'fi':u'Botti lissi puuttuvan {{viitteet}}-mallineen',
'he':u': <references /> ',
'hu':u'Hinyz {{Forrsok}} ptlsa',
'it':u'Bot: Aggiungo il tag <references /> mancante',
'ja':u': <references /> ',
'ko':u': <references /> ',
'lt':u'robotas: Pridedama trkstama <references /> ym',
'nl':u'Bot: toevoeging ontbrekende <references /> tag',
'pl':u'Robot dodaje szablon {{przypisy}}',
'pt':u'Bot: Adicionando a tag <references />',
'zh':u': <references /> ',
'fr':u'Robot: Ajout de la balise <references /> manquante',
}
# References sections are usually placed before further reading / external
# link sections. This dictionary defines these sections, sorted by priority.
# For example, on an English wiki, the script would place the "References"
# section in front of the "Further reading" section, if that existed.
# Otherwise, it would try to put it in front of the "External links" section,
# or if that fails, the "See also" section, etc.
placeBeforeSections = {
'ar': [ # no explicit policy on where to put the references
u' ',
u' ',
u''
],
'cs': [
u'Reference',
u'Poznmky',
],
'de': [ # no explicit policy on where to put the references
u'Literatur',
u'Weblinks',
u'Siehe auch',
u'Weblink', # bad, but common singular form of Weblinks
],
'en': [ # no explicit policy on where to put the references
u'Further reading',
u'External links',
u'See also',
u'Notes'
],
'eo': [
u'Eksteraj ligiloj',
u'Ekstera ligilo',
u'Eksteraj ligoj',
u'Ekstera ligo',
u'Rete'
],
'es': [
u'Enlaces externos',
u'Vase tambin',
u'Notas',
],
'fa': [
u' ',
u'',
u' '
],
'fi': [
u'Kirjallisuutta',
u'Aiheesta muualla',
u'Ulkoiset linkit',
u'Linkkej',
],
'fr': [
u'Liens externes',
u'Voir aussi',
u'Notes'
],
'hu': [
u'Kls hivatkozsok',
u'Lsd mg',
],
'it': [
u'Bibliografia',
u'Voci correlate',
u'Altri progetti',
u'Collegamenti esterni',
u'Vedi anche',
],
'ja':[
u'',
u'',
u'',
],
'ko':[ # no explicit policy on where to put the references
u' ',
u'',
u' ',
u'',
u' ',
u''
u' ',
u''
],
'lt': [ # no explicit policy on where to put the references
u'Nuorodos'
],
'nl': [ # no explicit policy on where to put the references
u'Literatuur',
u'Zie ook',
u'Externe verwijzingen',
u'Externe verwijzing',
],
'pl': [
u'rda',
u'Bibliografia',
u'Zobacz te',
u'Linki zewntrzne',
],
'pt': [
u'Ligaes externas',
u'Veja tambm',
u'Ver tambm',
u'Notas',
],
'sk': [
u'Pozri aj',
],
'zh': [
u'',
u'',
],
}
# Titles of sections where a reference tag would fit into.
# The first title should be the preferred one: It's the one that
# will be used when a new section has to be created.
referencesSections = {
'ar': [ # not sure about which ones are preferred.
u'',
u'',
],
'de': [ #see [[de:WP:REF]]
u'Einzelnachweise',
u'Funoten',
u'Anmerkungen',
u'Belege',
u'Quellen',
u'Quellenangaben',
],
'en': [ # not sure about which ones are preferred.
u'References',
u'Footnotes',
u'Notes',
],
'eo': [
u'Referencoj',
],
'es': [
u'Referencias',
u'Notas',
],
'fa': [
u'',
u''
],
'fi': [
u'Lhteet',
u'Viitteet',
],
'fr': [ # [[fr:Aide:Note]]
u'Notes et rfrences',
u'Rfrences',
u'References',
u'Notes'
],
'he': [
u' ',
],
'hu': [
u'Forrsok s jegyzetek',
u'Forrsok',
u'Jegyzetek',
u'Hivatkozsok',
u'Megjegyzsek',
],
'it': [
u'Note',
u'Riferimenti',
],
'ja': [
u'',
u'',
u'',
u'',
u'',
u'',
],
'ko': [
u'',
u''
u' '
u' ',
u' '
],
'lt': [ # not sure about which ones are preferred.
u'altiniai',
u'Literatra',
],
'nl': [ # not sure about which ones are preferred.
u'Voetnoten',
u'Voetnoot',
u'Referenties',
u'Noten',
u'Bronvermelding',
],
'pl': [
u'Przypisy',
u'Oglne przypisy',
u'Notatki',
],
'pt': [
u'Referncias',
],
'sk': [
u'Referencie',
],
'zh': [
u'',
u'',
u'',
u'',
u'',
u'',
u'',
u'',
u'',
u'',
],
}
# Templates which include a <references /> tag. If there is no such template
# on your wiki, you don't have to enter anything here.
referencesTemplates = {
'wikipedia': {
'ar': [u'Reflist',u' ',u' '],
'en': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference',
u'Ref-list',u'Reference list',u'References-small',u'Reflink',
u'Footnotes',u'FootnotesSmall'],
'eo': [u'Referencoj'],
'es': ['Listaref', 'Reflist', 'muchasref'],
'fa': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference',u'',u'Reflist',u' ',u' ',u''],
'fi': [u'Viitteet', u'Reflist'],
'fr': [u'Rfrences',u'Notes', u'References', u'Reflist'],
'hu': [u'reflist',u'forrsok'],
'it': [u'References'],
'ja': [u'Reflist', u''],
'ko': [u'', u'Reflist'],
'lt': [u'Reflist', u'Ref', u'Litref'],
'nl': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference',
u'Ref-list',u'Reference list',u'References-small',u'Reflink',
u'Referenties',u'Bron',u'Bronnen/noten/referenties',u'Bron2',
u'Bron3',u'ref',u'references',u'appendix',
u'Noot',u'FootnotesSmall'],
'pl': [u'przypisy', u'Przypisy'],
'pt': [u'Notas', 'ref-section'],
'zh': [u'Reflist'],
},
}
# Text to be added instead of the <references /> tag.
# Define this only if required by your wiki.
referencesSubstitute = {
'wikipedia': {
'fi': u'{{viitteet}}',
'hu': u'{{Forrsok}}',
},
}
class XmlDumpNoReferencesPageGenerator:
"""
Generator which will yield Pages that might lack a references tag.
These pages will be retrieved from a local XML dump file
(pages-articles or pages-meta-current).
"""
def __init__(self, xmlFilename):
"""
Arguments:
* xmlFilename - The dump's path, either absolute or relative
"""
self.xmlFilename = xmlFilename
self.refR = re.compile('</ref>', re.IGNORECASE)
# The references tab can contain additional spaces and a group attribute.
self.referencesR = re.compile('<references.*?/>', re.IGNORECASE)
def __iter__(self):
import xmlreader
dump = xmlreader.XmlDump(self.xmlFilename)
for entry in dump.parse():
text = wikipedia.removeDisabledParts(entry.text)
if self.refR.search(text) and not self.referencesR.search(text):
yield wikipedia.Page(wikipedia.getSite(), entry.title)
class NoReferencesBot:
def __init__(self, generator, always = False):
self.generator = generator
self.always = always
self.site = wikipedia.getSite()
self.refR = re.compile('</ref>', re.IGNORECASE)
self.referencesR = re.compile('<references.*?/>', re.IGNORECASE)
try:
self.referencesTemplates = referencesTemplates[wikipedia.getSite().family.name][wikipedia.getSite().lang]
except KeyError:
self.referencesTemplates = []
try:
self.referencesText = referencesSubstitute[wikipedia.getSite().family.name][wikipedia.getSite().lang]
except KeyError:
self.referencesText = u'<references />'
def lacksReferences(self, text, verbose = True):
"""
Checks whether or not the page is lacking a references tag.
"""
oldTextCleaned = wikipedia.removeDisabledParts(text)
if self.referencesR.search(oldTextCleaned):
if verbose:
wikipedia.output(u'No changes necessary: references tag found.')
return False
elif self.referencesTemplates:
templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')'
if re.search(templateR, oldTextCleaned, re.IGNORECASE):
if verbose:
wikipedia.output(u'No changes necessary: references template found.')
return False
elif not self.refR.search(oldTextCleaned):
if verbose:
wikipedia.output(u'No changes necessary: no ref tags found.')
return False
else:
if verbose:
wikipedia.output(u'Found ref without references.')
return True
def addReferences(self, oldText):
"""
Tries to add a references tag into an existing section where it fits
into. If there is no such section, creates a new section containing
the references tag.
* Returns : The modified pagetext
"""
# Is there an existing section where we can add the references tag?
for section in wikipedia.translate(self.site, referencesSections):
sectionR = re.compile(r'\r\n=+ *%s *=+ *\r\n' % section)
index = 0
while index < len(oldText):
match = sectionR.search(oldText, index)
if match:
if wikipedia.isDisabled(oldText, match.start()):
wikipedia.output('Existing %s section is commented out, skipping.' % section)
index = match.end()
else:
wikipedia.output(u'Adding references tag to existing %s section...\n' % section)
newText = oldText[:match.end()] + u'\n' + self.referencesText + u'\n' + oldText[match.end():]
return newText
else:
break
# Create a new section for the references tag
for section in wikipedia.translate(self.site, placeBeforeSections):
# Find out where to place the new section
sectionR = re.compile(r'\r\n(?P<ident>=+) *%s *(?P=ident) *\r\n' % section)
index = 0
while index < len(oldText):
match = sectionR.search(oldText, index)
if match:
if wikipedia.isDisabled(oldText, match.start()):
wikipedia.output('Existing %s section is commented out, won\'t add the references in front of it.' % section)
index = match.end()
else:
wikipedia.output(u'Adding references section before %s section...\n' % section)
index = match.start()
ident = match.group('ident')
return self.createReferenceSection(oldText, index, ident)
else:
break
# This gets complicated: we want to place the new references
# section over the interwiki links and categories, but also
# over all navigation bars, persondata, and other templates
# that are at the bottom of the page. So we need some advanced
# regex magic.
# The strategy is: create a temporary copy of the text. From that,
# keep removing interwiki links, templates etc. from the bottom.
# At the end, look at the length of the temp text. That's the position
# where we'll insert the references section.
catNamespaces = '|'.join(self.site.category_namespaces())
categoryPattern = r'\[\[\s*(%s)\s*:[^\n]*\]\]\s*' % catNamespaces
interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*'
# won't work with nested templates
templatePattern = r'{{((?!}}).)+?}}\s*' # the negative lookahead assures that we'll match the last template occurence in the temp text.
commentPattern = r'<!--((?!-->).)*?-->\s*'
metadataR = re.compile(r'(\r\n)?(%s|%s|%s|%s)$' % (categoryPattern, interwikiPattern, templatePattern, commentPattern), re.DOTALL)
tmpText = oldText
while True:
match = metadataR.search(tmpText)
if match:
tmpText = tmpText[:match.start()]
else:
break
wikipedia.output(u'Found no section that can be preceeded by a new references section. Placing it before interwiki links, categories, and bottom templates.')
index = len(tmpText)
return self.createReferenceSection(oldText, index)
def createReferenceSection(self, oldText, index, ident = '=='):
newSection = u'\n%s %s %s\n%s\n' % (ident, wikipedia.translate(self.site, referencesSections)[0], ident, self.referencesText)
return oldText[:index] + newSection + oldText[index:]
def save(self, page, newText):
"""
Saves the page to the wiki, if the user accepts the changes made.
"""
wikipedia.showDiff(page.get(), newText)
if not self.always:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'Y')
if choice == 'n':
return
elif choice == 'a':
self.always = True
if self.always:
try:
page.put(newText)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)' % (page.title(),))
else:
# Save the page in the background. No need to catch exceptions.
page.put_async(newText)
return
def run(self):
comment = wikipedia.translate(self.site, msg)
wikipedia.setAction(comment)
for page in self.generator:
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
try:
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u"Page %s does not exist?!" % page.aslink())
continue
except wikipedia.IsRedirectPage:
wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
continue
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked?!" % page.aslink())
continue
if self.lacksReferences(text):
newText = self.addReferences(text)
self.save(page, newText)
def main():
#page generator
gen = None
# This temporary array is used to read the page title if one single
# page to work on is specified by the arguments.
pageTitle = []
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
# Never ask before changing a page
always = False
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
for arg in wikipedia.handleArgs():
if arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
gen = XmlDumpNoReferencesPageGenerator(xmlFilename)
elif arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
except ValueError:
namespaces.append(arg[11:])
elif arg == '-always':
always = True
else:
if not genFactory.handleArg(arg):
pageTitle.append(arg)
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
gen = iter([page])
if not gen:
gen = genFactory.getCombinedGenerator()
if not gen:
wikipedia.showHelp('noreferences')
else:
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
preloadingGen = pagegenerators.PreloadingGenerator(gen)
bot = NoReferencesBot(preloadingGen, always)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
|