# -*- coding: utf-8 -*-
This bot uploads text from djvu files onto pages in the "Page"
namespace. It is intended to be used for Wikisource.
The following parameters are supported:
-dry If given, doesn't do any real changes, but only shows
what would have been changed.
-ask Ask for confirmation before uploading each page.
(Default: ask when overwriting pages)
-djvu:... Filename of the djvu file
-index:... Name of the index page
(Default: the djvu filename)
-pages:<start>-<end> Page range to upload; <end> is optional
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
# (C) Pywikipedia bot team, 2008-2010
# Distributed under the terms of the MIT license.
__version__ = '$Id: djvutext.py 8051 2010-04-04 15:33:15Z mfarag $'
import wikipedia
import os, sys
import config, codecs
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
class DjVuTextBot:
# Edit summary message that should be used.
# NOTE: Put a good description here, and add translations, if possible!
msg = {
'ar': u': DjVu',
'en': u'Robot: creating page with text extracted from DjVu',
'fr': u'Bot: Creating page with texte extracted from DjVu',
'nl': u'Bot: pagina aangemaakt met tekst gextraheerd uit DjVu-bestand',
'pt': u'Bot: criando pgina com texto extrado do DjVu',
# On English Wikisource, {{blank page}} is used to track blank pages.
# It may be omitted by adding an empty string like has been done for 'fr'.
blank = {
'en': u'{{blank page}}',
'fr': u'',
'pt': u'',
def __init__(self, djvu, index, pages, ask=False, debug=False):
Constructor. Parameters:
djvu : filename
index : page name
pages : page range
self.djvu = djvu
self.index = index
self.pages = pages
self.dry = debug
self.ask = ask
def NoOfImages(self):
cmd = u"djvused -e 'n' \"%s\"" % (self.djvu)
count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
count = int(count)
wikipedia.output("page count = %d" % count)
return count
def PagesGenerator(self):
start = 1
end = self.NoOfImages()
if self.pages:
pos = self.pages.find('-')
if pos != -1:
start = int(self.pages[:pos])
if pos < len(self.pages)-1:
end = int(self.pages[pos+1:])
start = int(self.pages)
end = start
wikipedia.output(u"Processing pages %d-%d" % (start, end))
return range(start, end+1)
def run(self):
# Set the edit summary message
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
linkingPage = wikipedia.Page(wikipedia.getSite(), self.index)
self.prefix = linkingPage.titleWithoutNamespace()
if self.prefix[0:6] == 'Liber:':
self.prefix = self.prefix[6:]
wikipedia.output(u"Using prefix %s" % self.prefix)
gen = self.PagesGenerator()
site = wikipedia.getSite()
self.username = config.usernames[site.family.name][site.lang]
for pageno in gen:
wikipedia.output("Processing page %d" % pageno)
def has_text(self):
cmd = u"djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
s = f.read()
return s.find('TXTz') >= 0
def get_page(self, pageno):
wikipedia.output(unicode("fetching page %d" % (pageno)))
cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
os.system ( cmd.encode(sys.stdout.encoding) )
f = codecs.open(u"%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
djvu_text = f.read()
return djvu_text
def treat(self, pageno):
Loads the given page, does some changes, and saves it.
site = wikipedia.getSite()
page_namespace = site.family.namespaces[104][site.lang]
page = wikipedia.Page(site, u'%s:%s/%d' % (page_namespace, self.prefix, pageno) )
exists = page.exists()
djvutxt = self.get_page(pageno)
if not djvutxt:
djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
text = u'<noinclude>{{PageQuality|1|%s}}<div class="pagetext">\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,djvutxt)
# convert to wikisyntax
# this adds a second line feed, which makes a new paragraph
text = text.replace('', "\n")
# only save if something was changed
# automatically ask if overwriting an existing page
ask = self.ask
if exists:
ask = True
old_text = page.get()
if old_text == text:
wikipedia.output(u"No changes were needed on %s" % page.aslink())
old_text = ''
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
wikipedia.showDiff(old_text, text)
if self.dry:
wikipedia.inputChoice(u'Dry mode... Press enter to continue', [], [], 'dummy')
if ask:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
choice = 'y'
if choice == 'y':
# Save the page
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
except wikipedia.SpamfilterError, error:
wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
def main():
import os
index = None
djvu = None
pages = None
# what would have been changed.
dry = False
ask = False
# Parse command line arguments
for arg in wikipedia.handleArgs():
if arg.startswith("-dry"):
dry = True
elif arg.startswith("-ask"):
ask = True
elif arg.startswith("-djvu:"):
djvu = arg[6:]
elif arg.startswith("-index:"):
index = arg[7:]
elif arg.startswith("-pages:"):
pages = arg[7:]
wikipedia.output(u"Unknown argument %s" % arg)
# Check the djvu file exists
if djvu:
if not index:
import os.path
index = os.path.basename(djvu)
if djvu and index:
site = wikipedia.getSite()
index_page = wikipedia.Page(site, index)
if site.family.name != 'wikisource':
raise wikipedia.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)
if not index_page.exists() and index_page.namespace() == 0:
index_namespace = wikipedia.Page(site, 'MediaWiki:Proofreadpage index namespace').get()
index_page = wikipedia.Page(wikipedia.getSite(),
u"%s:%s" % (index_namespace, index))
if not index_page.exists():
raise wikipedia.NoPage(u"Page '%s' does not exist" % index)
wikipedia.output(u"uploading text from %s to %s" % (djvu, index_page.aslink()) )
bot = DjVuTextBot(djvu, index, pages, ask, dry)
if not bot.has_text():
raise ValueError("No text layer in djvu file")
if __name__ == "__main__":