# -*- coding: utf-8 -*-
"""
Reads a cur SQL dump and offers a generator over SQLentry objects which can be
used by other bots. Each SQLentry object represents a page.
Can also be run directly from the command line to retrieve page lists from
an SQL dump.
Syntax:
python sqldump.py -sql:filename.sql action
Where action can be one of these:
* find - List pages which contain a certain text
* findr - List pages containing text matching a regular expression
* shortpages - List pages with short contents
* unmountedcats - List categories that don't have a supercategory
* percentnames - List pages that contain internal links where special
characters are encoded as hexadecimal codes, e.g. %F6
* baddisambiguations - Created for de.wikipedia to fix primary topic
disambiguations (Begriffsklrung nach Modell 2).
"""
#
# (C) Daniel Herding, 2004
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id:'
from __future__ import generators
import re, time
import wikipedia, config
class SQLentry(object):
'''
Represents a wiki page, read from anSQLdump. import
An instance of this class will have the following attributes:
* self.id is the page ID (integer)
* self.namespace is the namespace ID (integer)
* self.title is the page title without namespace (unicode)
* self.text is the text on that page (unicode)
* self.comment is the last edit summary (unicode)
* self.userid is the last editor's ID (integer)
* self.username is the last editor's username (unicode)
* self.timestamp is the time of the last edit (time tuple)
* self.restrictions is True if the page is locked (boolean)
* self.counter is the # of page views, disabled on Wikimedia wikis (integer)
* self.redirect is True if the page is a redirect (boolean)
* self.minor is True if the last edit was marked as minor (boolean)
* self.new is True if the last edit was the first one (boolean)
* self.random is a random number used for the 'Random Page' function (float)
* self.touched is the date of the last cache update (time tuple)
See http://meta.wikimedia.org/wiki/Cur_table for details.
'''
def __init__(self, id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched):
'''
Constructor. All parameters should be strings, as read from theSQL import
dump. This function will convert them to formats which are more
appropriate for the data types.
'''
self.id = int(id)
self.namespace = int(namespace)
self.title = title
self.text = text
self.comment = comment
self.userid = int(userid)
self.username = username
# convert to a 9-dimensional time tuple, see http://python.org/doc/2.3.4/lib/module-time.html
self.timestamp = time.strptime(timestamp, '%Y%m%d%H%M%S')
# convert to boolean
self.restrictions = (restrictions != '')
self.counter = int(counter)
self.redirect = (redirect == '1')
self.minor = (minor == '1')
self.new = (new == '1')
self.random = float(random)
# Inversetimestamp is obsolete, so we ignore it.
#self.inversetimestamp = inversetimestamp
# Basically, I would want to convert touched to time tuple, as I did
# with timestamp. But I noticed a problem: in the nds: dump touched
# comes before inversetimestamp, and that would cause strptime to crash.
# So we simply leave touched as it is and hope that this is the only
# exception where entries are mixed up. If you find other such cases,
# please report.
#self.touched = time.strptime(touched, '%Y%m%d%H%M%S')
self.touched = touched
# MediaWiki escapes apostrophes, backslashes and quotes with
# backslashes. We need to unescape them again.
# This regular expression matches a backslash followed by a group, where
# the group matches either an apostrophe, a backslashes or a quote.
escapedR = re.compile(r'\\([\\\"\'])')
# The group \1 is the character we really want, while the leading
# backslash is only escape information we don't need.
self.title = escapedR.sub(r"\1", self.title)
self.text = escapedR.sub(r"\1", self.text)
self.comment = escapedR.sub(r"\1", self.comment)
self.username = escapedR.sub(r"\1", self.username)
# convert \n and \r to newlines and carriage returns.
self.text = self.text.replace('\\r', '\r')
self.text = self.text.replace('\\n', '\n')
# comments can also contain newline characters
self.comment = self.comment.replace('\\r', '\r')
self.comment = self.comment.replace('\\n', '\n')
# I hope that titles and usernames can't :-)
def full_title(self, underline = True):
'''
Returns the full page title in the form 'namespace:title', using the
localized namespace titles defined in your family file.
If underline is True, returns the page title with underlines instead of
spaces.
'''
if not underline:
title = self.title.replace('_', ' ')
else:
title = self.title
namespace_title = wikipedia.getSite().namespace(self.namespace)
if namespace_title == None:
return self.title
else:
if underline:
namespace_title = namespace_title.replace(' ', '_')
return namespace_title + ':' + self.title
def age(self):
'''
Returns the time passed since the last edit, in relation to the current
system time, in seconds (floating point number).
'''
return time.time() - time.mktime(self.timestamp)
# Represents one parsed SQL dump file. Reads the local file at initialization,
# parses it with a regular expression, and offers access to the resulting
# SQLentry objects through the entries() generator.
class SQLdump(object):
def __init__(self, filename, encoding):
self.filename = filename
self.encoding = encoding
def entries(self):
'''
Generator which reads one line at a time from theSQLdumpfile import
parses it to create SQLentry objects. Stops when the end of file is
reached.
'''
# This regular expression will match one SQL database entry (i.e. a
# page), and each group represents an attribute of that entry.
# NOTE: We don't need re.DOTALL because newlines are escaped.
pageR = re.compile("\((\d+)," # cur_id (page ID number)
+ "(\d+)," # cur_namespace (namespace number)
+ "'(.*?)'," # cur_title (page title w/o namespace)
+ "'(.*?)'," # cur_text (page contents)
+ "'(.*?)'," # cur_comment (last edit's summary text)
+ "(\d+)," # cur_user (user ID of last contributor)
+ "'(.*?)'," # cur_user_text (user name)
+ "'(\d{14})'," # cur_timestamp (time of last edit)
+ "'(.*?)'," # cur_restrictions (protected pages have 'sysop' here)
+ "(\d+)," # cur_counter (view counter, disabled on WP)
+ "([01])," # cur_is_redirect
+ "([01])," # cur_minor_edit
+ "([01])," # cur_is_new
+ "([\d\.]+?)," # cur_random (for random page function)
+ "'(\d{14})'," # inverse_timestamp (obsolete)
+ "'(\d{14})'\)") # cur_touched (cache update timestamp)
print 'Reading SQL dump'
# Open the file, read it using the given encoding, and replace invalid
# characters with question marks.
import codecs
f=codecs.open(self.filename, 'r', encoding = self.encoding, errors='replace')
eof = False
while not eof:
# Read only one (very long) line because we would risk out of memory
# errors if we read the entire file at once
line = f.readline()
if line == '':
print 'End of file.'
eof = True
self.entries = []
for id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched in pageR.findall(line):
new_entry = SQLentry(id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched)
yield new_entry
f.close()
def query_percentnames(self):
'''
yields pages that contain internal links where special characters are
encoded as hexadecimal codes, e.g. %F6
'''
Rpercentlink = re.compile('\[\[[^\]]*?%[A-F0-9][A-F0-9][^\]]*?\]\]')
for entry in self.entries():
text = wikipedia.removeLanguageLinks(entry.text)
if Rpercentlink.search(text):
yield entry
def query_shortpages(self, minsize):
'''
yields articles that have less than minsize bytes of text
'''
for entry in self.entries():
if entry.namespace == 0 and not entry.redirect and len(entry.text) < minsize:
yield entry
def query_find(self, keyword):
'''
yields pages which contain keyword
'''
for entry in self.entries():
if entry.text.find(keyword) != -1:
yield entry
def query_findr(self, regex, namespace = None):
'''
yields pages which contain a string matching the given regular expression
'''
r = re.compile(regex)
for entry in self.entries():
if r.search(entry.text) and (namespace == None or entry.namespace == namespace):
yield entry
def query_unmountedcats(self):
'''
yields categories which don't have any supercategory
'''
for entry in self.entries():
if entry.namespace == 14:
has_supercategory = False
for ns in wikipedia.getSite().category_namespaces():
if entry.text.find('[[%s:' % ns) != -1:
has_supercategory = True
break
if not has_supercategory:
yield entry
def query(sqldump, action):
if action == 'percentnames':
for entry in sqldump.query_percentnames():
yield entry
elif action == 'shortpages':
minsize = int(wikipedia.input(u'Minimum size:'))
for entry in sqldump.query_shortpages(minsize):
yield entry
elif action == 'find':
keyword = wikipedia.input(u'Search for:')
for entry in sqldump.query_find(keyword):
yield entry
elif action == 'findr':
keyword = wikipedia.input(u'Search for:')
for entry in sqldump.query_findr(keyword):
yield entry
elif action == 'unmountedcats':
for entry in sqldump.query_unmountedcats():
yield entry
elif action == 'baddisambiguation':
for entry in sqldump.entries():
if entry.namespace == 0 and entry.title.endswith(')') and entry.text.startswith("''") and not entry.text.startswith("'''"):
yield entry
if __name__=="__main__":
wikipedia.stopme() # No need to have me on the stack, as I'm not contacting the wiki
import sys
action = None
filename = None
for arg in sys.argv[1:]:
arg = wikipedia.argHandler(arg, 'sqldump')
if arg:
if arg.startswith('-sql'):
if len(arg) == 4:
filename = wikipedia.input(u'Please enter the SQL dump\'s filename: ')
else:
filename = arg[5:]
else:
action = arg
if not filename or not action:
wikipedia.output(__doc__, 'utf-8')
else:
sqldump = SQLdump(filename, wikipedia.myencoding())
for entry in query(sqldump, action):
wikipedia.output(u'*[[%s]]' % entry.full_title())
|