check_extern.py : » Network » Python-Wikipedia-Robot-Framework » pywikipedia » archive » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML

Python Open Source » Network » Python Wikipedia Robot Framework

Python Wikipedia Robot Framework » pywikipedia » archive » check_extern.py

# -*- coding: utf-8  -*-
"""
This bot is used for checking external links from Wikipedia. It checks
all external links in groups of 480 pages, gives the error code for each
that causes problems, and counts the number of links with and without
problems.

It accepts all general Wikipediabot arguments as well as:
-start:xxx  Check starting at 'xxx'.
-nolog      Do not log to a file, only give output to a screen.

Anything else is assumed to be a page that is to be checked. Spaces in
page titles have to be replaced by underscores, otherwise the bot assumes
the parts are separate pages. If no page has been specified and also no
-start argument has been provided, the bot acts as if -start:! had been
specified, starting at the beginning.

The bot returns all links that have some problem, with the errorcode
provided by the server, or the artificial errorcode -1 if the server
could not be reached at all. Output is sent both to the screen and the
file check_extern.txt
"""

#
# (C) Andre Engels, 2004
#
# Distributed under the terms of the MIT license.
#

__version__='$Id: check_extern.py,v 1.16 2005/12/21 17:51:26 wikipedian Exp $'

import wikipedia, urllib, re, sys, httplib

class URLerrorFinder(urllib.FancyURLopener):
    version="RobHooftWikiRobot/1.0"
    def open_http(self, url):
        """Use HTTP protocol."""
        if isinstance(url, str):
            host, selector = urllib.splithost(url)
            if host:
                user_passwd, host = urllib.splituser(host)
                host = urllib.unquote(host)
            realhost = host
        else:
            host, selector = url
            urltype, rest = urllib.splittype(selector)
            url = rest
            user_passwd = None
            if urltype.lower() != 'http':
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
                if proxy_bypass(realhost):
                    host = realhost
        if not host: return -2
        h = httplib.HTTP(host)
        h.putrequest('GET', selector)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: h.putheader(*args)
        h.endheaders()
        errcode, errmsg, headers = h.getreply()
        return errcode

# Which error codes do we not consider errors? 
allowederrorcodes = [100,101,200,201,202,203,205,304]

errname = {
    -1:'No contact to server',
    -2:'No host found',
    100:'Continue',
    101:'Switching Protocols',
    200:'OK',
    201:'Created',
    202:'Accepted',
    203:'Non-Authorative Information',
    204:'No Content',
    205:'Reset Content',
    206:'Partial Content',
    300:'Multiple Choices',
    301:'Moved Permanently',
    302:'Moved Temporarily',
    303:'See Other',
    304:'Not Modified',
    305:'Use Proxy',
    307:'Temporary Redirect',
    400:'Bad Request',
    401:'Unauthorized',
    402:'Payment Required',
    403:'Forbidden',
    404:'Not Found',
    405:'Method Not Allowed',
    406:'None Acceptable',
    407:'Proxy Authentication Required',
    408:'Request Timeout',
    409:'Conflict',
    410:'Gone',
    411:'Authorization Refused',
    412:'Precondition Failed',
    413:'Request Entity Too Large',
    414:'Request-URI Too Large',
    415:'Unsupported Media Type',
    416:'Requested Range not satisfiable',
    417:'Expectation Failed',
    500:'Internal Server Error',
    501:'Not Implemented',
    502:'Bad Gateway',
    503:'Service Unavailable',
    504:'Gateway Timeout',
    505:'HTTP Version not supported',
    8181:'Certificate Expired',
    12002:'Timeout',
    12007:'No such host',
    12029:'No connection',
    12031:'Connection Reset'
 }

def errorname(error):
    # Given a numerical HTML error, give its actual identity
    if error in errname:
        return errname[error]
    elif (error > 300) and (error < 400):
        return 'Unknown Redirection Response'
    else:
        return 'Unknown Error'
    
start = '!'
log = True
todo = []
do_all = False

for arg in sys.argv[1:]:
    url=sys.argv[1]
    arg = wikipedia.argHandler(arg, 'check_extern')
    if arg:
        if arg.startswith('-start:'):
            start=arg[7:]
            do_all=True
        elif arg=='-nolog':
            log = False
        else:
            mysite = wikipedia.getSite()
            todo.append(wikipedia.Page(mysite,arg))

# Make sure we have the final site
mysite = wikipedia.getSite()

if todo == []:
    # No pages have been given; if also no start is given, we start at
    # the beginning
    do_all = True

if log:
    import logger
    sys.stdout = logger.Logger(sys.stdout, filename = 'check_extern.log')

cont = True
checked = 0
working = 0
nonworking = 0
totalchecked = 0

try:
    while cont:
        print
        i = 0
        if len(todo)<61 and do_all:
            for pl in wikipedia.allpages(start = start):
                todo.append(pl)
                i += 1
                if i==480:
                    break
            start = todo[len(todo)-1].title() + '_0'
        # todo is a list of pages to do, donow are the pages we will be doing in this run.
        if len(todo)>60:
            # Take the first 60.
            donow = todo[0:60]
            todo = todo[60:]
        else:
            donow = todo
            # If there was more to do, the 'if len(todo)<61' part would have extended
            # todo beyond this size.
            cont = False
        try:
            wikipedia.getall(mysite, donow)
        except wikipedia.SaxError:
            # Ignore this error, and get the pages the traditional way.
            pass
        checked +=len(donow)
        for pl in donow:
            R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]')
            try:
                for url in R.findall(pl.get()):
                    url = wikipedia.unicode2html(url,'ascii')
                    try:
                        error = URLerrorFinder().open(url)
                    except IOError:
                        error = -1
                    if error in allowederrorcodes:
                        working += 1
                    else:
                        nonworking += 1
                        print
                        wikipedia.output(u'Page "%s" links to:'%pl.title())
                        wikipedia.output(url)
                        wikipedia.output(u'Which gave error: %s %s'%(error,errorname(error)))
            # If anything is wrong with the Wikipedia page, just ignore
            except (wikipedia.NoPage,wikipedia.IsRedirectPage,wikipedia.LockedPage):
                pass
        if checked>499 or not cont:
            totalchecked += 500
            checked -= 500
            print
            print '======================================================================'
            wikipedia.output(u'%s pages checked, last was [[%s]]'%(totalchecked+checked,donow[len(donow)-1]))
            print 'In those pages there were %s correct and %s problematic external links.'%(working,nonworking)
except:
    wikipedia.stopme()
    raise
wikipedia.stopme()

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.