"""Check for broken links in an XHTML file.
Demonstrates several Pyana features:
1. how to accept input from both a string and a URL
2. how to pass a top-level parameter
3. how to have the output of an tranformation sent to a writer object
4. how to extend XPath with Python functions
Sample output:
> check_links.py http://pyana.sourceforge.net/
Checking for broken links in "http://pyana.sourceforge.net/"
Checking image "http://sourceforge.net/sflogo.php?group_id=28142&type=1".ok
Checking link "http://sourceforge.net".................................ok
Checking link "http://sourceforge.net/projects/pyana/".................ok
Checking link "http://sourceforge.net/project/showfiles.php?group_id=28142".ok
Checking link "build.html".............................................failed
Checking link "examples"...............................................ok
Checking link "http://pirxx.sourceforge.net"...........................ok
Checking link "mailto:brian@sweetapp.com"..............................maybe
"""
import Pyana
import urllib2
import urlparse
import sys
import urlparse
def checkBroken(base, frag):
"""checkBroken('http://www.python.org/', 'img.gif') => 'failed'
Checks to see if a URL that can be accessed. If a URL can be accessed,
returns 'ok', if not returns 'failed'. If a URL uses a protocol that
cannot be chacked, returns 'maybe'."""
# Encode the Unicode URIs as UTF-8. If they are ASCII then there
# content will be unchanged.
url = urlparse.urljoin(base.encode('utf-8'), frag.encode('utf-8'))
if urlparse.urlparse(url)[0] not in ['ftp', 'http', 'https', 'gopher']:
return 'maybe' # urllib2 is going to have problems
try:
urllib2.urlopen(urlparse.urljoin(base.encode('utf-8'), url.encode('utf-8')))
return 'ok'
except IOError:
return 'failed'
def format(str, width=55, char='.'):
"""format("image.gif") => '"image.gif"........................................'"""
return '"' + str + '"' + '.' * max(width - len(str), 1)
# Install functions as XPath extensions, stick all of the
# functions in the 'pyNS' namespace
Pyana.installGlobalExtension('pyNS', checkBroken, 'checkBroken')
Pyana.installGlobalExtension('pyNS', format, 'format')
checkURLsXSL = r'''
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:py="pyNS"
xmlns:xhtml="http://www.w3.org/1999/xhtml"
version="1.0">
<xsl:param name="url"/>
<xsl:output method="text"/>
<xsl:template match="/">Checking for broken links in "<xsl:value-of select="$url"/>"
<xsl:apply-templates select="//xhtml:img[@src]"/>
<xsl:apply-templates select="//xhtml:a[@href]"/>
</xsl:template>
<xsl:template match="xhtml:img[@src]">
Checking image <xsl:value-of select="py:format(string(@src))"/><xsl:value-of select="py:checkBroken($url, string(@src))"/>
</xsl:template>
<xsl:template match="xhtml:a[@href]">
Checking link <xsl:value-of select="py:format(string(@href))"/><xsl:value-of select="py:checkBroken($url, string(@href))"/>
</xsl:template>
</xsl:stylesheet>
'''
def printBrokenLinksForURL(url):
# Send the output to a writer so we don't have to wait until the entire transformation is complete before
# seeing output. Any object with a "write" method that takes a string is fine, so just use sys.stdout
# instead of creating our own object.
try:
Pyana.transform2Writer(source=Pyana.URI(url), style=checkURLsXSL, params={'url': repr(url)}, writer=sys.stdout)
except Pyana.SAXError:
print 'The source does not seem to be valid XHTML'
if __name__ == '__main__':
if len(sys.argv) == 1:
print 'usage: %s url1 [url2] .. [urln]\n'\
'e.g. %s http://pyana.sourceforge.net/' % (sys.argv[0], sys.argv[0])
else:
for url in sys.argv[1:]:
printBrokenLinksForURL(url)
|