BeautifulSoupTests.py :  » Web-Services » Beautiful-Soup » BeautifulSoup-3.1.0.1 » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Web Services » Beautiful Soup 
Beautiful Soup » BeautifulSoup 3.1.0.1 » BeautifulSoupTests.py
# -*- coding: utf-8 -*-
"""Unit tests for Beautiful Soup.

These tests make sure the Beautiful Soup works as it should. If you
find a bug in Beautiful Soup, the best way to express it is as a test
case like this that fails."""

import unittest
from BeautifulSoup import *

class SoupTest(unittest.TestCase):

    def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
                         encoding=None):
        """Parse the given text and make sure its string rep is the other
        given text."""
        if rep == None:
            rep = toParse
        obj = c(toParse)
        if encoding is None:
            rep2 = obj.decode()
        else:
            rep2 = obj.encode(encoding)
        self.assertEqual(rep2, rep)

class FollowThatTag(SoupTest):

    "Tests the various ways of fetching tags from a soup."

    def setUp(self):
        ml = """
        <a id="x">1</a>
        <A id="a">2</a>
        <b id="b">3</a>
        <b href="foo" id="x">4</a>
        <ac width=100>4</ac>"""
        self.soup = BeautifulStoneSoup(ml)

    def testFindAllByName(self):
        matching = self.soup('a')
        self.assertEqual(len(matching), 2)
        self.assertEqual(matching[0].name, 'a')
        self.assertEqual(matching, self.soup.findAll('a'))
        self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))

    def testFindAllByAttribute(self):
        matching = self.soup.findAll(id='x')
        self.assertEqual(len(matching), 2)
        self.assertEqual(matching[0].name, 'a')
        self.assertEqual(matching[1].name, 'b')

        matching2 = self.soup.findAll(attrs={'id' : 'x'})
        self.assertEqual(matching, matching2)

        strainer = SoupStrainer(attrs={'id' : 'x'})
        self.assertEqual(matching, self.soup.findAll(strainer))

        self.assertEqual(len(self.soup.findAll(id=None)), 1)

        self.assertEqual(len(self.soup.findAll(width=100)), 1)
        self.assertEqual(len(self.soup.findAll(junk=None)), 5)
        self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)

        self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
        self.assertEqual(len(self.soup.findAll(junk=True)), 0)

        self.assertEqual(len(self.soup.findAll(junk=True)), 0)
        self.assertEqual(len(self.soup.findAll(href=True)), 1)

    def testFindallByClass(self):
        soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
        self.assertEqual(soup.find('a', '1').string, "Bar")

    def testFindAllByList(self):
        matching = self.soup(['a', 'ac'])
        self.assertEqual(len(matching), 3)

    def testFindAllByHash(self):
        matching = self.soup({'a' : True, 'b' : True})
        self.assertEqual(len(matching), 4)

    def testFindAllText(self):
        soup = BeautifulSoup("<html>\xbb</html>")
        self.assertEqual(soup.findAll(text=re.compile('.*')),
                         [u'\xbb'])

    def testFindAllByRE(self):
        import re
        r = re.compile('a.*')
        self.assertEqual(len(self.soup(r)), 3)

    def testFindAllByMethod(self):
        def matchTagWhereIDMatchesName(tag):
            return tag.name == tag.get('id')

        matching = self.soup.findAll(matchTagWhereIDMatchesName)
        self.assertEqual(len(matching), 2)
        self.assertEqual(matching[0].name, 'a')

    def testParents(self):
        soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
        b = soup.b
        self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
        self.assertEquals(b.findParent('ul')['a'], 'b')

    PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')

    def testNext(self):
        soup = self.PROXIMITY_TEST
        b = soup.find('b', {'id' : 2})
        self.assertEquals(b.findNext('b')['id'], '3')
        self.assertEquals(b.findNext('b')['id'], '3')
        self.assertEquals(len(b.findAllNext('b')), 2)
        self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)

    def testPrevious(self):
        soup = self.PROXIMITY_TEST
        b = soup.find('b', {'id' : 3})
        self.assertEquals(b.findPrevious('b')['id'], '2')
        self.assertEquals(b.findPrevious('b')['id'], '2')
        self.assertEquals(len(b.findAllPrevious('b')), 2)
        self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)


    SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')

    def testNextSibling(self):
        soup = self.SIBLING_TEST
        tag = 'blockquote'
        b = soup.find(tag, {'id' : 2})
        self.assertEquals(b.findNext(tag)['id'], '2.1')
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
        self.assertEquals(len(b.findNextSiblings(tag)), 2)
        self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)

    def testPreviousSibling(self):
        soup = self.SIBLING_TEST
        tag = 'blockquote'
        b = soup.find(tag, {'id' : 3})
        self.assertEquals(b.findPrevious(tag)['id'], '2.1')
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
        self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
        self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)

    def testTextNavigation(self):
        soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
        baz = soup.find(text='Baz')
        self.assertEquals(baz.findParent("i")['id'], '1')
        self.assertEquals(baz.findNext(text='Blee'), 'Blee')
        self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
        self.assertEquals(baz.findNextSibling(text='Blargh'), None)
        self.assertEquals(baz.findNextSibling('hr')['id'], '1')

class SiblingRivalry(SoupTest):
    "Tests the nextSibling and previousSibling navigation."

    def testSiblings(self):
        soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
        secondLI = soup.find('li').nextSibling
        self.assert_(secondLI.name == 'li' and secondLI.string == '2')
        self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
        self.assertEquals(soup.find('p').nextSibling, 'B')
        self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')

class TagsAreObjectsToo(SoupTest):
    "Tests the various built-in functions of Tag objects."

    def testLen(self):
        soup = BeautifulSoup("<top>1<b>2</b>3</top>")
        self.assertEquals(len(soup.top), 3)

class StringEmUp(SoupTest):
    "Tests the use of 'string' as an alias for a tag's only content."

    def testString(self):
        s = BeautifulSoup("<b>foo</b>")
        self.assertEquals(s.b.string, 'foo')

    def testLackOfString(self):
        s = BeautifulSoup("<b>f<i>e</i>o</b>")
        self.assert_(not s.b.string)

class ThatsMyLimit(SoupTest):
    "Tests the limit argument."

    def testBasicLimits(self):
        s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
        self.assertEquals(len(s.findAll('br')), 4)
        self.assertEquals(len(s.findAll('br', limit=2)), 2)
        self.assertEquals(len(s('br', limit=2)), 2)

class OnlyTheLonely(SoupTest):
    "Tests the parseOnly argument to the constructor."
    def setUp(self):
        x = []
        for i in range(1,6):
            x.append('<a id="%s">' % i)
            for j in range(100,103):
                x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
            x.append('</a>')
        self.x = ''.join(x)

    def testOnly(self):
        strainer = SoupStrainer("b")
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
        self.assertEquals(len(soup), 15)

        strainer = SoupStrainer(id=re.compile("100.*"))
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
        self.assertEquals(len(soup), 5)

        strainer = SoupStrainer(text=re.compile("10[01].*"))
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
        self.assertEquals(len(soup), 10)

        strainer = SoupStrainer(text=lambda(x):x[8]=='3')
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
        self.assertEquals(len(soup), 3)

class PickleMeThis(SoupTest):
    "Testing features like pickle and deepcopy."

    def setUp(self):
        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
<link rev="made" href="mailto:leonardr@segfault.org">
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
<meta name="author" content="Leonard Richardson">
</head>
<body>
<a href="foo">foo</a>
<a href="foo"><b>bar</b></a>
</body>
</html>"""

        self.soup = BeautifulSoup(self.page)

    def testPickle(self):
        import pickle
        dumped = pickle.dumps(self.soup, 2)
        loaded = pickle.loads(dumped)
        self.assertEqual(loaded.__class__, BeautifulSoup)
        self.assertEqual(loaded.decode(), self.soup.decode())

    def testDeepcopy(self):
        from copy import deepcopy
        deepcopy(BeautifulSoup("<a></a>"))
        copied = deepcopy(self.soup)
        self.assertEqual(copied.decode(), self.soup.decode())

    def testUnicodePickle(self):
        import cPickle as pickle
        html = "<b>" + chr(0xc3) + "</b>"
        soup = BeautifulSoup(html)
        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
        loaded = pickle.loads(dumped)
        self.assertEqual(loaded.decode(), soup.decode())


class WriteOnlyCode(SoupTest):
    "Testing the modification of the tree."

    def testModifyAttributes(self):
        soup = BeautifulSoup('<a id="1"></a>')
        soup.a['id'] = 2
        self.assertEqual(soup.decode(), '<a id="2"></a>')
        del(soup.a['id'])
        self.assertEqual(soup.decode(), '<a></a>')
        soup.a['id2'] = 'foo'
        self.assertEqual(soup.decode(), '<a id2="foo"></a>')

    def testNewTagCreation(self):
        "Makes sure tags don't step on each others' toes."
        soup = BeautifulSoup()
        a = Tag(soup, 'a')
        ol = Tag(soup, 'ol')
        a['href'] = 'http://foo.com/'
        self.assertRaises(KeyError, lambda : ol['href'])

    def testTagReplacement(self):
        # Make sure you can replace an element with itself.
        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
        soup = BeautifulSoup(text)
        c = soup.c
        soup.c.replaceWith(c)
        self.assertEquals(soup.decode(), text)

        # A very simple case
        soup = BeautifulSoup("<b>Argh!</b>")
        soup.find(text="Argh!").replaceWith("Hooray!")
        newText = soup.find(text="Hooray!")
        b = soup.b
        self.assertEqual(newText.previous, b)
        self.assertEqual(newText.parent, b)
        self.assertEqual(newText.previous.next, newText)
        self.assertEqual(newText.next, None)

        # A more complex case
        soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
        soup.b.insert(1, "Hooray!")
        newText = soup.find(text="Hooray!")
        self.assertEqual(newText.previous, "Argh!")
        self.assertEqual(newText.previous.next, newText)

        self.assertEqual(newText.previousSibling, "Argh!")
        self.assertEqual(newText.previousSibling.nextSibling, newText)

        self.assertEqual(newText.nextSibling, None)
        self.assertEqual(newText.next, soup.c)

        text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
        soup = BeautifulSoup(text)
        no, show = soup.findAll('b')
        show.replaceWith(no)
        self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")

        # Even more complex
        soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
        tag = Tag(soup, 'magictag')
        tag.insert(0, "the")
        soup.a.insert(1, tag)

        b = soup.b
        c = soup.c
        theText = tag.find(text=True)
        findText = b.find(text="Find")

        self.assertEqual(findText.next, tag)
        self.assertEqual(tag.previous, findText)
        self.assertEqual(b.nextSibling, tag)
        self.assertEqual(tag.previousSibling, b)
        self.assertEqual(tag.nextSibling, c)
        self.assertEqual(c.previousSibling, tag)

        self.assertEqual(theText.next, c)
        self.assertEqual(c.previous, theText)

        # Aand... incredibly complex.
        soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
        f = soup.f
        a = soup.a
        c = soup.c
        e = soup.e
        weText = a.find(text="We")
        soup.b.replaceWith(soup.f)
        self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")

        self.assertEqual(f.previous, weText)
        self.assertEqual(weText.next, f)
        self.assertEqual(f.previousSibling, weText)
        self.assertEqual(f.nextSibling, None)
        self.assertEqual(weText.nextSibling, f)

    def testAppend(self):
       doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
       soup = BeautifulSoup(doc)
       second_para = soup('p')[1]
       bold = soup.find('b')
       soup('p')[1].append(soup.find('b'))
       self.assertEqual(bold.parent, second_para)
       self.assertEqual(soup.decode(),
                        "<p>Don't leave me .</p> "
                        "<p>Don't leave me.<b>here</b></p>")

    def testTagExtraction(self):
        # A very simple case
        text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
        soup = BeautifulSoup(text)
        extracted = soup.find("div", id="nav").extract()
        self.assertEqual(soup.decode(), "<html>Real content here.</html>")
        self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')

        # A simple case, a more complex test.
        text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
        soup = BeautifulStoneSoup(text)
        doc = soup.doc
        numbers, roman, letters = soup("a")

        self.assertEqual(roman.parent, doc)
        oldPrevious = roman.previous
        endOfThisTag = roman.nextSibling.previous
        self.assertEqual(oldPrevious, "2")
        self.assertEqual(roman.next, "i")
        self.assertEqual(endOfThisTag, "ii")
        self.assertEqual(roman.previousSibling, numbers)
        self.assertEqual(roman.nextSibling, letters)

        roman.extract()
        self.assertEqual(roman.parent, None)
        self.assertEqual(roman.previous, None)
        self.assertEqual(roman.next, "i")
        self.assertEqual(letters.previous, '2')
        self.assertEqual(roman.previousSibling, None)
        self.assertEqual(roman.nextSibling, None)
        self.assertEqual(endOfThisTag.next, None)
        self.assertEqual(roman.b.contents[0].next, None)
        self.assertEqual(numbers.nextSibling, letters)
        self.assertEqual(letters.previousSibling, numbers)
        self.assertEqual(len(doc.contents), 2)
        self.assertEqual(doc.contents[0], numbers)
        self.assertEqual(doc.contents[1], letters)

        # A more complex case.
        text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
        soup = BeautifulStoneSoup(text)
        one = soup.find(text="1")
        three = soup.find(text="3")
        toExtract = soup.b
        soup.b.extract()
        self.assertEqual(one.next, three)
        self.assertEqual(three.previous, one)
        self.assertEqual(one.parent.nextSibling, three)
        self.assertEqual(three.previousSibling, soup.a)

class TheManWithoutAttributes(SoupTest):
    "Test attribute access"

    def testHasKey(self):
        text = "<foo attr='bar'>"
        self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))

class QuoteMeOnThat(SoupTest):
    "Test quoting"
    def testQuotedAttributeValues(self):
        self.assertSoupEquals("<foo attr='bar'></foo>",
                              '<foo attr="bar"></foo>')

        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        soup = BeautifulSoup(text)
        self.assertEquals(soup.decode(), text)

        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
        self.assertSoupEquals(soup.decode(), newText)

        self.assertSoupEquals('<this is="really messed up & stuff">',
                              '<this is="really messed up &amp; stuff"></this>')



class YoureSoLiteral(SoupTest):
    "Test literal mode."
    def testLiteralMode(self):
        text = "<script>if (i<imgs.length)</script><b>Foo</b>"
        soup = BeautifulSoup(text)
        self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
        self.assertEqual(soup.b.contents[0], "Foo")

    def testTextArea(self):
        text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
        soup = BeautifulSoup(text)
        self.assertEqual(soup.textarea.contents[0],
                         "<b>This is an example of an HTML tag</b><&<&")

class OperatorOverload(SoupTest):
    "Our operators do it all! Call now!"

    def testTagNameAsFind(self):
        "Tests that referencing a tag name as a member delegates to find()."
        soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
        self.assertEqual(soup.b.i, soup.find('b').find('i'))
        self.assertEqual(soup.b.i.string, 'bar')
        self.assertEqual(soup.b['id'], '1')
        self.assertEqual(soup.b.contents[0], 'foo')
        self.assert_(not soup.a)

        #Test the .fooTag variant of .foo.
        self.assertEqual(soup.bTag.iTag.string, 'bar')
        self.assertEqual(soup.b.iTag.string, 'bar')
        self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)

class NestableEgg(SoupTest):
    """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""

    def testParaInsideBlockquote(self):
        soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
        self.assertEqual(soup.blockquote.p.b.string, 'Foo')
        self.assertEqual(soup.blockquote.b.string, 'Foo')
        self.assertEqual(soup.find('p', recursive=False).string, 'Bar')

    def testNestedTables(self):
        text = """<table id="1"><tr><td>Here's another table:
        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
        soup = BeautifulSoup(text)
        self.assertEquals(soup.table.table.td.string, 'Juicy text')
        self.assertEquals(len(soup.findAll('table')), 2)
        self.assertEquals(len(soup.table.findAll('table')), 1)
        self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
                          'table')

        text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
        soup = BeautifulSoup(text)
        self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")

        text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
        <tfoot><tr>Baz</tr></tfoot></table>"""
        soup = BeautifulSoup(text)
        self.assertEquals(soup.table.thead.tr.contents[0], "Foo")

    def testBadNestedTables(self):
        soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
        self.assertEquals(soup.table.tr.table.tr['id'], 'nested')

class CleanupOnAisleFour(SoupTest):
    """Here we test cleanup of text that breaks HTMLParser or is just
    obnoxious."""

    def testSelfClosingtag(self):
        self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
                         '<br />')

        self.assertSoupEquals('<p>test1<br/>test2</p>',
                              '<p>test1<br />test2</p>')

        text = '<p>test1<selfclosing>test2'
        soup = BeautifulStoneSoup(text)
        self.assertEqual(soup.decode(),
                         '<p>test1<selfclosing>test2</selfclosing></p>')

        soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
        self.assertEqual(soup.decode(),
                         '<p>test1<selfclosing />test2</p>')

    def testSelfClosingTagOrNot(self):
        text = "<item><link>http://foo.com/</link></item>"
        self.assertEqual(BeautifulStoneSoup(text).decode(), text)
        self.assertEqual(BeautifulSoup(text).decode(),
                         '<item><link />http://foo.com/</item>')

    def testBooleanAttributes(self):
        text = "<td nowrap>foo</td>"
        self.assertSoupEquals(text, text)

    def testCData(self):
        xml = "<root>foo<![CDATA[foobar]]>bar</root>"
        self.assertSoupEquals(xml, xml)
        r = re.compile("foo.*bar")
        soup = BeautifulSoup(xml)
        self.assertEquals(soup.find(text=r).string, "foobar")
        self.assertEquals(soup.find(text=r).__class__, CData)

    def testComments(self):
        xml = "foo<!--foobar-->baz"
        self.assertSoupEquals(xml)
        r = re.compile("foo.*bar")
        soup = BeautifulSoup(xml)
        self.assertEquals(soup.find(text=r).string, "foobar")
        self.assertEquals(soup.find(text="foobar").__class__, Comment)

    def testDeclaration(self):
        xml = "foo<!DOCTYPE foobar>baz"
        self.assertSoupEquals(xml)
        r = re.compile(".*foo.*bar")
        soup = BeautifulSoup(xml)
        text = "DOCTYPE foobar"
        self.assertEquals(soup.find(text=r).string, text)
        self.assertEquals(soup.find(text=text).__class__, Declaration)

        namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
                              '<html>foo</html>')
        soup = BeautifulSoup(namespaced_doctype)
        self.assertEquals(soup.contents[0],
                          'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
        self.assertEquals(soup.html.contents[0], 'foo')

    def testEntityConversions(self):
        text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
        soup = BeautifulStoneSoup(text)
        self.assertSoupEquals(text)

        xmlEnt = BeautifulStoneSoup.XML_ENTITIES
        htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
        xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES

        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
        self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")

        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
        self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")

        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
        self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")

        # Make sure the "XML", "HTML", and "XHTML" settings work.
        text = "&lt;&trade;&apos;"
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
        self.assertEquals(soup.decode(), u"<&trade;'")

        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
        self.assertEquals(soup.decode(), u"<\u2122&apos;")

        soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
        self.assertEquals(soup.decode(), u"<\u2122'")

    def testNonBreakingSpaces(self):
        soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")

    def testWhitespaceInDeclaration(self):
        self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')

    def testJunkInDeclaration(self):
        self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')

    def testIncompleteDeclaration(self):
        self.assertSoupEquals('a<!b <p>c')

    def testEntityReplacement(self):
        self.assertSoupEquals('<b>hello&nbsp;there</b>')

    def testEntitiesInAttributeValues(self):
        self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>',
                              encoding='utf-8')
        self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>',
                              encoding='utf-8')

        soup = BeautifulSoup('<x t="&gt;&trade;">',
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        self.assertEquals(soup.decode(), u'<x t="&gt;\u2122"></x>')

        uri = "http://crummy.com?sacr&eacute;&amp;bleu"
        link = '<a href="%s"></a>' % uri

        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.assertEquals(soup.decode(),
                          link.replace("&eacute;", u"\xe9"))

        uri = "http://crummy.com?sacr&eacute;&bleu"
        link = '<a href="%s"></a>' % uri
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.assertEquals(soup.a['href'],
                          uri.replace("&eacute;", u"\xe9"))

    def testNakedAmpersands(self):
        html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
        soup = BeautifulStoneSoup("AT&T ", **html)
        self.assertEquals(soup.decode(), 'AT&amp;T ')

        nakedAmpersandInASentence = "AT&T was Ma Bell"
        soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
        self.assertEquals(soup.decode(), \
               nakedAmpersandInASentence.replace('&','&amp;'))

        invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
        validURL = invalidURL.replace('&','&amp;')
        soup = BeautifulStoneSoup(invalidURL)
        self.assertEquals(soup.decode(), validURL)

        soup = BeautifulStoneSoup(validURL)
        self.assertEquals(soup.decode(), validURL)


class EncodeRed(SoupTest):
    """Tests encoding conversion, Unicode conversion, and Microsoft
    smart quote fixes."""

    def testUnicodeDammitStandalone(self):
        markup = "<foo>\x92</foo>"
        dammit = UnicodeDammit(markup)
        self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")

        hebrew = "\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
        self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
        self.assertEquals(dammit.originalEncoding, 'iso-8859-8')

    def testGarbageInGarbageOut(self):
        ascii = "<foo>a</foo>"
        asciiSoup = BeautifulStoneSoup(ascii)
        self.assertEquals(ascii, asciiSoup.decode())

        unicodeData = u"<foo>\u00FC</foo>"
        utf8 = unicodeData.encode("utf-8")
        self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')

        unicodeSoup = BeautifulStoneSoup(unicodeData)
        self.assertEquals(unicodeData, unicodeSoup.decode())
        self.assertEquals(unicodeSoup.foo.string, u'\u00FC')

        utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
        self.assertEquals(utf8, utf8Soup.encode('utf-8'))
        self.assertEquals(utf8Soup.originalEncoding, "utf-8")

        utf8Soup = BeautifulStoneSoup(unicodeData)
        self.assertEquals(utf8, utf8Soup.encode('utf-8'))
        self.assertEquals(utf8Soup.originalEncoding, None)


    def testHandleInvalidCodec(self):
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            soup = BeautifulSoup(u"Rksmrgs".encode("utf-8"),
                                 fromEncoding=bad_encoding)
            self.assertEquals(soup.originalEncoding, 'utf-8')

    def testUnicodeSearch(self):
        html = u'<html><body><h1>Rksmrgs</h1></body></html>'
        soup = BeautifulSoup(html)
        self.assertEqual(soup.find(text=u'Rksmrgs'),u'Rksmrgs')

    def testRewrittenXMLHeader(self):
        euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
        utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
        soup = BeautifulStoneSoup(euc_jp)
        if soup.originalEncoding != "euc-jp":
            raise Exception("Test failed when parsing euc-jp document. "
                            "If you're running Python >=2.4, or you have "
                            "cjkcodecs installed, this is a real problem. "
                            "Otherwise, ignore it.")

        self.assertEquals(soup.originalEncoding, "euc-jp")
        self.assertEquals(soup.renderContents('utf-8'), utf8)

        old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
        new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
        self.assertSoupEquals(old_text, new_text)

    def testRewrittenMetaTag(self):
        no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
        soup = BeautifulSoup(no_shift_jis_html)

        # Beautiful Soup used to try to rewrite the meta tag even if the
        # meta tag got filtered out by the strainer. This test makes
        # sure that doesn't happen.
        strainer = SoupStrainer('pre')
        soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
        self.assertEquals(soup.contents[0].name, 'pre')

        meta_tag = ('<meta content="text/html; charset=x-sjis" '
                    'http-equiv="Content-type" />')
        shift_jis_html = (
            '<html><head>\n%s\n'
            '<meta http-equiv="Content-language" content="ja" />'
            '</head><body><pre>\n'
            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
            '</pre></body></html>') % meta_tag
        soup = BeautifulSoup(shift_jis_html)
        if soup.originalEncoding != "shift-jis":
            raise Exception("Test failed when parsing shift-jis document "
                            "with meta tag '%s'."
                            "If you're running Python >=2.4, or you have "
                            "cjkcodecs installed, this is a real problem. "
                            "Otherwise, ignore it." % meta_tag)
        self.assertEquals(soup.originalEncoding, "shift-jis")

        content_type_tag = soup.meta['content']
        self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
                          'charset=%SOUP-ENCODING%')
        content_type = str(soup.meta)
        index = content_type.find('charset=')
        self.assertEqual(content_type[index:index+len('charset=utf8')+1],
                         'charset=utf-8')
        content_type = soup.meta.encode('shift-jis')
        index = content_type.find('charset=')
        self.assertEqual(content_type[index:index+len('charset=shift-jis')],
                         'charset=shift-jis'.encode())

        self.assertEquals(soup.encode('utf-8'), (
                '<html><head>\n'
                '<meta content="text/html; charset=utf-8" '
                'http-equiv="Content-type" />\n'
                '<meta http-equiv="Content-language" content="ja" />'
                '</head><body><pre>\n'
                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
                '</pre></body></html>'))
        self.assertEquals(soup.encode("shift-jis"),
                          shift_jis_html.replace('x-sjis'.encode(),
                                                 'shift-jis'.encode()))

        isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
        soup = BeautifulSoup(isolatin)

        utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
        utf8 = utf8.replace("\xe9", "\xc3\xa9")
        self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')

    def testHebrew(self):
        iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
        utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
        soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
        self.assertEquals(soup.encode('utf-8'), utf8)

    def testSmartQuotesNotSoSmartAnymore(self):
        self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
                              '&lsquo;Foo&rsquo; <!--blah-->')

    def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
        smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
        soup = BeautifulSoup(smartQuotes)
        self.assertEquals(soup.decode(),
                          'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
        soup = BeautifulSoup(smartQuotes, convertEntities="html")
        self.assertEquals(soup.encode('utf-8'),
                          'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')

    def testDontSeeSmartQuotesWhereThereAreNone(self):
        utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        self.assertSoupEquals(utf_8, encoding='utf-8')


class Whitewash(SoupTest):
    """Test whitespace preservation."""

    def testPreservedWhitespace(self):
        self.assertSoupEquals("<pre>   </pre>")
        self.assertSoupEquals("<pre> woo  </pre>")

    def testCollapsedWhitespace(self):
        self.assertSoupEquals("<p>   </p>", "<p> </p>")


if __name__ == '__main__':
    unittest.main()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.