import HTMLParser
import urllib
import sys
class parseAttrs(HTMLParser.HTMLParser):
def init_parser (self):
self.pieces = []
def handle_starttag(self, tag, attrs):
fixedAttrs = ""
for name, value in attrs:
fixedAttrs += "%s=\"%s\" " % (name, value)
self.pieces.append("<%s %s>" % (tag, fixedAttrs))
def handle_charref(self, name):
self.pieces.append("&#%s;" % (name))
def handle_endtag(self, tag):
self.pieces.append("</%s>" % (tag))
def handle_entityref(self, ref):
self.pieces.append("&%s" % (ref))
def handle_data(self, text):
self.pieces.append(text)
def handle_comment(self, text):
self.pieces.append("<!--%s-->" % (text))
def handle_pi(self, text):
self.pieces.append("<?%s>" % (text))
def handle_decl(self, text):
self.pieces.append("<!%s>" % (text))
def parsed (self):
return "".join(self.pieces)
attrParser = parseAttrs()
attrParser.init_parser()
attrParser.feed(urllib.urlopen("test2.html").read())
print open("test2.html").read()
print attrParser.parsed()
attrParser.close()
|