# A parser for HTML documents
# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
# describe hypertext documents
#
# SGML: Standard Generalized Markup Language
#
# WWW: World-Wide Web; a distributed hypertext system develped at CERN
#
# CERN: European Particle Physics Laboratory in Geneva, Switzerland
# This file is only concerned with parsing and formatting HTML
# documents, not with the other (hypertext and networking) aspects of
# the WWW project. (It does support highlighting of anchors.)
import os
import sys
import regex
import string
import sgmllib
class HTMLParser(sgmllib.SGMLParser):
# Copy base class entities and add some
entitydefs = {}
for key in sgmllib.SGMLParser.entitydefs.keys():
entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
entitydefs['bullet'] = '*'
# Provided -- handlers for tags introducing literal text
def start_listing(self, attrs):
self.setliteral('listing')
self.literal_bgn('listing', attrs)
def end_listing(self):
self.literal_end('listing')
def start_xmp(self, attrs):
self.setliteral('xmp')
self.literal_bgn('xmp', attrs)
def end_xmp(self):
self.literal_end('xmp')
def do_plaintext(self, attrs):
self.setnomoretags()
self.literal_bgn('plaintext', attrs)
# To be overridden -- begin/end literal mode
def literal_bgn(self, tag, attrs): pass
def literal_end(self, tag): pass
# Next level of sophistication -- collect anchors, title, nextid and isindex
class CollectingParser(HTMLParser):
#
def __init__(self):
HTMLParser.__init__(self)
self.savetext = None
self.nextid = ''
self.isindex = 0
self.title = ''
self.inanchor = 0
self.anchors = []
self.anchornames = []
self.anchortypes = []
#
def start_a(self, attrs):
self.inanchor = 0
href = ''
name = ''
type = ''
for attrname, value in attrs:
if attrname == 'href':
href = value
if attrname == 'name=':
name = value
if attrname == 'type=':
type = string.lower(value)
if not (href or name):
return
self.anchors.append(href)
self.anchornames.append(name)
self.anchortypes.append(type)
self.inanchor = len(self.anchors)
if not href:
self.inanchor = -self.inanchor
#
def end_a(self):
if self.inanchor > 0:
# Don't show anchors pointing into the current document
if self.anchors[self.inanchor-1][:1] <> '#':
self.handle_data('[' + `self.inanchor` + ']')
self.inanchor = 0
#
def start_header(self, attrs): pass
def end_header(self): pass
#
# (head is the same as header)
def start_head(self, attrs): pass
def end_head(self): pass
#
def start_body(self, attrs): pass
def end_body(self): pass
#
def do_nextid(self, attrs):
self.nextid = attrs
#
def do_isindex(self, attrs):
self.isindex = 1
#
def start_title(self, attrs):
self.savetext = ''
#
def end_title(self):
if self.savetext <> None:
self.title = self.savetext
self.savetext = None
#
def handle_data(self, text):
if self.savetext is not None:
self.savetext = self.savetext + text
# Formatting parser -- takes a formatter and a style sheet as arguments
# XXX The use of style sheets should change: for each tag and end tag
# there should be a style definition, and a style definition should
# encompass many more parameters: font, justification, indentation,
# vspace before, vspace after, hanging tag...
wordprog = regex.compile('[^ \t\n]*')
spaceprog = regex.compile('[ \t\n]*')
class FormattingParser(CollectingParser):
def __init__(self, formatter, stylesheet):
CollectingParser.__init__(self)
self.fmt = formatter
self.stl = stylesheet
self.savetext = None
self.compact = 0
self.nofill = 0
self.resetfont()
self.setindent(self.stl.stdindent)
def resetfont(self):
self.fontstack = []
self.stylestack = []
self.fontset = self.stl.stdfontset
self.style = ROMAN
self.passfont()
def passfont(self):
font = self.fontset[self.style]
self.fmt.setfont(font)
def pushstyle(self, style):
self.stylestack.append(self.style)
self.style = min(style, len(self.fontset)-1)
self.passfont()
def popstyle(self):
self.style = self.stylestack[-1]
del self.stylestack[-1]
self.passfont()
def pushfontset(self, fontset, style):
self.fontstack.append(self.fontset)
self.fontset = fontset
self.pushstyle(style)
def popfontset(self):
self.fontset = self.fontstack[-1]
del self.fontstack[-1]
self.popstyle()
def flush(self):
self.fmt.flush()
def setindent(self, n):
self.fmt.setleftindent(n)
def needvspace(self, n):
self.fmt.needvspace(n)
def close(self):
HTMLParser.close(self)
self.fmt.flush()
def handle_literal(self, text):
lines = string.splitfields(text, '\n')
for i in range(1, len(lines)):
lines[i] = string.expandtabs(lines[i], 8)
for line in lines[:-1]:
self.fmt.addword(line, 0)
self.fmt.flush()
self.fmt.nospace = 0
for line in lines[-1:]:
self.fmt.addword(line, 0)
def handle_data(self, text):
if self.savetext is not None:
self.savetext = self.savetext + text
return
if self.literal:
self.handle_literal(text)
return
i = 0
n = len(text)
while i < n:
j = i + wordprog.match(text, i)
word = text[i:j]
i = j + spaceprog.match(text, j)
self.fmt.addword(word, i-j)
if self.nofill and '\n' in text[j:i]:
self.fmt.flush()
self.fmt.nospace = 0
i = j+1
while text[i-1] <> '\n': i = i+1
def literal_bgn(self, tag, attrs):
if tag == 'plaintext':
self.flush()
else:
self.needvspace(1)
self.pushfontset(self.stl.stdfontset, FIXED)
self.setindent(self.stl.literalindent)
def literal_end(self, tag):
self.needvspace(1)
self.popfontset()
self.setindent(self.stl.stdindent)
def start_title(self, attrs):
self.flush()
self.savetext = ''
# NB end_title is unchanged
def do_p(self, attrs):
if self.compact:
self.flush()
else:
self.needvspace(1)
def do_hr(self, attrs):
self.fmt.hrule()
def start_h1(self, attrs):
self.needvspace(2)
self.setindent(self.stl.h1indent)
self.pushfontset(self.stl.h1fontset, BOLD)
self.fmt.setjust('c')
def end_h1(self):
self.popfontset()
self.needvspace(2)
self.setindent(self.stl.stdindent)
self.fmt.setjust('l')
def start_h2(self, attrs):
self.needvspace(1)
self.setindent(self.stl.h2indent)
self.pushfontset(self.stl.h2fontset, BOLD)
def end_h2(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
def start_h3(self, attrs):
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.pushfontset(self.stl.h3fontset, BOLD)
def end_h3(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
def start_h4(self, attrs):
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.pushfontset(self.stl.stdfontset, BOLD)
def end_h4(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
start_h5 = start_h4
end_h5 = end_h4
start_h6 = start_h5
end_h6 = end_h5
start_h7 = start_h6
end_h7 = end_h6
def start_ul(self, attrs):
self.needvspace(1)
for attrname, value in attrs:
if attrname == 'compact':
self.compact = 1
self.setindent(0)
break
else:
self.setindent(self.stl.ulindent)
start_dir = start_menu = start_ol = start_ul
do_li = do_p
def end_ul(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
end_dir = end_menu = end_ol = end_ul
def start_dl(self, attrs):
for attrname, value in attrs:
if attrname == 'compact':
self.compact = 1
self.needvspace(1)
def end_dl(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
def do_dt(self, attrs):
if self.compact:
self.flush()
else:
self.needvspace(1)
self.setindent(self.stl.stdindent)
def do_dd(self, attrs):
self.fmt.addword('', 1)
self.setindent(self.stl.ddindent)
def start_address(self, attrs):
self.compact = 1
self.needvspace(1)
self.fmt.setjust('r')
def end_address(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.fmt.setjust('l')
def start_pre(self, attrs):
self.needvspace(1)
self.nofill = self.nofill + 1
self.pushstyle(FIXED)
def end_pre(self):
self.popstyle()
self.nofill = self.nofill - 1
self.needvspace(1)
start_typewriter = start_pre
end_typewriter = end_pre
def do_img(self, attrs):
self.fmt.addword('(image)', 0)
# Physical styles
def start_tt(self, attrs): self.pushstyle(FIXED)
def end_tt(self): self.popstyle()
def start_b(self, attrs): self.pushstyle(BOLD)
def end_b(self): self.popstyle()
def start_i(self, attrs): self.pushstyle(ITALIC)
def end_i(self): self.popstyle()
def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
def end_u(self): self.popstyle()
def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
def end_r(self): self.popstyle()
# Logical styles
start_em = start_i
end_em = end_i
start_strong = start_b
end_strong = end_b
start_code = start_tt
end_code = end_tt
start_samp = start_tt
end_samp = end_tt
start_kbd = start_tt
end_kbd = end_tt
start_file = start_tt # unofficial
end_file = end_tt
start_var = start_i
end_var = end_i
start_dfn = start_i
end_dfn = end_i
start_cite = start_i
end_cite = end_i
start_hp1 = start_i
end_hp1 = start_i
start_hp2 = start_b
end_hp2 = end_b
def unknown_starttag(self, tag, attrs):
print '*** unknown <' + tag + '>'
def unknown_endtag(self, tag):
print '*** unknown </' + tag + '>'
# An extension of the formatting parser which formats anchors differently.
class AnchoringParser(FormattingParser):
def start_a(self, attrs):
FormattingParser.start_a(self, attrs)
if self.inanchor:
self.fmt.bgn_anchor(self.inanchor)
def end_a(self):
if self.inanchor:
self.fmt.end_anchor(self.inanchor)
self.inanchor = 0
# Style sheet -- this is never instantiated, but the attributes
# of the class object itself are used to specify fonts to be used
# for various paragraph styles.
# A font set is a non-empty list of fonts, in the order:
# [roman, italic, bold, fixed].
# When a style is not available the nearest lower style is used
ROMAN = 0
ITALIC = 1
BOLD = 2
FIXED = 3
class NullStylesheet:
# Fonts -- none
stdfontset = [None]
h1fontset = [None]
h2fontset = [None]
h3fontset = [None]
# Indents
stdindent = 2
ddindent = 25
ulindent = 4
h1indent = 0
h2indent = 0
literalindent = 0
class X11Stylesheet(NullStylesheet):
stdfontset = [ \
'-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
'-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
]
h1fontset = [ \
'-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
]
h2fontset = [ \
'-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
]
h3fontset = [ \
'-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
]
ddindent = 40
class MacStylesheet(NullStylesheet):
stdfontset = [ \
('Geneva', 'p', 10), \
('Geneva', 'i', 10), \
('Geneva', 'b', 10), \
('Monaco', 'p', 10), \
]
h1fontset = [ \
('Geneva', 'p', 18), \
('Geneva', 'i', 18), \
('Geneva', 'b', 18), \
('Monaco', 'p', 18), \
]
h3fontset = [ \
('Geneva', 'p', 14), \
('Geneva', 'i', 14), \
('Geneva', 'b', 14), \
('Monaco', 'p', 14), \
]
h3fontset = [ \
('Geneva', 'p', 12), \
('Geneva', 'i', 12), \
('Geneva', 'b', 12), \
('Monaco', 'p', 12), \
]
if os.name == 'mac':
StdwinStylesheet = MacStylesheet
else:
StdwinStylesheet = X11Stylesheet
class GLStylesheet(NullStylesheet):
stdfontset = [ \
'Helvetica 10', \
'Helvetica-Italic 10', \
'Helvetica-Bold 10', \
'Courier 10', \
]
h1fontset = [ \
'Helvetica 18', \
'Helvetica-Italic 18', \
'Helvetica-Bold 18', \
'Courier 18', \
]
h2fontset = [ \
'Helvetica 14', \
'Helvetica-Italic 14', \
'Helvetica-Bold 14', \
'Courier 14', \
]
h3fontset = [ \
'Helvetica 12', \
'Helvetica-Italic 12', \
'Helvetica-Bold 12', \
'Courier 12', \
]
# Test program -- produces no output but times how long it takes
# to send a document to a null formatter, exclusive of I/O
def test():
import fmt
import time
import urllib
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = urllib.urlopen(file).read()
t0 = time.time()
fmtr = fmt.WritingFormatter(sys.stdout, 79)
p = FormattingParser(fmtr, NullStylesheet)
p.feed(data)
p.close()
t1 = time.time()
print
print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
# Test program using stdwin
def testStdwin():
import stdwin, fmt
from stdwinevents import *
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = open(file, 'r').read()
window = stdwin.open('testStdwin')
b = None
while 1:
etype, ewin, edetail = stdwin.getevent()
if etype == WE_CLOSE:
break
if etype == WE_SIZE:
window.setdocsize(0, 0)
window.setorigin(0, 0)
window.change((0, 0), (10000, 30000)) # XXX
if etype == WE_DRAW:
if not b:
b = fmt.StdwinBackEnd(window, 1)
f = fmt.BaseFormatter(b.d, b)
p = FormattingParser(f, \
MacStylesheet)
p.feed(data)
p.close()
b.finish()
else:
b.redraw(edetail)
window.close()
# Test program using GL
def testGL():
import gl, GL, fmt
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = open(file, 'r').read()
W, H = 600, 600
gl.foreground()
gl.prefsize(W, H)
wid = gl.winopen('testGL')
gl.ortho2(0, W, H, 0)
gl.color(GL.WHITE)
gl.clear()
gl.color(GL.BLACK)
b = fmt.GLBackEnd(wid)
f = fmt.BaseFormatter(b.d, b)
p = FormattingParser(f, GLStylesheet)
p.feed(data)
p.close()
b.finish()
#
import time
time.sleep(5)
if __name__ == '__main__':
test()
|