htmllib.py : » Mobile » Python-for-PalmOS » Python-1.5.2+reduced-1.0 » Demo » tkinter » www » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Mobile » Python for PalmOS
Python for PalmOS » Python 1.5.2 reduced 1.0 » Demo » tkinter » www » htmllib.py
# A parser for HTML documents


# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
# describe hypertext documents
#
# SGML: Standard Generalized Markup Language
#
# WWW: World-Wide Web; a distributed hypertext system develped at CERN
#
# CERN: European Particle Physics Laboratory in Geneva, Switzerland


# This file is only concerned with parsing and formatting HTML
# documents, not with the other (hypertext and networking) aspects of
# the WWW project.  (It does support highlighting of anchors.)


import os
import sys
import regex
import string
import sgmllib


class HTMLParser(sgmllib.SGMLParser):

  # Copy base class entities and add some
  entitydefs = {}
  for key in sgmllib.SGMLParser.entitydefs.keys():
    entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
  entitydefs['bullet'] = '*'

  # Provided -- handlers for tags introducing literal text
  
  def start_listing(self, attrs):
    self.setliteral('listing')
    self.literal_bgn('listing', attrs)

  def end_listing(self):
    self.literal_end('listing')

  def start_xmp(self, attrs):
    self.setliteral('xmp')
    self.literal_bgn('xmp', attrs)

  def end_xmp(self):
    self.literal_end('xmp')

  def do_plaintext(self, attrs):
    self.setnomoretags()
    self.literal_bgn('plaintext', attrs)

  # To be overridden -- begin/end literal mode
  def literal_bgn(self, tag, attrs): pass
  def literal_end(self, tag): pass


# Next level of sophistication -- collect anchors, title, nextid and isindex
class CollectingParser(HTMLParser):
  #
  def __init__(self):
    HTMLParser.__init__(self)
    self.savetext = None
    self.nextid = ''
    self.isindex = 0
    self.title = ''
    self.inanchor = 0
    self.anchors = []
    self.anchornames = []
    self.anchortypes = []
  #
  def start_a(self, attrs):
    self.inanchor = 0
    href = ''
    name = ''
    type = ''
    for attrname, value in attrs:
      if attrname == 'href':
        href = value
      if attrname == 'name=':
        name = value
      if attrname == 'type=':
        type = string.lower(value)
    if not (href or name):
      return
    self.anchors.append(href)
    self.anchornames.append(name)
    self.anchortypes.append(type)
    self.inanchor = len(self.anchors)
    if not href:
      self.inanchor = -self.inanchor
  #
  def end_a(self):
    if self.inanchor > 0:
      # Don't show anchors pointing into the current document
      if self.anchors[self.inanchor-1][:1] <> '#':
        self.handle_data('[' + `self.inanchor` + ']')
    self.inanchor = 0
  #
  def start_header(self, attrs): pass
  def end_header(self): pass
  #
  # (head is the same as header)
  def start_head(self, attrs): pass
  def end_head(self): pass
  #
  def start_body(self, attrs): pass
  def end_body(self): pass
  #
  def do_nextid(self, attrs):
    self.nextid = attrs
  #
  def do_isindex(self, attrs):
    self.isindex = 1
  #
  def start_title(self, attrs):
    self.savetext = ''
  #
  def end_title(self):
    if self.savetext <> None:
      self.title = self.savetext
      self.savetext = None
  #
  def handle_data(self, text):
    if self.savetext is not None:
      self.savetext = self.savetext + text


# Formatting parser -- takes a formatter and a style sheet as arguments

# XXX The use of style sheets should change: for each tag and end tag
# there should be a style definition, and a style definition should
# encompass many more parameters: font, justification, indentation,
# vspace before, vspace after, hanging tag...

wordprog = regex.compile('[^ \t\n]*')
spaceprog = regex.compile('[ \t\n]*')

class FormattingParser(CollectingParser):

  def __init__(self, formatter, stylesheet):
    CollectingParser.__init__(self)
    self.fmt = formatter
    self.stl = stylesheet
    self.savetext = None
    self.compact = 0
    self.nofill = 0
    self.resetfont()
    self.setindent(self.stl.stdindent)

  def resetfont(self):
    self.fontstack = []
    self.stylestack = []
    self.fontset = self.stl.stdfontset
    self.style = ROMAN
    self.passfont()

  def passfont(self):
    font = self.fontset[self.style]
    self.fmt.setfont(font)

  def pushstyle(self, style):
    self.stylestack.append(self.style)
    self.style = min(style, len(self.fontset)-1)
    self.passfont()

  def popstyle(self):
    self.style = self.stylestack[-1]
    del self.stylestack[-1]
    self.passfont()

  def pushfontset(self, fontset, style):
    self.fontstack.append(self.fontset)
    self.fontset = fontset
    self.pushstyle(style)

  def popfontset(self):
    self.fontset = self.fontstack[-1]
    del self.fontstack[-1]
    self.popstyle()

  def flush(self):
    self.fmt.flush()

  def setindent(self, n):
    self.fmt.setleftindent(n)

  def needvspace(self, n):
    self.fmt.needvspace(n)

  def close(self):
    HTMLParser.close(self)
    self.fmt.flush()

  def handle_literal(self, text):
    lines = string.splitfields(text, '\n')
    for i in range(1, len(lines)):
      lines[i] = string.expandtabs(lines[i], 8)
    for line in lines[:-1]:
      self.fmt.addword(line, 0)
      self.fmt.flush()
      self.fmt.nospace = 0
    for line in lines[-1:]:
      self.fmt.addword(line, 0)

  def handle_data(self, text):
    if self.savetext is not None:
      self.savetext = self.savetext + text
      return
    if self.literal:
      self.handle_literal(text)
      return
    i = 0
    n = len(text)
    while i < n:
      j = i + wordprog.match(text, i)
      word = text[i:j]
      i = j + spaceprog.match(text, j)
      self.fmt.addword(word, i-j)
      if self.nofill and '\n' in text[j:i]:
        self.fmt.flush()
        self.fmt.nospace = 0
        i = j+1
        while text[i-1] <> '\n': i = i+1

  def literal_bgn(self, tag, attrs):
    if tag == 'plaintext':
      self.flush()
    else:
      self.needvspace(1)
    self.pushfontset(self.stl.stdfontset, FIXED)
    self.setindent(self.stl.literalindent)

  def literal_end(self, tag):
    self.needvspace(1)
    self.popfontset()
    self.setindent(self.stl.stdindent)

  def start_title(self, attrs):
    self.flush()
    self.savetext = ''
  # NB end_title is unchanged

  def do_p(self, attrs):
    if self.compact:
      self.flush()
    else:
      self.needvspace(1)

  def do_hr(self, attrs):
    self.fmt.hrule()

  def start_h1(self, attrs):
    self.needvspace(2)
    self.setindent(self.stl.h1indent)
    self.pushfontset(self.stl.h1fontset, BOLD)
    self.fmt.setjust('c')

  def end_h1(self):
    self.popfontset()
    self.needvspace(2)
    self.setindent(self.stl.stdindent)
    self.fmt.setjust('l')

  def start_h2(self, attrs):
    self.needvspace(1)
    self.setindent(self.stl.h2indent)
    self.pushfontset(self.stl.h2fontset, BOLD)

  def end_h2(self):
    self.popfontset()
    self.needvspace(1)
    self.setindent(self.stl.stdindent)

  def start_h3(self, attrs):
    self.needvspace(1)
    self.setindent(self.stl.stdindent)
    self.pushfontset(self.stl.h3fontset, BOLD)

  def end_h3(self):
    self.popfontset()
    self.needvspace(1)
    self.setindent(self.stl.stdindent)

  def start_h4(self, attrs):
    self.needvspace(1)
    self.setindent(self.stl.stdindent)
    self.pushfontset(self.stl.stdfontset, BOLD)

  def end_h4(self):
    self.popfontset()
    self.needvspace(1)
    self.setindent(self.stl.stdindent)

  start_h5 = start_h4
  end_h5 = end_h4

  start_h6 = start_h5
  end_h6 = end_h5

  start_h7 = start_h6
  end_h7 = end_h6

  def start_ul(self, attrs):
    self.needvspace(1)
    for attrname, value in attrs:
      if attrname == 'compact':
        self.compact = 1
        self.setindent(0)
        break
    else:
      self.setindent(self.stl.ulindent)

  start_dir = start_menu = start_ol = start_ul

  do_li = do_p

  def end_ul(self):
    self.compact = 0
    self.needvspace(1)
    self.setindent(self.stl.stdindent)

  end_dir = end_menu = end_ol = end_ul

  def start_dl(self, attrs):
    for attrname, value in attrs:
      if attrname == 'compact':
        self.compact = 1
    self.needvspace(1)

  def end_dl(self):
    self.compact = 0
    self.needvspace(1)
    self.setindent(self.stl.stdindent)

  def do_dt(self, attrs):
    if self.compact:
      self.flush()
    else:
      self.needvspace(1)
    self.setindent(self.stl.stdindent)

  def do_dd(self, attrs):
    self.fmt.addword('', 1)
    self.setindent(self.stl.ddindent)

  def start_address(self, attrs):
    self.compact = 1
    self.needvspace(1)
    self.fmt.setjust('r')

  def end_address(self):
    self.compact = 0
    self.needvspace(1)
    self.setindent(self.stl.stdindent)
    self.fmt.setjust('l')

  def start_pre(self, attrs):
    self.needvspace(1)
    self.nofill = self.nofill + 1
    self.pushstyle(FIXED)

  def end_pre(self):
    self.popstyle()
    self.nofill = self.nofill - 1
    self.needvspace(1)

  start_typewriter = start_pre
  end_typewriter = end_pre

  def do_img(self, attrs):
    self.fmt.addword('(image)', 0)

  # Physical styles

  def start_tt(self, attrs): self.pushstyle(FIXED)
  def end_tt(self): self.popstyle()

  def start_b(self, attrs): self.pushstyle(BOLD)
  def end_b(self): self.popstyle()

  def start_i(self, attrs): self.pushstyle(ITALIC)
  def end_i(self): self.popstyle()

  def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
  def end_u(self): self.popstyle()

  def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
  def end_r(self): self.popstyle()

  # Logical styles

  start_em = start_i
  end_em = end_i

  start_strong = start_b
  end_strong = end_b

  start_code = start_tt
  end_code = end_tt

  start_samp = start_tt
  end_samp = end_tt

  start_kbd = start_tt
  end_kbd = end_tt

  start_file = start_tt # unofficial
  end_file = end_tt

  start_var = start_i
  end_var = end_i

  start_dfn = start_i
  end_dfn = end_i

  start_cite = start_i
  end_cite = end_i

  start_hp1 = start_i
  end_hp1 = start_i

  start_hp2 = start_b
  end_hp2 = end_b

  def unknown_starttag(self, tag, attrs):
    print '*** unknown <' + tag + '>'

  def unknown_endtag(self, tag):
    print '*** unknown </' + tag + '>'


# An extension of the formatting parser which formats anchors differently.
class AnchoringParser(FormattingParser):

  def start_a(self, attrs):
    FormattingParser.start_a(self, attrs)
    if self.inanchor:
      self.fmt.bgn_anchor(self.inanchor)

  def end_a(self):
    if self.inanchor:
      self.fmt.end_anchor(self.inanchor)
      self.inanchor = 0


# Style sheet -- this is never instantiated, but the attributes
# of the class object itself are used to specify fonts to be used
# for various paragraph styles.
# A font set is a non-empty list of fonts, in the order:
# [roman, italic, bold, fixed].
# When a style is not available the nearest lower style is used

ROMAN = 0
ITALIC = 1
BOLD = 2
FIXED = 3

class NullStylesheet:
  # Fonts -- none
  stdfontset = [None]
  h1fontset = [None]
  h2fontset = [None]
  h3fontset = [None]
  # Indents
  stdindent = 2
  ddindent = 25
  ulindent = 4
  h1indent = 0
  h2indent = 0
  literalindent = 0


class X11Stylesheet(NullStylesheet):
  stdfontset = [ \
    '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
    '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
    '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
    '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
    ]
  h1fontset = [ \
    '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
    '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
    '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
    ]
  h2fontset = [ \
    '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
    '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
    '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
    ]
  h3fontset = [ \
    '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
    '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
    '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
    ]
  ddindent = 40


class MacStylesheet(NullStylesheet):
  stdfontset = [ \
    ('Geneva', 'p', 10), \
    ('Geneva', 'i', 10), \
    ('Geneva', 'b', 10), \
    ('Monaco', 'p', 10), \
    ]
  h1fontset = [ \
    ('Geneva', 'p', 18), \
    ('Geneva', 'i', 18), \
    ('Geneva', 'b', 18), \
    ('Monaco', 'p', 18), \
    ]
  h3fontset = [ \
    ('Geneva', 'p', 14), \
    ('Geneva', 'i', 14), \
    ('Geneva', 'b', 14), \
    ('Monaco', 'p', 14), \
    ]
  h3fontset = [ \
    ('Geneva', 'p', 12), \
    ('Geneva', 'i', 12), \
    ('Geneva', 'b', 12), \
    ('Monaco', 'p', 12), \
    ]


if os.name == 'mac':
  StdwinStylesheet = MacStylesheet
else:
  StdwinStylesheet = X11Stylesheet


class GLStylesheet(NullStylesheet):
  stdfontset = [ \
    'Helvetica 10', \
    'Helvetica-Italic 10', \
    'Helvetica-Bold 10', \
    'Courier 10', \
    ]
  h1fontset = [ \
    'Helvetica 18', \
    'Helvetica-Italic 18', \
    'Helvetica-Bold 18', \
    'Courier 18', \
    ]
  h2fontset = [ \
    'Helvetica 14', \
    'Helvetica-Italic 14', \
    'Helvetica-Bold 14', \
    'Courier 14', \
    ]
  h3fontset = [ \
    'Helvetica 12', \
    'Helvetica-Italic 12', \
    'Helvetica-Bold 12', \
    'Courier 12', \
    ]


# Test program -- produces no output but times how long it takes
# to send a document to a null formatter, exclusive of I/O

def test():
  import fmt
  import time
  import urllib
  if sys.argv[1:]: file = sys.argv[1]
  else: file = 'test.html'
  data = urllib.urlopen(file).read()
  t0 = time.time()
  fmtr = fmt.WritingFormatter(sys.stdout, 79)
  p = FormattingParser(fmtr, NullStylesheet)
  p.feed(data)
  p.close()
  t1 = time.time()
  print
  print '*** Formatting time:', round(t1-t0, 3), 'seconds.'


# Test program using stdwin

def testStdwin():
  import stdwin, fmt
  from stdwinevents import *
  if sys.argv[1:]: file = sys.argv[1]
  else: file = 'test.html'
  data = open(file, 'r').read()
  window = stdwin.open('testStdwin')
  b = None
  while 1:
    etype, ewin, edetail = stdwin.getevent()
    if etype == WE_CLOSE:
      break
    if etype == WE_SIZE:
      window.setdocsize(0, 0)
      window.setorigin(0, 0)
      window.change((0, 0), (10000, 30000)) # XXX
    if etype == WE_DRAW:
      if not b:
        b = fmt.StdwinBackEnd(window, 1)
        f = fmt.BaseFormatter(b.d, b)
        p = FormattingParser(f, \
                  MacStylesheet)
        p.feed(data)
        p.close()
        b.finish()
      else:
        b.redraw(edetail)
  window.close()


# Test program using GL

def testGL():
  import gl, GL, fmt
  if sys.argv[1:]: file = sys.argv[1]
  else: file = 'test.html'
  data = open(file, 'r').read()
  W, H = 600, 600
  gl.foreground()
  gl.prefsize(W, H)
  wid = gl.winopen('testGL')
  gl.ortho2(0, W, H, 0)
  gl.color(GL.WHITE)
  gl.clear()
  gl.color(GL.BLACK)
  b = fmt.GLBackEnd(wid)
  f = fmt.BaseFormatter(b.d, b)
  p = FormattingParser(f, GLStylesheet)
  p.feed(data)
  p.close()
  b.finish()
  #
  import time
  time.sleep(5)


if __name__ == '__main__':
  test()
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.