PSParser.py : » Network » Grail-Internet-Browser » grail-0.6 » printing » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Network » Grail Internet Browser
Grail Internet Browser » grail 0.6 » printing » PSParser.py
"""HTML parser for printing."""

__version__ = '$Revision: 1.13 $'

import os
import string
import types
import urlparse

from formatter import AbstractFormatter
from formatter import AS_IS
from sgml.HTMLParser import HTMLParser
from sgml.utils import *

import epstools                         # in package
import utils


SIZE_STEP = 0.2


class PrintingHTMLParser(HTMLParser):

    """Class to override HTMLParser's default methods.

    Special support is provided for anchors, BASE, images, subscripts,
    and superscripts.

    Image loading is controlled by an optional parameter called
    `image_loader.'  The value of this parameter should be a function
    which resolves a URL to raw image data.  The image data should be
    returned as a string.

    If an image loader is provided, the `greyscale' parameter is used
    to determine how the image should be converted to postscript.

    The interpretation of anchor tags is controlled by two options,
    `footnote_anchors' and `underline_anchors.'  If footnote_anchors
    is true, anchors are assigned footnote numbers and the target URL
    is printed in a list appended following the body of the document.
    The underline_anchors flag controls the visual treatment of the
    anchor text in the main document.
    """
    _inited = 0
    _image_loader = None

    def __init__(self, writer, settings, context):
        if not self._inited:
            for k, v in self.fontdingbats.items():
                self.dingbats[(k, 'grey')] = v
                self.dingbats[(k, 'color')] = v
            import Greek
            for k, v in Greek.entitydefs.items():
                tup = (v, 'Symbol')
                self.dingbats[(k, 'grey')] = tup
                self.dingbats[(k, 'color')] = tup
            PrintingHTMLParser._inited = 1
        HTMLParser.__init__(self, AbstractFormatter(writer))
        if settings.strict_parsing:
            self.sgml_parser.restrict(0)
        self._baseurl = context.get_baseurl()
        self.context = context
        self.settings = settings
        if settings.imageflag:
            self._image_loader = utils.image_loader
        self._image_cache = {}
        self._anchors = {None: None}
        self._anchor_sequence = []
        self._anchor_xforms = []
        if not settings.footnoteflag:
            self.add_anchor_transform(disallow_anchor_footnotes)
        else:
            self.add_anchor_transform(
                disallow_self_reference(context.get_url()))
        self.__fontsize = [3]

    def close(self):
        if self._anchor_sequence:
            self.write_footnotes()
        HTMLParser.close(self)

    def get_devicetypes(self):
        """Return sequence of device type names."""
        return ('postscript', 'writer')

    def add_anchor_transform(self, xform):
        if xform not in self._anchor_xforms:
            self._anchor_xforms.insert(0, xform)

    def remove_anchor_transform(self, xform):
        if xform in self._anchor_xforms:
            self._anchor_xforms.remove(xform)

    def register_id(self, name):
        """Add page number of element start to internal database."""
        (scheme, netloc, path, params, query, fragment) = \
                 urlparse.urlparse(self.context.get_url())
        netloc = string.lower(netloc)
        url = urlparse.urlunparse(
            (scheme, netloc, path, params, query, name))
        pageno = self.formatter.writer.ps.get_pageno()
        self._set_docinfo(url, pageno, '')

    def do_base(self, attrs):
        HTMLParser.do_base(self, attrs)
        if self.base:
            self.context.set_baseurl(self.base)

    def __footnote_anchor(self, href, attrs):
        for xform in self._anchor_xforms:
            href = xform(href, attrs)
            if not href:
                return None
            attrs['href'] = href
        else:
            href = disallow_data_scheme(href, attrs)
        return href or None

    FOOTNOTE_DIV_ATTRIBUTES = {'align': 'left'}
    FOOTNOTE_LIST_ATTRIBUTES = {'type': '1.', 'compact': 'compact'}
    FOOTNOTE_INDICATOR_FORMAT = "[%d]"
    FOOTNOTE_HEADER = "URLs referenced in this document:"

    def write_footnotes(self):
        import copy
        self.close_paragraph()
        self.formatter.end_paragraph(1)
        self.do_hr({})
        self.start_div(copy.copy(self.FOOTNOTE_DIV_ATTRIBUTES))
        self.para_bgn({})
        self.handle_data(self.FOOTNOTE_HEADER)
        self.para_end()
        self.start_small({})
        self.start_ol(copy.copy(self.FOOTNOTE_LIST_ATTRIBUTES))
        history = self.context.app.global_history
        for anchor, title in self._anchor_sequence:
            self.do_li({})
            if not title and history:
                title, when = history.lookup_url(anchor)
            if not title:
                # Try getting this from our internal database if we haven't
                # already found it.
                pageno, title = self.get_docinfo(anchor)
            if title:
                # Set the title as a citation:
                self.start_cite({})
                self.handle_data(title)
                self.end_cite()
                self.handle_data(', ')
            self.handle_data(anchor)
        self.end_ol()
        self.end_small()
        self.end_div()

    _inanchor = 0
    def start_a(self, attrs):
        href = extract_keyword('href', attrs)
        if href:
            href = self.context.get_baseurl(href)
        self.anchor = href
        if href:
            if self.settings.underflag:
                self.formatter.push_style('underline')
                self._inanchor = 1
            if not self._anchors.has_key(href):
                href = self.anchor = self.__footnote_anchor(href, attrs)
                if self._anchors.has_key(href): return
                self._anchors[href] = len(self._anchor_sequence) + 1
                title = extract_keyword('title', attrs, '')
                title = string.join(string.split(title))
                self._anchor_sequence.append((href, title))
        else:
            self._inanchor = 0
        name = extract_keyword('name', attrs, conv=conv_normstring)
        if name:
            self.register_id(name)

    def end_a(self):
        if self.settings.underflag and self._inanchor:
            self.formatter.pop_style()
        if self.anchor:
            anchor, self.anchor = self.anchor, None
            old_size = self.formatter.writer.ps.get_fontsize()
            self.start_small({}, steps=2)
            new_size = self.formatter.writer.ps.get_fontsize()
            yshift = old_size - ((1.0 + SIZE_STEP / 2) * new_size)
            self.formatter.push_font((AS_IS, 0, 0, 0))
            self.formatter.writer.ps.push_yshift(yshift)
            self.handle_data(self.FOOTNOTE_INDICATOR_FORMAT
                             % self._anchors[anchor])
            self.formatter.writer.ps.pop_yshift()
            self.formatter.pop_font()
            self.end_small()

    def start_p(self, attrs):
        if (self.settings.paragraph_indent or self.settings.paragraph_skip) \
           and extract_keyword(
               'indent', attrs, conv=conv_normstring) != "no":
            self.para_bgn(attrs)
            if not self.formatter.have_label:
                self.formatter.writer.send_indentation(
                    self.settings.paragraph_indent)
        else:
            self.para_bgn(attrs)
        self.require_vspace(2)

    def end_p(self):
        if self.settings.paragraph_indent:
            self.para_end(parbreak=0)
        else:
            self.para_end(parbreak=1)
        self.formatter.writer.send_indentation(None)
        self.formatter.writer.suppress_indentation(0)

    def do_basefont(self, attrs):
        if attrs.has_key("size"):
            self.start_font({"size": attrs["size"]})

    def start_font(self, attrs):
        # very simple: only supports SIZE="...."
        size = None
        spec = extract_keyword('size', attrs, conv=conv_normstring)
        nsize = self.__fontsize[-1]
        op, diff = self.parse_fontsize(spec)
        if not diff:
            self.formatter.push_font((AS_IS, AS_IS, AS_IS, AS_IS))
        else:
            if op == "-":
                nsize = self.__fontsize[-1] - diff
                self.start_small({}, steps=diff)
            else:
                nsize = self.__fontsize[-1] + diff
                self.start_big({}, steps=diff)
        self.__fontsize.append(nsize)

    def parse_fontsize(self, spec):
        if not spec:
            return "+", 0
        op = ""
        if spec[0] in "-+":
            op = spec[0]
            spec = spec[1:]
        try:
            spec = string.atoi(spec)
        except ValueError:
            return "+", 0
        if op:
            return op, spec
        if spec < self.__fontsize[-1]:
            diff = self.__fontsize[-1] - spec
            return "-", diff
        diff = spec - self.__fontsize[-1]
        return "+", diff

    def end_font(self):
        del self.__fontsize[-1]
        self.formatter.pop_font()

    def end_title(self):
        HTMLParser.end_title(self)
        self.formatter.writer.ps.set_title(self.title)
        self.formatter.writer.ps.prune_titles()

    def start_small(self, attrs, steps=1):
        font_size = self.formatter.writer.ps.get_fontsize()
        while steps > 0:
            steps = steps - 1
            font_size = (1.0 - SIZE_STEP) * font_size
        self.formatter.push_font((font_size, AS_IS, AS_IS, AS_IS))

    def end_small(self):
        self.formatter.pop_font()

    def start_big(self, attrs, steps=1):
        font_size = self.formatter.writer.ps.get_fontsize()
        while steps > 0:
            steps = steps - 1
            font_size = (1.0 + SIZE_STEP) * font_size
        self.formatter.push_font((font_size, AS_IS, AS_IS, AS_IS))

    def end_big(self):
        self.end_small()

    def start_sup(self, attrs):
        font_size = self.formatter.writer.ps.get_fontsize()
        self.start_small(attrs)
        new_font_size = self.formatter.writer.ps.get_fontsize()
        yshift = font_size - ((1.0 - SIZE_STEP / 2) * new_font_size)
        self.formatter.writer.ps.push_yshift(yshift)

    def start_sub(self, attrs):
        self.start_small(attrs)
        new_font_size = self.formatter.writer.ps.get_fontsize()
        self.formatter.writer.ps.push_yshift(-(SIZE_STEP / 2) * new_font_size)

    def end_sup(self):
        self.formatter.writer.ps.pop_yshift()
        self.end_small()

    def end_sub(self):
        self.end_sup()

    def start_pre(self, attrs):
        HTMLParser.start_pre(self, attrs)
        new_size = AS_IS
        width = extract_keyword('width', attrs, 0, conv=conv_integer)
        if width > 0:
            ps = self.formatter.writer.ps
            space_width = ps._font.text_width(' ')
            pagewidth = ps.get_pagewidth()
            required = space_width * width
            if required > pagewidth:
                factor = pagewidth / required
                new_size = ps.get_fontsize() * factor
        self.formatter.push_font((new_size, AS_IS, AS_IS, AS_IS))

    def end_pre(self):
        self.formatter.pop_font()
        HTMLParser.end_pre(self)

    __docinfo = None
    def _set_docinfo(self, url, pageno, title):
        if self.__docinfo is None:
            self.__docinfo = {}
        self.__docinfo[url] = (pageno, title)

    def get_docinfo(self, url):
        if self.__docinfo and self.__docinfo.has_key(url):
            return self.__docinfo[url]
        return None, None

    # These are really hackish, but improve some things just a little:
    def start_tr(self, attrs):
        self.start_div({})

    def end_tr(self):
        self.end_div()

    def start_table(self, attrs):
        self.para_bgn({}, parbreak=0)

    def end_table(self):
        self.para_end(parbreak=0)

    def start_td(self, attrs):
        pass

    def start_th(self, attrs):
        self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))

    def end_th(self):
        self.formatter.pop_font()

    def start_caption(self, attrs):
        self.start_div({"align": "center"})
        self.formatter.writer.suppress_indentation()

    def end_caption(self):
        self.end_div()

    def handle_image(self, src, alt, ismap, align, width,
                     height, border=2, *args, **kw):
        if self.settings.imageflag:
            utils.debug("handle_image('%s', ...)" % src)
            imageurl = self.context.get_baseurl(src)
            if self._image_cache.has_key(imageurl):
                image = self._image_cache[imageurl]
            else:
                try:
                    image = self.load_image(imageurl)
                except epstools.EPSError:
                    self._image_cache[imageurl] = image = None
                else:
                    if len(image.data) < 10240:
                        self._image_cache[imageurl] = image
            if image:
                self.print_image(image, width, height, align)
            else:
                #  previous load resulted in failure:
                self.handle_data(alt)
        else:
            self.handle_data(alt)

    def print_image(self, image, width, height, align=None):
        image.reset()                   # restart scaling calculations
        if width and height:
            image.set_size(width, height)
        elif width:
            image.set_width(width)
        elif height:
            image.set_height(height)
        self.formatter.writer.send_eps_data(image, string.lower(align or ''))
        self.formatter.assert_line_data()

    def header_bgn(self, tag, level, attrs):
        HTMLParser.header_bgn(self, tag, level, attrs)
        dingbat = extract_keyword('dingbat', attrs)
        if dingbat:
            self.unknown_entityref(dingbat, '')
            self.formatter.add_flowing_data(' ')
        elif attrs.has_key('src'):
            self.do_img(attrs)
            self.formatter.add_flowing_data(' ')

    def header_end(self, tag, level):
        HTMLParser.header_end(self, tag, level)
        self.formatter.writer.suppress_indentation()

    def header_number(self, tag, level, attrs):
        # make sure we have at least 3*fontsize vertical space available:
        self.require_vspace(3)
        # now call the base class:
        HTMLParser.header_number(self, tag, level, attrs)

    def require_vspace(self, factor):
        ps = self.formatter.writer.ps
        fontsize = ps._font.font_size()
        available = ps.get_pageheight() + ps._ypos
        if available < (factor * fontsize):
            ps.push_page_break()

    def pi_page_break(self, arglist):
        self.formatter.add_line_break()
        self.formatter.writer.ps.push_page_break()

    def pi_debugging_on(self, arglist):
        self.__do_debugging(1, arglist)

    def pi_debugging_off(self, arglist):
        self.__do_debugging(0, arglist)

    def __do_debugging(self, flag, arglist):
        arglist = arglist or (None,)
        for subsystem in arglist:
            utils.set_debugging(flag, subsystem)

    # List attribute extensions:

    def start_ul(self, attrs, *args, **kw):
        self.list_check_dingbat(attrs)
        apply(HTMLParser.start_ul, (self, attrs) + args, kw)
        self.formatter.writer.suppress_indentation()

    def end_ul(self):
        HTMLParser.end_ul(self)
        self.formatter.writer.suppress_indentation(0)

    def start_dl(self, attrs):
        HTMLParser.start_dl(self, attrs)
        self.formatter.writer.suppress_indentation()

    def end_dl(self):
        HTMLParser.end_dl(self)
        self.formatter.writer.suppress_indentation(0)

    def start_ol(self, attrs):
        HTMLParser.start_ol(self, attrs)
        self.formatter.writer.suppress_indentation()

    def end_ol(self):
        HTMLParser.end_ol(self)
        self.formatter.writer.suppress_indentation(0)

    def do_li(self, attrs):
        self.list_check_dingbat(attrs)
        HTMLParser.do_li(self, attrs)
        self.formatter.writer.suppress_indentation()

    def do_dd(self, attrs):
        HTMLParser.do_dd(self, attrs)
        self.formatter.writer.suppress_indentation()

    def do_dt(self, attrs):
        HTMLParser.do_dt(self, attrs)
        self.formatter.writer.suppress_indentation()

    def list_check_dingbat(self, attrs):
        if attrs.has_key('dingbat') and attrs['dingbat']:
            img = self.load_dingbat(attrs['dingbat'])
            if img: attrs['type'] = img

    # Override make_format():
    # This allows disc/circle/square to be mapped to images.

    def make_format(self, format, default='disc', listtype = None):
        fmt = format or default
        if fmt in ('disc', 'circle', 'square') and listtype == 'ul':
            img = self.load_dingbat(fmt)
            return img or HTMLParser.make_format(self, format, default)
        else:
            return HTMLParser.make_format(self, format, default,
                                          listtype = listtype)

    def unknown_entityref(self, entname, terminator):
        dingbat = self.load_dingbat(entname)
        if type(dingbat) is types.TupleType:
            apply(self.formatter.writer.ps.push_font_string, dingbat)
            self.formatter.assert_line_data()
        elif dingbat:
            dingbat.restrict(0.9 * self.formatter.writer.ps.get_fontsize(),
                             self.formatter.writer.ps.get_pagewidth())
            self.formatter.writer.send_eps_data(dingbat, 'absmiddle')
            self.formatter.assert_line_data()
        else:
            HTMLParser.unknown_entityref(self, entname, terminator)


    dingbats = {}                       # (name, cog) ==> EPSImage
                                        #                 | (string, font)
                                        #                 | None

    fontdingbats = {'disc': ('\x6c', 'ZapfDingbats'),
                    'circle': ('\x6d', 'ZapfDingbats'),
                    'square': ('\x6f', 'ZapfDingbats'),
                    'sp': (' ', None),
                    'thinsp': ('\240', None),
                    'endash': ('-', None),
                    'ndash': ('-', None),
                    'emdash': ('--', None),
                    'mdash': ('--', None),
                    }

    def load_dingbat(self, entname):
        """Load the appropriate EPSImage object for an entity.
        """
        if self.settings.greyscale:
            img = self.load_dingbat_cog(entname, 'grey')
        else:
            img = self.load_dingbat_cog(entname, 'color')
            if not img:
                img = self.load_dingbat_cog(entname, 'grey')
        return img

    def load_dingbat_cog(self, entname, cog):
        """Load EPSImage object for an entity with a specified conversion.

        The conversion is not downgraded to grey if 'color' fails.  If the
        image is not available or convertible, returns None.
        """
        key = (entname, cog)
        if self.dingbats.has_key(key):
            return self.dingbats[key]
        gifname = entname + '.gif'
        epsname = os.path.join('eps.' + cog, entname + '.eps')
        self.dingbats[key] = None
        for p in self.context.app.iconpath:
            epsp = os.path.join(p, epsname)
            gifp = os.path.join(p, gifname)
            if os.path.exists(epsp):
                self.load_dingbat_eps(key, epsp)
            elif os.path.exists(gifp):
                try:
                    newepsp = epstools.convert_gif_to_eps(cog, gifp, epsp)
                except:
                    pass
                else:
                    self.load_dingbat_eps(key, newepsp)
                    if newepsp != epsp:
                        os.unlink(newepsp)
                break
        return self.dingbats[key]

    def load_dingbat_eps(self, key, epsfile):
        """Loads the EPSImage object and stores in the cache.
        """
        try:
            img = epstools.load_eps(epsfile)
        except epstools.EPSError:
            #  no bounding box
            self.dingbats[key] = None
        else:
            self.dingbats[key] = img

    def load_image(self, imageurl):
        """Load image and return EPS data and bounding box.

        If the conversion from raster data to EPS fails, then EPSError is
        raised.
        """
        try:
            image = self._image_loader(imageurl)
        except:
            raise epstools.EPSError('Image could not be loaded.')
        if not image:
            raise epstools.EPSError('Image could not be loaded.')
        import tempfile
        img_fn = tempfile.mktemp()
        fp = open(img_fn, 'wb')
        try:
            fp.write(image)
        except:
            raise epstools.EPSError('Failed to write image to external file.')
        fp.close()
        return epstools.load_image_file(img_fn, self.settings.greyscale)


# These functions and classes are "filters" which can be used as anchor
# transforms with the PrintingHTMLParser class.


def disallow_data_scheme(href, attrs):
    """Cancel data: URLs."""
    if urlparse.urlparse(href)[0] == 'data':
        href = None
    return href


def disallow_anchor_footnotes(href, attrs):
    """Cancel all anchor footnotes."""
    return None


class disallow_self_reference:
    """Cancel all anchor footnotes which refer to the current document."""
    def __init__(self, baseurl):
        self.__baseref = urlparse.urlparse(baseurl)[:-1] + ('',)

    def __call__(self, href, attrs):
        ref = urlparse.urlparse(href)[:-1] + ('',)
        if ref == self.__baseref:
            href = None
        return href
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.