xmlinfo.py : » Network » Grail-Internet-Browser » grail-0.6 » bookmarks » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Network » Grail Internet Browser
Grail Internet Browser » grail 0.6 » bookmarks » xmlinfo.py
#! /usr/bin/env python
#  -*- python -*-

"""Support for retrieving useful information about XML data, including the
public and system IDs and the document type name.

There are parts of this module which assume the native character encoding is
ASCII or a superset; this should be fixed.
"""

__version__ = "$Revision: 1.5 $"

import copy
import os
import re
import string
import struct
import sys

BOM_BE = "\xfe\xff"
BOM_LE = "\xff\xfe"

BIG_ENDIAN = "big-endian"
LITTLE_ENDIAN = "little-endian"

if struct.pack('h', 1) == struct.pack('>h', 1):
    NATIVE_ENDIANNESS = BIG_ENDIAN
else:
    NATIVE_ENDIANNESS = LITTLE_ENDIAN


class Error(Exception):
    """Base class for xmlinfo exceptions."""
    def __init__(self, *args, **kw):
        self.message = args[0]
        apply(Exception.__init__, (self,) + args, kw)

class ConversionError(Error):
    """Raised when an encoding conversion fails."""
    pass

class ParseError(Error):
    pass

class EncodingMismatchError(ParseError):
    """Raised when an extractor thinks it's reading from a stream of the
    wrong encoding.  The exception parameter is the name of a suggested
    encoding to try, or None.
    """
    def __init__(self, encoding=None):
        self.encoding = encoding
        ParseError.__init__(self, encoding)


class Record:
    public_id = None
    system_id = None
    doc_elem = None
    standalone = None
    xml_version = None
    encoding = None
    byte_order = None

    def __init__(self, **kw):
        self.__dict__.update(kw)


FieldLabels = Record(
    system_id="System ID",
    public_id="Public ID",
    doc_elem="Document Element",
    standalone="Standalone",
    xml_version="XML Version",
    encoding="Encoding",
    byte_order="Byte Order",
    )


FieldNames = dir(Record)
for _name in FieldNames[:]:
    if _name[:2] == "__":
        FieldNames.remove(_name)


def get_xml_info(buffer):
    values = Record()
    byte_order, encoding, bom = guess_byte_order_and_encoding(buffer)
    values.byte_order = byte_order
    values.encoding = encoding
    return extract(values.encoding, buffer[len(bom):], values)


def get_byte_order_mark(buffer):
    bom = buffer[:2]
    if bom in (BOM_BE, BOM_LE):
        return bom
    else:
        return ''


def guess_byte_order_and_encoding(buffer):
    """Guess the byte-order and encoding."""
    byte_order = None
    encoding = "utf-8"
    #
    bom = get_byte_order_mark(buffer)
    buffer = buffer[len(bom):]
    if bom == BOM_BE:
        return BIG_ENDIAN, "utf-16", BOM_BE
    elif bom == BOM_LE:
        return LITTLE_ENDIAN, "utf-16", BOM_LE
    elif bom == '':
        pass
    else:
        raise RuntimeError, \
              "unexpected internal condition: bad byte-order mark"
    #
    # no byte-order mark
    #
    prefix = buffer[:4]
    if prefix == "\0\0\0\x3c":
        byte_order = BIG_ENDIAN
        encoding = "ucs-4"
    elif prefix == "\x3c\0\0\0":
        byte_order = LITTLE_ENDIAN
        encoding = "ucs-4"
    elif prefix == "\0\x3c\0\x3f":
        byte_order = BIG_ENDIAN
        encoding = "utf-16"
    elif prefix == "\x3c\0\x3f\0":
        byte_order = LITTLE_ENDIAN
        encoding = "utf-16"
    elif prefix == "\x3c\x3f\x78\x6d":
        # good enough to parse the encoding declaration
        encoding = "utf-8"
    elif prefix == "\x4c\x6f\xa7\x94":
        encoding = "ebcdic"
    #
    return byte_order, encoding, ""


def extract(encoding, buffer, values, best_effort=0):
    tried = {}
    while not tried.has_key(encoding):
        tried[encoding] = 1
        v2 = copy.copy(values)
        extractor = new_extractor(encoding, buffer, v2)
        try:
            v2 = extractor.extract()
        except EncodingMismatchError, e:
            encoding = e.encoding
        except:
            if best_effort:
                # in case there's anything there
                return v2
            raise
        else:
            return v2
    raise ParseError("could not determine encoding")


_extractor_map = {}
def new_extractor(encoding, buffer, values):
    encoding = string.lower(encoding)
    klass = _extractor_map.get(encoding, Extractor)
    return klass(buffer, values)

def add_extractor_class(klass):
    for enc in klass.Encodings:
        _extractor_map[enc] = klass


class Extractor:
    VERSION_CHARS = string.letters + string.digits + "_.:-"

    Encodings = ()

    def __init__(self, buffer, values):
        self.buffer = buffer
        self.values = values

    def extract(self):
        self.parse_declaration()
        if self.values.encoding not in self.Encodings:
            raise EncodingMismatchError(self.values.encoding)
        self.skip_to_doctype()
        self.parse_doctype()
        return self.values

    def parse_declaration(self):
        try:
            self.require_ascii("<?xml", "XML declation")
        except ParseError:
            # OK to drop this for UTF-8
            return
        self.parse_VersionInfo()
        attrname, encoding = self.get_opt_pseudo_attr()
        if attrname == "encoding":
            self.values.encoding = string.lower(encoding)
            attrname, standalone = self.get_opt_pseudo_attr()
            if attrname == "standalone":
                if standalone not in ("yes", "no"):
                    raise ParseError(
                        "illegal standalone value in XML declaration: "
                         + value)
                self.values.standalone = standalone
                attrname = None
        if attrname is not None:
            raise ParseError(
                "unknown or out-of-order XML declaration attribute: "
                + attrname)
        self.skip_whitespace()
        self.require_ascii("?>", "XML declaration")

    def parse_VersionInfo(self):
        attr, verno = self.get_pseudo_attr()
        if attr != 'version':
            raise ParseError(
               "first pseudo-attribute in XML declaration must be version")
        if not verno:
            raise ParseError("version number cannot be empty")
        version_chars = self.VERSION_CHARS
        for c in verno:
            if not c in version_chars:
                raise ParseError(
                    "illegal character in XML version declaration: " + `c`)
        self.values.xml_version = verno

    def get_pseudo_attr(self):
        """Return attr/value pair using the XML declaration's idea of a
        pseudo-attribute."""
        attrname = ''
        value = ''
        self.require_whitespace("pseudo-attribute")
        while 1:
            c = self.get_ascii(1)
            if c in string.letters:
                attrname = attrname + c
                self.discard_chars(1)
            else:
                break
        if not attrname:
            raise ParseError("could not extract pseudo-attribute name")
        self.skip_whitespace()
        self.require_ascii("=", "pseudo-attribute")
        self.skip_whitespace()
        open_quote = self.get_ascii(1)
        if open_quote not in ('"', "'"):
            raise ParseError("pseudo-attribute values must be quoted")
        self.discard_chars(1)
        while 1:
            c = self.get_ascii(1)
            if not c:
                raise ParseError("could not complete pseudo-attribute value")
            self.discard_chars(1)
            if c == open_quote:
                break
            value = value + c
        return attrname, value

    def get_opt_pseudo_attr(self):
        buffer = self.buffer
        try:
            return self.get_pseudo_attr()
        except ParseError:
            self.buffer = buffer
            return None, None

    def parse_doctype(self):
        self.require_ascii("<!DOCTYPE", "doctype declaration")
        self.require_whitespace("doctype declaration")
        self.values.doc_elem = self.parse_Name("doctype declaration")
        wscount = self.skip_whitespace()
        c = self.get_ascii(1)
        if c in "]>":
            return
        self.parse_ExternalID()


    def _make_set_predicate(L):
        d = {}
        for o in L:
            d[o] = o
        return d.has_key

    BASE_CHARS = tuple(
        range(0x41, 0x5A+1) + range(0x61, 0x7A+1)
        + range(0xC0, 0xD6+1) + range(0xD8, 0xF6+1)
        + range(0xF8, 0xFF+1) + range(0x100, 0x131+1)
        + range(0x134, 0x13E+1) + range(0x141, 0x148+1)
        + range(0x14A, 0x17E+1) + range(0x180, 0x1C3+1)
        + range(0x1CD, 0x1F0+1) + range(0x1F4, 0x1F5+1)
        + range(0x1FA, 0x217+1) + range(0x250, 0x2A8+1)
        + range(0x2BB, 0x2C1+1) + [0x386]
        + range(0x388, 0x38A+1) + [0x38C]
        + range(0x38E, 0x3A1+1) + range(0x3A3, 0x3CE+1)
        + range(0x3D0, 0x3D6+1) + [0x3DA, 0x3DC, 0x3DE, 0x3E0]
        + range(0x3E2, 0x3F3+1) + range(0x401, 0x40C+1)
        + range(0x40E, 0x44F+1) + range(0x451, 0x45C+1)
        + range(0x45E, 0x481+1) + range(0x490, 0x4C4+1)
        + range(0x4C7, 0x4C8+1) + range(0x4CB, 0x4CC+1)
        + range(0x4D0, 0x4EB+1) + range(0x4EE, 0x4F5+1)
        + range(0x4F8, 0x4F9+1) + range(0x531, 0x556+1) + [0x559]
        + range(0x561, 0x586+1) + range(0x5D0, 0x5EA+1)
        + range(0x5F0, 0x5F2+1) + range(0x621, 0x63A+1)
        + range(0x641, 0x64A+1) + range(0x905, 0x939+1) + [0x93D]
        + range(0x9AA, 0x9B0+1) + [0x9B2]
        + range(0xA05, 0xA0A+1) + range(0xA35, 0xA36+1)
        + range(0xA8F, 0xA91+1) + [0xAE0]
        + range(0xB05, 0xB0C+1) + range(0xB36, 0xB39+1) + [0xB3D]
        + range(0xB5F, 0xB61+1) + range(0xB85, 0xB8A+1)
        + range(0xB8E, 0xB90+1) + range(0xB92, 0xB95+1)
        + range(0xB99, 0xB9A+1) + [0xB9C]
        + range(0xB9E, 0xB9F+1) + range(0xBA3, 0xBA4+1)
        + range(0xBA8, 0xBAA+1) + range(0xBAE, 0xBB5+1)
        + range(0xBB7, 0xBB9+1) + range(0xC05, 0xC0C+1)
        + range(0xC0E, 0xC10+1) + range(0x10A0, 0x10C5+1)
        + range(0x10D0, 0x10F6+1) + [0x1100]
        + range(0x1102, 0x1103+1) + range(0x1105, 0x1107+1) + [0x1109]
        + range(0x110B, 0x110C+1) + range(0x110E, 0x1112+1)
        + [0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150]
        + range(0x1154, 0x1155+1) + [0x1159]
        + range(0x115F, 0x1161+1) + [0x1163, 0x1165, 0x1167, 0x1169]
        + range(0x116D, 0x116E+1) + range(0x1172, 0x1173+1)
        + [0x1175, 0x119E, 0x11A8, 0x11AB]
        + range(0x1F5F, 0x1F7D+1) + range(0x1F80, 0x1FB4+1)
        + range(0x1FB6, 0x1FBC+1) + [0x1FBE]
        + range(0x1FC2, 0x1FC4+1) + range(0x1FC6, 0x1FCC+1)
        + range(0x1FD0, 0x1FD3+1))

    COMBINING_CHARS = tuple(
        range(0x300, 0x345+1) + range(0x360, 0x361+1)
        + range(0x483, 0x486+1) + range(0x591, 0x5A1+1)
        + range(0x5A3, 0x5B9+1) + range(0x5BB, 0x5BD+1) + [0x5BF]
        + range(0x5C1, 0x5C2+1) + [0x5C4]
        + range(0x64B, 0x652+1) + [0x670]
        + range(0x6D6, 0x6DC+1) + range(0x6DD, 0x6DF+1)
        + range(0x6E0, 0x6E4+1) + range(0x6E7, 0x6E8+1)
        + range(0x6EA, 0x6ED+1) + range(0x901, 0x903+1) + [0x93C]
        + range(0x93E, 0x94C+1) + [0x94D]
        + range(0x951, 0x954+1) + range(0x962, 0x963+1)
        + range(0x981, 0x983+1) + [0x9BC, 0x9BE, 0x9BF]
        + range(0x9C0, 0x9C4+1) + range(0x9C7, 0x9C8+1)
        + range(0x9CB, 0x9CD+1) + [0x9D7]
        + range(0x9E2, 0x9E3+1) + [0xA02, 0xA3C, 0xA3E, 0xA3F]
        + range(0xA40, 0xA42+1) + range(0xA47, 0xA48+1)
        + range(0xA4B, 0xA4D+1) + range(0xA70, 0xA71+1)
        + range(0xA81, 0xA83+1) + [0xABC]
        + range(0xABE, 0xAC5+1) + range(0xAC7, 0xAC9+1)
        + range(0xACB, 0xACD+1) + range(0xB01, 0xB03+1) + [0xB3C]
        + range(0xB3E, 0xB43+1) + range(0xB47, 0xB48+1)
        + range(0xB4B, 0xB4D+1) + range(0xB56, 0xB57+1)
        + range(0xB82, 0xB83+1) + range(0xBBE, 0xBC2+1)
        + range(0xBC6, 0xBC8+1) + range(0xBCA, 0xBCD+1) + [0xBD7]
        + range(0xC01, 0xC03+1) + range(0xC3E, 0xC44+1)
        + range(0xC46, 0xC48+1) + range(0xC4A, 0xC4D+1)
        + range(0xC55, 0xC56+1) + range(0xC82, 0xC83+1)
        + range(0xCBE, 0xCC4+1) + range(0xCC6, 0xCC8+1)
        + range(0xCCA, 0xCCD+1) + range(0xCD5, 0xCD6+1)
        + range(0xD02, 0xD03+1) + range(0xD3E, 0xD43+1)
        + range(0xD46, 0xD48+1) + range(0xD4A, 0xD4D+1) + [0xD57, 0xE31]
        + range(0xE34, 0xE3A+1) + range(0xE47, 0xE4E+1) + [0xEB1]
        + range(0xEB4, 0xEB9+1) + range(0xEBB, 0xEBC+1)
        + range(0xEC8, 0xECD+1) + range(0xF18, 0xF19+1)
        + [0xF35, 0xF37, 0xF39, 0xF3E, 0xF3F]
        + range(0xF71, 0xF84+1) + range(0xF86, 0xF8B+1)
        + range(0xF90, 0xF95+1) + [0xF97]
        + range(0xF99, 0xFAD+1) + range(0xFB1, 0xFB7+1) + [0xFB9]
        + range(0x20D0, 0x20DC+1) + [0x20E1]
        + range(0x302A, 0x302F+1) + [0x3099, 0x309A])

    DIGIT_CHARS = tuple(
        range(0x30, 0x3A+1) + range(0x660, 0x669+1)
        + range(0x6F0, 0x6F9+1) + range(0x966, 0x96F+1)
        + range(0x9E6, 0x9EF+1) + range(0xA66, 0xA6F+1)
        + range(0xAE6, 0xAEF+1) + range(0xB66, 0xB6F+1)
        + range(0xBE7, 0xBEF+1) + range(0xC66, 0xC6F+1)
        + range(0xCE6, 0xCEF+1) + range(0xD66, 0xD6F+1)
        + range(0xE50, 0xE59+1) + range(0xED0, 0xED9+1)
        + range(0xF20, 0xF29+1))
    is_digit_char = _make_set_predicate(DIGIT_CHARS)

    EXTENDING_CHARS = tuple(
        [0xB7, 0x2D0, 0x2D1, 0x387, 0x640, 0xE46, 0xEC6, 0x3005]
        + range(0x3031, 0x3035+1) + range(0x309D, 0x309E+1)
        + range(0x30FC, 0x30FE+1))
    is_extending_char = _make_set_predicate(EXTENDING_CHARS)

    IDEOGRAPHIC_CHARS = tuple(
        range(0x4E00, 0x9FA5+1) + range(0x3021, 0x3029+1))
    is_ideographic_char = _make_set_predicate(IDEOGRAPHIC_CHARS)

    LETTER_CHARS = BASE_CHARS + IDEOGRAPHIC_CHARS
    is_letter_char = _make_set_predicate(LETTER_CHARS)

    NAME_CHARS = LETTER_CHARS + DIGIT_CHARS + (46, 45, 95, 58) \
                 + COMBINING_CHARS + EXTENDING_CHARS
    is_name_char = _make_set_predicate(NAME_CHARS)

    del _make_set_predicate

    def parse_Name(self, where):
        s, u = self.get_char_and_unicode()
        if not self.is_name_char(u):
            raise ParseError("illegal character in name: %s (%d)" % (`s`, u))
        i = 1
        while 1:
            c, u = self.get_char_and_unicode(i)
            if u not in self.NAME_CHARS:
                break
            i = i + 1
            s = s + c
        self.discard_chars(i)
        return s

    def parse_ExternalID(self):
        str = self.get_ascii(6)
        if str == "PUBLIC":
            # public system id w/ optional system id
            self.discard_chars(len(str))
            self.require_whitespace("ExternalID")
            id = self.get_quoted_string()
            if not id:
                raise ParseError("could not parse doctype declaration:"
                                 " bad public id")
            self.values.public_id = id
            self.require_whitespace("ExternalID")
            self.values.system_id = self.get_quoted_string()
        elif str == "SYSTEM":
            #  system id
            self.discard_chars(len(str))
            self.require_whitespace("ExternalID")
            id = self.get_quoted_string()
            if not id:
                raise ParseError("could not parse doctype declaration:"
                                 " bad system id")
            self.values.system_id = id
        else:
            raise ParseError("illegal external ID")

    def get_quoted_string(self):
        c, u = self.get_char_and_unicode()
        if u not in (34, 39):
            raise ParseError("illegal quoted string")
        self.discard_chars(1)
        quote_mark = u
        s = ''
        while 1:
            c, u = self.get_char_and_unicode()
            if not c:
                raise ParseError("could not find end of quoted string")
            self.discard_chars(1)
            if u == quote_mark:
                break
            s = s + c
        return s

    def skip_comment(self):
        self.require_ascii("<!--", "comment")
        self.skip_past_ascii("-->", "comment")

    def skip_pi(self):
        self.require_ascii("<?", "processing instruction")
        self.skip_past_ascii("?>", "processing instruction")

    def skip_to_doctype(self):
        # This should probably be implemented by any extractor for which we
        # care about performance.
        while 1:
            self.skip_whitespace()
            try:
                c = self.get_ascii(1)
            except ConversionError:
                self.discard_chars(1)
            else:
                if not c:
                    break
                if c == "<":
                    # might be something interesting
                    try:
                        prefix = self.get_ascii(4)
                    except ConversionError:
                        # If this fails, assume there's something non-white in
                        # there; allow the exception to be raised since there's
                        # probably illegal data before the document element.
                        prefix = self.get_ascii(2)
                    if prefix == "<!--":
                        self.skip_comment()
                    elif prefix[:2] == "<?":
                        self.skip_pi()
                    else:
                        break
                else:
                    # way bad!
                    raise ParseError("could not locate doctype declaration"
                                     " or start of document element")

    def skip_whitespace(self):
        """Trim leading whitespace, returning the number of characters
        stripped.

        The default implementation is slow; subclasses should override it.
        """
        count = 0
        try:
            while 1:
                c, u = self.get_char_and_unicode(count)
                if not c:
                    break
                if u not in (0x9, 0xA, 0xD, 0x20):
                    break
                count = count + 1
        except ConversionError:
            pass
        if count:
            self.discard_chars(count)
        return count

    def require_whitespace(self, where):
        """Trim leading whitespace, returning the number of characters
        stripped or raising ParseError is no whitespace was present."""
        numchars = self.skip_whitespace()
        if not numchars:
            raise ParseError("required whitespace in " + where)

    def get_ascii(self, count):
        raise NotImplementedError

    def get_char_and_unicode(self, index=0):
        raise NotImplementedError

    def require_ascii(self, str, where):
        width = len(str)
        data = self.get_ascii(width)
        if data != str:
            raise ParseError("required text '%s' missing in %s" % (str, where))
        self.discard_chars(width)

    def skip_past_ascii(self, str, what):
        width = len(str)
        initchar = str[0]
        subs = range(1, width)
        while 1:
            try:
                data = self.get_ascii(width)
            except ConversionError:
                self.discard_chars(1)
            else:
                if len(data) < width:
                    raise ParseError("could not locate end of " + what)
                if data == str:
                    self.discard_chars(width)
                    return
                for i in subs:
                    if data[i] == initchar:
                        self.discard_chars(i)
                else:
                    self.discard_chars(width)

    def discard_chars(self, count):
        raise NotImplementedError


class ISO8859Extractor(Extractor):
    __declattr_rx = re.compile(
        "([a-z]*)=\"((?:[^?\"]|\?[^?>\"]|\?(?=\?))*)\"", re.MULTILINE)

    __gi_rx = re.compile("[a-zA-Z_:][-a-zA-Z_:0-9.]*")
    __id_rx = re.compile(r"""(?:'[^']*'|\"[^\"]*\")""",
                         re.MULTILINE | re.VERBOSE)

    def yank_id(self):
        self.require_whitespace("doctype declaration: ExternalID")
        m = self.__id_rx.match(self.buffer)
        if not m:
            return None
        self.buffer = self.buffer[m.end():]
        return string.lstrip(m.group())[1:-1]

    def parse_doctype(self):
        self.require_ascii("<!DOCTYPE", "doctype declaration")
        self.require_whitespace("doctype declaration")
        m = self.__gi_rx.match(self.buffer)
        if not m:
            raise ParseError("could not parse doctype declaration: no name")
        self.values.doc_elem = m.group()
        self.discard_chars(len(self.values.doc_elem))
        whitechars = self.skip_whitespace()
        if not self.buffer:
            raise ParseError("could not parse doctype declaration:"
                             " insufficient data")
        if self.get_ascii(1) in ">[":
            # reached internal subset or end of declaration; we're done
            return
        if not whitechars:
            raise ParseError("whitespace required between document type and"
                             " document type declaration")
        self.parse_ExternalID()
    
    def skip_to_doctype(self):
        while self.buffer:
            self.buffer = string.lstrip(self.buffer)
            if self.buffer[:4] == "<!--":
                self.skip_comment()
            elif self.buffer[:2] == "<?":
                self.skip_pi()
            else:
                break

    def skip_pi(self):
        pos = string.find(self.buffer, "?>", 2)
        if pos < 0:
            raise ParseError("could not scan over processing instruction")
        self.buffer = self.buffer[pos + 2:]

    def skip_comment(self):
        pos = string.find(self.buffer, "-->", 4)
        if pos < 0:
            raise ParseError("could not scan over comment")
        self.buffer = self.buffer[pos + 4:]

    def skip_whitespace(self):
        old_buffer = self.buffer
        self.buffer = string.lstrip(old_buffer)
        return len(old_buffer) - len(self.buffer)

    def get_ascii(self, count):
        # not quite right, but good enough for now
        return self.buffer[:count]

    def get_char_and_unicode(self, index=0):
        # really only good for iso-8859-1
        c = self.buffer[index:index + 1]
        if c:
            return c, ord(c)
        else:
            return c, None

    def discard_chars(self, count):
        self.buffer = self.buffer[count:]

    def lower(self, str):
        return string.lower(str)


class ISO8859_1_Extractor(ISO8859Extractor):
    Encodings = ("iso-8859-1", "iso-latin-1", "latin-1")

    def get_ascii(self, count):
        return self.buffer[:count]

    def get_char_and_unicode(self, index=0):
        c = self.buffer[index:index + 1]
        if c:
            return c, ord(c)
        else:
            return c, None

add_extractor_class(ISO8859_1_Extractor)


for c in "23456789":
    class _Extractor(ISO8859Extractor):
        Encodings = ("iso-8859-" + c,)
    try:
        _Extractor.__name__ = "ISO8859_%s_Extractor" % c
    except TypeError:
        # older Python versions wouldn't allow __name__ to be set on a class
        pass
    exec "ISO8859_%s_Extractor = _Extractor" % c
    add_extractor_class(_Extractor)
del _Extractor


class UTF8Extractor(ISO8859Extractor):
    Encodings = ("utf-8",)

    def get_char_and_unicode(self, index=0):
        raise NotImplementedError

add_extractor_class(UTF8Extractor)


class EBCDICExtractor(Extractor):
    Encodings = ("ebcdic",)

    # This table was taken from the source code of GNU recode 3.4.
    __ASCII_TO_EBCDIC = [
          0,   1,   2,   3,  55,  45,  46,  47,   #   0 -   7
         22,   5,  37,  11,  12,  13,  14,  15,   #   8 -  15
         16,  17,  18,  19,  60,  61,  50,  38,   #  16 -  23
         24,  25,  63,  39,  28,  29,  30,  31,   #  24 -  31
         64,  79, 127, 123,  91, 108,  80, 125,   #  32 -  39
         77,  93,  92,  78, 107,  96,  75,  97,   #  40 -  47
        240, 241, 242, 243, 244, 245, 246, 247,   #  48 -  55
        248, 249, 122,  94,  76, 126, 110, 111,   #  56 -  63
        124, 193, 194, 195, 196, 197, 198, 199,   #  64 -  71
        200, 201, 209, 210, 211, 212, 213, 214,   #  72 -  79
        215, 216, 217, 226, 227, 228, 229, 230,   #  80 -  87
        231, 232, 233,  74, 224,  90,  95, 109,   #  88 -  95
        121, 129, 130, 131, 132, 133, 134, 135,   #  96 - 103
        136, 137, 145, 146, 147, 148, 149, 150,   # 104 - 111
        151, 152, 153, 162, 163, 164, 165, 166,   # 112 - 119
        167, 168, 169, 192, 106, 208, 161,   7,   # 120 - 127
         32,  33,  34,  35,  36,  21,   6,  23,   # 128 - 135
         40,  41,  42,  43,  44,   9,  10,  27,   # 136 - 143
         48,  49,  26,  51,  52,  53,  54,   8,   # 144 - 151
         56,  57,  58,  59,   4,  20,  62, 225,   # 152 - 159
         65,  66,  67,  68,  69,  70,  71,  72,   # 160 - 167
         73,  81,  82,  83,  84,  85,  86,  87,   # 168 - 175
         88,  89,  98,  99, 100, 101, 102, 103,   # 176 - 183
        104, 105, 112, 113, 114, 115, 116, 117,   # 184 - 191
        118, 119, 120, 128, 138, 139, 140, 141,   # 192 - 199
        142, 143, 144, 154, 155, 156, 157, 158,   # 200 - 207
        159, 160, 170, 171, 172, 173, 174, 175,   # 208 - 215
        176, 177, 178, 179, 180, 181, 182, 183,   # 216 - 223
        184, 185, 186, 187, 188, 189, 190, 191,   # 224 - 231
        202, 203, 204, 205, 206, 207, 218, 219,   # 232 - 239
        220, 221, 222, 223, 234, 235, 236, 237,   # 240 - 247
        238, 239, 250, 251, 252, 253, 254, 255,   # 248 - 255
        ]

    _m = [None] * 256
    for _i in range(len(__ASCII_TO_EBCDIC)):
        _e = __ASCII_TO_EBCDIC[_i]
        __ASCII_TO_EBCDIC[_i] = chr(_e)
        _m[_e] = chr(_i)
    for i in range(len(_m)):
        if _m[_i] is None:
            print "No EBCDIC character for ASCII", `chr(i)`

    __EBCDIC_TO_ASCII = tuple(_m)

    __translation = string.maketrans(string.join(__ASCII_TO_EBCDIC, ''),
                                     string.join(__EBCDIC_TO_ASCII, ''))

    def get_ascii(self, count):
        buffer = self.buffer[:count]
        return string.translate(buffer, self.__translation)

add_extractor_class(EBCDICExtractor)


def ascii_to_ucs2be(s):
    L = map(None, s)
    L.insert(0, '')
    return string.join(L, '\0')


def ascii_to_ucs2le(s):
    L = map(None, s)
    L.append('')
    return string.join(L, '\0')


def ascii_to_ucs4be(s):
    L = map(None, s)
    L.insert(0, '')
    return string.join(L, '\0\0\0')


def ascii_to_ucs4le(s):
    L = map(None, s)
    L.append('')
    return string.join(L, '\0\0\0')


class UCS2Extractor(Extractor):
    Encodings = ("ucs-2", "utf-16", "iso-10646-ucs-2")

    __WHITESPACE_BE = map(ascii_to_ucs2be, string.whitespace)
    __WHITESPACE_LE = map(ascii_to_ucs2le, string.whitespace)

    def __init__(self, buffer, values):
        Extractor.__init__(self, buffer, values)
        if values.byte_order not in (BIG_ENDIAN, LITTLE_ENDIAN):
            raise ValueError, \
                  "UCS-2 encoded strings must have determinable byte order"
        self.__byte_order = values.byte_order
        if values.byte_order == BIG_ENDIAN:
            self.__whitespace = self.__WHITESPACE_BE
            self.__from_ascii = ascii_to_ucs2be
        else:
            self.__whitespace = self.__WHITESPACE_LE
            self.__from_ascii = ascii_to_ucs2le

    def skip_whitespace(self):
        buffer = self.buffer
        pos = 0
        whitespace = self.__whitespace
        while buffer[pos:pos+2] in whitespace:
            pos = pos + 2
        self.buffer = buffer[pos:]
        return pos / 2

    def get_ascii(self, count):
        data = self.buffer[:count*2]
        if self.__byte_order == BIG_ENDIAN:
            zero_offset = 0
            char_offset = 1
        else:
            zero_offset = 1
            char_offset = 0
        s = ''
        try:
            for i in range(0, count*2, 2):
                if data[i+zero_offset] != '\0':
                    raise ConversionError("cannot convert %s to ASCII"
                                          % `data[i:i+2]`)
                s = s + data[i+char_offset]
        except IndexError:
            # just didn't have enough; somebody else's problem
            pass
        return s

    def get_char_and_unicode(self, index=0):
        if len(self.buffer) >= 2:
            offset = index * 2
            c = self.buffer[offset:offset + 2]
            return c, ordwc(c, self.__byte_order)
        else:
            return None, None

    def discard_chars(self, count):
        self.buffer = self.buffer[count*2:]

add_extractor_class(UCS2Extractor)


def ordwc(wc, byte_order=None):
    """Return the ord() for a wide character."""
    if byte_order is None:
        byte_order = NATIVE_ENDIANNESS
    width = len(wc)
    if width == 2:
        o1, o2 = map(ord, wc)
        if byte_order == BIG_ENDIAN:
            o = (o1 << 8) | o2
        else:
            o = (o2 << 8) | o1
    elif width == 4:
        o1, o2, o3, o4 = map(ord, wc)
        if byte_order == BIG_ENDIAN:
            o = (((((o1 << 8) | o2) << 8) | o3) << 8) | o4
        else:
            o = (((((o4 << 8) | o3) << 8) | o2) << 8) | o1
    else:
        raise ValueError, "wide-character string has bad length"
    return o


def ordwstr(wstr, byte_order=None, charsize=2):
    assert charsize in (2, 4), "wide character size must be 2 or 4"
    ords = []
    for i in range(0, len(wstr), charsize):
        ords.append(ordwc(wstr[i:i+charsize], byte_order))
    return ords


def dump_info(values, labels=None):
    if labels is None:
        labels = FieldLabels
    format = "%%%ds: %%s" % max(map(len, FieldLabels.__dict__.values()))
    for field_name in FieldNames:
        value = getattr(values, field_name)
        label = getattr(FieldLabels, field_name)
        if value is not None:
            print format % (label, value)


def main():
    import getopt
    #
    reqs = Record()                     # required values (for output)
    #
    get_defaults = 1
    full_report = 0
    debugging = 0
    program = os.path.basename(sys.argv[0])
    opts, args = getopt.getopt(sys.argv[1:], "ad",
                               ["all", "docelem", "encoding", "public-id",
                                "standalone", "system-id", "version"])
    if opts:
        get_defaults = 0
    for opt, arg in opts:
        if opt in ("-a", "--all"):
            full_report = 1
        elif opt == "-d":
            debugging = debugging + 1
        elif opt == "--docelem":
            reqs.doc_elem = 1
        elif opt == "--encoding":
            reqs.encoding = 1
        elif opt == "--public-id":
            reqs.publib_id = 1
        elif opt == "--standalone":
            reqs.standalone = 1
        elif opt == "--system-id":
            reqs.system_id = 1
        elif opt == "--version":
            reqs.xml_version = 1
    if get_defaults:
        full_report = 1
    #
    if len(args) > 1:
        sys.stderr.write(program + ": too many input sources specified")
        sys.exit(2)
    if args:
        if os.path.exists(args[0]):
            fp = open(args[0])
        else:
            import urllib
            fp = urllib.urlopen(args[0])
    else:
        fp = sys.stdin
    #
    buffer = fp.read(10240)
    fp.close()
    try:
        values = get_xml_info(buffer)
    except Error, e:
        sys.stderr.write("parse failed: %s\n" % e.args[0])
        if debugging:
            raise
        sys.exit(1)
    #
    # Make the report:
    #
    if full_report:
        dump_info(values)
    else:
        for field_name in FieldNames:
            if getattr(reqs, field_name):
                value = getattr(values, field_name)
                if value is None:
                    print
                else:
                    print value


if __name__ == "__main__":
    main()
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.