Create Python codecs from Unicode mapping files : Programming « Utility




"""



PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2

--------------------------------------------



1. This LICENSE AGREEMENT is between the Python Software Foundation

("PSF"), and the Individual or Organization ("Licensee") accessing and

otherwise using this software ("Python") in source or binary form and

its associated documentation.



2. Subject to the terms and conditions of this License Agreement, PSF

hereby grants Licensee a nonexclusive, royalty-free, world-wide

license to reproduce, analyze, test, perform and/or display publicly,

prepare derivative works, distribute, and otherwise use Python

alone or in any derivative version, provided, however, that PSF's

License Agreement and PSF's notice of copyright, i.e., "Copyright (c)

2001, 2002, 2003, 2004 Python Software Foundation; All Rights Reserved"

are retained in Python alone or in any derivative version prepared

by Licensee.



3. In the event Licensee prepares a derivative work that is based on

or incorporates Python or any part thereof, and wants to make

the derivative work available to others as provided herein, then

Licensee hereby agrees to include in any such work a brief summary of

the changes made to Python.



4. PSF is making Python available to Licensee on an "AS IS"

basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR

IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND

DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS

FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT

INFRINGE ANY THIRD PARTY RIGHTS.



5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON

FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS

A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,

OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.



6. This License Agreement will automatically terminate upon a material

breach of its terms and conditions.



7. Nothing in this License Agreement shall be deemed to create any

relationship of agency, partnership, or joint venture between PSF and

Licensee.  This License Agreement does not grant permission to use PSF

trademarks or trade name in a trademark sense to endorse or promote

products or services of Licensee, or any third party.



8. By copying, installing or otherwise using Python, Licensee

agrees to be bound by the terms and conditions of this License

Agreement.

"""





""" Unicode Mapping Parser and Codec Generator.



This script parses Unicode mapping files as available from the Unicode

site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec

modules from them. The codecs use the standard character mapping codec

to actually apply the mapping.



Synopsis: gencodec.py dir codec_prefix



All files in dir are scanned and those producing non-empty mappings

will be written to <codec_prefix><mapname>.py with <mapname> being the

first part of the map's filename ('a' in a.b.c.txt) converted to

lowercase with hyphens replaced by underscores.



The tool also writes marshalled versions of the mapping tables to the

same location (with .mapping extension).



Written by Marc-Andre Lemburg (mal@lemburg.com).



(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

(c) Copyright Guido van Rossum, 2000.



"""#"



import re,os,time,marshal



# Create numeric tables or character based ones ?

numeric = 1



mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'

                   '\s+'

                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'

                   '\s*'

                   '(#.+)?')



def parsecodes(codes,

               len=len, filter=filter,range=range):



    """ Converts code combinations to either a single code integer

        or a tuple of integers.



        meta-codes (in angular brackets, e.g. <LR> and <RL>) are

        ignored.



        Empty codes or illegal ones are returned as None.



    """

    if not codes:

        return None

    l = codes.split('+')

    if len(l) == 1:

        return int(l[0],16)

    for i in range(len(l)):

        try:

            l[i] = int(l[i],16)

        except ValueError:

            l[i] = None

    l = filter(lambda x: x is not None, l)

    if len(l) == 1:

        return l[0]

    else:

        return tuple(l)



def readmap(filename):



    f = open(filename,'r')

    lines = f.readlines()

    f.close()

    enc2uni = {}

    identity = []

    unmapped = range(256)

    for i in range(256):

        unmapped[i] = i

    for line in lines:

        line = line.strip()

        if not line or line[0] == '#':

            continue

        m = mapRE.match(line)

        if not m:

            #print '* not matched: %s' % repr(line)

            continue

        enc,uni,comment = m.groups()

        enc = parsecodes(enc)

        uni = parsecodes(uni)

        if not comment:

            comment = ''

        else:

            comment = comment[1:]

        if enc < 256:

            unmapped.remove(enc)

            if enc == uni:

                identity.append(enc)

            else:

                enc2uni[enc] = (uni,comment)

        else:

            enc2uni[enc] = (uni,comment)

    # If there are more identity-mapped entries than unmapped entries,

    # it pays to generate an identity dictionary first, and add explicit

    # mappings to None for the rest

    if len(identity)>=len(unmapped):

        for enc in unmapped:

            enc2uni[enc] = (None, "")

        enc2uni['IDENTITY'] = 256



    return enc2uni



def hexrepr(t):



    if t is None:

        return 'None'

    try:

        len(t)

    except:

        return '0x%04x' % t

    return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'



def unicoderepr(t):



    if t is None:

        return 'None'

    if numeric:

        return hexrepr(t)

    else:

        try:

            len(t)

        except:

            return repr(unichr(t))

        return repr(''.join(map(unichr, t)))



def keyrepr(t):



    if t is None:

        return 'None'

    if numeric:

        return hexrepr(t)

    else:

        try:

            len(t)

        except:

            if t < 256:

                return repr(chr(t))

            else:

                return repr(unichr(t))

        return repr(''.join(map(chr, t)))



def codegen(name,map,comments=1):



    """ Returns Python source for the given map.



        Comments are included in the source, if comments is true (default).



    """

    l = [

        '''\

""" Python Character Mapping Codec generated from '%s' with gencodec.py.



"""#"



import codecs



### Codec APIs



class Codec(codecs.Codec):



    def encode(self,input,errors='strict'):



        return codecs.charmap_encode(input,errors,encoding_map)



    def decode(self,input,errors='strict'):



        return codecs.charmap_decode(input,errors,decoding_map)



class StreamWriter(Codec,codecs.StreamWriter):

    pass



class StreamReader(Codec,codecs.StreamReader):

    pass



### encodings module API



def getregentry():



    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)



### Decoding Map

''' % name,

        ]



    if map.has_key("IDENTITY"):

        l.append("decoding_map = codecs.make_identity_dict(range(%d))"

                 % map["IDENTITY"])

        l.append("decoding_map.update({")

        splits = 1

        del map["IDENTITY"]

    else:

        l.append("decoding_map = {")

        splits = 0



    mappings = map.items()

    mappings.sort()

    append = l.append

    i = 0

    for e,value in mappings:

        try:

            (u,c) = value

        except TypeError:

            u = value

            c = ''

        key = keyrepr(e)

        if c and comments:

            append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))

        else:

            append('\t%s: %s,' % (key,unicoderepr(u)))

        i += 1

        if i == 4096:

            # Split the definition into parts to that the Python

            # parser doesn't dump core

            if splits == 0:

                append('}')

            else:

                append('})')

            append('decoding_map.update({')

            i = 0

            splits = splits + 1

    if splits == 0:

        append('}')

    else:

        append('})')

    append('''

### Encoding Map



encoding_map = codecs.make_encoding_map(decoding_map)

''')

    return '\n'.join(l)



def pymap(name,map,pyfile,comments=1):



    code = codegen(name,map,comments)

    f = open(pyfile,'w')

    f.write(code)

    f.close()



def marshalmap(name,map,marshalfile):



    d = {}

    for e,(u,c) in map.items():

        d[e] = (u,c)

    f = open(marshalfile,'wb')

    marshal.dump(d,f)

    f.close()



def convertdir(dir,prefix='',comments=1):



    mapnames = os.listdir(dir)

    for mapname in mapnames:

        name = os.path.split(mapname)[1]

        name = name.replace('-','_')

        name = name.split('.')[0]

        name = name.lower()

        codefile = name + '.py'

        marshalfile = name + '.mapping'

        print 'converting %s to %s and %s' % (mapname,

                                              prefix + codefile,

                                              prefix + marshalfile)

        try:

            map = readmap(os.path.join(dir,mapname))

            if not map:

                print '* map is empty; skipping'

            else:

                pymap(mapname, map, prefix + codefile,comments)

                marshalmap(mapname, map, prefix + marshalfile)

        except ValueError:

            print '* conversion failed'



def rewritepythondir(dir,prefix='',comments=1):



    mapnames = os.listdir(dir)

    for mapname in mapnames:

        if not mapname.endswith('.mapping'):

            continue

        codefile = mapname[:-len('.mapping')] + '.py'

        print 'converting %s to %s' % (mapname,

                                       prefix + codefile)

        try:

            map = marshal.load(open(os.path.join(dir,mapname),

                               'rb'))

            if not map:

                print '* map is empty; skipping'

            else:

                pymap(mapname, map, prefix + codefile,comments)

        except ValueError, why:

            print '* conversion failed: %s' % why



if __name__ == '__main__':



    import sys

    if 1:

        apply(convertdir,tuple(sys.argv[1:]))

    else:

        apply(rewritepythondir,tuple(sys.argv[1:]))
Create Python codecs from Unicode mapping files : Programming « Utility « Python