DTLexer.py :  » Database » PyDO » skunkweb-3.4.4 » pylibs » DT » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Database » PyDO 
PyDO » skunkweb 3.4.4 » pylibs » DT » DTLexer.py
#  
#  Copyright (C) 2001 Andrew T. Csillag <drew_csillag@geocities.com>
#  
#      You may distribute under the terms of either the GNU General
#      Public License or the SkunkWeb License, as specified in the
#      README file.
#   
import string
import sys
import re
import ErrorHandler
import SkunkExcept
from DTExcept import DTLexicalError
import StringIO

#regex for a quoted string (no backslash escaping)
quotedStr="('(.*?)' | \"(.*?)\" | `(.*?)`)"

#plain thing
plain='[^\s="\'`]+'

plainre=re.compile(plain)

#regex for parsing contents of a tag
tagContents='''(
     (?P<plain> %(plain)s ) (?=($|\s)) | 
     (?P<plaineq> %(plain)s ) \s*
               = \s* (?P<plaineqval> %(plain)s ) (?=($|\s)) |
     (?P<plaineqq> %(plain)s ) \s*
               = \s* (?P<plaineqqval> %(quotedStr)s ) (?=($|\s)) |
     (?P<quotonly> %(quotedStr)s (?=($|\s))) |
     (?P<whitespace> \s)
         )''' % {'plain': plain,
                 'quotedStr': quotedStr
                 }
tagContentsRe=re.compile(tagContents, re.VERBOSE | re.DOTALL)
PLAIN='plain'
PLAINEQ='plaineq'
PLAINEQVAL='plaineqval'
PLAINEQQ='plaineqq'
PLAINEQQVAL='plaineqqval'
QUOTONLY='quotonly'
WHITESPACE='whitespace'

wsre = re.compile(r'\s+')
def dequote(s):
    if s[0]==s[-1]=='"':
        return s[1:-1]
    elif s[0]==s[-1]=="'":
        return s[1:-1]
    return s

class DTToken:
    def __init__(self, text, name, (start, end, tagname, tup, dict)):
        self.text=text
        self._name=name
        self.start=start
        self.end=end
        self.tagname=tagname
        self.tupargs=tuple(tup)
        self.dictargs=dict

        if self.tagname[0]=='/':
            self.isclose=1
            self.tagname=self.tagname[1:]
        else:
            self.isclose=0

    def filename(self):
        return self._name

    def filelineno(self):
        return "%s:%s" % (self._name, self.getLineno())

    def lineno(self):
        return int(self.getLineno())
    
    def getLineno(self):
        return getLineno(self.text, self.start)

    def tagText(self):
        return self.text[self.start:self.end]

    def __repr__(self):
        tupvals=' '.join(map(repr, self.tupargs))
        if tupvals:
            tupvals=' '+tupvals
        dl=[]
        for k,v in self.dictargs.items():
            dl.append('%s=%s' % (k, repr(v)))
        dictvals=' '.join(dl)
        if dictvals:
            dictvals=' '+dictvals
        close=self.isclose and '/' or ''
        return '<:%s%s%s%s:>' % (close,
                                 self.tagname,
                                 tupvals,
                                 dictvals)

(INIT, GOTLT, PLAINTAGTEXT, SQUOT, BQUOT, DQUOT, GOTCCOL, BACKSLASH,
 BEGINCOMMENTCHECK, INCOMMENT, INCOMMENTSTAR, INCOMMENTSTARCOL) = range(12)
_statenumtostr = {
    INIT:'INIT',
    GOTLT:'GOTLT',
    PLAINTAGTEXT:'PLAINTAGTEXT',
    SQUOT:'SQUOT',
    BQUOT:'BQUOT',
    DQUOT:'DQUOT',
    GOTCCOL:'GOTCCOL',
    BACKSLASH:'BACKSLASH',
    BEGINCOMMENTCHECK:'BEGINCOMMENTCHECK',
    INCOMMENT:'INCOMMENT',
    INCOMMENTSTAR:'INCOMMENTSTAR',
    INCOMMENTSTARCOL:'INCOMMENTSTARCOL',
}

def findTags(text):
    ret = []

    PTTSTATESWITCH = {
        ":": GOTCCOL,
        "'": SQUOT,
        '"': DQUOT,
        "`": BQUOT
        }
    VALID_PTTEXT = (string.letters + string.digits + string.whitespace
                    + '[]_=/\\#.*')
    QUOTE_CLOSE = {
        SQUOT: "'",
        DQUOT: '"',
        BQUOT: '`'
        }
        
    offset = 0
    lentext = len(text)
    state = INIT

    backslash_save_state = 0

    textoffset = 0
    tagoffset = 0
    while offset < lentext:
        c = text[offset]
        if state == INIT:
            if c == '<':
                tagoffset = offset
                state = GOTLT
        elif state == GOTLT:
            if c == ':':
                state = BEGINCOMMENTCHECK
                #put in text token
                ret.append( (textoffset, offset - 1, 0, 
                             text[textoffset:offset - 1]) )
            elif c == '<':
                tagoffset = offset
            else:
                state = INIT
        elif state == BEGINCOMMENTCHECK:
            if c != '*':
                state = PLAINTAGTEXT
                continue
            state = INCOMMENT
        elif state == INCOMMENT:
            if c == '*':
                state = INCOMMENTSTAR
        elif state == INCOMMENTSTAR:
            if c == ':':
                state = INCOMMENTSTARCOL
            else:
                state = INCOMMENT
        elif state == INCOMMENTSTARCOL:
            if c == '>':
                state = INIT
                textoffset=offset+1
            else:
                state = INCOMMENT
        elif state == PLAINTAGTEXT:
            nstate = PTTSTATESWITCH.get(c)
            if nstate:
                state = nstate
            elif c == '\\':
                backslash_save_state = state
                state = BACKSLASH
            elif c not in VALID_PTTEXT:

                if c != '<': #not beginning of next tag?
                    raise DTLexicalError( lineno = getLineno ( text, offset), 
                                    msg = "invalid character |%s| in tag" % c)
                else:
                    raise DTLexicalError( lineno = getLineno ( text, offset ),
                                      msg = "found < in tag" )
        elif state == GOTCCOL:
            if c != ">":
                state=PLAINTAGTEXT
            else:    
                textoffset = offset + 1
                # put in tag token
                ret.append(
                    [tagoffset, offset + 1, 1, text[tagoffset + 2: offset - 1]]
                    )
                state = INIT
            
        elif state in (SQUOT, DQUOT, BQUOT):
            if c == '\\':
                backslash_save_state = state
                state = BACKSLASH

            elif c == QUOTE_CLOSE[state]:
                state = PLAINTAGTEXT

        elif state == BACKSLASH:
            state = backslash_save_state

        offset = offset + 1
    # GOTLT should be an acceptable state for the file
    # to terminate in
    if state != INIT and state != GOTLT:
        md = {PLAINTAGTEXT: "text",
              SQUOT:    "in single quoted area",
              DQUOT:    "in double quoted area",
              BQUOT:    "in expression area",
              GOTCCOL:  "after closing colon",
              BACKSLASH: "after backslash",
        INCOMMENT: "in comment"
              }
        raise DTLexicalError(msg = "hit end of file %s in tag, beginning of "
                             "tag %s at" % (md.get(state, "[unknown state]"),
                                            text[tagoffset:tagoffset+10]+"..."),
                             lineno = getLineno ( text, tagoffset ))
    else:
        ret.append([textoffset, offset, 0, text[textoffset:offset]])
    return ret

def processTag(tagString, text, st, end):
    tupargs=[]
    dictargs={}
    tagname=''
    off=0

    tnmatch = None
    while not tnmatch:
        wsmatch = wsre.match(tagString, off, len(tagString))
        if not wsmatch:
            tnmatch = plainre.match(tagString, off, len(tagString))
            if not tnmatch:
                raise DTLexicalError( msg = 'invalid tag name', 
                                      tagtext = tagString,
                                      lineno = getLineno ( text, st ) )
            else:
                break
        off = wsmatch.end()
                                
    tagname=tnmatch.group()
    off=tnmatch.end()
    lts=len(tagString)
    while off<lts:
        match=tagContentsRe.match(tagString, off, lts)
        if match is None:
            raise DTLexicalError( msg = 'invalid tag text', tagtext = tagString,
                                  lineno = getLineno ( text, st ) )
        g=match.groupdict()
        
        tmp=g[WHITESPACE]
        if tmp is not None:
            off=match.end(WHITESPACE)
            continue
        
        tmp=g[PLAIN]
        if tmp is not None:
            tupargs.append(tmp)
            off=match.end(PLAIN)
            continue
        
        tmp=g[PLAINEQ]
        tmp2=g[PLAINEQVAL]
        if tmp is not None:
            assert tmp2 is not None
            dictargs[tmp]=tmp2
            off=match.end(PLAINEQVAL)
            continue

        tmp=g[PLAINEQQ]
        tmp2=g[PLAINEQQVAL]
        if tmp is not None:
            assert tmp2 is not None
            dictargs[tmp]=dequote(tmp2)
            off=match.end(PLAINEQQVAL)
            continue

        tmp=g[QUOTONLY]
        if tmp is not None:
            tupargs.append(dequote(tmp))
            off=match.end(QUOTONLY)
            continue


        raise DTLexicalError( msg = 'unknown text in tag', 
                              tagtext = tagString[off:off+10], 
                              lineno=getLineno(text, st) )

    return tagname, tupargs, dictargs

def getLineno(text, offset):
    f=StringIO.StringIO(text)
    bread=0
    lineno=0
    line=f.readline()
    while line:
        lineno=lineno+1
        bread=bread+len(line)
        if bread>offset:
            break
        line=f.readline()
    return lineno

def doTag(text, name):
    tl=[]
    tags=findTags(text)
    for tag in tags:
        if tag[2]:
            n,t,d=processTag(tag[3], text, tag[0], tag[1])
            tl.append(DTToken(text, name, (tag[0], tag[1], n, t, d)))
        else:
            tl.append(tag[3])
    return tl
        
if __name__=='__main__':
    import time
    
    text=open(sys.argv[1]).read()

    a=time.time()
    tl=doTag(text, '')
    print 'time:', time.time()-a
    print '-------------------'
    for i in tl:
        if type(i)==type(''):
            print 'S: |%s|' % i
        else:
            print 'tag:', i
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.