parser.py :  » Web-Server » Snakelets » Snakelets-1.50 » snakeserver » ypage » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Web Server » Snakelets 
Snakelets » Snakelets 1.50 » snakeserver » ypage » parser.py
#############################################################################
#
#  $Id: parser.py,v 1.41 2006/05/18 10:56:41 irmen Exp $
#  Ypage parser
#
#  This is part of "Snakelets" - Python Web Application Server
#  which is (c) Irmen de Jong - irmen@users.sourceforge.net
#
#############################################################################

#
#   This parser takes tokens and converts them to an Abstract Syntax Tree
#   of the Ypage. It also checks the syntax of the document.
#   The parser is a hand crafted recursive-descent parser.
#

import types
import os
import codecs
import re
import tokenizer    # do not confuse this with tokenize from the std lib


class ParserError(Exception):
    def __init__(self, msg, linenum=None):
        if linenum:
            msg = msg+" @"+str(linenum)
        Exception.__init__(self, unicode(msg).encode("unicode-escape") )

    
def isWhitespace(text):
    return type(text) in types.StringTypes and (len(text)==0 or text.isspace())

def stringInQuotes(txt):
    if txt and len(txt)>=2:
        if txt[0] in ( '\'', '"') and txt[-1] in ( '\'', '"'):
            return True
        if txt[0] in ( '\'', '"') or txt[-1] in ( '\'', '"'):
            raise ParserError("invalidly quoted string: "+txt)
    return False

# the objects of the AST

class AbstractSyntaxElement:
    def __str__(self):
        return repr(self)
    def __repr__(self):
        return "<"+self.__class__.__name__+">"
        
class Declaration(AbstractSyntaxElement):
    def __init__(self, name, value):
        self.name=name.strip()
        self.value=value.strip()
        
class TextBlock(AbstractSyntaxElement):
    def __init__(self, text):
        self.text=text
    def mergeText(self,text):
        self.text+=text

class Whitespace(TextBlock):
    pass

class InsertPageBody(AbstractSyntaxElement):
    pass


class Comment(AbstractSyntaxElement):   # NO TextBlock!
    def __init__(self, text):
        self.text=text
    
class URLCall(AbstractSyntaxElement):   # NO TextBlock!
    def __init__(self, url):
        self.url=url

class URLForward(AbstractSyntaxElement):   # NO TextBlock!
    def __init__(self, url):
        self.url=url

class HTTPRedirect(AbstractSyntaxElement):   # NO TextBlock!
    def __init__(self, url):
        self.url=url

class Expression(AbstractSyntaxElement):
    def __init__(self, text):
        self.text=text.strip()

class Script(AbstractSyntaxElement, list):
    def __init__(self, text, lst=None, stripIndent=False):
        list.__init__(self,lst or [])
        self.text=text.rstrip()
        if stripIndent:
            self.text=self.stripIndent(self.text)
    def stripIndent(self, text):
        # strip starting indents
        lines=text.split('\n')
        if lines:
            if not lines[0]:
                del lines[0]
            if not lines:
                return ''   # no script content at all... whatever ;-)
            indent=len(lines[0])-len(lines[0].lstrip())
            if indent:
                lines2=[]
                # strip indent from all lines
                # if we strip something that is not whitespace, we have an error
                for line in lines:
                    lines2.append(line[indent:])
                    if line[:indent].strip():
                        raise ParserError("script block indentation error")
                lines=lines2
                # lines=[line[indent:] for line in lines]
        return '\n'.join(lines)
    def __repr__(self):
        s=["<Script @%d:" % id(self)]
        for c in self:
            s.append(",")
            s.append(repr(c))
        s.append('/Script>')
        return ''.join(s)
    def isStartOfBlock(self):
        # if it ends with a : or with a : followed by a comment, it's a block start
        return self.text.endswith(':') or re.match("[^:#]*:[ \t]*(#.*)?$", self.text) is not None
    def isEndScript(self):
        return self.text=="end"
    def isEndAndNewBlock(self):
        return re.match(r"^else:",self.text) or re.match(r"^elif\s.+:",self.text)
        
class ScriptEndBlock(AbstractSyntaxElement,list):
    def __init__(self, content=None):
        if content is not None:
            self.append(content)
        
class Document(AbstractSyntaxElement, list):
    def __repr__(self):
        s=["[DOCUMENT:"]
        for c in self:
            s.append(",")
            s.append(repr(c))
        s.append(']')
        return ''.join(s)
    

# Scans the file for input and output encoding declarations.
def determineInputOutputEncodings(filename):
    inputEnc=outputEnc=None
    for line in file(filename,"r").readlines(1000)[:10]: # first 10 lines
        m=re.match(r".*<%@\s*(.+)\s*=\s*(.+)\s*%>\s*$", line)
        if m:
            (name,value)=m.groups()
            name=name.lower()
            if name in ('inputencoding','outputencoding'):
                if stringInQuotes(value):
                    value=value[1:-1]
                if name=='inputencoding':
                    inputEnc=value
                elif name=='outputencoding':
                    outputEnc=value
    return (inputEnc, outputEnc)


# The parser.
# Parses a tokenized stream into a AST, with a few
# syntax checks along the way.

# <ypage> --> DeclarationBlock DocBody <None>
# DeclarationBlock --> (WS Declaration)*
# DocBody --> (Text | Comment | Expression ) *

class Parser(object):
    def __init__(self, stream, filename, inputEncoding):
        self.tokenizer=tokenizer.Tokenizer(stream,filename)
        self.fileLocation = os.path.dirname(filename)
        self.inputEncoding=inputEncoding
        self.outputEncoding_fromChild = False
        self.tokens=self.tokenizer.getTokens()
        self.currentToken=None
        self.previousToken=None
        self.scriptNesting=0
        self.nextToken()
        self.declarations=[]
        self.includedDeclarations=[]
        
    def nextToken(self):
        try:
            self.previousToken=self.currentToken
            self.currentToken=self.tokens.next()
            return self.currentToken
        except StopIteration:
            # end of input
            self.currentToken=None
            return None
    def undoToken(self):
        self.currentToken, self.previousToken = self.previousToken, None
        
    def parseDeclarationsOnly(self):
        # only read and parse the declarations at the top of the file, return list of decls
        ast=Document()
        if self.currentToken==tokenizer.TOK_COMMENTOPEN:   # if ypage starts with a comment, skip it.
            self.skipComment() 
        return [d for d in self.parseDeclarationBlock() if isinstance(d, Declaration)]
        
    def parse(self):
        # fully parse the ypage source file.
        ast=Document()
        if self.currentToken==tokenizer.TOK_COMMENTOPEN:   # if ypage starts with a comment, skip it.
            self.skipComment() 
        self.declarations = self.parseDeclarationBlock()
        body = self.parseDocBody()
        ast.extend(body)
        if self.currentToken is None:
            self.declarations.extend(self.includedDeclarations)
            self.declarations.reverse()
            for d in self.declarations:
                ast.insert(0,d)   # place declarations at the front.
            ast.endlocation=self.tokenizer.getLocation()
            return ast
        else:
            raise ParserError("trailing garbage or wrong script tag", self.tokenizer.getLocationStr())

    def processWhitespaceAndComments(self):
        ast=[]
        while isWhitespace(self.currentToken) or self.currentToken==tokenizer.TOK_COMMENTOPEN:
            self.skipComment()
            if isWhitespace(self.currentToken):
                ast.append(Whitespace(self.currentToken))
                self.nextToken()
            else:
                break
        return ast
    
    def skipComment(self):
        if self.currentToken==tokenizer.TOK_COMMENTOPEN:
            while self.currentToken!=tokenizer.TOK_COMMENTCLOSE:
                self.nextToken()
                if not self.currentToken:
                    raise ParserError("missing comment closing tag", self.tokenizer.getLocationStr())
            self.nextToken() # skip the comment closing tag
                
                
    # Returns the parsed declarations in the DeclarationBlock at the
    # beginning of the document. On exit, the curToken is the first
    # non-whitespace token (or None at EOF).
    # (the parsed decaration AST also contains embedded whitespace!)
    def parseDeclarationBlock(self):
        ast=[]
        while self.currentToken==tokenizer.TOK_DECLARATIONOPEN or type(self.currentToken) in types.StringTypes:
            ast.extend(self.processWhitespaceAndComments())
            if self.currentToken==tokenizer.TOK_DECLARATIONOPEN:
                self.nextToken()
                if type(self.currentToken) in types.StringTypes:
                    decl=self.currentToken.split('=',1)
                    if len(decl)!=2:
                        raise ParserError("invalid declaration, must be decl=value", self.tokenizer.getLocationStr())
                    decl = Declaration(decl[0].strip(), decl[1].strip())
                    if not decl.name or not decl.value:
                        raise ParserError("invalid declaration, must be decl=value", self.tokenizer.getLocationStr())
                    ast.append(decl)
                    if self.nextToken()==tokenizer.TOK_SCRIPTCLOSE:
                        self.nextToken()
                    else:
                        raise ParserError("close token expected", self.tokenizer.getLocationStr())
                else:
                    raise ParserError("text expected", self.tokenizer.getLocationStr())
            else:
                # it is not a decl, we must be finished
                break
        return ast


    
    # DocBody        --> (Text | Comment | Expression | Script | Instruction ) *
    # DocBodyInBlock --> (Text | Comment | Expression | No_EndBlock_Script ) *
    def parseDocBody(self, inBlock=False):
        ast=[]
        while True:
            if type(self.currentToken) in types.StringTypes:        # Text
                ast.append(TextBlock(self.currentToken))
                self.nextToken()
            elif self.currentToken==tokenizer.TOK_COMMENTOPEN:
                self.skipComment()
            elif self.currentToken==tokenizer.TOK_EXPRESSIONOPEN:
                ast.append(self.parseExpression())
            elif self.currentToken==tokenizer.TOK_SCRIPTOPEN:
                if inBlock:
                    script=self.parseScript()
                    if script[0].isEndScript(): 
                        return ast,None
                    elif script[0].isEndAndNewBlock():
                        return ast,script
                    ast.extend(script)
                else:
                    script=self.parseScript()
                    if len(script)==1:
                        scripttxt=script[0].text.lower()
                        if scripttxt=="end":
                            raise ParserError("block ended while not in block", self.tokenizer.getLocationStr())
                        if scripttxt.startswith("else:") or scripttxt.startswith("elif:"):  # XXX hack-ish
                            raise ParserError("else or elif outside if", self.tokenizer.getLocationStr())
                    ast.extend(script)
            elif self.currentToken==tokenizer.TOK_INSTRUCTIONOPEN:
                # processing instruction
                self.nextToken()
                if type(self.currentToken) in types.StringTypes:
                    if '=' in self.currentToken:
                        instruction, value = self.currentToken.split('=',1)
                    else:
                        instruction, value = self.currentToken,''
                    self.processInstruction(instruction.lower(), value, ast)
                    if self.nextToken()==tokenizer.TOK_SCRIPTCLOSE:
                        self.nextToken()
                    else:
                        raise ParserError("close token expected", self.tokenizer.getLocationStr())
                else:
                    raise ParserError("text expected", self.tokenizer.getLocationStr())
            elif self.currentToken and type(self.currentToken) not in types.StringTypes:
                raise ParserError("normal text expected instead of tag", self.tokenizer.getLocationStr())
            else:
                # end of docbody
                break
        return ast     # yes, this is correct (pychecker)
    
    # Expression -> ExpressionOpen TextBlock ExpressionClose
    def parseExpression(self):
        if self.currentToken==tokenizer.TOK_EXPRESSIONOPEN:
            self.nextToken()
            if type(self.currentToken) in types.StringTypes:
                expr=Expression(self.currentToken)
                self.nextToken()
                if self.currentToken==tokenizer.TOK_SCRIPTCLOSE:
                    self.nextToken()
                    return expr
                else:
                    raise ParserError("expression close expected", self.tokenizer.getLocationStr())
            else:
                raise ParserError("text expected", self.tokenizer.getLocationStr())
        else:
            raise ParserError("expression expected", self.tokenizer.getLocationStr())

    
    # Script --> ScriptFragment | ScriptBlock
    # ScriptFragment --> ScriptOpen text-without-: ScriptClose
    def parseScript(self):
        if self.currentToken==tokenizer.TOK_SCRIPTOPEN:
            self.nextToken()
            if type(self.currentToken) in types.StringTypes:
                try:
                    scriptNode=Script(self.currentToken, stripIndent=True)
                except ParserError,px:
                    raise ParserError(str(px), self.tokenizer.getLocationStr())
                closeTok = self.nextToken()
                if closeTok in (tokenizer.TOK_SCRIPTCLOSE, tokenizer.TOK_SCRIPTCLOSEKEEPB):
                    self.nextToken()
                    if closeTok==tokenizer.TOK_SCRIPTCLOSEKEEPB or scriptNode.isStartOfBlock():
                        newscript=self.parseScriptBlock(scriptNode)
                        if newscript is not None:
                            ast=[scriptNode]
                            ast.extend(newscript)
                            return ast
                    return [scriptNode]
                else:
                    raise ParserError("script close expected", self.tokenizer.getLocationStr())
            else:
                raise ParserError("text expected", self.tokenizer.getLocationStr())
        else:
            raise ParserError("script expected", self.tokenizer.getLocationStr())
        
    # ScriptBlock --> ScriptOpen text-with-: ScriptClose DocBody ScriptOpen 'end' ScriptClose
    def parseScriptBlock(self, scriptNode):
        # the initial script node is already parsed and passed in
        something=self.parseDocBody(inBlock=True)
        if type(something) == types.TupleType:
            (body,script)=something
            if len(body)>=1:
                scriptNode.extend(body)
            return script
        else:
            raise ParserError("syntax error, script end tag missing?", self.tokenizer.getLocationStr())

    def processInstruction(self, instr, value, ast):
        value=value.strip()
        if instr=="include":
            if not stringInQuotes(value):
                raise ParserError("argument for include must be a quoted string",self.tokenizer.getLocationStr())
            value=value[1:-1]

            if value.startswith("http://") or value.startswith("HTTP://"):
                raise ParserError("included URLs must be local (relative to the page itself)")

            filename=os.path.join(self.fileLocation, value)
    
            try:
                (inputenc, outputenc) = determineInputOutputEncodings(filename)
                includeStream = self.openIncludedFile(filename, encoding=inputenc)
            except Exception,x:
                raise ParserError("error including file '%s': %s" % (value, x), self.tokenizer.getLocationStr())

            parsed=Parser(includeStream, filename, inputenc).parse()
            includeStream.close()
            # only add import declarations!!
            for item in parsed:
                if isinstance(item, Declaration):
                    if item.name=="import":
                        self.includedDeclarations.append(item)
                    elif item.name=="outputencoding":
                        self.checkUseChildEncoding(item.value)
            ast.append(Comment("BEGIN INCLUDE %s" % value ))
            ast.extend(parsed)
            ast.append(Comment("END INCLUDE %s" % value ))
        elif instr=="call":
            if not stringInQuotes(value):
                raise ParserError("argument for call must be a quoted string",self.tokenizer.getLocationStr())
            value=value[1:-1]
            ast.append(URLCall(value))
        elif instr=="redirect":
            if not stringInQuotes(value):
                raise ParserError("argument for redirect must be a quoted string",self.tokenizer.getLocationStr())
            value=value[1:-1]
            ast.append(URLForward(value))
        elif instr=="httpredirect":
            if not stringInQuotes(value):
                raise ParserError("argument for httpredirect must be a quoted string",self.tokenizer.getLocationStr())
            value=value[1:-1]
            ast.append(HTTPRedirect(value))
        elif instr=="insertpagebody":
            ast.append(InsertPageBody())
        else:
            raise ParserError("unknown instruction", self.tokenizer.getLocationStr())

    def checkUseChildEncoding(self, childenc):
        # check if we should use the child's encoding.
        for decl in self.declarations:
            if isinstance(decl, Declaration) and decl.name=="outputencoding":
                return  # we already have our own encoding, do not change it!
        # we don't have an encoding, use the child's encoding
        self.declarations.append( Declaration("outputencoding", childenc) )
        self.outputEncoding_fromChild = True
    
    def openIncludedFile(self, filename, encoding=None, raw=False):
        if raw:
            return file(filename,"rb")
        elif encoding:
            return codecs.open(filename, mode="rb", encoding=encoding)
        else:
            return file(filename, "r")


# test methods      
                
def syntaxTreeToStr(c):         # JUST FOR TESTING!! Not used by compiler.py
    s=""
    if isinstance(c, Document):
        for child in c:
            s+=syntaxTreeToStr(child)
    elif isinstance(c,  Declaration):
        s+="<%%@%s=%s@%%>" % (c.name, c.value)
    elif isinstance(c,URLCall):
        s+='\n----> CALL URL HERE: '+c.url+'\n'
    elif isinstance(c,URLForward):
        s+='\n----> REDIRECT URL HERE: '+c.url+'\n'
    elif isinstance(c,Comment):
        s+='\n# '+c.text+'\n'
    elif isinstance(c,TextBlock):
        if isinstance(c,Whitespace):
            s+='['+c.text+']'
        else:
            s+='{'+c.text+'}'
    elif isinstance(c, Expression):
        s+="<%%=%s%%>" % c.text
    elif isinstance(c, Script):
        s+="<%SCRIPTID:"+`id(c)`+"\t"+c.text+"%>"
        for child in c:
            s+=syntaxTreeToStr(child)
        s+="<%end SCRIPTID:"+`id(c)`+"%>"
    elif isinstance(c, InsertPageBody):
        s+="\n----> TEMPLATED PAGE BODY HERE\n"
    else:
        raise ValueError("invalid syntax "+repr(c))

    return s
    
def main(args):
    if len(args)>2:
        inputstream = codecs.open(args[1], mode="rb", encoding=args[2])
        print "Reading with input encoding ",args[2]
        inputenc = args[2]
    else:
        inputstream = open(args[1], "r")
        print "Reading with default encoding"
        inputenc=None

    parser = Parser(inputstream, args[1], inputenc)
    
    #decls = parser.parseDeclarationsOnly()
    #print "DECLS=",decls   # XXX
    
    syntaxtree=parser.parse()
    print syntaxtree
    print '*'*70
    
    string = syntaxTreeToStr(syntaxtree)
    if len(args)>2:
        print string.encode("iso-8859-1", "xmlcharrefreplace")
    else:
        print string


if __name__=="__main__":
    import sys
    main(sys.argv)
    
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.