parsers.py :  » IDE » PyPE » PyPE-2.9.1 » plugins » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » IDE » PyPE 
PyPE » PyPE 2.9.1 » plugins » parsers.py
#!/usr/bin/python

'''
This software is licensed under the GPL (GNU General Public License) version 2
as it appears here: http://www.gnu.org/copyleft/gpl.html
It is also included with this archive as `gpl.txt <gpl.txt>`_.
'''

import bisect
import compiler
import compiler.ast
import os
import parser
import pprint
import re
import time
import token
import traceback
import symbol

from plugins import exparse

todoexp = re.compile('(>?[a-zA-Z0-9 ]+):(.*)', re.DOTALL)

_bad_todo = dict.fromkeys('if elif else def cdef class try except finally for while lambda with'.split())
_bad_urls = dict.fromkeys('http ftp mailto news gopher telnet file'.split())

try:
    _pype
except:
    class _pype:
        STRICT_TODO = 0

def is_url(left, right, ml=0):
    if left.lstrip().lower() in _bad_urls and right[:2] == '//':
        return 1
    if not ml and _pype.STRICT_TODO and left[:1] != '>':
        return 1
    return 0

def detectLineEndings(text):
    crlf_ = text.count('\r\n')
    lf_ = text.count('\n')
    cr_ = text.count('\r')
    mx = max(lf_, cr_)
    if not mx:
        return os.linesep
    elif crlf_ >= mx/2:
        return '\r\n'
    elif lf_ is mx:
        return '\n'
    else:# cr_ is mx:
        return '\r'

def detectLineIndent(lines, use_tabs, spaces_per_tab, spaces_per_indent):
    tablines = 0
    lc = 0
    prev = 0, 0
    deltas = {}
    for line in lines:
        line = line.rstrip('\r\n')
        tl = 0
        ll = len(line)
        indent = ll-len(line.lstrip())
        lc += 1
        leading = line[:indent]
        if '\t' in leading:
            # we've definitely got tab indents,
            # but is it mixed, or straight up
            # tabs, or are the tabs a mistake?
            leading = leading.replace('\t', '')
            tl = indent - len(leading)
            tablines += 1
        ind = len(leading)
        this = tl, ind
        if prev == this:
            # multiple lines of the same indent
            # level isn't very interesting, we
            # only care about transitions
            continue
        x = (0,)
        if tl or prev[0]:
            # Let's just check all of the indents up to 8.
           x = tuple(range(1,9))
        for i in x:
            d = abs((ind + i*tl) - (prev[1] + i*prev[0]))
            deltas[i,d] = deltas.get((i,d), 0) + 1
        prev = this
    # We have counts of transitions between different indent levels.
    # The correct one will generally dominate, and will also divide
    # the other major indent levels evenly.
    if deltas:
        # if more than 1/8 of your lines have tabs...you probably used tabs
        need_tabs = tablines > lc / 8
        dts = []
        for i,j in deltas.iteritems():
            j += j**.5 * (spaces_per_tab == i[0]) + j**.5 * (spaces_per_indent == i[1])
            dts.append((j,i))
        dts.sort(reverse=True)
        for count, (sppt, delta) in dts:
            # Unless something funky is going on, the maximum is the likely
            # correct answer, so we'll use it unless we are supposed to use tabs...
            if not sppt and need_tabs:
                continue
            spaces_per_indent = delta
            use_tabs = bool(sppt)
            if use_tabs:
                spaces_per_tab = sppt
            break
    return use_tabs, spaces_per_tab, spaces_per_indent

def leading(line):
    return len(line)-len(line.lstrip())

def line_info(lineno):
    return exparse.Info(lineno, '\xff', '\xff', 999, 999, (), (), "", 999999999)

#------------------------------- C/C++ parser --------------------------------

defn = '(?:is+)*(?:is+\*+s+)?(?:is*::s*)?cs*\(a\)\s*{'
rep = [('a', '(?:\b|b|b(?:,s*b)*)'),
       ('b', '(?:i?[ \t\*&]*is*(?:\[[^\]]*\])*)'),
       ('c', '(?:i|operator[^\w]+)'),
       ('d', '(?:(?:is+)*(?:is+\*+s+)?is*;f*)'),
       ('i', '(?:[a-zA-Z_]\w*)'),
       ('s', '[ \t]'),
       ('f', '\s'),
       ('y', '(?:[dD][eE][fF][iI][nN][eE])')]

fcn = '(#ys+i\(i(?:,s*i)*\))|(?:(cs*\([^\)]*\))[^{;\)]*[;{])'
sdef = '(c)s*\('

for i,j in rep:
    try:
        _ = re.compile(j)
    except:
        print j
        raise
    fcn = fcn.replace(i,j)
    sdef = sdef.replace(i,j)

fcnre = re.compile(fcn)
sdefre = re.compile(sdef)

badstarts = []
for i in 'if for while switch case return'.split():
    badstarts.append(i+'(')
    badstarts.append(i+' ')
    badstarts.append(i+'\t')

ops = '+-=<>?%!~^&(|/"\''

def _get_tags(out, todo):
    #tags: ^tag parser
    out = _flatten2(out)
    out = [i for i in out if not i[1].startswith('-- ')] #line_no, defn
    rtags = {'':dict(out)}
    todo = [i for i in todo if i[0] == 'tags'] #tag, line_no, excl, content
    tags = []
    last = None
    while todo and out:
        if todo[-1][1] == out[-1][0]:
            #tag line matches parsed line, maybe it hangs out to the right?
            _ = todo.pop()
            tags.append((out[-1][0], _[-1], out[-1][1]))
        
        elif out[-1][0] > todo[-1][1]:
            last = out.pop()
        
        elif '^' in todo[-1][3]:
            _ = todo.pop()
            tags.append((out[-1][0], _[-1], out[-1][1]))
        
        else:
            #todo is > out, use last if it exists
            _ = todo.pop()
            if last:
                tags.append((last[0], _[-1], last[1]))
            else:
                tags.append((_[1], _[-1], "?"))
    
    for line, content, defn in tags:
        for tag in re.split("[^0-9a-z]+", content.lower()):
            if not tag:
                continue
            if tag not in rtags:
                rtags[tag] = {}
            rtags[tag][line] = defn
    
    return rtags

def _shared_parse(ls, todo, line_no, bad_todo=(), start=1, texp=todoexp, ml=0):
    r = texp.match(ls, start)
    if not r:
        return 0
    tpl = r.groups()
    if (tpl[0].split() or [''])[0] in bad_todo or is_url(tpl[0], tpl[1], ml):
        return 0
    if tpl[0][:1] == '>':
        tpl = tpl[0][1:], tpl[1]
    todo.append((tpl[0].strip().lower(),
            line_no,
            tpl[1].count('!'),
            tpl[1].strip()))
    return 1

def c_parser(source, line_ending, flat, wxYield):
    posn = 0
    lc = 1
    post = 0
    out = []
    docs = {}
    for i in fcnre.finditer(source):
        fcn = i.group(0).replace('\n', ' ')
        
        #update line count
        lc += post + source.count('\n', posn, i.start())
        post = 0
        post = source.count('\n', i.start(), i.end())
        posn = i.end()
        
        sm = sdefre.search(fcn)
        short = sm.group(1)
        
        #check for function-like macros
        if fcn.lower().startswith('#define'):
            out.append((fcn, (short.lower(), lc, short), 0, []))
            docs.setdefault(short, []).append(fcn[sm.start():])
            continue
        
        #handle the 'badstarts'
        cont = 0
        for j in badstarts:
            if fcn.startswith(j):
                cont = 1
                break
        if cont:
            continue
        
        #handle function calls
        pp = fcn.rfind(')')
        if fcn.endswith(';'):
            xx = fcn[pp+1:-1]
            if not xx.strip():
                continue
            for j in ops:
                if j in xx:
                    cont = 1
                    break
            if cont:
                continue
        
        #get the start of the definition
        linestart = source.rfind('\n', 0, i.start()) + 1 #yes, I really want this
        
        fcns = source[linestart:i.start()]
        dfcns = dict.fromkeys(fcns)
        
        #check for operators in the beginning; for things like...
        #x = fcncall(...) * X;
        for j in ops:
            if j in dfcns:
                cont = 1
                break
        if cont:
            continue
        
        if '[' not in short:
            docs.setdefault(short, []).append(fcn[sm.start():pp+1])
        #use the entire definition
        fcn = ' '.join(fcns.split() + fcn[:pp+1].split())
        out.append((fcn, (short.lower(), lc, short), 0, []))
    
    texp = todoexp
    todo = []
    _sp = _shared_parse
    labels = []
    lines = source.replace('\r\n', '\n').replace('\r', '\n').split('\n')
    for line_no, line in enumerate(lines):
        ls = line.strip()
        if ls[:2] == '//':
            _sp(ls, todo, line_no+1, start=2)
        elif ls[:2] == '/*' and ls[-2:] == '*/':
            _label(ls.strip('/* '), labels, line_no+1)
    
    out, docs = exparse.translate_old_to_new(out, docs, len(lines))
    
    if labels:
        add_labels(out, labels)
    
    return out, docs.keys(), docs, todo

#-------------------------------- misc stuff ---------------------------------

def _flatten(out, seq=None):
    #used for:
    #------ labels like this one ------
    first = 0
    if seq is None:
        seq = []
        first = 1
    
    for i,j in enumerate(out):
        ## print j[1], j[2]
        seq.append((j[1][1], i, out, j[2]))
        if j[-1]:
            _flatten(j[-1], seq)
    if first:
        seq.append((0x7fffffff, len(seq), out, 0))
    return seq

def _flatten2(out, seq=None):
    #used for:
    #tags: like this one^
    #tags: ^tag parser
    first = 0
    if seq is None:
        seq = []
        first = 1
    for i,j in enumerate(out):
        seq.append((j[1][1], j[0]))
        if j[-1]:
            _flatten2(j[-1], seq)
    return seq

def add_labels(out, labels):
    if USE_NEW:
        for label in labels:
            posn = bisect.bisect_right(out, label)
            line_no, indent, text = label
            dl = '-- ' + text + ' --'
            if len(out) > posn and out[posn-1].depth <= out[posn].depth:
                # no children on the previous guy
                # or there are children on the previous guy
                # either way, take the depth of the next guy...
                out.insert(posn, exparse.Info(line_no, text, dl, out[posn].depth, indent, (), None, None, 1, dl, dl))
                continue
            # We should iterate up the set of contexts to try to find the
            # proper indent level...but screw that; we'll just toss it in as a
            # sibling to make navigation easier.
            out.insert(posn, exparse.Info(line_no, text, dl, out[posn-1].depth, indent, (), None, None, 1, dl, dl))
    else:
        labels.reverse()
        seq = _flatten(out)
        seq.reverse()
        _ = seq[-1]
        while labels:
            #'seq and' portion semantically unnecessary
            line, label = labels.pop()
            while seq and line > seq[-1][0]:
                _ = seq.pop()
            __, posn, entry, indent = seq[-1]
            #normalize the label
            entry.insert(posn, ('-- %s --'%label, (label.lower(), line, label), indent, []))

def _label(lss, labels, line_no, indent=None):
    #we may have a label of the form...
    # ----- label -----
    if len(lss) > 4 and lss[:1] == lss[-1:] == '-':
        labels.append((line_no, lss.strip('\t\n\x0b\x0c\r -')))

def _new_label(lss, labels, line_no, indent):
    #we may have a label of the form...
    # ----- label -----
    if len(lss) > 4 and lss[:1] == lss[-1:] == '-':
        labels.append((line_no, indent, lss.strip('\t\n\x0b\x0c\r -')))

#------------------------------ Python parsers -------------------------------

def slower_parser(source, _1, flat, _2):
    try:
        if USE_NEW:
            out, docstring = exparse._parse(source)
        else:
            out, docstring = exparse.parse(source)
    except:
        ## import traceback
        ## traceback.print_exc()
        #parse error, defer to faster parser
        return faster_parser(source, '\n', flat, _2)

    texp = todoexp
    bad_todo = _bad_todo
    todo = []
    _sp = _shared_parse
    labels = []
    _l = _label
    if USE_NEW:
        _l = _new_label
    for line_no, line in enumerate(source.split('\n')):
        if '#' not in line:
            continue
        p = line.find('#')
        if not _sp(line[p:], todo, line_no+1, bad_todo, 1+(line[p+1:p+2]=='#')):
            _l(line[p+1:].lstrip('#>'), labels, line_no+1, len(line)-len(line.lstrip()))
    
    if labels:
        add_labels(out, labels)
    
    return out, docstring.keys(), docstring, todo
#
def faster_parser(source, line_ending, flat, wxYield):
    texp = todoexp
    bad_todo = _bad_todo
    lines = source.split(line_ending)
    docstring = {} #new_kwl()
    todo = []
    
    out = []
    stk = []
    if USE_NEW:
        stk.append(exparse.Info(-1, '', '', 0, -1, (), None, None, len(lines)))
    line_no = 0
    _len = len
    
    FIL = lambda A:A[1][2]
    if USE_NEW:
        FIL = lambda A:A[1]
    
    def fun(i, line, ls, line_no, stk):
        ## try: wxYield()
        ## except: pass
        na = ls.find('(')
        ds = ls.find(':')
        if na == -1:
            na = ds
        if na != -1:
            if ds == -1:
                ds = na
            fn = ls[_len(i):ds].strip()
            if fn:
                lead = _len(line)-_len(ls)
                if USE_NEW:
                    while stk and (stk[-1][4] >= lead):
                        stk[-1].lines = line_no - stk[-1].lineno
                        out.append(stk.pop())
                else:
                    while stk and (stk[-1][2] >= lead):
                        prev = stk.pop()
                        if stk: stk[-1][-1].append(prev)
                        else:   out.append(prev)
                nam = i+fn
                nl = nam.lower()
                f = ls[_len(i):na].strip()
                
                if f in ('__init__', '__new__') and _len(stk):
                    key = stk[-1][1]
                    if not USE_NEW:
                        key = key[2]
                    docstring.setdefault(key, []).append("%s %s.%s"%(fn, '.'.join(map(FIL, stk)), f))
                if USE_NEW:
                    stk.append(exparse.Info(line_no, f, nam, len(stk), lead, (), None, None, -1))
                else:
                    stk.append((nam, (f.lower(), line_no, f), lead, []))
                docstring.setdefault(f, []).append("%s %s"%(fn, '.'.join(map(FIL, stk))))
    
    _sp = _shared_parse
    _l = _label
    if USE_NEW:
        _l = _new_label
    labels = []
    for line in lines:
        line_no += 1
        ls = line.lstrip()

        if ls[:4] == 'def ':
            fun('def ', line, ls, line_no, stk)
        elif ls[:5] == 'cdef ':
            fun('cdef ', line, ls, line_no, stk)
        elif ls[:6] == 'class ':
            fun('class ', line, ls, line_no, stk)
        elif '#' in line:
            p = line.find('#')
            if not _sp(line[p:], todo, line_no+1, bad_todo, 1+(line[p+1:p+2]=='#')):
                _l(line[p+1:].lstrip('#>'), labels, line_no+1, _len(line)-_len(ls))

    if not USE_NEW:
        while _len(stk) > 1:
            a = stk.pop()
            stk[-1][-1].append(a)
        out.extend(stk)
    else:
        for i in stk:
            i.lines = line_no - i.lineno + (i.lineno >= 0)
        out.extend(stk)
        out.sort()
        exparse._fixup_extra(out)
    
    if labels:
        add_labels(out, labels)
    
    if flat == 0:
        return out, docstring.keys()
    elif flat==1:
        return docstring
    elif flat==2:
        return out, docstring.keys(), docstring
    else:
        return out, docstring.keys(), docstring, todo

def fast_parser(*args, **kwargs):
    return slower_parser(*args, **kwargs)

#-------------------------- spitfire/cheetah parser --------------------------

def cheetah_parser(source, line_ending, flat, _):
    bad_todo = _bad_todo
    _sp = _shared_parse
    _l = _label
    if USE_NEW:
        _l = _new_label
    _len = len
    # because of start/end stuff, for the new parser, we can generate good
    # line count information...we'll do that later
    new_blocks = set('#' + i for i in ('block', 'def')) # to bypass the cheetah parser detection
    todo = []
    labels = []
    docs = {}
    out = []
    stk = []
    lines = source.split('\n')
    for i, line in enumerate(lines):
        ls = line.lstrip()
        if not ls:
            continue
        lead = line.split(None, 1)
        if lead[0] in new_blocks:
            defn = ls.rstrip()
            if len(lead) < 2:
                name = ''
            elif lead[0][0] == '#' and lead[0][1:] == 'block': # to bypass the cheetah parser detection
                name = lead[1].split()[0]
            else:
                name = lead[1].split('#', 1)[0]
            docname = '.'.join(i[1][2] for i in stk)
            if docname:
                docname += '.'
            docname += name
            docs.setdefault(docname, []).append(defn)
            stk.append((defn, (name.lower(), i+1, name), _len(line)-_len(ls), []))
        elif lead[0] == '#end' and len(lead) > 1 and (lead[1][:3] == 'def' or lead[1][:5] == 'block'):
            o = stk.pop()
            if stk:
                stk[-1][-1].append(o)
            else:
                out.append(o)
        elif '#' in line:
            # try to find a todo as best we can...
            pp = 0
            while line.find('#', pp) >= 0:
                pp = line.find('#', pp)
                if _sp(line[pp:], todo, i+1, bad_todo, 1+(line[pp+1:pp+2]=='#')):
                    break
                pp += 1+(line[pp+1:pp+2]=='#')
            else:
                # otherwise try to find a label of the form # --- label ---
                if '---' in line:
                    p = line.find('#')
                    _l(line[p+1:].lstrip('#>'), labels, i+1, _len(line)-_len(ls))
    while len(stk) > 1:
        o = stk.pop()
        stk[-1][-1].append(o)
    out.extend(stk)
    
    if USE_NEW:
        out, docs = exparse.translate_old_to_new(out, docs, len(lines))
    
    if labels:
        add_labels(out, labels)
    
    if flat == 0:
        return out, docs.keys()
    elif flat==1:
        return docs
    elif flat==2:
        return out, docs.keys(), docs
    else:
        return out, docs.keys(), docs, todo

#------------------------------- latex parser --------------------------------

def latex_parser(source, line_ending, flat, _):
    texp = todoexp
    lines = source.split(line_ending)
    todo = []
    out = []
    stk = []
    line_no = 0
    sections = ('\\chapter', '\\section', '\\subsection', '\\subsubsection')
    
    def f(which, line, ls, line_no, stk):
        if which in sections:
            ind = which.count('sub') + which.endswith('section')
        elif stk:
            ind = 3
        else:
            ind = -1
        while stk and stk[-1][2] >= ind:
            it = stk.pop()
            if stk:
                stk[-1][-1].append(it)
            else:
                out.append(it)
        na = ls.find('{')
        ds = ls.find('}')
        if na > 0 and ds > 0:
            name = ls[na+1:ds].strip()
            if ind >= 0:
                stk.append((ls.rstrip(), (name.lower(), line_no, name), ind, []))
            else:
                out.append((ls.rstrip(), (name.lower(), line_no, name), 0, []))
    
    _sp = _shared_parse
    labels = []
    for line in lines:
        line_no += 1
        ls = line.lstrip()
        
        if ls[:1] == '%':
            if not _sp(ls, todo, line_no, start=1):
                _label(ls.strip('%>'), labels, line_no)
            continue
        elif ls[:6] == '\\label':
            f('\\label', line, ls, line_no, stk)
        for i in sections:
            if ls[:len(i)] == i:
                f(i, line, ls, line_no, stk)
                break
                
    while len(stk)>1:
        a = stk.pop()
        stk[-1][-1].append(a)
    out.extend(stk)
    
    if USE_NEW:
        out, _ = exparse.translate_old_to_new(out, {}, len(lines))
    
    if labels:
        add_labels(out, labels)
    
    if flat == 0:
        return out, []
    elif flat==1:
        return {}
    elif flat==2:
        return out, [], {}
    else:
        return out, [], {}, todo

#---------------------------- [ht|x|sg]ml parser -----------------------------

#Are there any other non-opening tags?
no_ends = []
for i in ('br p input img area base basefont '
          'col frame hr isindex meta param').split():
    no_ends.append(i+' ')
    no_ends.append(i+'>')
    no_ends.append('/'+i+' ')
    no_ends.append('/'+i+'>')

def ml_parser(source, line_ending, flat, _):
    todo = []
    texp = todoexp
    bad_todo = _bad_todo
    _sp = _shared_parse
    labels = []
    for line_no, line in enumerate(source.split(line_ending)):
        if '<!-- ' not in line or ' -->' not in line:
            continue
        
        posn1 = line.find('<!-- ')
        if posn1 == -1:
            posn2 == -2
        else:
            posn2 = line.find(' -->', posn1)
        
        if posn1 > posn2:
            continue
        
        r = texp.match(line, posn1+5, posn2)
        
        if not r:
            _label(line[posn1+5:posn2], labels, line_no+1)
        else:
            _sp(r.group(), todo, line_no+1, bad_todo, 0, ml=1)
    
    out = []
    if labels:
        add_labels(out, labels)
    
    if flat == 0:
        return out, []
    elif flat==1:
        return {}
    elif flat==2:
        return out, [], {}
    else:
        return out, [], {}, todo

#--------------------------- other misc functions ----------------------------

def preorder(h):
    #uses call stack; do we care?
    for i in h:
        yield i[1][2], i
        for j in preorder(i[3]):
            yield j

def _preorder(h):
    #uses explicit stack, may be slower, no limit to depth
    s = [h]
    while s:
        c = s.pop()
        yield c[1][2]
        s.extend(c[3][::-1])

_name_start = dict.fromkeys(iter('abcdefghijklmnopqrstuvwxyzABCDEFGHIJLKMNOPQRSTUVWXYZ_'))
_name_characters = dict(_name_start)
_name_characters.update(dict.fromkeys(iter('0123456789')))

def get_last_word(line):
    nch = _name_characters
    for i in xrange(len(line)):
        if line[-1-i] not in nch:
            break
    
    if line[-1-i] in _name_start:
        #handles a word that is the whole line
        return line[-1-i:]
    ## if i and line[-i] in _name_start and line[-1-i] in :
        ## #handles a word that isn't the whole line
        ## return line[-i:]
    return ''

'''
([('def foo(x, y=6, *args, **kwargs)', ('foo', 5, 'foo'), 0, []),
  ('class bar',
   ('bar', 9, 'bar'),
   0,
   [('def __init__(self, foo=a, bar={1:2})',
     ('__init__', 10, '__init__'),
     4,
     [])]),
  ('class Baz(object, int)',
   ('baz', 13, 'Baz'),
   0,
   [('def __init__(self, bar=(lambda:None))',
     ('__init__', 14, '__init__'),
     4,
     [('def goo()', ('goo', 16, 'goo'), 8, [])])])],
 '''

if __name__ == '__main__':
    a = '''import a, b, c

#todo: hello world

def foo(x, y=6, *args,
        **kwargs):
    return None

class bar:
    #--- this is also a label ---
    def __init__(self, foo=a, bar={1:2}):
        #--- this is a label! ---
        """blah!"""

class Baz(object, int):
    def __init__(self, bar=(lambda:None)):
        """blah 2"""
        def goo():
            pass
'''
    ## pprint.pprint(get_defs(a,1))
    pprint.pprint(slower_parser(a, '\n', 3, lambda:None))
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.