BuildMatcher.py :  » Development » Frowns » frowns » build » lib » frowns » smiles_parsers » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Development » Frowns 
Frowns » frowns » build » lib » frowns » smiles_parsers » BuildMatcher.py
"""
SMARTS Primitives taken from daylight's web site
http://www.daylight.com

SMARTS Atomic primitives

Symbol   Symbol name            Atomic property            Default
                                requirements
======   ===========            ============                ===========                                
*        wildcard               any atom                   (no default) 
a        aromatic               aromatic                   (no default) 
A        aliphatic              aliphatic                  (no default) 
D<n>     degree <n>             explicit connections       (no default) 
H<n>     total-H-count <n>      attached hydrogens         exactly one 
h<n>     implicit-H-count <n>   implicit hydrogens         exactly one 
R<n>     ring membership in <n> SSSR rings                 any ring atom 
r<n>     ring size              in smallest SSSR ring      any ring atom 
                                of size <n>   
v<n>     valence                total bond order <n>       (no default) 
X<n>     connectivity <n>       total connections          (no default) 
- <n>    negative charge -<n>   charge                     -1 charge (-- is -2, etc) 
+<n>     positive charge +<n>   formal charge              +1 charge (++ is +2, etc) 
#n       atomic number          atomic number <n>          (no default) 
@        chirality              anticlockwise              anticlockwise, default class 
@@       chirality              clockwise                  clockwise, default class 
@<c><n>  chirality              chiral class <c>           chirality <n> (nodefault) 
@<c><n>? chiral or              <c><n> orunspecified       (no default) 
         unspec chirality 
<n>      atomic mass            explicit atomic mass       unspecified mass

SMARTS Bond Primitives
Symbol   Atomic property           requirements 
======   ===============           ============
-        single bond               (aliphatic) 
/        directional single bond   "up" 
\        directional single bond   "down" 
/?       directional bond          "up or unspecified" 
\?       directional bond          "down or unspecified" 
=        double bond 
#        triple bond 
:        aromatic bond 
~        any bond (wildcard) 
@        any ring bond
"""

# Build a simple Molecule Matcher object given the events from the
# Smarts tokenizer.

# Notes:
#  - A lot of optimization could be done on the graph
#  - The "match" functions don't work and shouldn't be done this way
#  - I think the recursive SMARTS need to be done first, once, and
#      the results used for the higher-level matches
#  - I'm not sure about the zero-level SMARTS, but what I have does
#      match the Daylight toolkit behaviour
#  - This is a first-draft, proof of principle parser.

import string
import Handler

class PropertyMatch:
    property = None
    def __init__(self, value):
        self.value = value
    def match(self, atom):
        return self.value == getattr(atom, self.property)
    def __str__(self):
        return "%s == %s" % (self.property, self.value)

class BooleanMatch(PropertyMatch):
    def __init__(self, value = 1):
        assert value in (0, 1)
        PropertyMatch.__init__(self, value)
    def __str__(self):
        if self.value :
            return "%s is true" % (self.property, )
        else:
            return "%s is false" % (self.property, )

class SymbolMatch(PropertyMatch):
    property = "symbol"

class AromaticMatch(PropertyMatch):
    property = "aromatic"

class AtomicNumberMatch(PropertyMatch):
    property = "number"

class WeightMatch(PropertyMatch):
    property = "weight"

class ChargeMatch(PropertyMatch):
    property = "charge"

class TotalHMatch(PropertyMatch):
    property = "explicit_hcount"

class ImplicitHMatch(PropertyMatch):
    property = "imp_hcount"

class DegreeMatch(PropertyMatch):
    property = "degree"

class RingMembershipMatch(PropertyMatch):
    property = "XXX RingMembership"

class BooleanRingMembershipMatch(BooleanMatch):
    property = "XXX2 BooleanRingMembership"

class RingSizeMatch(PropertyMatch):
    property = "XXX RingSize"

class ValenceMatch(PropertyMatch):
    property = "XXX Valence"

class ConnectivityMatch(PropertyMatch):
    property = "XXX Connectivity"

class ChiralClassMatch(PropertyMatch):
    property = "XXX ChiralClass"

class ChiralCountMatch(PropertyMatch):
    property = "XXX ChiralCount"

class RecursiveMatcher:
    def __init__(self, mol):
        self.mol = mol
    def match(self, atom):
        raise NotImplementedError
    def __str__(self):
        s = self.mol.dump()
        lines = string.split(s, "\n")
        new_lines = []
        for line in lines:
            new_lines.append("recursive> " + line)
        s = string.join(new_lines, "\n")
        return "atom in\n%s\n" % (s,)

class BondSymbolMatch(PropertyMatch):
    property = "symbol"

class NotMatch:
    def __init__(self, child):
        self.child = child
    def match(self, obj):
        return self.child.match(obj)
    def __str__(self):
        return "not (%s)" % (self.child,)

class AndMatch:
    def __init__(self, left, right):
        self.left = left
        self.right = right
    def match(self, obj):
        return self.left.match(obj) and self.right.match(obj)
    def __str__(self):
        return "AND(%s, %s)" % (self.left, self.right)
    
class OrMatch:
    def __init__(self, left, right):
        self.left = left
        self.right = right
    def match(self, obj):
        return self.left.match(obj) or self.right.match(obj)
    def __str__(self):
        return "OR(%s, %s)" % (self.left, self.right)

bool_unary_not = 76
bool_strong_and = 77
bool_or = 78
bool_weak_and = 79
binary_operators = [bool_strong_and, bool_or, bool_weak_and]
boolean_operators = binary_operators + [bool_unary_not]
text_to_bool = {
    "&": bool_strong_and,
    ",": bool_or,
    ";": bool_weak_and,
    "!": bool_unary_not,
    }

class ExpressionList:
    def __init__(self):
        self.matchers = []
    def __nonzero__(self):
        return len(self.matchers) != 0
    def add_matcher(self, obj):
        if self.matchers and self.matchers[-1] not in boolean_operators:
            self.matchers.append(bool_strong_and)
        self.matchers.append(obj)
    def add_operator(self, op):
        assert op in binary_operators or op == bool_unary_not
        if __debug__:
            if self.matchers:
                if op in binary_operators:
                    assert self.matchers[-1] not in binary_operators
            else:
                assert op not in binary_operators
        self.matchers.append(op)
    def make_matcher(self):
        matchers = self.matchers[:]
        i = 0
        while i < len(matchers):
            if matchers[i] == bool_unary_not:
                matchers[i:i+2] = [NotMatch(matchers[i+1])]
            else:
                i = i + 1
        i = 1
        while i < len(matchers):
            if matchers[i] == bool_strong_and:
                matchers[i-1:i+2] = [AndMatch(matchers[i-1], matchers[i+1])]
            else:
                i = i + 1
        i = 1
        while i < len(matchers):
            if matchers[i] == bool_or:
                matchers[i-1:i+2] = [OrMatch(matchers[i-1], matchers[i+1])]
            else:
                i = i + 1
        i = 1
        while i < len(matchers):
            if matchers[i] == bool_weak_and:
                matchers[i-1:i+2] = [AndMatch(matchers[i-1], matchers[i+1])]
            else:
                i = i + 1
        assert len(matchers) == 1, matchers
        return matchers[0]

class AtomExpression(ExpressionList):
    pass

class BondExpression(ExpressionList):
    pass
        

class Atom:
    def __init__(self, matcher, component_number):
        self.matcher = matcher
        self.component_number = component_number
        self.bonds = []
    def match(self, atom):
        return self.matcher(atom)
    def dump_info(self, bonds):
        text = "[%d] <%s>" % (self.component_number, str(self.matcher))
        text = text + " bonds = ["
        bond_ids = []
        for bond in self.bonds:
            bond_ids.append(str(bonds.index(bond)))
        text = text + string.join(bond_ids, ", ") + "]"
        return text


class Bond:
    def __init__(self, matcher, component_number):
        self.matcher = matcher
        self.component_number = component_number
        self.atoms = []
    def match(self, bond):
        return self.matcher(bond)
    def dump_info(self, atoms):
        return "[%d] %d <%s> %d" % (self.component_number,
                                    atoms.index(self.atoms[0]),
                                    self.matcher,
                                    atoms.index(self.atoms[1]))
        

class Molecule:
    def __init__(self):
        self.atoms = []
        self.bonds = []
    def dump(self):
        lines = ["atom_id [component_number] <matcher> "
                 "bonds = [list of bond ids]"]
        i = 0
        for atom in self.atoms:
            lines.append(str(i) + " " + atom.dump_info(self.bonds))
            i = i + 1
        lines.append("bond_id [component_number> "
                     "atom1_id <matcher> atom2_id")
        i = 0
        for bond in self.bonds:
            lines.append(str(i) + " " + bond.dump_info(self.atoms))
            i = i + 1
        return string.join(lines, "\n")
    def __del__(self):
        for atom in self.atoms:
            del atom.bonds
        for bond in self.bonds:
            del bond.atoms

def get_symbol_aromatic(text):
    if text[0] in "cnosp":
        return string.upper(text), 1
    return text, 0

def normalize_closure(text):
    if text[:1] == "%":
        return int(text[1:])
    return int(text)

class BuildMatcher(Handler.TokenHandler):
    _save_state = ("closures", "mol", "_atom_expr", "_prev_atoms",
                   "_pending_bond_expr")
    def begin(self):
        self._reset()
        self.component_number = 0
        self.recursive_smarts_stack = []
        
    def _reset(self):
        self.closures = {}
        self.mol = Molecule()
        self._atom_expr = None
        self._prev_atoms = []

        # None occurs after a '.'
        self._pending_bond_expr = None

    def _verify(self):
        if len(self._prev_atoms) >= 2:
            raise AssertionError("Missing ')'")
        if self._pending_bond_expr:
            raise AssertionError("Missing an atom expression after the bond")
        if self.closures:
            raise AssertionError("Missing closures for %s" %
                                 (self.closures.keys(),))
    def end(self):
        self._verify()
        if self.recursive_smarts_stack:
            raise AssertionError("Inside of a recursive SMARTS")


    def add_token(self, field, pos, text):
        getattr(self, "do_" + field)(text)

    def do_open_recursive_smarts(self, text):
        d = {}
        for k in self._save_state:
            d[k] = getattr(self, k)
        self.recursive_smarts_stack.append(d)
        self._reset()
        
    def do_close_recursive_smarts(self, text):
        self._verify()
        mol = self.mol
        self._reset()
        d = self.recursive_smarts_stack.pop()
        for k, v in d.items():
            setattr(self, k, v)
        self._atom_expr.add_matcher(RecursiveMatcher(mol))

    def add_atom(self, atom):
        if self._pending_bond_expr is not None and not self._pending_bond_expr:
            # Implicit single or aromatic bond
            self._pending_bond_expr = BondExpression()
            self._pending_bond_expr.add_matcher(BondSymbolMatch("-"))

        if self._pending_bond_expr is not None:
            bond = Bond(self._pending_bond_expr.make_matcher(),
                        self.component_number)
            prev_atom = self._prev_atoms[-1]
            bond.atoms[:] = [prev_atom, atom]
            prev_atom.bonds.append(bond)
            atom.bonds.append(bond)
            self.mol.bonds.append(bond)
        self._pending_bond_expr = BondExpression()
        if not self._prev_atoms:
            self._prev_atoms.append(atom)
        else:
            self._prev_atoms[-1] = atom
        self.mol.atoms.append(atom)
        
    def do_raw_atom(self, text):
        symbol, aromatic = get_symbol_aromatic(text)
        atom_match = AndMatch(SymbolMatch(symbol), AromaticMatch(aromatic))
        self.add_atom(Atom(atom_match,
                           self.component_number))

    def do_raw_aromatic(self, text):
        self.add_atom(Atom(AromaticMatch(1),
                           self.component_number))
    def do_raw_aliphatic(self, text):
        self.add_atom(Atom(AliphaticMatch(0),
                           self.component_number))
    def do_raw_b_unknown(self, text):
        1/0
    def do_raw_f_unknown(self, text):
        1/0
    def do_raw_h_unknown(self, text):
        1/0
    def do_raw_i_unknown(self, text):
        1/0
    def do_raw_r_unknown(self, text):
        # I think this is right
        self.add_atom(Atom(BooleanRingMembershipMatch(),
                           self.component_number))
    def do_raw_R_unknown(self, text):
        # I think this is right
        self.add_atom(Atom(BooleanRingMembershipMatch(),
                           self.component_number))

    def do_open_bracket(self, text):
        self._atom_expr = AtomExpression()

    def do_atom_not(self, text):
        self._atom_expr.add_operator(text_to_bool[text])
    def do_atom_binary(self, text):
        self._atom_expr.add_operator(text_to_bool[text])

    def do_atomic_number(self, text):
        self._atom_expr.add_matcher(AtomicNumberMatch(text[1:]))
    def do_weight(self, text):
        self._atom_expr.add_matcher(WeightMatch(text))
    def do_element(self, text):
        symbol, aromatic = get_symbol_aromatic(text)
        atom_matcher = AndMatch(SymbolMatch(symbol),
                                AromaticMatch(aromatic))
        self._atom_expr.add_matcher(atom_matcher)
    def do_chiral_count(self, text):
        self._atom_expr.add_matcher(ChiralCountMatch(text[1:]))
    def do_chiral_named(self, text):
        self._atom_expr.add_matcher(ChiralClassMatch(text[1:3]))
        self._atom_expr.add_matcher(ChiralCountMatch(int(text[3:])))
    def do_chiral_symbols(self, text):
        self._atom_expr.add_matcher(ChiralCountMatch(len(text)))

    def do_aromatic(self, text):
        self._atom_expr.add_matcher(AromaticMatch(1))
    def do_aliphatic(self, text):
        self._atom_expr.add_matcher(AromaticMatch(0))
    def do_total_hcount(self, text):
        if text == "H":
            count = 1
        else:
            count = int(text[1:])
        self._atom_expr.add_matcher(TotalHMatch(count))
    def do_imp_hcount(self, text):
        self._atom_expr.add_matcher(ImplicitHMatch(text[1:]))
    def do_degree(self, text):
        self._atom_expr.add_matcher(DegreeMatch(text[1:]))
    def do_ring_membership(self, text):
        if text == "R":
            self._atom_expr.add_matcher(BooleanRingMembershipMatch())
        else:
            self._atom_expr.add_matcher(RingMembershipMatch(text[1:]))
    def do_ring_size(self, text):
        if text == "r":
            self._atom_expr.add_matcher(BooleanRingMembershipMatch())
        else:
            self._atom_expr.add_matcher(RingSizeMatch(text[1:]))
    def do_valence(self, text):
        self._atom_expr.add_matcher(ValenceMatch(text[1:]))
    def do_connectivity(self, text):
        self._atom_expr.add_matcher(ConnectivityMatch(text[1:]))

    def do_positive_count(self, text):
        self._atom_expr.add_matcher(ChargeMatch(int(text)))
    def do_positive_symbols(self, text):
        self._atom_expr.add_matcher(ChargeMatch(len(text)))
    def do_negative_count(self, text):
        self._atom_expr.add_matcher(ChargeMatch(int(text)))
    def do_negative_symbols(self, text):
        self._atom_expr.add_matcher(ChargeMatch(-len(text)))

    def do_close_bracket(self, text):
        self.add_atom(Atom(self._atom_expr.make_matcher(),
                           self.component_number))
        self._atom_expr = None


    def do_bond(self, text):
        self._pending_bond_expr.add_matcher(BondSymbolMatch(text))
    def do_bond_not(self, text):
        self._pending_bond_expr.add_operator(text_to_bool[text])
    def do_bond_binary(self, text):
        self._pending_bond_expr.add_operator(text_to_bool[text])

    def do_dot(self, text):
        assert not self._pending_bond_expr, "not possible"
        self._pending_bond_expr = None

    def do_closure(self, text):
        num = normalize_closure(text)
        if self.closures.has_key(num):
            prev_atom, prev_bond_expr = self.closures[num]
            del self.closures[num]

            # Because things like ".1" or "1C" can't occur
            assert self._pending_bond_expr is not None, "Can't happen"
            assert prev_bond_expr is not None, "Can't happen either"
            
            if not prev_bond_expr:
                if not self._pending_bond_expr:
                    bond_matcher = BondSymbolMatch("-")
                else:
                    bond_matcher = self._pending_bond_expr.make_matcher()
            else:
                if not self._pending_bond_expr:
                    bond_matcher = prev_bond_expr.make_matcher()
                else:
                    raise NotImplementedError("Need to check if they match")

            bond = Bond(bond_matcher, self.component_number)

            atom = self._prev_atoms[-1]
            if prev_atom is atom:
                raise AssertionError("cannot close a ring with itself")
            bond.atoms[:] = [prev_atom, atom]
            prev_atom.bonds.append(bond)
            atom.bonds.append(bond)
            self.mol.bonds.append(bond)
        else:
            self.closures[num] = (self._prev_atoms[-1], self._pending_bond_expr)
        self._pending_bond_expr = BondExpression()

    def do_open_branch(self, text):
        self._prev_atoms.append(self._prev_atoms[-1])
    
    def do_close_branch(self, text):
        self._prev_atoms.pop()

    def do_open_zero(self, text):
        # I think this component thing is right
        pass
    
    def do_close_zero(self, text):
        self._verify()
        self.component_number = self.component_number + 1
    
def test():
    import Smarts
    h = BuildMatcher()
    for smi in ["C",
                "CC",
                "c",
                "C(N)O",
                "[O]",
                "c1ccccc1",
                ]:
        print "*"*44
        print "-->", smi        
        Smarts.tokenize(smi, h)
        print h.mol.dump()

if __name__ == "__main__":
    test()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.