sdfile.py :  » Development » Frowns » frowns » mdl_parsers » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Development » Frowns 
Frowns » frowns » mdl_parsers » sdfile.py
"""file reader for
reading sdf (MDL) files
"""
from frowns.Atom import Atom
from frowns.Bond import Bond
from frowns.Molecule import Molecule
from frowns.perception import TrustedSmiles,RingDetection,BasicAromaticity
import re, os

# Atom format
#           111111111122222222223333333333444444444455555555556666666666
# 0123456789012345678901234567890123456789012345678901234567890123456789# 
#"xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee"
#"   -0.0187    1.5258    0.0104 C   0  0  0  0  0",
# x
# y
# z
#
# from the sdfile format distributed by mdl
# aaa atom symbol
# dd  mass difference -3,-2,-1,0,1,2,3,4 (0 if beyond these limits)
# ccc charge 0=uncharged or other than 1=+3, 2=+2, 3=+1, 4=doublet ^, 5=-1 6=-1 7=-3
# sss atom stereo parity 0 not stereo 1=odd, 2=even 3=either or unmarked stereo center
# hhh implicit hydrogen count + 1  1=HO, 2=H1, 3=H2, 4=H3, 5=H5 
# bbb
# vvv

def parse_atom(line):
    x, y, z = map(float, (line[0:10], line[10:20], line[20:30]))

    symbol =   line[31:34].strip()
    
    massdiff = line[34:36].strip()
    charge =   line[36:39].strip()
    stereo =   line[39:42].strip()
    hcount =   line[42:45].strip()

    if massdiff:
        massdiff = int(massdiff)
    else:
        massdiff = 0
        
    if charge:
        charge = int(charge)
        if charge: charge = 4 - charge
    else:
        charge = 0

    # if the hcount is set for the atom, 
    # we need to know this later
    #  i.e. the hcounts are explicit and can't be
    #  changed
    hcount_specified = 0
    if hcount:
        hcount = int(hcount)
        # an hcount of zero means it is not set
        # XXX FIX ME
        #  I don't have a way of forcing hcount to remain 0
        if hcount:
            hcount -= 1
            hcount_specified = 1
    else:
        hcount = 0
        
    # ignore stereo now
    stereo = 0
    return x, y, z, symbol, massdiff, charge, stereo, hcount, hcount_specified
#
# Bond Parser
#             11111111112 
#   012345678901234567890
#   111222tttsssxxxrrrccc
#
# 111 first atom index
# 222 second atom index
# ttt bond type 1=Single, 2=Double, 3=Triple, 4=Aromatic, 
#               Query types 5=Single or double, 6=Single or aromatic, 7=Double or aromatic, 8=Any
# sss bond stereo 0=Not stereo, 1=Up, 4=Either, 6=Down, Double Bonds
#                 0 = Use x-y-z coords from atom block to determine cis/trans
#                 3 = Cis or trans (either) double bond)

def parse_bond(line):
    atom1, atom2, type, remainder = line[0:3], line[3:6], line[6:9], line[9:]
    atom1 = int(atom1)
    atom2 = int(atom2)
    type = int(type)
    return atom1, atom2, type, remainder

BOND_SYMBOL = {1:('-', 1, 1, 1),
               2:('=', 2, 2, 1),
               3:('#', 3, 3, 1),
               4:('', 4, 4, 1)}

class MolReaderError(Exception):
    pass

class MolReader:
    def __init__(self, file, stripHydrogens=1):
        self.file = file
        self.iterator = iter(file)
        self._lastlines = []        # lastlines stores the original lines that made up the
                                            #  last molecule read
        self.stripHydrogens = stripHydrogens
        self._lastline = None
        
    def _readline(self, endOk=0):
        """internal readline function, if endOk is 0 then upon an end of
        line a MolReaderError is generated"""
        if self._lastline:
            res = self._lastline
            self._lastline = None
            return res

        try:
            line = self.iterator.next()
            self._lastlines.append(line)
        except StopIteration:
            line = None
            

        if not line and not endOk:
            raise MolReaderError, "Unexpected end of file"

        return line

    def _pushback(self, line):
        self._lastline = line
        
    def _clear(self):
        """Clear the _lastlines buffer"""
        self._lastlines = []

    def get_text(self):
        """->text that formed the last molecule read"""
        return "".join(self._lastlines)

    def get_lines(self):
        """->the lines of text that formed the last molecule read"""
        return self._lastlines
    
    def _read_to_next(self):
        readline = self._readline
        endOfMol = self._endOfMol
        
        while 1:
            line = readline(endOk=1)
            if not line:
                break

            if endOfMol(line):
                break


    def _endOfMol(self, line):
        """(line)-> return 1 if the line signifies the end of molecule
        0 otherwise"""
        if line[0:4] == "$$$$":
            return 1
        return 0

    def _readFields(self,
                    pattern=re.compile(">\s+<([^>]+)>\s+\(*([^)]*)")
                    ):
        """Read the database field component at the end of a molecule
        record.  Sets a dictionary of key->values"""
        readline = self._readline
        endOfMol = self._endOfMol
        
        fields = {}

        name = None
        while 1:
            # by setting endOk = 1 we can read mol files as
            # well as sdfiles
            line = readline(endOk=1)
            if not line:
                break
            
            if endOfMol(line):
                break
            elif line[0] == ">":
                # we have a data line so get the field
                #  and potentialID values
                if not endOfMol(line):
                    res = pattern.match(line)

                    if res:
                        field, potentialID = res.groups()
                    else:
                        field, potentialID = None, None

                if name is None:
                    name = potentialID
                elif name != potentialID:
                    name = "UNKNOWN (id clash)"
                    
                # read the data from the next line
                if field:
                    line = readline().strip()
                    data = []
                    while line:
                        data.append(line)
                        line = readline().strip()
                        
                    if not endOfMol(line):
                        fields[field] = os.linesep.join(data)
                    else:
                        break
                    
                if endOfMol(line):
                    break

        if not endOfMol(line):
            # by setting endok = 1 here we can read
            # mol files as well as sd files
            line = readline(endOk=1)
                 
        return fields, name

    def readMProps(self):
        readline = self._readline
        while 1:
            line = readline()
            if line[0] == ">":
                # need to push back the last line
                self._pushback(line)
                return
            
            if line[0:6] == "M  END":
                break

            if line[0:6] == "M  CHG":
                # parse the charge line and add charges
                # to the correct atoms
                groups = line[6:].split()[1:]
                index = 0
                while index < len(groups):
                    atomIndex = int(groups[index]) - 1
                    atom = self.atoms[atomIndex]
                    charge = int(groups[index+1])
                    self.atoms[atomIndex].charge = charge
                    index += 2
        
    def read_one(self):
        """Read one molecule from the sd file"""
        self._clear()
        readline = self._readline
        endOfMol = self._endOfMol
        try:
            name = readline().strip()
            userLine = readline()
            comment = readline()
            line = readline()
        except MolReaderError, msg:
            if str(msg) == "Unexpected end of file":
                return None            
            raise

        try:
            numAtoms, numBonds = map(int, (line[0:3], line[3:6]))
        except ValueError: # XXX FIX ME - trap exceptions and stuff
            print "cannot parse atom, bond line"
            self._read_to_next()
            return None
        atoms = self.atoms = []

        for index in range(numAtoms):
            line = readline()
            try:
                x,y,z,symbol,mass,charge,stereo,hcount,hcount_fixed = parse_atom(line)
            except: # XXX FIX ME - trap exceptions and stuff
                self._read_to_next()
                return None
            
            atom = Atom()
            atom._line = line
            atom.symbol = symbol
            atom.explicit_hcount = hcount
            atom.charge = charge

            #if hcount_fixed:                
            #symbol, hcount, charge, weight=0, aromatic=0)
            
            # XXX FIX ME
            # a really bad hack here.
            # ignore please!
            atom._line = line
            atom.x = x
            atom.y = y
            atom.z = z
            if hcount_fixed: atom.fixed_hcount = 1
            if mass:
                atom.weight = atom.mass + mass
            atom.index = len(atoms)
            atoms.append(atom)
            if vfgraph:
                insert_node(atom)

        bonds = []
        mappings = []
        bondCount = [0] * len(atoms)
        closures = {}
        for index in range(numBonds):
            line = readline()
            try:
                a1, a2, bondtype, remainder = parse_bond(line)
            except:
                self._read_to_next()
                return None
            a1 -= 1
            a2 -= 1

            symbol, bondorder, bondtype, fixed = BOND_SYMBOL[bondtype]
            atom1 = atoms[a1]
            atom2 = atoms[a2]
            if stripHydrogen:
                if atom1.symbol == "H":
                    atom2.hcount += 1
                    atom2.hcount_fixed = 1
                if atom2.symbol == "H":
                    atom1.hcount += 1
                    atom1.hcount_fixed = 1
            else:
                bond = Bond(symbol, bondorder, bondtype, fixed)
                bond._line = line
                # XXX FIX ME
                # a really bad hack here
                # ignore please!
                bond._line = remainder
                bond.atoms = [a1, a2]
                a1.bonds.append(bond)
                a2.bonds.append(bond)
                a1.oatoms.append(a1)
                a2.oatoms.append(a2)
                bonds.append(bond)
##            mappings.append((bond, a1, a2))
##                bondCount[a1] += 1
##                bondCount[a2] += 1

        self.readMProps()
        
        fields, potentialName = self._readFields()

        if not name:
            name = potentialName
        elif name != potentialName:
            # XXX FIX ME, what do I do here?
            pass
        
        # we've tokenized the molecule, now we need to build one
        # XXX FIX ME - Split this up into a builder and a tokenizer ?
        mol = Molecule()
        mol.name = name
        mol.fields = fields
#        for atom in atoms:
#            mol.add_atom(atom)
            
#        for bond, a1, a2 in mappings:
#            atom1, atom2 = atoms[a1], atoms[a2]

            # XXX FIX ME
            # does this format mean the atom's hcount can't
            #  change?
#            stripHydrogens = self.stripHydrogens
#            if not hasattr(atom1, "number"):
#                print atom
#                print atom.__dict__
##            if stripHydrogens and atom1.symbol == "H" and bondCount[a1] == 1:
##                atom2.hcount += 1
##                atom2.hcount_fixed = 1
##                bondCount[a2] -=1
##                mol.remove_atom(atom1)
##            elif stripHydrogens and atom2.symbol == "H" and bondCount[a2] == 1:
##                atom1.hcount += 1
##                atom1.hcount_fixed = 1
##                bondCount[a1] -= 1
##                mol.remove_atom(atom2)
##            else:
##                mol.add_bond(bond, atom1, atom2)
##                if bond.bondtype == 4:
##                    atom1.aromatic = 1
##                    atom2.aromatic = 1

##        # get rid of any non-bonded hydrogens
##        atomsToDelete = []
##        for atom in mol.atoms:
##            if atom.symbol == "H":
##                assert len(atom.bonds) == 0
##                atomsToDelete.append(atom)
##        for atom in atomsToDelete:
##            mol.remove_atom(atom)

        index = 0
        for atom in mol.atoms:
            assert atom.symbol != "H"
            if len(atom.bonds) > 1:
                atom._closure = 1
            atom.index = index
            index += 1

        return mol
                
                
            
def test():
    tests = [
  "   -0.0187    1.5258    0.0104 C   0  0  0  0  0",
  "    0.0021   -0.0041    0.0020 C   0  0  0  0  0",
  "    1.3951    2.0474   -0.0003 C   0  0  0  0  0",
  "    2.3236    1.2759   -0.0133 O   0  3  0  0  0",
  "    1.6511    3.5328   -0.0010 C   0  0  0  0  0",
  "    3.7321    1.7917   -0.0227 Cu  0  7  0  0  0",
  "    2.8773    3.8271   -0.8266 C   0  0  0  0  0",
  "    4.1685    2.0764    1.3837 O   0  3  0  0  0",
  "    4.6361    0.7604   -0.6303 O   0  3  0  0  0",
  "    3.7997    3.0530   -0.8316 O   0  3  0  0  0",
  "    2.9504    5.0907   -1.6446 C   0  0  0  0  0",
  "    5.3415    2.0641    1.6691 C   0  0  0  0  0",
  "    5.8107    0.7437   -0.3517 C  "
  ]

    for line in tests:
        print parse_atom(line)

    bondTests = [
  "  1  2  1  0  0  0",
  "  1  3  1  0  0  0",
  "  1 20  1  0  0  0",
  "  1 21  1  0  0  0",
  "  2 22  1  0  0  0",
  "  2 23  1  0  0  0",
  ]

    for line in bondTests:
  print parse_bond(line)


if __name__ == "__main__":
    test()

    file = open('../test/data/caffeine.mol')
    reader = MolReader(file)
    mol = reader.read_one()
    print mol.cansmiles()
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.