myChEMBL iPython Notebook Tutorial

A Chemoinformatics taster using the RDKit toolkit and cartridge, the ChEMBL database and Pandas

George Papadatos, ChEMBL group, EMBL-EBI

Start with something relatively easy

In [2]:
print 'Hello World!'
Hello World!
In [3]:
1+4
Out[3]:
5

Import RDKit libraries

In [4]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit import DataStructs

Simple RDKit stuff - Molecules, descriptors and similarity

Molecule from SMILES

In [5]:
smi = 'CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4' #sildenafil
m = Chem.MolFromSmiles(smi)
In [6]:
m
Out[6]:

Simple descriptors

In [7]:
Descriptors.MolWt(m)
Out[7]:
474.5870000000004
In [8]:
Descriptors.TPSA(m)
Out[8]:
113.41999999999999
In [9]:
Descriptors.RingCount(m)
Out[9]:
4

Output to various text formats

In [10]:
Chem.MolToSmiles(m, True)
Out[10]:
'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O'
In [11]:
Chem.MolToInchi(m)
Out[11]:
'InChI=1S/C22H30N6O4S/c1-5-7-17-19-20(27(4)25-17)22(29)24-21(23-19)16-14-15(8-9-18(16)32-6-2)33(30,31)28-12-10-26(3)11-13-28/h8-9,14H,5-7,10-13H2,1-4H3,(H,23,24,29)'
In [12]:
print Chem.MolToMolBlock(m)
     RDKit          

 33 36  0  0  0  0  0  0  0  0999 V2000
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 S   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0
  2  3  1  0
  3  4  1  0
  4  5  2  0
  5  6  1  0
  6  7  1  0
  6  8  1  0
  8  9  1  0
  9 10  2  0
  9 11  1  0
 11 12  1  0
 12 13  2  0
 13 14  1  0
 12 15  1  0
 15 16  2  0
 16 17  1  0
 17 18  2  0
 18 19  1  0
 19 20  2  0
 20 21  1  0
 21 22  1  0
 22 23  1  0
 17 24  1  0
 24 25  2  0
 24 26  2  0
 24 27  1  0
 27 28  1  0
 28 29  1  0
 29 30  1  0
 30 31  1  0
 30 32  1  0
 32 33  1  0
 14  4  1  0
 14  8  2  0
 20 15  1  0
 33 27  1  0
M  END

In [13]:
Chem.Compute2DCoords(m)
Out[13]:
0
In [14]:
print Chem.MolToMolBlock(m)
     RDKit          2D

 33 36  0  0  0  0  0  0  0  0999 V2000
   -8.2094    2.2189    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -7.5208    0.8863    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -6.0224    0.8163    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -5.3338   -0.5163    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -6.0072   -1.8566    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
   -4.9405   -2.9112    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
   -5.1666   -4.3941    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.6079   -2.2226    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.2044   -2.7522    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.9613   -4.2323    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   -1.0441   -1.8015    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
   -1.2872   -0.3214    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.6907    0.2082    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8510   -0.7424    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.1269    0.6292    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.2765    0.0997    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.4368    1.0503    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.1937    2.5305    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.7903    3.0600    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3700    2.1094    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.7734    2.6390    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   -2.0166    4.1191    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.4200    4.6487    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8402    0.5208    0.0000 S   0  0  0  0  0  0  0  0  0  0  0  0
    3.3107   -0.8826    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    4.3698    1.9242    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    5.2436   -0.0088    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    5.4867   -1.4889    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    6.8902   -2.0185    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    8.0505   -1.0679    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    9.4539   -1.5974    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    7.8074    0.4123    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    6.4039    0.9418    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0
  2  3  1  0
  3  4  1  0
  4  5  2  0
  5  6  1  0
  6  7  1  0
  6  8  1  0
  8  9  1  0
  9 10  2  0
  9 11  1  0
 11 12  1  0
 12 13  2  0
 13 14  1  0
 12 15  1  0
 15 16  2  0
 16 17  1  0
 17 18  2  0
 18 19  1  0
 19 20  2  0
 20 21  1  0
 21 22  1  0
 22 23  1  0
 17 24  1  0
 24 25  2  0
 24 26  2  0
 24 27  1  0
 27 28  1  0
 28 29  1  0
 29 30  1  0
 30 31  1  0
 30 32  1  0
 32 33  1  0
 14  4  1  0
 14  8  2  0
 20 15  1  0
 33 27  1  0
M  END

Fingerprints and similarity

In [15]:
fp = Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048)
In [16]:
fp.ToBitString()
Out[16]:

In [17]:
fp.GetNumOnBits()
Out[17]:
61
In [18]:
fp.GetNumBits()
Out[18]:
2048
In [19]:
smi2 = 'CCCc1nc(C)c2C(=O)N=C(Nn12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(CC)CC4' #vardenafil
m2 = Chem.MolFromSmiles(smi2)
In [20]:
fp2 = Chem.GetMorganFingerprintAsBitVect(m2, 2, nBits=2048)
In [21]:
m2
Out[21]:
In [22]:
m
Out[22]:
In [23]:
DataStructs.TanimotoSimilarity(fp,fp2)
Out[23]:
0.5

Similarity Maps

In [24]:
from rdkit.Chem.Draw import SimilarityMaps
SimilarityMaps.GetSimilarityMapForFingerprint(m2, m, SimilarityMaps.GetMorganFingerprint)
Out[24]:
(<matplotlib.figure.Figure at 0x2b60210>, 0.14414414414414412)

Using the RDKit database cartridge in myChEMBL

In [25]:
sma = 'C1C[!#1!#6]1' #oxirane or aziridine
In [26]:
from IPython.display import Image
from urllib import quote_plus
In [27]:
Image(url='http://www.smartsview.de/smartsview/auto/png/1/dynamic/{0}'.format(quote_plus(sma)))
Out[27]:
In [28]:
import psycopg2
In [29]:
conn = psycopg2.connect(port=5432, user='chembl', dbname='chembl_17')
In [30]:
cur = conn.cursor()
In [31]:
sql1 = """
SELECT mr.*, md.chembl_id, cp.full_mwt, cp.alogp
from mols_rdkit mr, molecule_dictionary md, compound_properties cp
where
mr.m @> 'C1C[!#1!#6]1'::qmol
and
mr.molregno = md.molregno
and
md.molregno = cp.molregno
limit 100
"""
In [32]:
cur.execute(sql1)
In [33]:
for c in cur: print c
(1296551, 'CC(=O)OC1C[C@@H](C)C2(CC(c3ccoc3)OC2=O)C2CCC(O)C3(CO3)C12CO', 'CHEMBL1975260', Decimal('420.45'), Decimal('0.57'))
(1240102, 'O=C(CCN1CC1)OCCOC(=O)CCN1CC1', 'CHEMBL1899531', Decimal('256.30'), Decimal('0.01'))
(1296481, 'CCN(CC)C(=O)CCN1CC1', 'CHEMBL1975190', Decimal('170.25'), Decimal('0.35'))
(1235869, 'Clc1ccc(N(CC2CO2)CC2CO2)cc1', 'CHEMBL1895298', Decimal('239.70'), Decimal('2.09'))
(1295075, 'O=C(C1OC1c1ccc([N+](=O)[O-])cc1)C12CC3CC(CC(C3)C1)C2', 'CHEMBL1973784', Decimal('327.37'), Decimal('3.65'))
(1246669, 'C[C@H]1OP(=O)(Oc2ccccc2)C[C@@H]2O[C@@H]21', 'CHEMBL1906098', Decimal('240.19'), Decimal('1.20'))
(1231326, 'N#C[C@H]1C2OC2c2ccccc2N1C(=O)c1ccccc1', 'CHEMBL1890755', Decimal('276.29'), Decimal('2.19'))
(1218973, 'CC12OC1C(O)C(Br)=C(CO)C2O', 'CHEMBL1878402', Decimal('251.07'), Decimal('-0.80'))
(1295513, 'OC1c2ccccc2C(O)C2OC21', 'CHEMBL1974222', Decimal('178.18'), Decimal('0.24'))
(1296770, 'CC(=O)OC1(C#N)CC2OC1C1C2N1C(=O)c1ccccc1', 'CHEMBL1975479', Decimal('298.29'), Decimal('0.41'))
(1295729, 'O=C(c1ccc(Cl)cc1)C1OC12C(=O)Nc1ccccc12', 'CHEMBL1974438', Decimal('299.71'), Decimal('2.41'))
(1294930, 'CO/C(=N/N1CC1C(F)(F)F)c1ccncc1', 'CHEMBL1973639', Decimal('245.20'), Decimal('1.36'))
(1296542, 'COc1ccc(C[C@H]2NC(=O)C=CC[C@@H]([C@H](C)[C@H]3O[C@@H]3c3ccccc3)OC(=O)[C@H](CC(C)C)OC(=O)[C@H](C)CNC2=O)cc1', 'CHEMBL1975251', Decimal('620.73'), Decimal('4.85'))
(1295673, 'C=C1C(=O)OC2CCCCC3OC3C12', 'CHEMBL1974382', Decimal('194.23'), Decimal('1.54'))
(1202551, 'Cc1ccc(/C(=N/O)N2CC2C)c(Oc2ccc3oc4ccccc4c3c2)n1', 'CHEMBL1861933', Decimal('373.40'), Decimal('4.24'))
(1290807, 'COC(C/C=C/N(C)C=O)C(C)C(=O)CCC(C)C(OC)C(C)C1OC(=O)C=CC2OC2(C)CC(OC)C(OC)C2=CC(=O)O[C@H]([C@H]2O)C(C)C(OC)CC(OC)C=CC(C)C(O)CC(OC)C=CC1C', 'CHEMBL1969516', Decimal('1006.27'), None)
(1231532, 'O=C1C=C(N2CC2)c2ccccc2C1=O', 'CHEMBL1890961', Decimal('199.21'), Decimal('1.34'))
(1243825, 'COC(C[C@@H]1O[C@H]1C(=O)COCc1ccccc1)OC', 'CHEMBL1903254', Decimal('280.32'), Decimal('1.13'))
(1209176, 'N#CC1(C#N)OC12CCS(=O)(=O)c1ccccc12', 'CHEMBL1868605', Decimal('260.27'), Decimal('0.52'))
(1237949, 'c1cc(OCCN2CC2)ccn1', 'CHEMBL1897378', Decimal('164.20'), Decimal('0.62'))
(1291819, 'O=c1n(Cc2ccccc2)c2ccccc2n1CC1CS1', 'CHEMBL1970528', Decimal('296.39'), Decimal('3.15'))
(1287453, 'C=C1C(=O)O[C@@H]2C=C(C)C=C[C@@H]3O[C@@]3(C)C[C@@H](O)C12', 'CHEMBL1966162', Decimal('262.30'), Decimal('1.12'))
(1296095, 'COC(=O)C1(COCc2ccccc2)CCCC2OC21', 'CHEMBL1974804', Decimal('276.33'), Decimal('2.16'))
(1296635, 'COc1cc(O)c2c(c1)C(O)C(O)CC1OC1C(=O)C=CCC(C)OC2=O', 'CHEMBL1975344', Decimal('378.37'), Decimal('0.90'))
(1302623, 'COc1cc2cc(c1Cl)N(C)C(=O)C[C@H](OC(=O)[C@H](C)N(C)C(C)=O)[C@]1(C)O[C@H]1[C@H](C)[C@@H]1C[C@@](O)(NC(=O)O1)[C@H](OC)C=CC=C(C)C2', 'CHEMBL1981332', Decimal('692.20'), Decimal('3.07'))
(1290845, 'C=C1CC2(C)CCC(OC(=O)C3(C)OC3C)C(C)(OC(C)=O)C2CC1=C(C)C', 'CHEMBL1969554', Decimal('390.51'), Decimal('4.15'))
(1297691, 'Cc1cn([C@@H]2O[C@H](CO[Si](C)(C)C(C)(C)C)[C@]3(O[C@H]3C(=O)NO)[C@H]2O[Si](C)(C)C(C)(C)C)c(=O)[nH]c1=O', 'CHEMBL1976400', Decimal('557.78'), None)
(1243276, 'CC12CCC3C(CCC4CC5OC5CC43C)C1CCC21OCCO1', 'CHEMBL1902705', Decimal('332.48'), Decimal('3.27'))
(1297762, 'CC1CCC2C(C(=O)O[C@@H]3[C@@H]4O[C@]4(CO)C4C3C=CO[C@H]4OC3OC(CO)C(O)C(O)C3O)=CO[C@@H](OC3OC(CO)C(O)C(O)C3O)C12', 'CHEMBL1976471', Decimal('704.67'), Decimal('-3.95'))
(1297121, 'CC1(C)C2CCC3(OCCO3)C1C1OC12', 'CHEMBL1975830', Decimal('210.27'), Decimal('0.88'))
(1297766, 'CC(=O)O[C@@H]1[C@H]2O[C@H]2[C@H]2O[C@@]2(COC(=O)c2ccccc2)[C@H]1OC(C)=O', 'CHEMBL1976475', Decimal('362.33'), Decimal('0.59'))
(1297698, 'CC(=O)OC1OC(c2ccoc2)C[C@@]12C1CCC3O[C@]3(C)[C@]1(C)C[C@H](OC(C)=O)[C@H]2C', 'CHEMBL1976407', Decimal('432.51'), Decimal('2.47'))
(1297541, 'C=CC1(CCC(Br)C(C)(C)Cl)CO1', 'CHEMBL1976250', Decimal('267.59'), Decimal('3.10'))
(1297560, '[O-][P-](Oc1ccc2ccccc2c1)(N1CC1)N1CC1', 'CHEMBL1976269', Decimal('274.25'), Decimal('0.92'))
(1297481, 'CC(=O)c1ccc2c3c1[C@@H]1O[C@@H]1c1cccc(c1-3)[C@@H]1O[C@H]21', 'CHEMBL1976190', Decimal('276.29'), Decimal('1.96'))
(1299670, 'CC1=CC2O[C@@H]3C[C@H]4OC(=O)C=CC=CC56OCCC7(OC7C(=O)OCC2(CC1)[C@]4(C)[C@]31CO1)C5OC(O)C6O', 'CHEMBL1978379', Decimal('558.57'), Decimal('-0.18'))
(1299584, 'C=C1[C@H](OC(C)=O)[C@H]2[C@@H](OC(C)=O)[C@](C)(OC(C)=O)C[C@]2(OC(C)=O)C(=O)[C@H](C)[C@@H]2O[C@H]2C(C)(C)[C@H](OC(C)=O)[C@H](OC(C)=O)[C@H]1OC(=O)C(C)C', 'CHEMBL1978293', Decimal('738.77'), Decimal('1.75'))
(1298308, 'C=C(C(=O)OC)C1C[C@@H](OC(C)=O)C2=C[C@H](C[C@@]3(C)O[C@@H]3c3cc(C)c(o3)[C@H]1OC(C)=O)OC2=O', 'CHEMBL1977017', Decimal('488.48'), Decimal('2.20'))
(1287993, 'C=C1C(=O)OC2C3OC3(C)CCC=C(C)CC(=O)C12', 'CHEMBL1966702', Decimal('262.30'), Decimal('1.78'))
(1271919, 'CC(=O)OCC1=C(C)C[C@H]([C@@H](C)[C@H]2CC[C@H]3[C@@H]4C[C@H]5O[C@]56[C@@H](OC(C)=O)C=CC(=O)[C@]6(COC(C)=O)[C@H]4CC[C@]23C)OC1=O', 'CHEMBL1934450', Decimal('612.71'), Decimal('3.43'))
(1271928, 'CC1=C(CO)C(=O)O[C@@H]([C@@H](C)[C@H]2CC[C@H]3[C@@H]4C[C@H]5O[C@]56[C@@H](O)[C@@H](OS(=O)(=O)O)CC(=O)[C@]6(CO)[C@H]4CC[C@]23C)C1', 'CHEMBL1934459', Decimal('584.68'), Decimal('1.14'))
(1271936, 'CC1=C(CO[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)C(=O)O[C@@H]([C@@H](C)[C@H]2CC[C@H]3[C@@H]4C[C@H]5O[C@]56[C@@H](O)C=CC(=O)[C@]6(C)[C@H]4CC[C@]23C)C1', 'CHEMBL1934467', Decimal('632.74'), Decimal('1.64'))
(1288183, 'O=C1c2c3c4ccccc4[nH]c3c3c(c4ccccc4n3CC3CO3)c2C(=O)N1Cc1ccccc1', 'CHEMBL1966892', Decimal('471.51'), Decimal('5.35'))
(1285594, 'Cc1ccc(OCC2CO2)c(Br)c1', 'CHEMBL1964303', Decimal('243.10'), Decimal('2.76'))
(1284906, 'O=C(CCc1ccc(F)cc1)c1cc(F)ccc1OCC1CO1', 'CHEMBL1963195', Decimal('318.31'), Decimal('3.84'))
(1284905, 'O=C(CCc1ccc(F)cc1)c1ccccc1OCC1CO1', 'CHEMBL1963194', Decimal('300.32'), Decimal('3.63'))
(1284904, 'O=C(CCc1ccccc1)c1ccccc1OCC1CO1', 'CHEMBL1963193', Decimal('282.33'), Decimal('3.43'))
(1299546, 'CCOC(=O)[C@H]1O[C@@H]1C(=O)NC(CC(C)C)C(=O)NCCC(C)C', 'CHEMBL1978255', Decimal('342.43'), Decimal('1.61'))
(1216668, 'O=C(O)C1OC1(c1ccccc1Cl)c1ccccc1Cl', 'CHEMBL1876097', Decimal('309.14'), Decimal('3.87'))
(1228380, 'COC(C[C@@H]1O[C@@H]1[C@@H](O)[C@@H](C)OCc1ccccc1)OC', 'CHEMBL1887809', Decimal('296.36'), Decimal('1.44'))
(1211913, 'CCOC(=O)c1ccc(N(CC2CO2)S(=O)(=O)c2ccc(C)cc2)cc1', 'CHEMBL1871342', Decimal('375.44'), Decimal('2.93'))
(1300104, 'COc1cc2c(c3oc4c(O)cccc4c(=O)c13)C(C1(C)CO1)CO2', 'CHEMBL1978813', Decimal('340.33'), Decimal('2.32'))
(1228204, 'CC[C@@H](c1ccccc1)n1c(=O)n2n(c1=O)[C@H]1[C@H](O)[C@@H]3O[C@@H]3/C(=N\\OCc3ccccc3)[C@H]1CC2', 'CHEMBL1887633', Decimal('476.52'), Decimal('2.67'))
(1299787, 'O=C1c2ccccc2OCC12OC21C=CC(Cl)=CC1', 'CHEMBL1978496', Decimal('274.70'), Decimal('2.35'))
(1299302, 'O=C(O)CCCCCCC[C@H]1S[C@H]1CCCCCCO', 'CHEMBL1978011', Decimal('302.47'), Decimal('4.47'))
(1300017, 'C=C(C)[C@@H]1C[C@H]2O[C@](O)(C(=C)CC(=O)C=C(C)C[C@@H]3OC(=O)[C@]24O[C@@H]34)[C@H]1O', 'CHEMBL1978726', Decimal('376.40'), Decimal('1.35'))
(1268906, 'Cc1coc2c1[C@H]1C=C(CC[C@@H]3O[C@@]3(C)C2)C(=O)O1', 'CHEMBL1927944', Decimal('260.29'), Decimal('2.29'))
(1251649, 'C=C1C(=O)O[C@@H]2C[C@@]3(C)O[C@@H]3CC[C@@]3(C)O[C@H]3C[C@@H]12', 'CHEMBL1912039', Decimal('264.32'), Decimal('1.30'))
(1251654, 'C=C1C(=O)O[C@@H]2C[C@H](C)[C@]3(CCC(C)O)O[C@@H]3C[C@H]12', 'CHEMBL1912044', Decimal('266.33'), Decimal('1.73'))
(1251661, 'C=C1C(=O)O[C@H]2C[C@H](C)[C@@H]3CC[C@@]4(C)O[C@@]34C[C@H]12', 'CHEMBL1912051', Decimal('248.32'), Decimal('2.21'))
(1288152, 'COC(=O)[C@@H]1O[C@]12[C@@H](CO[Si](C)(C)C(C)(C)C)O[C@@H](n1cc(C)c(=O)[nH]c1=O)[C@@H]2O[Si](C)(C)C(C)(C)C', 'CHEMBL1966861', Decimal('556.80'), None)
(1299033, 'COC(=O)/C=C/[C@@H]1[C@H](C)N1S(=O)(=O)c1ccc(C)cc1', 'CHEMBL1977742', Decimal('295.35'), Decimal('2.05'))
(1286713, 'CN1c2ccc(Cl)cc2C2(c3ccccc3)N(CC1=O)C2(Cl)Cl', 'CHEMBL1965422', Decimal('367.66'), Decimal('4.24'))
(1301826, 'CC(=O)O[C@H]1CC(C)(C)C(=C=C/C(C)=C/C=C/C=C\\C=C(C)\\C=C2\\C=C(/C=C/[C@@]34O[C@]3(C)C[C@@H](O)CC4(C)C)C(=O)O2)[C@](C)(O)C1', 'CHEMBL1980535', Decimal('630.81'), Decimal('5.36'))
(1303258, 'CC(CO)[C@H]1OC(=O)C=C2[C@@]13O[C@@H]3[C@H]1OC(=O)[C@@]3(C)[C@H]4O[C@H]4C[C@@]2(C)[C@@H]13', 'CHEMBL1981967', Decimal('362.37'), Decimal('0.17'))
(1295439, 'COC1(OC)C[C@H](C)[C@@]23O[C@]24c2cc(O)c5c(c2N[C@H]3C#CC=CC#C[C@@]41O)C(=O)c1ccccc1C5=O', 'CHEMBL1974148', Decimal('509.51'), Decimal('2.87'))
(1295527, 'CC(=O)OCC1OC(OC2CC3C(C)(C)C(O)CC[C@]3(C)C3CCC45CC4(CC[C@H]5C(C)CC(O)C4OC4(C)C)[C@]23C)C(O)C(O)C1O', 'CHEMBL1974236', Decimal('678.89'), Decimal('2.82'))
(1321195, 'CC(=O)OCC1OC(OC2CC3C(C)(C)C(OC(C)=O)CC[C@]3(C)C3CCC45CC4(CC[C@H]5C(C)CC(O)C4OC4(C)C)[C@]23C)C(O)C(O)C1O', 'CHEMBL1999904', Decimal('720.93'), Decimal('3.19'))
(1301478, 'COC1(OC)C[C@H](C)[C@@]23O[C@]24C2=CC(=O)C=CC2=N[C@H]3C#CC=CC#C[C@@]41O', 'CHEMBL1980187', Decimal('377.39'), Decimal('1.40'))
(1276021, 'C=C(C(=O)[C@H](OC(C)=O)[C@@H](C)[C@H]1[C@@H](OC(C)=O)C[C@@]2(C)[C@@H]3[C@H]4O[C@H]4[C@H]4[C@H](C)C(=O)C=C[C@@]45C[C@@]35CC[C@]12C)[C@@H](C)CO', 'CHEMBL1941159', Decimal('582.72'), Decimal('2.93'))
(1288224, 'CC(=O)O[C@H]1C[C@]2(C(C)(C)O)C(=C1C)[C@@H](OC(C)=O)C(OC(C)=O)[C@@]1(C)[C@H]([C@@H]2OC(C)=O)[C@@]2(CO2)C(OC(C)=O)C[C@@H]1OC(C)=O', 'CHEMBL1966933', Decimal('652.68'), Decimal('-0.21'))
(1301178, 'Cc1cc2c(c3oc(C4(C)OC4C4OC4C)cc(=O)c13)C(=O)c1c(O)c(C3CC(C)(N(C)C)C(O)C(C)O3)cc(C3CC(N(C)C)C(O)C(C)O3)c1C2=O', 'CHEMBL1979887', Decimal('746.84'), Decimal('2.52'))
(1321506, 'CC1CN1C(=O)NCCCCCCNC(=O)N1CC1C', 'CHEMBL2000215', Decimal('282.38'), Decimal('0.92'))
(1310842, 'C[C@]12CCC3[C@@H](CCC4=CC(=O)CC[C@@]43C)C1CC[C@@H]2OC(=O)C12OC1CCC2=O', 'CHEMBL1989551', Decimal('412.52'), Decimal('3.59'))
(1286420, 'C=C1C(=O)OC2C=C(C)C3OC3C=C(C(=O)OC)C(OC(C)=O)C(OC(=O)C(C)(O)C(C)OC(C)=O)C12', 'CHEMBL1965129', Decimal('522.50'), Decimal('0.91'))
(1311072, 'CC(=O)OC[C@]12C[C@H](OC(=O)CC(C)C)C(C)=C[C@H]1OC1[C@H](O)[C@@H](OC(C)=O)[C@@]2(C)[C@@]12CO2', 'CHEMBL1989781', Decimal('466.52'), Decimal('0.99'))
(1309747, 'CC1=CC2OC3CC(OC(=O)/C=C/C=C/C(OCCC4=CC(=O)OC4)C(C)O)C(C)(C34CO4)C2(CO)CC1', 'CHEMBL1988456', Decimal('530.61'), Decimal('1.70'))
(1310737, 'CC1CCOC(=O)C=CC=CC(=O)OC2CC3OC4C5OC5(C)C(O)CC4(COC(=O)C1O)C2(C)C31CO1', 'CHEMBL1989446', Decimal('534.55'), Decimal('0.02'))
(1312614, 'CC(=O)OCC1=C(C)C[C@H]([C@@H](COC(C)=O)C2CCC3C4C[C@H]5O[C@]56CC=CC(=O)[C@]6(C)C4CC[C@]23C)OC1=O', 'CHEMBL1991323', Decimal('554.67'), Decimal('3.76'))
(1314509, 'CNC(=O)C(C)C1C(=O)/C(=C(O)/C=C/C(C)=C/C(C)C2OC3(C)OC(C=CC34CO4)C2C)C(=O)N1C1CCC(O)C(C)O1', 'CHEMBL1993218', Decimal('600.70'), Decimal('1.16'))
(1320933, 'CC1=CC2OC3CC4OC(=O)C=CC=CC(C(C)O)OCCC(C)C(O)C(=O)OCC2(CC1O)C4(C)C31CO1', 'CHEMBL1999642', Decimal('548.62'), Decimal('0.80'))
(1322083, 'CC1(C)CCCC2OC2CCC(C)(C)C1=O', 'CHEMBL2000792', Decimal('224.34'), Decimal('3.46'))
(1314654, 'O=c1sc2ccccc2n1CC1CS1', 'CHEMBL1993363', Decimal('223.31'), Decimal('2.57'))
(1301689, 'CC(=O)O[C@H]1[C@@H]2O[C@@]2(C)CCC=C(C)C[C@H](OC(C)=O)[C@H]1C(C)C', 'CHEMBL1980398', Decimal('338.44'), Decimal('2.92'))
(1314212, 'O=C(c1ccccc1)C1OC12C(=O)Nc1ccccc12', 'CHEMBL1992921', Decimal('265.26'), Decimal('1.75'))
(1300173, 'C=C1C(=O)OC2CC3(C)OC3C3OC3C3=CC(OC3=O)C12', 'CHEMBL1978882', Decimal('290.27'), Decimal('0.13'))
(1322034, 'C[C@H](O)[C@H]1C=CC=CC(=O)O[C@@H]2C[C@H]3O[C@@H]4[C@@H]5O[C@]5(C)CCC4(COC(=O)[C@H]4O[C@]4(C)[C@@H](O)CO1)[C@]2(C)[C@]31CO1', 'CHEMBL2000743', Decimal('562.61'), Decimal('-0.21'))
(1313881, 'COc1ccc2c(c1)C1(C)CCC(O2)C12CO2', 'CHEMBL1992590', Decimal('232.28'), Decimal('2.11'))
(1312125, 'C=CCOC(=O)N1c2ccc(O)cc2[C@@]23O[C@]24[C@@H](C)CC(OC)(OC)[C@@]3(O)C#CC=CC#C[C@H]14', 'CHEMBL1990834', Decimal('463.48'), Decimal('2.66'))
(1313562, 'CC(=O)OC(C[C@H](C)[C@@H]1CC[C@]2(C)C3=CCC4C(C)(C)C(=O)CC[C@]4(C)C3CCC12C)C1OC1(C)C', 'CHEMBL1992271', Decimal('498.74'), Decimal('6.14'))
(1300502, 'Cc1cn([C@@H]2O[C@H](CO[Si](C)(C)C(C)(C)C)[C@]3(O[C@H]3C(=O)NN)[C@H]2O[Si](C)(C)C(C)(C)C)c(=O)[nH]c1=O', 'CHEMBL1979211', Decimal('556.80'), None)
(1313878, 'CC1OC12CC(C)C(C)(O)C(=O)OCC1=CCN(C)CCC(OC2=O)C1=O', 'CHEMBL1992587', Decimal('381.42'), Decimal('0.63'))
(1313202, 'CC(=O)OC1CC2C(C)(C)C(=O)C=C[C@]2(C)C2CCC3(C)C(c4ccoc4)[C@@H](O)[C@H]4O[C@]43[C@]12C', 'CHEMBL1991911', Decimal('468.58'), Decimal('3.09'))
(1302651, 'C=C(C)C12OC1[C@@]1(C)C(=CC2=O)CCC(O)[C@@H]1C', 'CHEMBL1981360', Decimal('248.32'), Decimal('1.99'))
(1309841, 'C[C@]12CCC3C4=C(CCC3C1CCC2=O)C(=O)[C@H]1O[C@H]1C4=O', 'CHEMBL1988550', Decimal('300.35'), Decimal('2.02'))
(1313550, 'CC1=C[C@H]2O[C@@H]3C[C@H]4OC(=O)C=CC=CC(C(C)O)OCC[C@@H](C)[C@H](O)C(=O)OC[C@@]2(CC1)[C@]4(C)[C@]31CO1', 'CHEMBL1992259', Decimal('532.62'), Decimal('1.90'))
(1321613, 'C/C=C/C/C=C/CCC(=O)C1OC1C(N)=O', 'CHEMBL2000322', Decimal('223.27'), Decimal('1.09'))
(1309900, 'COc1ccc(/C=C/C(=O)c2c(-c3ccccc3)nn(C)c(=O)c2N2CC2C)cc1', 'CHEMBL1988609', Decimal('401.46'), Decimal('3.43'))
(1320419, 'CC1CCC2(C)C(CC=C(C=O)C2(O)C=O)C12CO2', 'CHEMBL1999128', Decimal('264.32'), Decimal('0.99'))
(1314159, 'CC(=O)OC1(C#N)CC2OC1C1C2N1C(=O)OC(C)(C)C', 'CHEMBL1992868', Decimal('294.30'), Decimal('0.32'))
In [34]:
smi = 'CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4' #sildenafil
In [35]:
sql2 = """
select molregno,m as smiles,tanimoto_sml(morganbv_fp('CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4'::mol),mfp2) as similarity
from fps_rdkit join mols_rdkit using (molregno)
where morganbv_fp('CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4'::mol)%mfp2
order by morganbv_fp('CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4'::mol)<%>mfp2;
"""
In [36]:
cur.execute(sql2)
In [37]:
for c in cur: print c
(410802, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O', 1.0)
(1351310, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O', 0.88135593220339)
(1351311, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O', 0.88135593220339)
(80636, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O', 0.866666666666667)
(80694, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O', 0.838709677419355)
(488008, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccccc4)CC3)ccc1OCC)[nH]c2=O', 0.825396825396825)
(410662, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCC(=O)O)CC3)ccc1OCC)[nH]c2=O', 0.8125)
(512303, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCC(C(N)=O)CC3)ccc1OCC)[nH]c2=O', 0.8125)
(1334756, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(C)nn(C)c2c(=O)[nH]1', 0.8)
(488151, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C4CCCCC4)CC3)ccc1OCC)[nH]c2=O', 0.8)
(410656, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCC(C(=O)O)CC3)ccc1OCC)[nH]c2=O', 0.8)
(488072, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccc(F)cc4)CC3)ccc1OCC)[nH]c2=O', 0.8)
(488147, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccc(Cl)cc4)CC3)ccc1OCC)[nH]c2=O', 0.787878787878788)
(488073, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccccc4Cl)CC3)ccc1OCC)[nH]c2=O', 0.787878787878788)
(1351309, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N(CC)CC)ccc1OCC)[nH]c2=O', 0.783333333333333)
(488146, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4cccc(Cl)c4)CC3)ccc1OCC)[nH]c2=O', 0.776119402985075)
(488010, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccccc4OC)CC3)ccc1OCC)[nH]c2=O', 0.776119402985075)
(488009, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccccc4C)CC3)ccc1OCC)[nH]c2=O', 0.776119402985075)
(410657, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCC(CC(=O)O)CC3)ccc1OCC)[nH]c2=O', 0.776119402985075)
(488071, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccccc4F)CC3)ccc1OCC)[nH]c2=O', 0.764705882352941)
(488149, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4cccc5ccccc54)CC3)ccc1OCC)[nH]c2=O', 0.764705882352941)
(1351312, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N(C)C)ccc1OCC)[nH]c2=O', 0.758064516129032)
(488148, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4ccc([N+](=O)[O-])cc4)CC3)ccc1OCC)[nH]c2=O', 0.753623188405797)
(410658, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCC(CCC(=O)O)CC3)ccc1OCC)[nH]c2=O', 0.753623188405797)
(488152, 'CCCCCCCCCC(=O)OCCN1CCN(S(=O)(=O)c2ccc(OCC)c(-c3nc4c(CCC)nn(C)c4c(=O)[nH]3)c2)CC1', 0.742857142857143)
(1334601, 'CCCc1nn(-c2cccnc2)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O', 0.742857142857143)
(1334602, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(-c3ccccc3)nn(C)c2c(=O)[nH]1', 0.738461538461539)
(410664, 'CCCOc1ccc(S(=O)(=O)N2CCC(C(=O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.735294117647059)
(1334603, 'CCCc1n[nH]c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O', 0.734375)
(410660, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCC(CCCC(=O)O)CC3)ccc1OCC)[nH]c2=O', 0.732394366197183)
(283528, 'CCCc1nc(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nn12', 0.727272727272727)
(410675, 'CCCOc1ccc(S(=O)(=O)N2CCC(CC(=O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.72463768115942)
(410742, 'CCCOc1ccc(S(=O)(=O)N2CCN(CCP(=O)(O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.72463768115942)
(488011, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(c4cccc(C(F)(F)F)c4)CC3)ccc1OCC)[nH]c2=O', 0.722222222222222)
(410755, 'CCCOc1ccc(S(=O)(=O)N2CCC(CP(=O)(O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.714285714285714)
(410746, 'CCCOc1ccc(S(=O)(=O)N2CCC(P(=O)(O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.714285714285714)
(410735, 'CCCOc1ccc(S(=O)(=O)N2CCC(P(=O)(OCC)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.714285714285714)
(410731, 'CCCOc1ccc(S(=O)(=O)N2CCN(CCP(=O)(OCC)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.714285714285714)
(488150, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(Cc4ccc5c(c4)OCO5)CC3)ccc1OCC)[nH]c2=O', 0.712328767123288)
(410715, 'CCCOc1ccc(S(=O)(=O)N2CCC(P(=O)(O)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.704225352112676)
(410737, 'CCCOc1ccc(S(=O)(=O)N2CCC(CP(=O)(OCC)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.704225352112676)
(1334755, 'CCCc1nn(C)c2c1nc(-c1cccc(S(=O)(=O)N3CCN(C)CC3)c1)[nH]c2=O', 0.698412698412698)
(1334754, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2cnn(C)c2c(=O)[nH]1', 0.698412698412698)
(410711, 'CCCOc1ccc(S(=O)(=O)N2CCN(CP(=O)(O)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.694444444444444)
(410713, 'CCCOc1ccc(S(=O)(=O)N2CCN(CCP(=O)(O)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.694444444444444)
(410676, 'CCCOc1ccc(S(=O)(=O)N2CCC(CCC(=O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.694444444444444)
(487042, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCNc3ccccc3)ccc1OCC)[nH]c2=O', 0.691176470588235)
(1351313, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCc3ccccc3)ccc1OCC)[nH]c2=O', 0.691176470588235)
(410717, 'CCCOc1ccc(S(=O)(=O)N2CCC(CP(=O)(O)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.684931506849315)
(488153, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCOC(=O)CCCO[N+](=O)[O-])CC3)ccc1OCC)[nH]c2=O', 0.684210526315789)
(1441770, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CCC)c(CC)c(=O)[nH]1', 0.676923076923077)
(567449, 'CCCc1c(OC)cc(OC)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.676470588235294)
(410677, 'CCCOc1ccc(S(=O)(=O)N2CCC(CCCC(=O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.675675675675676)
(304727, 'CCCn1nc(CC)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O', 0.671641791044776)
(410679, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)cc3c1OCC3)[nH]c2=O', 0.666666666666667)
(80598, 'CCCc1nn(C)c2c1nc(-c1ccccc1OCC)[nH]c2=O', 0.666666666666667)
(1441766, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(CCC)c(=O)[nH]1', 0.666666666666667)
(140806, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(C)c(=O)[nH]c2=O', 0.666666666666667)
(487043, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCNc3cccc4ccccc43)ccc1OCC)[nH]c2=O', 0.666666666666667)
(487044, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCNCC(=O)O)ccc1OCC)[nH]c2=O', 0.661971830985915)
(487046, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCN(CCO)S(=O)(=O)c3ccc(OCC)c(-c4nc5c(CCC)nn(C)c5c(=O)[nH]4)c3)ccc1OCC)[nH]c2=O', 0.657534246575342)
(410681, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)cc3c1OCO3)[nH]c2=O', 0.656716417910448)
(283502, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(ncn2CCC)c(=O)[nH]1', 0.656716417910448)
(410683, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)cc3c1OCCO3)[nH]c2=O', 0.656716417910448)
(553751, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3ccccn32)c(=O)[nH]1', 0.656716417910448)
(1441475, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CCC)cc(=O)[nH]1', 0.65625)
(555103, 'CCCc1c(OC)cc(O)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.647887323943662)
(304716, 'CCCn1nc(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O', 0.647058823529412)
(1262965, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(Cl)c(=O)[nH]1', 0.646153846153846)
(304811, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(c(CC)nn2C2CCCC2)c(=O)[nH]1', 0.642857142857143)
(1441765, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(CC)c(=O)[nH]1', 0.636363636363636)
(1262967, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(I)c(=O)[nH]1', 0.636363636363636)
(1262966, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(Br)c(=O)[nH]1', 0.636363636363636)
(1441764, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(C)c(=O)[nH]1', 0.636363636363636)
(558511, 'CCCCc1c(OC)cc(OC)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.633802816901408)
(80661, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C(N)=O)CC3)ccc1OCC)nc2O', 0.633802816901408)
(556780, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(Br)c(OC)cc(OC)c2c(=O)[nH]1', 0.632352941176471)
(563798, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(C)c(OC)cc(OC)c2c(=O)[nH]1', 0.632352941176471)
(1441774, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Cl)c(=O)[nH]1', 0.630769230769231)
(1351308, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)Nc3ncccc3C3CCCN3C)ccc1OCC)[nH]c2=O', 0.628205128205128)
(140060, 'CCCn1c2nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)[nH]c2c(=O)[nH]c1=O', 0.626865671641791)
(1262964, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(F)c(=O)[nH]1', 0.626865671641791)
(562127, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(Cl)c(OC)cc(OC)c2c(=O)[nH]1', 0.623188405797101)
(558510, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(CC)c(OC)cc(OC)c2c(=O)[nH]1', 0.623188405797101)
(558503, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(I)c(OC)cc(OC)c2c(=O)[nH]1', 0.623188405797101)
(558494, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(F)c(OC)cc(OC)c2c(=O)[nH]1', 0.623188405797101)
(560323, 'C=Cc1c(OC)cc(OC)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.623188405797101)
(1441771, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Br)c(=O)[nH]1', 0.621212121212121)
(487045, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCN(CC(=O)O)S(=O)(=O)c3ccc(OCC)c(-c4nc5c(CCC)nn(C)c5c(=O)[nH]4)c3)ccc1OCC)[nH]c2=O', 0.618421052631579)
(1376119, 'CCCOc1ccc(S(=O)(=O)NCCC2CCCN2C)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.618421052631579)
(140771, 'CCCn1c2nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)[nH]c2c(=O)n(C)c1=O', 0.617647058823529)
(410694, 'CCCOc1ccc(NC(C)=O)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.617647058823529)
(511952, 'CCCCN1C(=O)c2nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)[nH]c(=O)c2C1=O', 0.617647058823529)
(1441775, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(c(=O)[nH]1)CCC2', 0.617647058823529)
(1262963, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)cc(=O)[nH]1', 0.615384615384615)
(1441474, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C)cc(=O)[nH]1', 0.615384615384615)
(562135, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2cc(OC)cc(OC)c2c(=O)[nH]1', 0.611940298507463)
(1441769, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C)c(CC)c(=O)[nH]1', 0.611940298507463)
(1441768, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(CC)c(=O)[nH]1', 0.608695652173913)
(563835, 'CCCCc1c(OC)cc(O)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.608108108108108)
(1441472, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(O)cc(=O)[nH]1', 0.606060606060606)
(1441471, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(N)cc(=O)[nH]1', 0.606060606060606)
(555082, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(Br)c(OC)cc(O)c2c(=O)[nH]1', 0.605633802816901)
(139899, 'CCCCCCn1c2nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)[nH]c2c(=O)n(C)c1=O', 0.605633802816901)
(565660, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(C)c(OC)cc(O)c2c(=O)[nH]1', 0.605633802816901)
(1334600, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(-c3ccccc3)n[nH]c2c(=O)[nH]1', 0.605633802816901)
(1441761, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Cl)c(=O)[nH]1', 0.602941176470588)
(1441480, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC)c(NC(C)=O)c(=O)[nH]1', 0.6)
(140172, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)[nH]c2=O', 0.6)
(140026, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC1CC1)c(=O)n(C)c2=O', 0.6)
(556799, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(Cl)c(OC)cc(O)c2c(=O)[nH]1', 0.597222222222222)
(563829, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(CC)c(OC)cc(O)c2c(=O)[nH]1', 0.597222222222222)
(565656, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(F)c(OC)cc(O)c2c(=O)[nH]1', 0.597222222222222)
(562146, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(I)c(OC)cc(O)c2c(=O)[nH]1', 0.597222222222222)
(1441767, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(C)c(=O)[nH]1', 0.594202898550725)
(1441763, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(I)c(=O)[nH]1', 0.594202898550725)
(1441762, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Br)c(=O)[nH]1', 0.594202898550725)
(1441473, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(NC(C)=O)cc(=O)[nH]1', 0.594202898550725)
(140805, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)(C)C)c(=O)n(C)c2=O', 0.591549295774648)
(1441773, 'CCCCCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Br)c(=O)[nH]1', 0.591549295774648)
(140087, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.591549295774648)
(410684, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)cc3c1OCC3)[nH]c2=O', 0.589041095890411)
(560347, 'C=Cc1c(OC)cc(O)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12', 0.589041095890411)
(1441476, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)cc(=O)[nH]1', 0.588235294117647)
(1441479, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(F)(F)F)cc(=O)[nH]1', 0.588235294117647)
(555093, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2cc(OC)cc(O)c2c(=O)[nH]1', 0.585714285714286)
(1441772, 'CCCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(C(C)C)c(Br)c(=O)[nH]1', 0.585714285714286)
(139960, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC1CCCCC1)c(=O)n(C)c2=O', 0.583333333333333)
(140807, 'C=CCCn1c2[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc2c(=O)n(C)c1=O', 0.583333333333333)
(140260, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)CC)c(=O)n(C)c2=O', 0.583333333333333)
(80559, 'CCCc1nn(C)c2c1nc(-c1ccccc1OCC1CC1)[nH]c2=O', 0.582089552238806)
(1441477, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(CC(C)C)cc(=O)[nH]1', 0.579710144927536)
(1334762, 'CCCc1nn(C)c2c1nc(-c1cccnc1OCC)[nH]c2=O', 0.578125)
(410698, 'CCCOc1ccc(NC(=O)CCC)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.577464788732394)
(410696, 'CCCOc1ccc(NC(=O)CC)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.577464788732394)
(1441478, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(-c2ccccc2)cc(=O)[nH]1', 0.571428571428571)
(304703, 'CCCc1nc(C)n2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)nc2O', 0.571428571428571)
(410702, 'CCCOc1ccc(NC(=O)C(C)C)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.569444444444444)
(80558, 'CCCc1nn(C)c2c1nc(-c1ccccc1NS(C)(=O)=O)[nH]c2=O', 0.569230769230769)
(140519, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.567567567567568)
(410687, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)cc3c1OCO3)[nH]c2=O', 0.567567567567568)
(410689, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)cc3c1OCCO3)[nH]c2=O', 0.567567567567568)
(410686, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)cc3c1OCCC3)[nH]c2=O', 0.565789473684211)
(304580, 'CCCn1cnc2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)nc2O', 0.563380281690141)
(304707, 'CCCc1nc(CC)c2c(O)nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nn12', 0.561643835616438)
(140816, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(Cc1ccc(OC)cc1)c(=O)n(C)c2=O', 0.56)
(102837, 'CCCOc1ccc(S(=O)(=O)N2CCN(CP(=O)(O)O)CC2)cc1-c1nc2c(CCC)nn(C)c2c(O)n1', 0.545454545454545)
(325914, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3c(C)cccn32)c(O)n1', 0.540540540540541)
(512228, 'CCCc1nn(C)c2c1nc(-c1ccccc1O)[nH]c2=O', 0.53968253968254)
(410708, 'CCCOc1ccc(NC(=O)C2CCCCC2)cc1-c1nc2c(CCC)nn(C)c2c(=O)[nH]1', 0.539473684210526)
(102529, 'CCCOc1ccc(S(=O)(=O)N2CCN(CP(=O)(OCC)OCC)CC2)cc1-c1nc2c(CCC)nn(C)c2c(O)n1', 0.538461538461538)
(1334759, 'CCCc1nn(C)c2c1nc(-c1cccnc1OC)[nH]c2=O', 0.537313432835821)
(1008491, 'CCCc1nn(C)c2c1nc(-c1cc(S(N)(=O)=O)ccc1OCC)nc2O', 0.536231884057971)
(326033, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3cc(C)ccn32)c(O)n1', 0.533333333333333)
(1334767, 'CCCc1nn(C)c2c1nc(-c1cccnc1O)[nH]c2=O', 0.53030303030303)
(325968, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3ccc(Br)cn32)c(O)n1', 0.526315789473684)
(1334776, 'CCCc1nn(C)c2c1nc(-c1ccsc1)[nH]c2=O', 0.523809523809524)
(1334773, 'CCCc1nn(C)c2c1nc(-c1cn(C)cn1)[nH]c2=O', 0.523809523809524)
(80670, 'CCCc1nn(C)c2c1nc(-c1ccccc1[N+](=O)[O-])[nH]c2=O', 0.522388059701492)
(1334761, 'CCCc1nn(C)c2c1nc(-c1cccnc1OC(C)C)[nH]c2=O', 0.521739130434783)
(971340, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NC)ccc1OCC)nc2O', 0.52112676056338)
(800246, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N(CCO)CCO)ccc1OCC)nc2O', 0.520547945205479)
(326091, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3ccc(C)cn32)c(O)n1', 0.52)
(140606, 'CC(C)COc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.52)
(1227092, 'CCCc1nn(C)c2c1nc(-c1cc([S+](=O)([O-])N3CCN(C)CC3)ccc1OCC)nc2O', 0.52)
(1353820, 'CCCc1nn(C)c2c1nc(-c1ccc(C)cc1)[nH]c2=O', 0.516129032258065)
(512227, 'CCCc1nn(C)c2c1nc(-c1ccccc1)[nH]c2=O', 0.516129032258065)
(1353819, 'CCCc1nn(C)c2c1nc(-c1ccc(OC)cc1)[nH]c2=O', 0.515625)
(1334771, 'CCCc1nn(C)c2c1nc(-c1cn(C)nc1C)[nH]c2=O', 0.515625)
(1269220, 'CCCn1nc2c(nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)cnc3OCCOC)[nH]c2=O)c1CC', 0.513513513513513)
(1212525, 'CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)NCCO)ccc1OCC)nc2O', 0.513513513513513)
(140345, 'CCOc1ccc(S(=O)(=O)N2CCN(CC)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.513157894736842)
(1353817, 'CCCc1nn(C)c2c1nc(-c1ccc(Br)cc1)[nH]c2=O', 0.507936507936508)
(1334775, 'CCCc1nn(C)c2c1nc(-c1ccnn1C)[nH]c2=O', 0.507936507936508)
(421326, 'CCCc1nc(C)c2c(=O)nc(-c3cc(S(=O)(=O)N4CCN(CC)CC4)ccc3OCC)[nH]n12', 0.506666666666667)
(28710, 'CCCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc(O)c2cc3[nH]cnc3cc2n1', 0.506666666666667)
(140382, 'CCOc1ccc(S(=O)(=O)N2CCN(CCN(C)C)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.506329113924051)
(453288, 'CCCc1c2nc(-c3cc(S(=O)(=O)N4CCN(CC)CC4)cnc3OCCOC)[nH]c(=O)c2nn1C', 0.5)
(511950, 'CCCCN1C(=O)C2=C(N=C(c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)NC2)C1=O', 0.5)
(1353816, 'CCCc1nn(C)c2c1nc(-c1ccc(Cl)cc1)[nH]c2=O', 0.5)
(326382, 'CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3cccc(CC)n32)c(O)n1', 0.5)
(140451, 'CCOc1ccc(S(=O)(=O)N2CCN(CCO)CC2)cc1-c1nc2c([nH]1)n(CC(C)C)c(=O)n(C)c2=O', 0.5)
(1351307, 'CCOc1ccc(S(=O)(=O)Nc2ccc(O)c(C(=O)O)c2)cc1-c1nc2c(C(C)(C)C)nn(C)c2c(=O)[nH]1', 0.5)
(1353821, 'CCCc1nn(C)c2c1nc(-c1cccc(Br)c1)[nH]c2=O', 0.5)
(1334770, 'CCCc1nn(C)c2c1nc(-c1ccc(=O)[nH]n1)[nH]c2=O', 0.5)
(1334766, 'CCCc1nn(C)c2c1nc(-c1ccc(O)cc1)[nH]c2=O', 0.5)
(1353818, 'CCCc1nn(C)c2c1nc(-c1cccc(Cl)c1)[nH]c2=O', 0.5)

More about the RDKit Cartridge here: http://www.rdkit.org/docs/Cartridge.html

Working with Pandas

In [38]:
from rdkit.Chem import PandasTools
import pandas as pd
In [39]:
data = pd.read_sql(sql2, conn)
In [40]:
data.shape
Out[40]:
(187, 3)
In [41]:
data.head()
Out[41]:
molregno smiles similarity
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356
2 1351311 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O 0.881356
3 80636 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O 0.866667
4 80694 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O 0.838710

Filter the table

In [42]:
nn = data[data['similarity'] >= 0.8]
In [43]:
nn.shape
Out[43]:
(12, 3)
In [44]:
nn.head()
Out[44]:
molregno smiles similarity
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356
2 1351311 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O 0.881356
3 80636 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O 0.866667
4 80694 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O 0.838710

Add RDKit molecules and descriptors to the table

In [45]:
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='smiles',molCol='mol',includeFingerprints=True)
In [46]:
data.head()
Out[46]:
molregno smiles similarity mol
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000 Mol
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol
2 1351311 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol
3 80636 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O 0.866667 Mol
4 80694 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O 0.838710 Mol
In [47]:
data['logp'] = data['mol'].map(Descriptors.MolLogP)
data['mw'] = data['mol'].map(Descriptors.MolWt)
In [48]:
data.head()
Out[48]:
molregno smiles similarity mol logp mw
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000 Mol 1.6109 474.587
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol 3.2395 473.599
2 1351311 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol 2.8494 459.572
3 80636 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O 0.866667 Mol 1.2687 460.560
4 80694 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O 0.838710 Mol 0.9734 504.613
In [49]:
data.sort(columns=['similarity']).head()
Out[49]:
molregno smiles similarity mol logp mw
186 1353818 CCCc1nn(C)c2c1nc(-c1cccc(Cl)c1)[nH]c2=O 0.5 Mol 2.9295 302.765
177 453288 CCCc1c2nc(-c3cc(S(=O)(=O)N4CCN(CC)CC4)cnc3OCCOC)[nH]c(=O)c2nn1C 0.5 Mol 1.0225 519.628
178 511950 CCCCN1C(=O)C2=C(N=C(c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)NC2)C1=O 0.5 Mol 0.7942 489.598
179 1353816 CCCc1nn(C)c2c1nc(-c1ccc(Cl)cc1)[nH]c2=O 0.5 Mol 2.9295 302.765
180 326382 CCOc1ccc(S(=O)(=O)N2CCN(C)CC2)cc1-c1nc2c(nc3cccc(CC)n32)c(O)n1 0.5 Mol 2.5473 496.593

Simple plotting using the table columns

In [50]:
data[['mw', 'logp']].describe()
Out[50]:
mw logp
count 187.000000 187.000000
mean 486.180160 2.268858
std 92.688399 0.968545
min 268.320000 -0.048300
25% 448.589000 1.566700
50% 496.593000 2.292500
75% 537.152000 2.902300
max 866.980000 4.731700
In [51]:
rcParams['figure.figsize'] = 12,12
In [52]:
data['logp'].hist()
Out[52]:
<matplotlib.axes.AxesSubplot at 0x390e490>
In [53]:
scatter(data['mw'],data['logp'])
Out[53]:
<matplotlib.collections.PathCollection at 0x422b0d0>
In [54]:
data[data['mw']>800][['molregno','mol','mw','logp']]
Out[54]:
molregno mol mw logp
60 487046 Mol 852.997 2.6245
88 487045 Mol 866.980 2.7168

Substructure search within the table

In [55]:
qsmi = 'c1nn(C)c2c1nc[nH]c2=O'
In [56]:
qmol = Chem.MolFromSmiles(qsmi)
In [57]:
qmol
Out[57]:
In [58]:
subset = data[data['mol'] >= qmol]
In [59]:
subset.shape
Out[59]:
(90, 6)
In [60]:
subset.head()
Out[60]:
molregno smiles similarity mol logp mw
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000 Mol 1.6109 474.587
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol 3.2395 473.599
2 1351311 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol 2.8494 459.572
3 80636 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCNCC3)ccc1OCC)[nH]c2=O 0.866667 Mol 1.2687 460.560
4 80694 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(CCO)CC3)ccc1OCC)[nH]c2=O 0.838710 Mol 0.9734 504.613
In [61]:
data.groupby(data['mol'] >= qmol).describe().unstack()
Out[61]:
molregno similarity logp mw
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
mol
False 97 741276.649485 525340.753036 28710 304703.0 560323.0 1441471.0 1441775 97 0.595045 0.050741 0.5 0.567568 0.600000 0.626866 0.734375 97 2.018649 0.740929 -0.0483 1.482000 2.08120 2.54670 4.2180 97 492.993175 45.491272 391.453 462.61600 490.586 518.64000 624.701
True 90 709987.411111 449869.213609 80558 410694.5 488009.5 1334760.5 1376119 90 0.670030 0.113819 0.5 0.567983 0.691176 0.753623 1.000000 90 2.538528 1.107499 0.3034 1.620075 2.65905 3.32205 4.7317 90 478.837244 124.968717 268.320 386.95875 504.591 558.42875 866.980
In [62]:
data['containsQ'] = data['mol'] >= qmol
In [63]:
data.head(2)
Out[63]:
molregno smiles similarity mol logp mw containsQ
0 410802 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCN(C)CC3)ccc1OCC)[nH]c2=O 1.000000 Mol 1.6109 474.587 True
1 1351310 CCCc1nn(C)c2c1nc(-c1cc(S(=O)(=O)N3CCCCCC3)ccc1OCC)[nH]c2=O 0.881356 Mol 3.2395 473.599 True
In [64]:
data.boxplot('similarity',by='containsQ')
Out[64]:
<matplotlib.axes.AxesSubplot at 0x393c4d0>
In [65]:
conn.close()