from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
Start by reading in a set of building blocks from ZINC
import gzip,random
inlines = gzip.open('data/zbb.smi.gz').readlines()
random.seed(42)
random.shuffle(inlines)
indata = '\n'.join(inlines[:10000])
suppl = Chem.SmilesMolSupplier()
suppl.SetData(indata)
ms = [x for x in suppl if x is not None]
ms[0]
from rdkit.Chem import FunctionalGroups
fgs = FunctionalGroups.BuildFuncGroupHierarchy()
fgs
[<rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2a9510>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2a9890>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2a9a50>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2a9b90>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b5050>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b51d0>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b5390>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b5490>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b5650>, <rdkit.Chem.FunctionalGroups.FGHierarchyNode at 0x10e2b5a90>]
[x.name for x in fgs]
['Acid Chloride', 'Carboxylic acid', 'Sulfonyl Chloride', 'Amine', 'Boronic Acid', 'Isocyanate', 'Alcohol', 'Aldehyde', 'Halogen', 'Azide']
[x.label for x in fgs]
['AcidChloride', 'CarboxylicAcid', 'SulfonylChloride', 'Amine', 'BoronicAcid', 'Isocyanate', 'Alcohol', 'Aldehyde', 'Halogen', 'Azide']
[x.smarts for x in fgs]
['C(=O)Cl', 'C(=O)[O;H,-]', '[$(S-!@[#6])](=O)(=O)(Cl)', '[N;!H0;$(N-[#6]);!$(N-[!#6;!#1]);!$(N-C=[O,N,S])]', '[$(B-!@[#6])](O)(O)', '[$(N-!@[#6])](=!@C=!@O)', '[O;H1;$(O-!@[#6;!$(C=!@[O,N,S])])]', '[CH;D2;!$(C-[!#6;!#1])]=O', '[$([F,Cl,Br,I]-!@[#6]);!$([F,Cl,Br,I]-!@C-!@[F,Cl,Br,I]);!$([F,Cl,Br,I]-[C,S](=[O,S,N]))]', '[N;H0;$(N-[#6]);D2]=[N;D2]=[N;D1]']
[x.label for x in fgs[1].children]
['CarboxylicAcid.Aromatic', 'CarboxylicAcid.Aliphatic', 'CarboxylicAcid.AlphaAmino']
The functional groups are in a hierarchy. For our purposes a flat datastructure is better. Convert to a dictionary keyed by functional group name:
from collections import namedtuple
nt = namedtuple('pattern','smarts mol')
def flattenFgs(fgs,res):
if not fgs:
return
for x in fgs:
res[x.label]=nt(x.smarts,x.pattern)
flattenFgs(x.children,res)
allFgDefs={}
flattenFgs(fgs,allFgDefs)
allFgNames=sorted(allFgDefs.keys())
allFgNames
['AcidChloride', 'AcidChloride.Aliphatic', 'AcidChloride.Aromatic', 'Alcohol', 'Alcohol.Aliphatic', 'Alcohol.Aromatic', 'Aldehyde', 'Aldehyde.Aliphatic', 'Aldehyde.Aromatic', 'Amine', 'Amine.Aliphatic', 'Amine.Aromatic', 'Amine.Cyclic', 'Amine.Primary', 'Amine.Primary.Aliphatic', 'Amine.Primary.Aromatic', 'Amine.Secondary', 'Amine.Secondary.Aliphatic', 'Amine.Secondary.Aromatic', 'Azide', 'Azide.Aliphatic', 'Azide.Aromatic', 'BoronicAcid', 'BoronicAcid.Aliphatic', 'BoronicAcid.Aromatic', 'CarboxylicAcid', 'CarboxylicAcid.Aliphatic', 'CarboxylicAcid.AlphaAmino', 'CarboxylicAcid.Aromatic', 'Halogen', 'Halogen.Aliphatic', 'Halogen.Aromatic', 'Halogen.Bromine', 'Halogen.Bromine.Aliphatic', 'Halogen.Bromine.Aromatic', 'Halogen.NotFluorine', 'Halogen.NotFluorine.Aliphatic', 'Halogen.NotFluorine.Aromatic', 'Isocyanate', 'Isocyanate.Aliphatic', 'Isocyanate.Aromatic', 'SulfonylChloride', 'SulfonylChloride.Aliphatic', 'SulfonylChloride.Aromatic']
allFgs={}
for fgn in allFgNames:
patt = allFgDefs[fgn]
allFgs[fgn]=[m for m in ms if m.HasSubstructMatch(patt.mol)]
print '%s: Found %d '%(fgn,len(allFgs[fgn]))
AcidChloride: Found 62 AcidChloride.Aliphatic: Found 21 AcidChloride.Aromatic: Found 41 Alcohol: Found 1137 Alcohol.Aliphatic: Found 864 Alcohol.Aromatic: Found 304 Aldehyde: Found 210 Aldehyde.Aliphatic: Found 21 Aldehyde.Aromatic: Found 189 Amine: Found 3643 Amine.Aliphatic: Found 2366 Amine.Aromatic: Found 1427 Amine.Cyclic: Found 1069 Amine.Primary: Found 908 Amine.Primary.Aliphatic: Found 127 Amine.Primary.Aromatic: Found 781 Amine.Secondary: Found 857 Amine.Secondary.Aliphatic: Found 203 Amine.Secondary.Aromatic: Found 664 Azide: Found 1 Azide.Aliphatic: Found 0 Azide.Aromatic: Found 1 BoronicAcid: Found 0 BoronicAcid.Aliphatic: Found 0 BoronicAcid.Aromatic: Found 0 CarboxylicAcid: Found 2052 CarboxylicAcid.Aliphatic: Found 1325 CarboxylicAcid.AlphaAmino: Found 128 CarboxylicAcid.Aromatic: Found 736 Halogen: Found 2828 Halogen.Aliphatic: Found 477 Halogen.Aromatic: Found 2483 Halogen.Bromine: Found 607 Halogen.Bromine.Aliphatic: Found 118 Halogen.Bromine.Aromatic: Found 499 Halogen.NotFluorine: Found 2135 Halogen.NotFluorine.Aliphatic: Found 407 Halogen.NotFluorine.Aromatic: Found 1774 Isocyanate: Found 8 Isocyanate.Aliphatic: Found 0 Isocyanate.Aromatic: Found 8 SulfonylChloride: Found 0 SulfonylChloride.Aliphatic: Found 0 SulfonylChloride.Aromatic: Found 0
Note: It would be much, much more efficient to take advantage of the hierarchy in the functional groups to do this.
Pick a popular class from Stephen Roughley's paper (Roughley, S. & Jordan, A. The Medicinal Chemist’s Toolbox: An Analysis of Reactions Used in the Pursuit of Drug Candidates. J. Med. Chem. 54, 3451–3479 (2011).).
Since there are no boronic acids in the BB library from ZINC I can't do a Suzuki coupling, so I'll use N-arylation instead.
Start with a very rough definition of the reaction:
rxn = AllChem.ReactionFromSmarts('[a:1]-[Br,I].[N;H1;D2;$(N(-[#6])-[#6]);!$(N-[!#6;!#1]);!$(N-C=[O,N,S]):2]>>[a:1]-[N:2]')
Take a look at the reaction using the new (and very crude) function Draw.ReactionToImage()
Draw.ReactionToImage(rxn)
Now grab the building blocks we're going to use and look at them.
halogens = allFgs['Halogen.NotFluorine.Aromatic']
amines = allFgs['Amine.Secondary']
Draw.MolsToGridImage(halogens[:20],molsPerRow=4,legends=[x.GetProp('_Name') for x in halogens])
Draw.MolsToGridImage(amines[:20],molsPerRow=4,legends=[x.GetProp('_Name') for x in amines])
There's lots of stuff in there that looks like it may interfere, so let's do a bit more filtering
halogens = [x for x in halogens if len(x.GetSubstructMatches(allFgDefs['Halogen.NotFluorine'].mol))==1]
halogens = [x for x in halogens if not x.HasSubstructMatch(allFgDefs['Amine'].mol)]
len(halogens)
955
Draw.MolsToGridImage(halogens[:20],molsPerRow=4,legends=[x.GetProp('_Name') for x in halogens])
amines = [x for x in amines if len(x.GetSubstructMatches(allFgDefs['Amine'].mol))==1]
amines = [x for x in amines if not x.HasSubstructMatch(allFgDefs['Halogen.NotFluorine'].mol)]
len(amines)
485
Draw.MolsToGridImage(amines[:20],molsPerRow=4,legends=[x.GetProp('_Name') for x in amines])
The convenience function AllChem.EnumerateLibraryFromReaction()
makes it very easy to enumerate libraries of molecules using a reaction and some lists of building blocks.
The return value is a generator object that encodes the entire library. The members of the library are not actually generated until you iterate through the generator object. Consequently, the command below executes in milliseconds even though the underlying library contains 955x485 (>460K) members.
products = AllChem.EnumerateLibraryFromReaction(rxn,(halogens,amines))
products
<generator object EnumerateLibraryFromReaction at 0x111e4de10>
The results are tuples with one entry per product of the reaction.
This reaction has a single product, so there's only one entry in the tuple.
products.next()
(<rdkit.Chem.rdchem.Mol at 0x111e2cc80>,)
pull out a block of products:
first20 = [products.next()[0] for x in range(20)]
Draw.MolsToGridImage(first20,molsPerRow=4)
The above molecules all contain some ugliness because they have not been sanitized after being constructed. This is easy to fix:
[Chem.SanitizeMol(x) for x in first20]
Draw.MolsToGridImage(first20,molsPerRow=4)
We can always go back and get more (until we get to the end of the 450K library):
next20 = [products.next()[0] for x in range(20)]
[Chem.SanitizeMol(x) for x in next20]
Draw.MolsToGridImage(next20,molsPerRow=4)
rxnData="""$RXN
Marvin 092401121729
2 1
$MOL
Mrv0541 09241217292D
7 7 0 0 0 0 999 V2000
-6.7915 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0
-7.6165 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0
-8.0290 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 3 0 0
-7.6165 0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0
-6.7915 0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0
-6.3790 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0
-5.5540 0.0000 0.0000 * 0 0 0 0 0 0 0 0 0 0 0 0
1 2 4 0 0 0 0
2 3 4 0 0 0 0
3 4 4 0 0 0 0
4 5 4 0 0 0 0
5 6 4 0 0 0 0
6 7 1 0 0 0 0
6 1 4 0 0 0 0
V 7 Halogen.NotFluorine.Aromatic
M ALS 7 2 F Br I
M END
$MOL
Mrv0541 09241217292D
3 2 0 0 0 0 999 V2000
-2.3645 -0.2750 0.0000 N 0 0 0 0 0 0 0 0 0 7 0 0
-1.6500 0.1375 0.0000 C 0 0 0 0 0 0 0 0 0 8 0 0
-3.0790 0.1375 0.0000 C 0 0 0 0 0 0 0 0 0 9 0 0
2 1 1 0 0 0 0
1 3 1 0 0 0 0
V 1 Amine.Secondary
M END
$MOL
Mrv0541 09241217292D
9 9 0 0 0 0 999 V2000
4.4197 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0
3.5947 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0
3.1821 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 3 0 0
3.5947 0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0
4.4197 0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0
4.8322 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0
5.6572 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 7 0 0
6.0697 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 8 0 0
6.0697 0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 9 0 0
1 2 4 0 0 0 0
2 3 4 0 0 0 0
3 4 4 0 0 0 0
4 5 4 0 0 0 0
5 6 4 0 0 0 0
6 7 1 0 0 0 0
8 7 1 0 0 0 0
7 9 1 0 0 0 0
6 1 4 0 0 0 0
M END
"""
rxn = AllChem.ReactionFromRxnBlock(rxnData)
AllChem.Compute2DCoordsForReaction(rxn)
Draw.ReactionToImage(rxn)
m1 = Chem.MolFromSmiles('c1ccc(C)cc1Br')
m2 = Chem.MolFromSmiles('CCNCC')
ps = rxn.RunReactants((m1,m2,))
ps
((<rdkit.Chem.rdchem.Mol at 0x111e2c848>,), (<rdkit.Chem.rdchem.Mol at 0x111e2c668>,), (<rdkit.Chem.rdchem.Mol at 0x111e2c488>,), (<rdkit.Chem.rdchem.Mol at 0x111e2c8c0>,))
usmis = set()
for p in ps:
smi = Chem.MolToSmiles(p[0])
usmis.add(smi)
usmis
set(['CCN(CC)c1cccc(C)c1'])
Draw.MolsToGridImage([Chem.MolFromSmiles(x) for x in usmis])
Seems simple, but there's a problem. The query in the rxn block is actually much too simple, so the reaction will take place at Ns that are not actually amines:
m1 = Chem.MolFromSmiles('c1ccc(C)cc1Br')
m2 = Chem.MolFromSmiles('CCNCC(=O)NC')
ps = rxn.RunReactants((m1,m2,))
usmis = set()
for p in ps:
smi = Chem.MolToSmiles(p[0])
usmis.add(smi)
usmis
Draw.MolsToGridImage([Chem.MolFromSmiles(x) for x in usmis])
Solving this in general is difficult, but there's a relatively simple trick to make this work better.
Many chemical drawing tools allow you to attach extra information to atoms in rxn files using the "Value" tag. These can be used to specify the queries. We've tested that this works in ISIS/Draw and Marvin Sketch.
The idea is to capture the type of functional group as an atom property and then modify the reaction after parsing to add the corresponding query information. The code for that is below, but before getting to that, here's how Marvin Sketch displays the above reaction:
Note: The idea to use atom values in the rxn files to encode queries is from Holger Claussen at BioSolveIT.
# this should have some documentation and a lot more error checking... that's coming in the next RDKit release.
def PreprocessReaction(reaction,funcGroupFilename=None):
reaction._setImplicitPropertiesFlag(True)
reaction.Initialize()
nReactants = reaction.GetNumReactantTemplates()
nProducts = reaction.GetNumProductTemplates()
nWarn,nError = reaction.Validate()
reactantLabels = []
if not nError:
FunctionalGroups.BuildFuncGroupHierarchy(fileNm=funcGroupFilename)
gpNms={}
for k in FunctionalGroups.groupDefns.keys():
gpNms[k.lower()]=k
for i in range(nReactants):
m = reaction.GetReactantTemplate(i)
gps = []
for at in m.GetAtoms():
if at.HasProp('molFileValue'):
atIdx = at.GetIdx()
vals = at.GetProp('molFileValue').lower()
queryName=[]
queries=[]
for v in vals.split(','):
if gpNms.has_key(v):
queryName.append(gpNms[v])
queries.append(FunctionalGroups.groupDefns[gpNms[v]])
else:
queryName=None
queries=None
break
if queries:
if len(queries)>1:
# combine the individual queries into one SMARTS:
smas = ['$(%s)'%x.smarts for x in queries]
overallSmarts='[%s]'%','.join(smas)
pattern = Chem.MolFromSmarts(overallSmarts)
if not pattern:
raise ValueError,'could not build query from combined SMARTS "%s"'%overallSmarts
else:
pattern=queries[0].pattern
Chem.AddRecursiveQuery(m,pattern,atIdx)
queryName=','.join(queryName)
else:
queryName=None
gps.append((atIdx,queryName))
reactantLabels.append(tuple(gps))
return reactantLabels
Now let's apply that to the reaction from above. The return values shows the functional groups used:
PreprocessReaction(rxn)
[((6, 'Halogen.NotFluorine.Aromatic'),), ((0, 'Amine.Secondary'),)]
ps = rxn.RunReactants((m1,m2,))
usmis = set()
for p in ps:
smi = Chem.MolToSmiles(p[0])
usmis.add(smi)
usmis
Draw.MolsToGridImage([Chem.MolFromSmiles(x) for x in usmis])
Reminder of what the sample reaction is:
Draw.ReactionToImage(rxn)
mol = Chem.MolFromSmiles('n1ccc(N(C)C)cc1')
mol
rxn.IsMoleculeProduct(mol)
False
mol = Chem.MolFromSmiles('OC(=O)c1ccc(N(C)C)cc1')
mol
rxn.IsMoleculeProduct(mol)
True
mol = Chem.MolFromSmiles('c1ccncc1Br')
mol
rxn.IsMoleculeReactant(mol)
False
mol = Chem.MolFromSmiles('c1ccc(C2CC2)cc1Br')
mol
rxn.IsMoleculeReactant(mol)
True
ras=rxn.GetReactingAtoms()
ras
((5, 6), (0,))
Draw.ReactionToImage(rxn,includeAtomNumbers=True)
Map that information onto actual reactants:
match = m1.GetSubstructMatch(rxn.GetReactantTemplate(0))
Draw.MolToImage(m1,highlightAtoms=[match[x] for x in ras[0]])
match = m2.GetSubstructMatch(rxn.GetReactantTemplate(1))
Draw.MolToImage(m2,highlightAtoms=[match[x] for x in ras[1]])
This document is copyright (C) 2012 by Greg Landrum
This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to Creative Commons, 543 Howard Street, 5th Floor, San Francisco, California, 94105, USA.
The intent of this license is similar to that of the RDKit itself. In simple words: “Do whatever you want with it, but please give us some credit.”