import matplotlib.pyplot as plt import pandas as pd from IPython.display import display from IPython.display import display_pretty, display_html, HTML, Javascript, Image from rdkit.Chem import PandasTools from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import AllChem as Chem from rdkit.Chem import DataStructs from rdkit.Chem import MCS from rdkit.Chem import Draw Draw.DrawingOptions.elemDict[0]=(0.,0.,0.) # draw dummy atoms in black from itertools import cycle from sklearn import manifold from scipy.spatial.distance import * import mpld3 mpld3.enable_notebook() import warnings warnings.filterwarnings('ignore') from rdkit import rdBase rdBase.rdkitVersion base_url = 'www.ebi.ac.uk/chembl/api' pd.options.display.mpl_style = 'default' pd.options.display.float_format = '{:.2f}'.format rcParams['figure.figsize'] = 16,10 df = pd.read_csv('document chemistry_20141011_114421_271.csv',sep=',') df.info() df.shape df['SCHEMBL ID'] = df['Chemical ID'].map(lambda x: 'SCHEMBL{0}'.format(x)) df = df.sort('Chemical ID') #df.head() dff = df[(df['Claims Count'] > 0) | (df['Description Count'] > 0) | (df['Type'] != "TEXT")] dff.shape dff = dff[(dff['Rotatable Bound Count'] < 15) & (dff['Ring Count'] > 0) & (df['Radical'] == 0) & (df['Singleton'] == 0) & (df['Simple'] == 0)] dff.shape dff = dff[(dff['Molecular Weight'] >= 300) & (dff['Molecular Weight'] <= 800) & (dff['Log P'] > 0) & (dff['Log P'] < 7)] dff.shape dff = dff[(dff['Chemical Corpus Count'] < 400) & ((dff['Annotation Corpus Count'] < 400) | (dff['Annotation Corpus Count'].isnull()))] dff.shape dff['SMILES'] = dff['SMILES'].map(lambda x: max(x.split('.'), key=len)) PandasTools.AddMoleculeColumnToFrame(dff, smilesCol = 'SMILES') #dff.head(10) dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi) dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey) #dff.head() dff = dff.drop_duplicates('Chemical ID') dff.shape dff = dff.drop_duplicates('InChIKey') dff.shape dff = dff.ix[~(dff['ROMol'] >= Chem.MolFromSmarts('[#5]'))] dff.shape dff = dff.set_index('Chemical ID', drop=False) dff_counts = dff[['Patent ID','ROMol']].groupby('Patent ID').count() dff_counts['Link'] = dff_counts.index.map(lambda x: '{0}'.format(x)) dff_counts = dff_counts.rename(columns={'ROMol':'# Compounds'}) dff_counts dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']] fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']] dist_mat = squareform(pdist(fps,'jaccard')) mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) results = mds.fit(dist_mat) coords = results.embedding_ dff['X'] = [c[0] for c in coords] dff['Y'] = [c[1] for c in coords] csss = """ table { border-collapse: collapse; } th { color: #ffffff; background-color: #848482; } td { background-color: #f2f3f4; } table, th, td { font-family:Arial, Helvetica, sans-serif; border: 1px solid black; text-align: right; } """ import base64 dff['base64smi'] = dff['SMILES'].map(base64.b64encode) fig, ax = plt.subplots() fig.set_size_inches(14.0, 12.0) #colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw') colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index)))) for name, group in dff.groupby('Patent ID'): labels = [] for i in group.index: zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64smi']] zz['mol'] = zz['base64smi'].map(lambda x: ''.format(base_url,x)) del zz['base64smi'] label = zz.T del zz label.columns = ['Row: {}'.format(i)] labels.append(str(label.to_html())) points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8) tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss) mpld3.plugins.connect(fig,tooltip) resmarts = "([R:5][C;!R:1]=[C;H2].[C;H0,H1;!R:4]-[N;H2:2])>>[R:5][*:1]=[*:2]-[*:4]" Draw.ReactionToImage(Chem.ReactionFromSmarts(resmarts)) def fixRing(mol,resmarts=resmarts): rxn = Chem.ReactionFromSmarts(resmarts) ps = rxn.RunReactants((mol,)) if ps: m = ps[0][0] Chem.SanitizeMol(m) return m else: return mol mm = dff.ix[12823029]['ROMol'] mm fixRing(mm) for i, p, m in zip(dff.index, dff['Patent ID'], dff['ROMol']): if p == 'EP-2295436-A1': dff.ix[i,'ROMol'] = fixRing(m) dff['InChI'] = dff['ROMol'].map(Chem.MolToInchi) dff['InChIKey'] = dff['InChI'].map(Chem.InchiToInchiKey) dff = dff.drop_duplicates('InChIKey') dff.shape dff['RD_SMILES'] = dff['ROMol'].map(Chem.MolToSmiles) dff['base64rdsmi'] = dff['RD_SMILES'].map(base64.b64encode) fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in dff['ROMol']] dist_mat = squareform(pdist(fps,'jaccard')) mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=3, n_jobs = 2) results = mds.fit(dist_mat) coords = results.embedding_ dff['X'] = [c[0] for c in coords] dff['Y'] = [c[1] for c in coords] fig, ax = plt.subplots() fig.set_size_inches(14.0, 12.0) #colors = cycle('bgrcmykwbgrcmykbgrcmykwbgrcmykw') colors = cycle(cm.Dark2(np.linspace(0,1,len(dff_counts.index)))) for name, group in dff.groupby('Patent ID'): labels = [] for i in group.index: zz = group.ix[[i],['Patent ID','SCHEMBL ID','base64rdsmi']] zz['mol'] = zz['base64rdsmi'].map(lambda x: ''.format(base_url,x)) del zz['base64rdsmi'] label = zz.T del zz label.columns = ['Row: {}'.format(i)] labels.append(str(label.to_html())) points = ax.scatter(group['X'], group['Y'],c=colors.next(), s=80, alpha=0.8) tooltip = mpld3.plugins.PointHTMLTooltip(points, labels, voffset=10, hoffset=10, css=csss) mpld3.plugins.connect(fig,tooltip) def MCS_Report(ms,atomCompare='any',**kwargs): mcs = MCS.FindMCS(ms,atomCompare=atomCompare,timeout=120,**kwargs) nAts = np.array([x.GetNumAtoms() for x in ms]) print 'Mean nAts: {0}, mcs nAts: {1}'.format(nAts.mean(),mcs.numAtoms) print 'MCS SMARTS: {0}'.format(mcs.smarts) mcsM = Chem.MolFromSmarts(mcs.smarts) # find the common atoms if atomCompare == 'any': mcsM2 = Chem.MolFromSmiles(mcs.smarts,sanitize=False) atNums=[-1]*mcsM.GetNumAtoms() for m in ms: match = m.GetSubstructMatch(mcsM) if not match: continue for qidx,midx in enumerate(match): anum = m.GetAtomWithIdx(midx).GetAtomicNum() if atNums[qidx]==-1: atNums[qidx]=anum elif anum!=atNums[qidx]: atNums[qidx]=0 for idx,atnum in enumerate(atNums): if atnum>0: mcsM.GetAtomWithIdx(idx).SetAtomicNum(atnum) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) smi = Chem.MolToSmiles(mcsM,isomericSmiles=True) print "MCS SMILES: {0}".format(smi) img=Draw.MolToImage(mcsM,kekulize=False) return mcs.smarts,smi,img,mcsM PandasTools.AddMurckoToFrame(dff) PandasTools.AddMoleculeColumnToFrame(dff,smilesCol='Murcko_SMILES', molCol='MurMol') PandasTools.AlignToScaffold(dff) dff[['ROMol','MurMol']].head() mols = list(dff.MurMol) smarts,smi,img,mcsM = MCS_Report(mols,threshold=0.8,ringMatchesRingOnly=True) img core = Chem.MolFromSmarts(smarts) pind = [] rgroups = [] for i, m in zip(dff.index, dff.ROMol): rgrps = Chem.ReplaceCore(m,core,labelByIndex=True) if rgrps: s = Chem.MolToSmiles(rgrps, True) for e in s.split("."): pind.append(e[1:4].replace('*','').replace(']','')) rgroups.append(e) #print "R-Groups: ", i, s pind = sorted(list(set(pind)),key=int) pind = map(int,pind) Draw.MolToImage(core,includeAtomNumbers=True,highlightAtoms=pind) HTML('') sim_mat = [] CUTOFF = 0.8 for i,fp in enumerate(fps): tt = DataStructs.BulkTanimotoSimilarity(fps[i],fps) for i,t in enumerate(tt): if tt[i] < CUTOFF: tt[i] = None sim_mat.append(tt) sim_mat=numpy.array(sim_mat) sim_df = pd.DataFrame(sim_mat, columns = dff['Chemical ID'], index=dff['Chemical ID']) sim_df.head() d = sim_df.count().to_dict() nn_df = pd.DataFrame(d.items(),columns=['Chemical ID', '# NNs']) nn_df.set_index('Chemical ID',inplace=True) nn_df.head() pd.merge(dff[['Chemical ID','ROMol']],nn_df,left_index=True, right_index=True, sort=False).sort('# NNs',ascending=False).head() from collections import Counter rgroups = [r.replace('[15*]','[11*]').replace('[14*]','[12*]') for r in rgroups] c = Counter(rgroups) rgroup_counts = pd.DataFrame(c.most_common(20),columns=['RGroups','Frequency']) rgroup_counts.head(10) PandasTools.AddMoleculeColumnToFrame(rgroup_counts, smilesCol = 'RGroups') rgroup_counts.head(10) rgroup_counts[[r.startswith('[12*]') for r in rgroup_counts.RGroups]].head(10) dff.ix[dff.InChIKey == 'SECKRCOLJRRGGV-UHFFFAOYSA-N',['SCHEMBL ID','ROMol']]