import pandas as pd from rdkit import Chem,rdBase from rdkit.Chem import PandasTools print 'PANDAS version ',pd.__version__ data = pd.read_table(open('smiles_cas_N6512.smi','r'),header=None,names=['smiles','cas','mutagenic']) data.head() data = data.set_index('cas') data.head() data.describe() PandasTools.AddMoleculeColumnToFrame(data,smilesCol='smiles',molCol='molecule',includeFingerprints=False) data.head(2) data = data.ix[data['molecule'].notnull()] data.describe() data.head(2) data.groupby('mutagenic').describe().unstack() #define a structure pattern from rdkit.Chem.Draw import IPythonConsole nitroso = Chem.MolFromSmiles('N=O') nitroso data.groupby(data['molecule'] >= nitroso).describe().unstack() polyarom = Chem.MolFromSmiles('c1cccc2c1cccc2') polyarom data.groupby(data['molecule'] >= polyarom).describe().unstack() from rdkit.Chem import AllChem from rdkit import DataStructs class FP: def __init__(self, fp): self.fp = fp def __str__(self): return self.fp.__str__() def computeFP(x): #compute depth-2 morgan fingerprint hashed to 1024 bits fp = AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) res = numpy.zeros(len(fp),numpy.int32) #convert the fingerprint to a numpy array and wrap it into the dummy container DataStructs.ConvertToNumpyArray(fp,res) return FP(res) data['FP'] = data.apply(lambda row: computeFP(row['molecule']), axis=1) #filter potentially failed fingerprint computations data = data.ix[data['FP'].notnull()] import random rand = random.Random() rand.seed(42) train = rand.sample(data.index, len(data)/2) trainData = data.ix[train] testData = data.drop(train) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(random_state = 42) #resolve wrapped fingerprints X = [x.fp for x in trainData['FP']] y = trainData['mutagenic'] model.fit(X,y) #resolve wrapped fingerprints and apply the model on the test data prediction = model.predict([x.fp for x in testData['FP']]) from sklearn.metrics import metrics print metrics.confusion_matrix(testData['mutagenic'],prediction) print metrics.classification_report(testData['mutagenic'],prediction) testData['prediction'] = model.predict([x.fp for x in testData['FP']]) testData['probability'] = [p[1] for p in model.predict_proba([x.fp for x in testData['FP']])] testData.head(2) testData.sort(columns='probability').head(2) testData.sort(columns='probability',ascending=False).head(2) #assign the predicted probabilities to discrete bins testData['binnedProb'] = pd.cut(testData['probability'],bins=[-0.1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.1],labels=False) temp = testData.groupby(['binnedProb','mutagenic'])['mutagenic'].size().unstack() print temp temp.plot(kind='bar',stacked=True,) polyarom = Chem.MolFromSmiles('c1cccc2c1cccc2') polyarom testData.groupby(['binnedProb',testData['molecule'] >= polyarom])['mutagenic'].size().unstack().plot(kind='bar',stacked=True,) temp = testData.copy() temp['containsMotif'] = temp['molecule'] >= polyarom temp.boxplot('probability',by='containsMotif') #del trainData['explFP'] def addExplFP(df,molColumn): fpCache = [] for mol in df[molColumn]: res = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024) fpCache.append(res) arr = np.empty((len(df),), dtype=np.object) arr[:]=fpCache S = pd.Series(arr,index=df.index,name='explFP') return df.join(pd.DataFrame(S)) trainData = addExplFP(trainData,'molecule') trainData.head(2) from rdkit import DataStructs fpList = trainData['explFP'].tolist() dm=[] for i,fp in enumerate(fpList): dm.extend(DataStructs.BulkTanimotoSimilarity(fp,fpList[1+i:],returnDistance=True)) dm = array(dm) def convertToNumpy(df,fpCol): fpCache = [] for fp in df[fpCol]: res = numpy.zeros(len(fp),numpy.int32) DataStructs.ConvertToNumpyArray(fp,res) fpCache.append(res) ''' it is necessary to constructs an empty object array in advance and fill that later, because directly initializing an array with the fingerprint would trigger the numpy type recognition and result in a array of integers that again would trigger pandas to construct a Series object per bit position ''' arr = np.empty((len(df),), dtype=np.object) arr[:]=fpCache S = pd.Series(arr,index=df.index,name='npFP') return df.join(pd.DataFrame(S)) trainData = convertToNumpy(trainData,'explFP') trainData.head() model = RandomForestClassifier() #resolve wrapped fingerprints #X = [x.fp for x in trainData['FP']] #y = trainData['mutagenic'] model.fit(np.vstack(trainData['npFP']),trainData['mutagenic']) from rdkit import DataStructs from rdkit.ML.Cluster import Butina def ClusterFps(fps,cutoff=0.2): # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1,nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i]) dists.extend([1-x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True) return cs #the explicit bitvector column constructured a few steps earlier, can not directly used to perform a Butina clustering using the RDKit implementation bt_clusters = ClusterFps(trainData['explFP'].tolist(),cutoff=0.3) cluster_map = np.empty(len(trainData)) cluster_map[:]=-1 for _c in bt_clusters: if len(_c)<5: continue centroid = _c[0] for it in _c: cluster_map[it]=centroid trainData['btCluster'] = cluster_map def getMoleculeForCluster(c): return trainData.ix[trainData.index[int(c)]]['molecule'] cData = trainData.ix[trainData['btCluster'] != -1] cData['Centroid'] = cData.apply(lambda row: getMoleculeForCluster(row['btCluster']),axis=1) cData['Centroid CAS'] = cData.apply(lambda row: str(trainData.index[int(row['btCluster'])]),axis=1) cData[['molecule','Centroid','btCluster','Centroid CAS']].tail(2) #add an example numeric column to compute statistics for cData['nAtoms'] = cData.apply(lambda row: row['molecule'].GetNumAtoms(),axis=1) from IPython.display import HTML,display temp = dict([(row[1]['btCluster'],row[1]['Centroid']) for row in cData.iterrows()]) tC = cData.set_index('Centroid',False) #groupby clusterID (string doesn't work well) and compute statistic tC = tC.groupby('btCluster') tempC = tC.describe()[['mutagenic','nAtoms']] #display(HTML(tempC.to_html())) #the MCS was dropped because describe doesn't work on non-numerics, thus it has to be mapped back #every substatistic is now associated with an instance of the MCS def mapMCS(row): return temp[row.name[0]] tempC['Centroid'] = tempC.apply(lambda row: mapMCS(row),axis=1) #display(HTML(tempC.to_html())) #create a second index level using the MCS and reorder the indices index = tempC.index tempC = tempC.set_index('Centroid', append=True) #display(HTML(tempC.to_html())) tempC = tempC.reorder_levels([0,'Centroid',1]) display(HTML(tempC.ix[1802].to_html()))