#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from rdkit.Chem import Draw from rdkit.Chem.Draw import IPythonConsole import numpy as np import pandas as pd # In[2]: df = pd.read_csv("bace.csv") # In[3]: df.head(1) # In[4]: mols = [Chem.MolFromSmiles(smi) for smi in df.mol] fps = [AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=1024) for mol in mols] pIC50 = [i for i in df.pIC50] # In[5]: Draw.MolsToGridImage(mols[:10], legends=["pIC50 "+str(i) for i in pIC50[:10]], molsPerRow=5) # In[6]: X = [] for fp in fps: arr = np.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) X.append(arr) X = np.array(X) y = np.array(pIC50) y_bin = np.asarray(y>7, dtype=np.int) # In[7]: import matplotlib.pyplot as plt # In[8]: plt.scatter(range(len(y_bin)), y_bin) # In[9]: from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix from sklearn.decomposition import PCA from xgboost import XGBClassifier from mlxtend.classifier import StackingClassifier from mlxtend.plotting import plot_decision_regions from mlxtend.plotting import plot_confusion_matrix import numpy as np # In[10]: x_train, x_test, y_train, y_test = train_test_split(X,y_bin, test_size=0.2) # In[11]: clf1 = RandomForestClassifier(random_state=794) clf2 = GaussianNB() clf3 = XGBClassifier(random_state=0) clf4 = SVC(random_state=0) clflist = ["RF", "GNB", "XGB", "SVC", "SCLF"] # In[12]: sclf = StackingClassifier(classifiers=[clf1,clf2,clf3], meta_classifier=clf4) # In[13]: skf = StratifiedKFold(n_splits=5) # In[14]: for j, (train_idx,test_idx) in enumerate(skf.split(x_train, y_train)): for i, clf in enumerate([clf1, clf2, clf3, clf4, sclf]): clf.fit(x_train[train_idx],y_train[train_idx]) ypred = clf.predict(x_train[test_idx]) acc = accuracy_score(y_train[test_idx], ypred) b_acc = balanced_accuracy_score(y_train[test_idx], ypred) print("round {}".format(j)) print(clflist[i]) print("accuracy {}".format(acc)) print("balanced accuracy {}".format(b_acc)) print("="*20) # In[15]: ypred = sclf.predict(x_test) plot_confusion_matrix(confusion_matrix(y_test, ypred)) # In[ ]: