#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from imblearn.over_sampling import SMOTE from imblearn.over_sampling import ADASYN import pandas as pd import numpy as np import matplotlib.pyplot as plt from IPython import display from sklearn.decomposition import PCA # In[2]: df = pd.read_csv('chembl_5HT.csv') df = df.dropna() # In[3]: df.head(2) # In[4]: # define class pIC50 >8 is active and other is inactive. df['CLS'] = np.array(df.pchembl_value > 9, dtype=np.int) # In[5]: pd.plotting.hist_series(df.CLS) # In[6]: mols = [Chem.MolFromSmiles(smi) for smi in df.canonical_smiles] fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols] # In[7]: def fp2np(fp): arr = np.zeros((0,)) DataStructs.ConvertToNumpyArray(fp, arr) return arr # In[8]: X = np.array([fp2np(fp) for fp in fps]) # In[9]: Y = df.CLS.to_numpy() # In[10]: # without sampling train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state=123, test_size=0.2) # In[11]: print(train_X.shape) print(train_Y.shape) print(sum(train_Y)/len(train_Y)) # In[12]: rf = RandomForestClassifier(n_estimators=10) rf.fit(train_X, train_Y) pred_Y = rf.predict(test_X) # In[13]: print(classification_report(test_Y, pred_Y)) print(confusion_matrix(test_Y, pred_Y)) # In[ ]: # In[14]: X_resampled, Y_resampled = SMOTE().fit_resample(train_X, train_Y) # In[15]: print(X_resampled.shape) print(Y_resampled.shape) print(sum(Y_resampled)/len(Y_resampled)) # In[16]: rf = RandomForestClassifier(n_estimators=10) rf.fit(X_resampled, Y_resampled) pred_Y = rf.predict(test_X) # In[17]: print(classification_report(test_Y, pred_Y)) print(confusion_matrix(test_Y, pred_Y)) # In[ ]: # In[18]: X_resampled, Y_resampled = ADASYN().fit_resample(train_X, train_Y) # In[19]: print(X_resampled.shape) print(Y_resampled.shape) print(sum(Y_resampled)/len(Y_resampled)) # In[20]: rf = RandomForestClassifier(n_estimators=10) rf.fit(X_resampled, Y_resampled) pred_Y = rf.predict(test_X) clsreport = classification_report(test_Y, pred_Y) print(classification_report(test_Y, pred_Y)) print(confusion_matrix(test_Y, pred_Y)) # In[21]: pca = PCA(n_components=3) # In[22]: res = pca.fit_transform(X) # In[23]: col = {0:'blue', 1:'yellow'} color = [col[np.int(i)] for i in Y] plt.figure(figsize=(10,7)) plt.scatter(res[:,0], res[:,1], c=color, alpha=0.5) # In[ ]: