#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from sklearn.decomposition import PCA


# In[2]:


df = pd.read_csv('chembl_5HT.csv')
df = df.dropna()


# In[3]:


df.head(2)


# In[4]:


# define class pIC50 >8 is active and other is inactive.
df['CLS'] = np.array(df.pchembl_value > 9, dtype=np.int)


# In[5]:


pd.plotting.hist_series(df.CLS)


# In[6]:


mols = [Chem.MolFromSmiles(smi) for smi in df.canonical_smiles]
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]


# In[7]:


def fp2np(fp):
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr


# In[8]:


X = np.array([fp2np(fp) for fp in fps])


# In[9]:


Y = df.CLS.to_numpy()


# In[10]:


# without sampling
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state=123, test_size=0.2)


# In[11]:


print(train_X.shape)
print(train_Y.shape)
print(sum(train_Y)/len(train_Y))


# In[12]:


rf = RandomForestClassifier(n_estimators=10)
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)


# In[13]:


print(classification_report(test_Y, pred_Y))
print(confusion_matrix(test_Y, pred_Y))


# In[ ]:


# In[14]:


X_resampled, Y_resampled = SMOTE().fit_resample(train_X, train_Y)


# In[15]:


print(X_resampled.shape)
print(Y_resampled.shape)
print(sum(Y_resampled)/len(Y_resampled))


# In[16]:


rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_resampled, Y_resampled)
pred_Y = rf.predict(test_X)


# In[17]:


print(classification_report(test_Y, pred_Y))
print(confusion_matrix(test_Y, pred_Y))


# In[ ]:


# In[18]:


X_resampled, Y_resampled = ADASYN().fit_resample(train_X, train_Y)


# In[19]:


print(X_resampled.shape)
print(Y_resampled.shape)
print(sum(Y_resampled)/len(Y_resampled))


# In[20]:


rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_resampled, Y_resampled)
pred_Y = rf.predict(test_X)
clsreport = classification_report(test_Y, pred_Y)
print(classification_report(test_Y, pred_Y))
print(confusion_matrix(test_Y, pred_Y))


# In[21]:


pca = PCA(n_components=3)


# In[22]:


res = pca.fit_transform(X)


# In[23]:


col = {0:'blue', 1:'yellow'}
color = [col[np.int(i)] for i in Y]
plt.figure(figsize=(10,7))
plt.scatter(res[:,0], res[:,1], c=color, alpha=0.5)


# In[ ]: