In [1]:

from itertools import combinations

import numpy as np
import pandas as pd
import scikit_posthocs as sp
import seaborn as sns
from bootstrap import bootstrap_error_estimate
from scipy.stats import gmean
# Model comparison imports
from delong_ci import calc_auc_ci
from mlxtend.evaluate import cochrans_q, mcnemar, mcnemar_table
from mlxtend.evaluate import paired_ttest_5x2cv
#RDKit imports
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import AllChem
# ML imports
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tqdm.notebook import tqdm
import chembl_downloader

import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
sns.set_style('whitegrid')
sns.set_context('talk')

Query the ChEMBL database for hERG data.

In [2]:

query = """select canonical_smiles, cs.molregno, md.chembl_id as mol_chembl_id, standard_relation, standard_value,
standard_type, standard_units, description, td.organism, assay_type, confidence_score,
td.pref_name, td.chembl_id as tgt_chembl_id
from activities act
join assays ass on act.assay_id = ass.assay_id
join target_dictionary td on td.tid = ass.tid
join compound_structures cs on cs.molregno = act.molregno
join molecule_dictionary md on md.molregno = cs.molregno
where ass.tid = 165
and assay_type in ('B','F')
and standard_value is not null
and standard_units = 'nM'
and act.standard_relation is not null
and standard_type = 'IC50'
and standard_relation = '='"""

Make a reproducible query ChEMBL database.

In [3]:

df_ok = chembl_downloader.query(query)

A quick sanity check to ensure that we extracted the data correctly.

In [4]:

df_ok.head()

Out[4]:

	canonical_smiles	molregno	mol_chembl_id	standard_relation	standard_value	standard_type	standard_units	description	organism	assay_type	confidence_score	pref_name	tgt_chembl_id
0	O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...	152751	CHEMBL1108	=	32.2	IC50	nM	K+ channel blocking activity in human embryoni...	Homo sapiens	F	9	HERG	CHEMBL240
1	O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...	1543376	CHEMBL2368925	=	5950.0	IC50	nM	K+ channel blocking activity in human embryoni...	Homo sapiens	F	9	HERG	CHEMBL240
2	COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...	1219	CHEMBL6966	=	143.0	IC50	nM	K+ channel blocking activity in human embryoni...	Homo sapiens	F	9	HERG	CHEMBL240
3	CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...	152728	CHEMBL1107	=	196.0	IC50	nM	K+ channel blocking activity in Chinese hamste...	Homo sapiens	F	8	HERG	CHEMBL240
4	CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1	110803	CHEMBL998	=	173.0	IC50	nM	K+ channel blocking activity in human embryoni...	Homo sapiens	F	9	HERG	CHEMBL240

Aggregate the results by taking the geometric mean of replicates.

In [5]:

grouper =  df_ok.groupby(["canonical_smiles","molregno"])
data_df = grouper['standard_value'].apply(gmean).to_frame(name = 'IC50').reset_index()

Add a new column with the pIC50

In [6]:

data_df['pIC50'] = -np.log10(data_df.IC50*1e-9)

Set the "Active" field to 1 if the pIC50 >= 5 (10uM), otherwise 0

Look at counts of active and inactive molecules

In [7]:

data_df['Active'] = [1 if x >= 5 else 0 for x in data_df.pIC50]
data_df['Active'].value_counts()

Out[7]:

1    4601
0    2274
Name: Active, dtype: int64

Visualize the activity distribution.

In [8]:

sns.displot(data=data_df,x='pIC50',hue='Active')
plt.show()

Define a function to get a Morgan fingerprint from a SMILES string

In [9]:

def gen_fp(smi):
    mol = Chem.MolFromSmiles(smi)
    fp = None
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
    return fp

Enable the "progress_apply" function that lets us use a progress bar.

In [10]:

tqdm.pandas()

In [11]:

data_df['fp'] = data_df.canonical_smiles.apply(gen_fp)

Remove rows where we didn't successfully generate a fingerprint.

In [12]:

num_orig_rows = len(data_df)
data_df.dropna(inplace=True)
num_filtered_rows = len(data_df)
num_orig_rows, num_filtered_rows

Out[12]:

(6875, 6875)

Build a quick ML model as a test.

In [13]:

train, test = train_test_split(data_df)
train_X = np.stack(train.fp)
train_y = train.Active.values
test_X = np.stack(test.fp)
test_Y = test.Active.values
lgbm = LGBMClassifier()
lgbm.fit(train_X,train_y)
pred = lgbm.predict(test_X)
auc = roc_auc_score(test_Y,pred)
auc

Out[13]:

0.7055758053840365

In [14]:

prob = lgbm.predict_proba(test_X)[:,1]
false_pos_rate, true_pos_rate, thresholds = roc_curve(test_Y,prob)
ax = sns.lineplot(x=false_pos_rate,y=true_pos_rate)
ax.set_xlabel("True Positive Rate")
ax.set_ylabel("False Positive Rate")
# add the unity line
linemin = 0
linemax = 1
ax.plot([linemin,linemax],[linemin,linemax],color="grey",linewidth=2,linestyle="--");

For 10 folds of cross-validation loop over the different model types, train and test models.

In [15]:

method_list = [KNeighborsClassifier, RandomForestClassifier, LGBMClassifier]
method_name_list = [x.__name__.replace("Classifier","") for x in method_list]
truth_list = []
pred_list = []
prob_list = []
cv_cycles = 10
for i in tqdm(range(0,cv_cycles)):
    train, test = train_test_split(data_df)
    train_X = np.stack(train.fp)
    train_y = train.Active.values
    test_X = np.stack(test.fp)
    test_y = test.Active.values
    cycle_pred = []
    cycle_prob = []
    for method, method_name in zip(method_list, method_name_list):
        if method_name == "XGB":
            cls = method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
        else:
            cls = method(n_jobs=-1)
        cls.fit(train_X, train_y)
        cycle_pred.append(cls.predict(test_X))
        cycle_prob.append(cls.predict_proba(test_X))

    truth_list.append(test.Active.values)        
    pred_list.append(cycle_pred)
    prob_list.append(cycle_prob)

  0%|          | 0/10 [00:00<?, ?it/s]

Build a dataframe with the AUC values collected above.

In [16]:

auc_result = []
for truth, prob in zip(truth_list,prob_list):
    for name, p in zip(method_name_list, prob):
        auc_result.append([name,roc_auc_score(truth,p[:,1])])
auc_df = pd.DataFrame(auc_result,columns=["Method","AUC"])

Here's what people typically do in the literature. Construct a bar char with the mean AUC for each of the methods, add a "whisker" to show the standard deviation. In my mind this is not a good way to present data. It doesn't adequately reflect the performance across cross validation folds and doesn't show whether the differences between methods are statistically significant.

In [17]:

ax = sns.barplot(x="Method",y="AUC",data=auc_df)
labels = [x.get_text() for x in ax.get_xticklabels()]
ax.set(xticklabels=labels)
ax.set(ylim=[0,1])
ax.set(xlabel="")
plt.show()

Here's a somewhat better approach where we represent the distribution of AUC values as box plots. This is somewhat better, but we're still not making an adequate comparison between methods.

In [18]:

sns.boxplot(x="Method",y="AUC",data=auc_df)
plt.show()

We can calculate a 95% confidence interval around each AUC using DeLong's method.

In [19]:

auc_result = []
for cycle, [truth, prob] in enumerate(zip(truth_list,prob_list)):
    for name, p in zip(method_name_list, prob):
        truth = np.array([int(x) for x in truth])
        auc, (lb, ub) = calc_auc_ci(truth,p[:,1])
        auc_result.append([cycle,name, auc, lb, ub])
auc_ci_df = pd.DataFrame(auc_result,columns=["Cycle","Method","AUC","LB","UB"])
auc_ci_df.head()

Out[19]:

	Cycle	Method	AUC	LB	UB
0	0	KNeighbors	0.788374	0.766868	0.809879
1	0	RandomForest	0.820932	0.800811	0.841053
2	0	LGBM	0.814247	0.793943	0.834551
3	1	KNeighbors	0.803592	0.782375	0.824808
4	1	RandomForest	0.830648	0.810878	0.850418

Define a routine for displaying the AUC values and the associated 95% confidence intervals.

In [20]:

def ci_pointplot(input_df, x_col="Cycle", y_col="AUC", hue_col="Method", lb_col="LB", ub_col="UB"):
    dodge_val = 0.25
    palette_name = "deep"
    cv_cycles = len(input_df[x_col].unique())
    fig, ax = plt.subplots(1,1,figsize=(10, 5))
    g = sns.pointplot(
        x=x_col, y=y_col, hue=hue_col, data=input_df, dodge=dodge_val, join=False, palettte=palette_name, ax=ax)
    colors = sns.color_palette(palette_name, len(input_df.Method.unique())) * cv_cycles
    ax.axvline(0.5, ls="--", c="gray")
    for x in np.arange(0.5, cv_cycles, 1):
        ax.axvline(x, ls="--", c="gray")
    y_val = input_df[y_col]
    lb = y_val - input_df[lb_col]
    ub = input_df[ub_col] - y_val
    x_pos = []
    for i in range(0, cv_cycles):
        x_pos += [i - dodge_val / 2, i, i + dodge_val / 2]
    _ = ax.errorbar(x_pos, y_val, yerr=[lb, ub], fmt="none", capsize=0, ecolor=colors)

In [21]:

ci_pointplot(auc_ci_df)
plt.show()

In addition to using an analytic method for calculating confidence intervals, we can also bootstrap an estimate.

In [22]:

bootstrap_result = []
with tqdm(total=len(truth_list)) as pbar:
    for cycle,[truth,probs] in enumerate(zip(truth_list,prob_list)):
        for name,p in zip(method_name_list,probs):
            auc = roc_auc_score(truth,p[:,1])
            lb,ub = bootstrap_error_estimate(truth,p[:,1],roc_auc_score)
            bootstrap_result.append([cycle,name,auc,lb,ub])
        pbar.update(1)
bootstrap_df = pd.DataFrame(bootstrap_result,columns=["Cycle","Method","AUC","LB","UB"])

  0%|          | 0/10 [00:00<?, ?it/s]

In [23]:

ci_pointplot(bootstrap_df)
plt.show()

5x2 cross validation

In [24]:

X = np.stack(data_df.fp)
y = data_df.Active.values
classifier_list = []
for method, name in zip(method_list, method_name_list):
    if name == "XGB":
        classifier_list.append(method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1))
    else:
        classifier_list.append(method(n_jobs=-1))
print(f"{'Method_1':12s} {'Method_2':12s} {'p-value'}")
for a,b in tqdm(combinations(zip(classifier_list,method_name_list),2)):
    clf1,name1 = a
    clf2,name2 = b
    t, p = paired_ttest_5x2cv(estimator1=clf1,estimator2=clf2,X=X, y=y, scoring="roc_auc")
    tqdm.write(f"{name1:12s} {name2:12s} {p:.3f}")

Method_1     Method_2     p-value

0it [00:00, ?it/s]

KNeighbors   RandomForest 0.001
KNeighbors   LGBM         0.023
RandomForest LGBM         0.281

McNemar's test

In [25]:

mc_result = []
for truth, pred in zip(truth_list,pred_list): 
    for i,j in combinations(range(len(method_list)),2):
        mc, mc_pvalue = mcnemar(mcnemar_table(truth, pred[i], pred[j]))
        mc_result.append([method_name_list[i],method_name_list[j], mc_pvalue])
mc_df = pd.DataFrame(mc_result,columns=["Method_1","Method_2","p_value"])
mc_df['Combo'] = mc_df.Method_1 + "_" + mc_df.Method_2
for k,v in mc_df.groupby("Combo"):
    print(k,v.p_value.mean())

KNeighbors_LGBM 0.3483030894730388
KNeighbors_RandomForest 0.2003823576958655
RandomForest_LGBM 0.37455199864849836

In [26]:

len(mc_result)

Out[26]:

In [27]:

alpha = 0.05/len(pred_list[0])
alpha

Out[27]:

0.016666666666666666

In [28]:

ax = sns.boxplot(x="p_value",y="Combo",data=mc_df)
ax.set(ylabel="",xlabel="p-value")
_ = ax.axvline(alpha,c="red",ls="--")