from itertools import combinations
import numpy as np
import pandas as pd
import scikit_posthocs as sp
import seaborn as sns
from bootstrap import bootstrap_error_estimate
from scipy.stats import gmean
# Model comparison imports
from delong_ci import calc_auc_ci
from mlxtend.evaluate import cochrans_q, mcnemar, mcnemar_table
from mlxtend.evaluate import paired_ttest_5x2cv
#RDKit imports
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import AllChem
# ML imports
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tqdm.notebook import tqdm
import chembl_downloader
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
sns.set_style('whitegrid')
sns.set_context('talk')
Query the ChEMBL database for hERG data.
query = """select canonical_smiles, cs.molregno, md.chembl_id as mol_chembl_id, standard_relation, standard_value,
standard_type, standard_units, description, td.organism, assay_type, confidence_score,
td.pref_name, td.chembl_id as tgt_chembl_id
from activities act
join assays ass on act.assay_id = ass.assay_id
join target_dictionary td on td.tid = ass.tid
join compound_structures cs on cs.molregno = act.molregno
join molecule_dictionary md on md.molregno = cs.molregno
where ass.tid = 165
and assay_type in ('B','F')
and standard_value is not null
and standard_units = 'nM'
and act.standard_relation is not null
and standard_type = 'IC50'
and standard_relation = '='"""
Make a reproducible query ChEMBL database.
df_ok = chembl_downloader.query(query)
A quick sanity check to ensure that we extracted the data correctly.
df_ok.head()
canonical_smiles | molregno | mol_chembl_id | standard_relation | standard_value | standard_type | standard_units | description | organism | assay_type | confidence_score | pref_name | tgt_chembl_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F... | 152751 | CHEMBL1108 | = | 32.2 | IC50 | nM | K+ channel blocking activity in human embryoni... | Homo sapiens | F | 9 | HERG | CHEMBL240 |
1 | O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c... | 1543376 | CHEMBL2368925 | = | 5950.0 | IC50 | nM | K+ channel blocking activity in human embryoni... | Homo sapiens | F | 9 | HERG | CHEMBL240 |
2 | COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C... | 1219 | CHEMBL6966 | = | 143.0 | IC50 | nM | K+ channel blocking activity in human embryoni... | Homo sapiens | F | 9 | HERG | CHEMBL240 |
3 | CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(... | 152728 | CHEMBL1107 | = | 196.0 | IC50 | nM | K+ channel blocking activity in Chinese hamste... | Homo sapiens | F | 8 | HERG | CHEMBL240 |
4 | CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1 | 110803 | CHEMBL998 | = | 173.0 | IC50 | nM | K+ channel blocking activity in human embryoni... | Homo sapiens | F | 9 | HERG | CHEMBL240 |
Aggregate the results by taking the geometric mean of replicates.
grouper = df_ok.groupby(["canonical_smiles","molregno"])
data_df = grouper['standard_value'].apply(gmean).to_frame(name = 'IC50').reset_index()
Add a new column with the pIC50
data_df['pIC50'] = -np.log10(data_df.IC50*1e-9)
Set the "Active" field to 1 if the pIC50 >= 5 (10uM), otherwise 0
Look at counts of active and inactive molecules
data_df['Active'] = [1 if x >= 5 else 0 for x in data_df.pIC50]
data_df['Active'].value_counts()
1 4601 0 2274 Name: Active, dtype: int64
Visualize the activity distribution.
sns.displot(data=data_df,x='pIC50',hue='Active')
plt.show()
Define a function to get a Morgan fingerprint from a SMILES string
def gen_fp(smi):
mol = Chem.MolFromSmiles(smi)
fp = None
if mol:
fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
return fp
Enable the "progress_apply" function that lets us use a progress bar.
tqdm.pandas()
data_df['fp'] = data_df.canonical_smiles.apply(gen_fp)
Remove rows where we didn't successfully generate a fingerprint.
num_orig_rows = len(data_df)
data_df.dropna(inplace=True)
num_filtered_rows = len(data_df)
num_orig_rows, num_filtered_rows
(6875, 6875)
Build a quick ML model as a test.
train, test = train_test_split(data_df)
train_X = np.stack(train.fp)
train_y = train.Active.values
test_X = np.stack(test.fp)
test_Y = test.Active.values
lgbm = LGBMClassifier()
lgbm.fit(train_X,train_y)
pred = lgbm.predict(test_X)
auc = roc_auc_score(test_Y,pred)
auc
0.7055758053840365
prob = lgbm.predict_proba(test_X)[:,1]
false_pos_rate, true_pos_rate, thresholds = roc_curve(test_Y,prob)
ax = sns.lineplot(x=false_pos_rate,y=true_pos_rate)
ax.set_xlabel("True Positive Rate")
ax.set_ylabel("False Positive Rate")
# add the unity line
linemin = 0
linemax = 1
ax.plot([linemin,linemax],[linemin,linemax],color="grey",linewidth=2,linestyle="--");
For 10 folds of cross-validation loop over the different model types, train and test models.
method_list = [KNeighborsClassifier, RandomForestClassifier, LGBMClassifier]
method_name_list = [x.__name__.replace("Classifier","") for x in method_list]
truth_list = []
pred_list = []
prob_list = []
cv_cycles = 10
for i in tqdm(range(0,cv_cycles)):
train, test = train_test_split(data_df)
train_X = np.stack(train.fp)
train_y = train.Active.values
test_X = np.stack(test.fp)
test_y = test.Active.values
cycle_pred = []
cycle_prob = []
for method, method_name in zip(method_list, method_name_list):
if method_name == "XGB":
cls = method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
else:
cls = method(n_jobs=-1)
cls.fit(train_X, train_y)
cycle_pred.append(cls.predict(test_X))
cycle_prob.append(cls.predict_proba(test_X))
truth_list.append(test.Active.values)
pred_list.append(cycle_pred)
prob_list.append(cycle_prob)
0%| | 0/10 [00:00<?, ?it/s]
Build a dataframe with the AUC values collected above.
auc_result = []
for truth, prob in zip(truth_list,prob_list):
for name, p in zip(method_name_list, prob):
auc_result.append([name,roc_auc_score(truth,p[:,1])])
auc_df = pd.DataFrame(auc_result,columns=["Method","AUC"])
Here's what people typically do in the literature. Construct a bar char with the mean AUC for each of the methods, add a "whisker" to show the standard deviation. In my mind this is not a good way to present data. It doesn't adequately reflect the performance across cross validation folds and doesn't show whether the differences between methods are statistically significant.
ax = sns.barplot(x="Method",y="AUC",data=auc_df)
labels = [x.get_text() for x in ax.get_xticklabels()]
ax.set(xticklabels=labels)
ax.set(ylim=[0,1])
ax.set(xlabel="")
plt.show()
Here's a somewhat better approach where we represent the distribution of AUC values as box plots. This is somewhat better, but we're still not making an adequate comparison between methods.
sns.boxplot(x="Method",y="AUC",data=auc_df)
plt.show()
We can calculate a 95% confidence interval around each AUC using DeLong's method.
auc_result = []
for cycle, [truth, prob] in enumerate(zip(truth_list,prob_list)):
for name, p in zip(method_name_list, prob):
truth = np.array([int(x) for x in truth])
auc, (lb, ub) = calc_auc_ci(truth,p[:,1])
auc_result.append([cycle,name, auc, lb, ub])
auc_ci_df = pd.DataFrame(auc_result,columns=["Cycle","Method","AUC","LB","UB"])
auc_ci_df.head()
Cycle | Method | AUC | LB | UB | |
---|---|---|---|---|---|
0 | 0 | KNeighbors | 0.788374 | 0.766868 | 0.809879 |
1 | 0 | RandomForest | 0.820932 | 0.800811 | 0.841053 |
2 | 0 | LGBM | 0.814247 | 0.793943 | 0.834551 |
3 | 1 | KNeighbors | 0.803592 | 0.782375 | 0.824808 |
4 | 1 | RandomForest | 0.830648 | 0.810878 | 0.850418 |
Define a routine for displaying the AUC values and the associated 95% confidence intervals.
def ci_pointplot(input_df, x_col="Cycle", y_col="AUC", hue_col="Method", lb_col="LB", ub_col="UB"):
dodge_val = 0.25
palette_name = "deep"
cv_cycles = len(input_df[x_col].unique())
fig, ax = plt.subplots(1,1,figsize=(10, 5))
g = sns.pointplot(
x=x_col, y=y_col, hue=hue_col, data=input_df, dodge=dodge_val, join=False, palettte=palette_name, ax=ax)
colors = sns.color_palette(palette_name, len(input_df.Method.unique())) * cv_cycles
ax.axvline(0.5, ls="--", c="gray")
for x in np.arange(0.5, cv_cycles, 1):
ax.axvline(x, ls="--", c="gray")
y_val = input_df[y_col]
lb = y_val - input_df[lb_col]
ub = input_df[ub_col] - y_val
x_pos = []
for i in range(0, cv_cycles):
x_pos += [i - dodge_val / 2, i, i + dodge_val / 2]
_ = ax.errorbar(x_pos, y_val, yerr=[lb, ub], fmt="none", capsize=0, ecolor=colors)
ci_pointplot(auc_ci_df)
plt.show()
In addition to using an analytic method for calculating confidence intervals, we can also bootstrap an estimate.
bootstrap_result = []
with tqdm(total=len(truth_list)) as pbar:
for cycle,[truth,probs] in enumerate(zip(truth_list,prob_list)):
for name,p in zip(method_name_list,probs):
auc = roc_auc_score(truth,p[:,1])
lb,ub = bootstrap_error_estimate(truth,p[:,1],roc_auc_score)
bootstrap_result.append([cycle,name,auc,lb,ub])
pbar.update(1)
bootstrap_df = pd.DataFrame(bootstrap_result,columns=["Cycle","Method","AUC","LB","UB"])
0%| | 0/10 [00:00<?, ?it/s]
ci_pointplot(bootstrap_df)
plt.show()
X = np.stack(data_df.fp)
y = data_df.Active.values
classifier_list = []
for method, name in zip(method_list, method_name_list):
if name == "XGB":
classifier_list.append(method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1))
else:
classifier_list.append(method(n_jobs=-1))
print(f"{'Method_1':12s} {'Method_2':12s} {'p-value'}")
for a,b in tqdm(combinations(zip(classifier_list,method_name_list),2)):
clf1,name1 = a
clf2,name2 = b
t, p = paired_ttest_5x2cv(estimator1=clf1,estimator2=clf2,X=X, y=y, scoring="roc_auc")
tqdm.write(f"{name1:12s} {name2:12s} {p:.3f}")
Method_1 Method_2 p-value
0it [00:00, ?it/s]
KNeighbors RandomForest 0.001 KNeighbors LGBM 0.023 RandomForest LGBM 0.281
McNemar's test
mc_result = []
for truth, pred in zip(truth_list,pred_list):
for i,j in combinations(range(len(method_list)),2):
mc, mc_pvalue = mcnemar(mcnemar_table(truth, pred[i], pred[j]))
mc_result.append([method_name_list[i],method_name_list[j], mc_pvalue])
mc_df = pd.DataFrame(mc_result,columns=["Method_1","Method_2","p_value"])
mc_df['Combo'] = mc_df.Method_1 + "_" + mc_df.Method_2
for k,v in mc_df.groupby("Combo"):
print(k,v.p_value.mean())
KNeighbors_LGBM 0.3483030894730388 KNeighbors_RandomForest 0.2003823576958655 RandomForest_LGBM 0.37455199864849836
len(mc_result)
30
alpha = 0.05/len(pred_list[0])
alpha
0.016666666666666666
ax = sns.boxplot(x="p_value",y="Combo",data=mc_df)
ax.set(ylabel="",xlabel="p-value")
_ = ax.axvline(alpha,c="red",ls="--")