#!/usr/bin/env python # coding: utf-8 # In[1]: from itertools import combinations import mysql.connector as sql import numpy as np import pandas as pd import scikit_posthocs as sp import seaborn as sns from bootstrap import bootstrap_error_estimate from scipy.stats import gmean # Model comparison imports from delong_ci import calc_auc_ci from mlxtend.evaluate import cochrans_q, mcnemar, mcnemar_table from mlxtend.evaluate import paired_ttest_5x2cv #RDKit imports from rdkit import Chem from rdkit import Chem from rdkit.Chem import AllChem # ML imports from lightgbm import LGBMClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score, roc_curve from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from tqdm.notebook import tqdm # Establish a connection to a local version of the ChEMBL database. # In[2]: con = sql.connect(host='localhost', database='chembl_26', user='pwalters', password='itsasecret') cursor = con.cursor() # Query the ChEMBL database for hERG data. # In[3]: query = """select canonical_smiles, cs.molregno, md.chembl_id as mol_chembl_id, standard_relation, standard_value, standard_type, standard_units, description, td.organism, assay_type, confidence_score, td.pref_name, td.chembl_id as tgt_chembl_id from activities act join assays ass on act.assay_id = ass.assay_id join target_dictionary td on td.tid = ass.tid join compound_structures cs on cs.molregno = act.molregno join molecule_dictionary md on md.molregno = cs.molregno where ass.tid = 165 and assay_type in ('B','F') and standard_value is not null and standard_units = 'nM' and act.standard_relation is not null and standard_type = 'IC50' and standard_relation = '='""" # In[4]: df_ok = pd.read_sql(query,con=con) # A quick sanity check to ensure that we extracted the data correctly. # In[5]: df_ok.head() # Aggregate the results by taking the geometric mean of replicates. # In[6]: grouper = df_ok.groupby(["canonical_smiles","molregno"]) data_df = grouper['standard_value'].apply(gmean).to_frame(name = 'IC50').reset_index() # Add a new column with the pIC50 # In[7]: data_df['pIC50'] = -np.log10(data_df.IC50*1e-9) # Set the "Active" field to 1 if the pIC50 >= 5 (10uM), otherwise 0 # Look at counts of active and inactive molecules # In[8]: data_df['Active'] = [1 if x >= 5 else 0 for x in data_df.pIC50] data_df['Active'].value_counts() # Visualize the activity distribution. # In[9]: sns.set_style('whitegrid') sns.set_context('talk') sns.displot(data=data_df,x='pIC50',hue='Active',height=7); # Define a function to get a Morgan fingerprint from a SMILES string # In[10]: def gen_fp(smi): mol = Chem.MolFromSmiles(smi) fp = None if mol: fp = AllChem.GetMorganFingerprintAsBitVect(mol,2) return fp # Enable the "progress_apply" function that lets us use a progress bar. # In[11]: tqdm.pandas() # In[12]: data_df['fp'] = data_df.canonical_smiles.progress_apply(gen_fp) # Remove rows where we didn't successfully generate a fingerprint. # In[13]: num_orig_rows = len(data_df) data_df.dropna(inplace=True) num_filtered_rows = len(data_df) num_orig_rows, num_filtered_rows # Build a quick ML model as a test. # In[14]: train, test = train_test_split(data_df) train_X = np.stack(train.fp) train_y = train.Active.values test_X = np.stack(test.fp) test_Y = test.Active.values lgbm = LGBMClassifier() lgbm.fit(train_X,train_y) pred = lgbm.predict(test_X) auc = roc_auc_score(test_Y,pred) auc # In[15]: prob = lgbm.predict_proba(test_X)[:,1] false_pos_rate, true_pos_rate, thresholds = roc_curve(test_Y,prob) ax = sns.lineplot(x=false_pos_rate,y=true_pos_rate) ax.set_xlabel("True Positive Rate") ax.set_ylabel("False Positive Rate") # add the unity line linemin = 0 linemax = 1 ax.plot([linemin,linemax],[linemin,linemax],color="grey",linewidth=2,linestyle="--"); # For 10 folds of cross-validation loop over the different model types, train and test models. # In[16]: method_list = [KNeighborsClassifier, RandomForestClassifier, LGBMClassifier] method_name_list = [x().__class__.__name__ for x in method_list] method_name_list = [x.replace("Classifier","") for x in method_name_list] truth_list = [] pred_list = [] prob_list = [] cv_cycles = 10 for i in tqdm(range(0,cv_cycles)): train, test = train_test_split(data_df) train_X = np.stack(train.fp) train_y = train.Active.values test_X = np.stack(test.fp) test_y = test.Active.values cycle_pred = [] cycle_prob = [] for method, method_name in zip(method_list, method_name_list): if method_name == "XGB": cls = method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1) else: cls = method(n_jobs=-1) cls.fit(train_X, train_y) cycle_pred.append(cls.predict(test_X)) cycle_prob.append(cls.predict_proba(test_X)) truth_list.append(test.Active.values) pred_list.append(cycle_pred) prob_list.append(cycle_prob) # Build a dataframe with the AUC values collected above. # In[17]: auc_result = [] for truth, prob in zip(truth_list,prob_list): for name, p in zip(method_name_list, prob): auc_result.append([name,roc_auc_score(truth,p[:,1])]) auc_df = pd.DataFrame(auc_result,columns=["Method","AUC"]) # Here's what people typically do in the literature. Construct a bar char with the mean AUC for each of the methods, add a "whisker" to show the standard deviation. **In my mind this is not a good way to present data**. It doesn't adequately reflect the performance across cross validation folds and doesn't show whether the differences between methods are statistically significant. # In[18]: sns.set(rc={'figure.figsize': (6, 6)}) sns.set_style('whitegrid') sns.set_context('talk') ax = sns.barplot(x="Method",y="AUC",data=auc_df) labels = [x.get_text() for x in ax.get_xticklabels()] ax.set(xticklabels=labels) ax.set(ylim=[0,1]) _ = ax.set(xlabel="") # Here's a somewhat better approach where we represent the distribution of AUC values as box plots. This is somewhat better, but we're still not making an adequate comparison between methods. # In[19]: sns.boxplot(x="Method",y="AUC",data=auc_df) # We can calculate a 95% confidence interval around each AUC using [DeLong's method](https://github.com/yandexdataschool/roc_comparison/blob/master/compare_auc_delong_xu.py). # In[20]: auc_result = [] for cycle, [truth, prob] in enumerate(zip(truth_list,prob_list)): for name, p in zip(method_name_list, prob): truth = np.array([int(x) for x in truth]) auc, (lb, ub) = calc_auc_ci(truth,p[:,1]) auc_result.append([cycle,name, auc, lb, ub]) auc_ci_df = pd.DataFrame(auc_result,columns=["Cycle","Method","AUC","LB","UB"]) auc_ci_df.head() # Define a routine for displaying the AUC values and the associated 95% confidence intervals. # In[21]: def ci_pointplot(input_df, x_col="Cycle", y_col="AUC", hue_col="Method", lb_col="LB", ub_col="UB"): dodge_val = 0.25 palette_name = "deep" cv_cycles = len(input_df[x_col].unique()) ax = sns.pointplot(x=x_col, y=y_col, hue=hue_col, data=input_df, dodge=dodge_val, join=False, palettte=palette_name) colors = sns.color_palette(palette_name, len(input_df.Method.unique())) * cv_cycles ax.axvline(0.5, ls="--", c="gray") for x in np.arange(0.5, cv_cycles, 1): ax.axvline(x, ls="--", c="gray") y_val = input_df[y_col] lb = y_val - input_df[lb_col] ub = input_df[ub_col] - y_val x_pos = [] for i in range(0, cv_cycles): x_pos += [i - dodge_val / 2, i, i + dodge_val / 2] _ = ax.errorbar(x_pos, y_val, yerr=[lb, ub], fmt="none", capsize=0, color=colors) # In[22]: sns.set(rc={'figure.figsize': (14, 7)}) sns.set(font_scale=1.5) sns.set_style('white') ci_pointplot(auc_ci_df) # In addition to using an analytic method for calculating confidence intervals, we can also bootstrap an estimate. # In[23]: bootstrap_result = [] with tqdm(total=len(truth_list)) as pbar: for cycle,[truth,probs] in enumerate(zip(truth_list,prob_list)): for name,p in zip(method_name_list,probs): auc = roc_auc_score(truth,p[:,1]) lb,ub = bootstrap_error_estimate(truth,p[:,1],roc_auc_score) bootstrap_result.append([cycle,name,auc,lb,ub]) pbar.update(1) bootstrap_df = pd.DataFrame(bootstrap_result,columns=["Cycle","Method","AUC","LB","UB"]) # In[24]: ci_pointplot(bootstrap_df) # [5x2 cross validation](https://sci2s.ugr.es/keel/pdf/algorithm/articulo/dietterich1998.pdf) # In[ ]: X = np.stack(data_df.fp) y = data_df.Active.values classifier_list = [] for method, name in zip(method_list, method_name_list): if name == "XGB": classifier_list.append(method(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)) else: classifier_list.append(method(n_jobs=-1)) print(f"{'Method_1':12s} {'Method_2':12s} {'p-value'}") for a,b in combinations(zip(classifier_list,method_name_list),2): clf1,name1 = a clf2,name2 = b t, p = paired_ttest_5x2cv(estimator1=clf1,estimator2=clf2,X=X, y=y, scoring="roc_auc") print(f"{name1:12s} {name2:12s} {p:.3f}") # McNemar's test # In[ ]: mc_result = [] for truth, pred in zip(truth_list,pred_list): for i,j in combinations(range(len(method_list)),2): mc, mc_pvalue = mcnemar(mcnemar_table(truth, pred[i], pred[j])) mc_result.append([method_name_list[i],method_name_list[j], mc_pvalue]) mc_df = pd.DataFrame(mc_result,columns=["Method_1","Method_2","p_value"]) mc_df['Combo'] = mc_df.Method_1 + "_" + mc_df.Method_2 for k,v in mc_df.groupby("Combo"): print(k,v.p_value.mean()) # In[ ]: len(mc_result) # In[ ]: alpha = 0.05/len(pred_list[0]) alpha # In[ ]: ax = sns.boxplot(x="p_value",y="Combo",data=mc_df) ax.set(ylabel="",xlabel="p value") _ = ax.axvline(alpha,c="red",ls="--") # In[ ]: