#!/usr/bin/env python # coding: utf-8 # # Requirements for running this notebook: # - RDKit # - SQLAlchemy # - NumPy # - Pandas # - PyTables # - A connection to a ChEMBL database! # # ## This notebook extracts data from ChEMBL database and formats it for a multi-task neural network training problem. # In[1]: from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from rdkit.Chem import rdMolDescriptors from sqlalchemy import create_engine from sqlalchemy.sql import text import numpy as np import pandas as pd import tables as tb from tables.atom import ObjectAtom import json # # Get data from ChEMBL # - You can use our SQLite dump: http://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_25/chembl_25_sqlite.tar.gz # In[2]: engine = create_engine('sqlite:///chembl_25/chembl_25_sqlite/chembl_25.db') # ex postgresql: 'postgresql://user:pass@host:5432/chembl_25' qtext = """ SELECT activities.doc_id AS doc_id, activities.standard_value AS standard_value, molecule_hierarchy.parent_molregno AS molregno, compound_structures.canonical_smiles AS canonical_smiles, molecule_dictionary.chembl_id AS chembl_id, target_dictionary.tid AS tid, target_dictionary.chembl_id AS target_chembl_id, protein_family_classification.l1 AS l1, protein_family_classification.l2 AS l2, protein_family_classification.l3 AS l3 FROM activities JOIN assays ON activities.assay_id = assays.assay_id JOIN target_dictionary ON assays.tid = target_dictionary.tid JOIN target_components ON target_dictionary.tid = target_components.tid JOIN component_class ON target_components.component_id = component_class.component_id JOIN protein_family_classification ON component_class.protein_class_id = protein_family_classification.protein_class_id JOIN molecule_dictionary ON activities.molregno = molecule_dictionary.molregno JOIN molecule_hierarchy ON molecule_dictionary.molregno = molecule_hierarchy.molregno JOIN compound_structures ON molecule_hierarchy.parent_molregno = compound_structures.molregno WHERE activities.standard_units = 'nM' AND activities.standard_type IN ('EC50', 'IC50', 'Ki', 'Kd', 'XC50', 'AC50', 'Potency') AND activities.data_validity_comment IS NULL AND activities.standard_relation IN ('=', '<') AND activities.potential_duplicate = 0 AND assays.confidence_score >= 8 AND target_dictionary.target_type = 'SINGLE PROTEIN'""" with engine.begin() as conn: res = conn.execute(text(qtext)) df = pd.DataFrame(res.fetchall()) df.columns = res.keys() df = df.where((pd.notnull(df)), None) # # Drop duplicate activities keeping the activity with lower concentration for each molecule-target pair # In[3]: df = df.sort_values(by=['standard_value', 'molregno', 'tid'], ascending=True) df = df.drop_duplicates(subset=['molregno', 'tid'], keep='first') # save to csv df.to_csv('chembl_activity_data.csv', index=False) # # Set to active/inactive by threshold # - Depending on family type from IDG: https://druggablegenome.net/ProteinFam # # - Kinases: <= 30nM # - GPCRs: <= 100nM # - Nuclear Receptors: <= 100nM # - Ion Channels: <= 10μM # - Non-IDG Family Targets: <= 1μM # # In[4]: def set_active(row): active = 0 if row['standard_value'] <= 1000: active = 1 if row['l1'] == 'Ion channel': if row['standard_value'] <= 10000: active = 1 if row['l2'] == 'Kinase': if row['standard_value'] > 30: active = 0 if row['l2'] == 'Nuclear receptor': if row['standard_value'] > 100: active = 0 if row['l3'] and 'GPCR' in row['l3']: if row['standard_value'] > 100: active = 0 return active df['active'] = df.apply(lambda row: set_active(row), axis=1) # # Filter target data # # - Keep targets mentioned at least in two different docs # - Keep targets with at least 100 active and 100 inactive molecules. Threshold set to 100 to get a 'small' dataset that will train faster on this example. # In[5]: # get targets with at least 100 different active molecules acts = df[df['active'] == 1].groupby(['target_chembl_id']).agg('count') acts = acts[acts['molregno'] >= 100].reset_index()['target_chembl_id'] # get targets with at least 100 different inactive molecules inacts = df[df['active'] == 0].groupby(['target_chembl_id']).agg('count') inacts = inacts[inacts['molregno'] >= 100].reset_index()['target_chembl_id'] # get targets mentioned in at least two docs docs = df.drop_duplicates(subset=['doc_id', 'target_chembl_id']) docs = docs.groupby(['target_chembl_id']).agg('count') docs = docs[docs['doc_id'] >= 2.0].reset_index()['target_chembl_id'] # In[6]: t_keep = set(acts).intersection(set(inacts)).intersection(set(docs)) # get dta for filtered targets activities = df[df['target_chembl_id'].isin(t_keep)] ion = pd.unique(activities[activities['l1'] == 'Ion channel']['tid']).shape[0] kin = pd.unique(activities[activities['l2'] == 'Kinase']['tid']).shape[0] nuc = pd.unique(activities[activities['l2'] == 'Nuclear receptor']['tid']).shape[0] gpcr = pd.unique(activities[activities['l3'].str.contains('GPCR', na=False)]['tid']).shape[0] print('Number of unique targets: ', len(t_keep)) print(' Ion channel: ', ion) print(' Kinase: ', kin) print(' Nuclear receptor: ', nuc) print(' GPCR: ', gpcr) print(' Others: ', len(t_keep) - ion - kin - nuc - gpcr) # In[7]: # save it to a file activities.to_csv('chembl_activity_data_filtered.csv', index=False) # # Prepare the label matrix for the multi-task deep neural network # # - known active = 1 # - known no-active = 0 # - unknown activity = -1, so we'll be able to easilly filter them and won't be taken into account when calculating the loss during model training. # # The matrix is extremely sparse so using sparse matrices (COO/CSR/CSC) should be considered. There are a couple of issues making it a bit tricker than what it should be so we'll keep the example without them. # # - https://github.com/pytorch/pytorch/issues/20248 # - https://github.com/scipy/scipy/issues/7531 # # In[8]: def gen_dict(group): return {tid: act for tid, act in zip(group['target_chembl_id'], group['active'])} group = activities.groupby('chembl_id') temp = pd.DataFrame(group.apply(gen_dict)) mt_df = pd.DataFrame(temp[0].tolist()) mt_df['chembl_id'] = temp.index mt_df = mt_df.where((pd.notnull(mt_df)), -1) # In[9]: structs = activities[['chembl_id', 'canonical_smiles']].drop_duplicates(subset='chembl_id') # drop mols not sanitizing on rdkit structs['romol'] = structs.apply(lambda row: Chem.MolFromSmiles(row['canonical_smiles']), axis=1) structs = structs.dropna() del structs['romol'] # add the structures to the final df mt_df = pd.merge(structs, mt_df, how='inner', on='chembl_id') # In[10]: # save to csv mt_df.to_csv('chembl_multi_task_data.csv', index=False) # # Calc fingeprints and save data to a PyTables H5 file # In[11]: FP_SIZE = 1024 RADIUS = 2 def calc_fp(smiles, fp_size, radius): """ calcs morgan fingerprints as a numpy array. """ mol = Chem.MolFromSmiles(smiles, sanitize=False) mol.UpdatePropertyCache(False) Chem.GetSSSR(mol) fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size) a = np.zeros((0,), dtype=np.float32) Chem.DataStructs.ConvertToNumpyArray(fp, a) return a # calc fps descs = [calc_fp(smi, FP_SIZE, RADIUS) for smi in mt_df['canonical_smiles'].values] descs = np.asarray(descs, dtype=np.float32) # put all training data in a pytables file with tb.open_file('mt_data.h5', mode='w') as t_file: # set compression filter. It will make the file much smaller filters = tb.Filters(complib='blosc', complevel=5) # save chembl_ids tatom = ObjectAtom() cids = t_file.create_vlarray(t_file.root, 'chembl_ids', atom=tatom) for cid in mt_df['chembl_id'].values: cids.append(cid) # save fps fatom = tb.Atom.from_dtype(descs.dtype) fps = t_file.create_carray(t_file.root, 'fps', fatom, descs.shape, filters=filters) fps[:] = descs del mt_df['chembl_id'] del mt_df['canonical_smiles'] # save target chembl ids tcids = t_file.create_vlarray(t_file.root, 'target_chembl_ids', atom=tatom) for tcid in mt_df.columns.values: tcids.append(tcid) # save labels labs = t_file.create_carray(t_file.root, 'labels', fatom, mt_df.values.shape, filters=filters) labs[:] = mt_df.values # save task weights # each task loss will be weighted inversely proportional to its number of data points weights = [] for col in mt_df.columns.values: c = mt_df[mt_df[col] >= 0.0].shape[0] weights.append(1 / c) weights = np.array(weights) ws = t_file.create_carray(t_file.root, 'weights', fatom, weights.shape) ws[:] = weights # # Open H5 file and show the shape of all collections # In[12]: with tb.open_file('mt_data.h5', mode='r') as t_file: print(t_file.root.chembl_ids.shape) print(t_file.root.target_chembl_ids.shape) print(t_file.root.fps.shape) print(t_file.root.labels.shape) print(t_file.root.weights.shape) # save targets to a json file with open('targets.json', 'w') as f: json.dump(t_file.root.target_chembl_ids[:], f) # In[ ]: