#!/usr/bin/env python # coding: utf-8 # # Parse the BindingDB tsv export # # This notebook peforms the following processing steps on the [BindingDB](http://www.bindingdb.org/bind/index.jsp) export: # # + processes affinities to floats # + converts to entrez genes # + simplifies observation into essential fields # # See the corresponding [Thinklab discussion](http://doi.org/10.15363/thinklab.d53) for more information. # In[1]: import os import csv import gzip import pprint import collections import operator import pandas import requests # ## Download BindingDB # In[2]: # Download all data from BindingDB filename = 'BindingDB_All_2015m10.tsv' # ! wget --directory-prefix download https://www.bindingdb.org/bind/downloads/{filename}.zip # ! unzip -d download download/{filename}.zip # ! rm download/{filename}.zip # ! mv download/BindingDB_All.tsv download/{filename} # ! gzip -f download/{filename} get_ipython().system(' shasum download/{filename}.gz') # ## Load uniprot to entrez gene mapping # In[3]: # uniprot to entrez gene mapping url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz' uniprot_df = pandas.read_table(url, compression='gzip') # In[4]: uniprot_to_entrez = dict() for uniprot, entrez in zip(uniprot_df.uniprot, uniprot_df.GeneID): uniprot_to_entrez.setdefault(uniprot, set()).add(str(entrez)) # ## Read and process BindingDB tsv # In[5]: target_fields = [ 'BindingDB Target Chain Sequence', 'PDB ID(s) of Target Chain', 'UniProt (SwissProt) Recommended Name of Target Chain', 'UniProt (SwissProt) Entry Name of Target Chain', 'UniProt (SwissProt) Primary ID of Target Chain', 'UniProt (SwissProt) Secondary ID(s) of Target Chain', 'UniProt (SwissProt) Alternative ID(s) of Target Chain', 'UniProt (TrEMBL) Submitted Name of Target Chain', 'UniProt (TrEMBL) Entry Name of Target Chain', 'UniProt (TrEMBL) Primary ID of Target Chain', 'UniProt (TrEMBL) Secondary ID(s) of Target Chain', 'UniProt (TrEMBL) Alternative ID(s) of Target Chain', ] chains_key = 'Number of Protein Chains in Target (>1 implies a multichain complex)' def read_bindingdb(path, verbose=False, max_rows=None): """ Field documentation: https://www.bindingdb.org/bind/chemsearch/marvin/BindingDB-TSV-Format.pdf """ read_file = gzip.open(path, 'rt') reader = csv.reader(read_file, delimiter='\t') header = next(reader) chains_index = header.index(chains_key) target0_index = chains_index + 1 ligand_fields = header[:chains_index + 1] for j, row in enumerate(reader): if max_rows is not None and j == max_rows: break row = [x if x else None for x in row] ligand_values = row[:chains_index + 1] # Ensure line has sufficient ligand fields if len(row) < chains_index + 1: if verbose: print('Line', j + 2, 'is deficient') continue rowdict = collections.OrderedDict(zip(ligand_fields, ligand_values)) for key in [chains_key]: if key not in rowdict: print(j+2) print(row) print(rowdict) rowdict[key] = int(rowdict[key]) chains = list() assert rowdict[chains_key] == len(row[target0_index:]) / len(target_fields) for i in range(rowdict[chains_key]): i_0 = target0_index + i * len(target_fields) i_1 = target0_index + (i + 1) * len(target_fields) target_values = row[i_0:i_1] chain = collections.OrderedDict(zip(target_fields, target_values)) chains.append(chain) rowdict['chains'] = chains yield rowdict read_file.close() # In[6]: path = os.path.join('download', filename + '.gz') bindingdb_generator = read_bindingdb(path, verbose=True) bindings = list() for i, row in enumerate(bindingdb_generator): #if i > 10000: # break if len(row['chains']) != 1: continue chain, = row['chains'] uniprots = chain['UniProt (SwissProt) Primary ID of Target Chain'] if not uniprots: continue uniprots = uniprots.split(',') template = dict() template['bindingdb_id'] = row['BindingDB MonomerID'] template['reaction_id'] = row['BindingDB Reactant_set_id'] template['source'] = row['Curation/DataSource'] template['organism'] = row['Target Source Organism According to Curator or DataSource'] template['pubmed'] = row['PMID'] template['doi'] = row['Article DOI'] affinities = {'Ki': row['Ki (nM)'], 'Kd': row['Kd (nM)'], 'IC50': row['IC50 (nM)']} for measure, affinity in affinities.items(): if affinity is None: continue for uniprot in uniprots: entrez_set = uniprot_to_entrez.get(uniprot) if not entrez_set: # uniprot_id not found in mapping continue for entrez in entrez_set: binding = template.copy() binding['measure'] = measure binding['affinity_nM'] = affinity binding['uniprot'] = uniprot binding['entrez_gene'] = entrez bindings.append(binding) # In[7]: # Convert affinities to floats lt, gt, eq, err = 0, 0, 0, 0 for binding in bindings: affinity = binding['affinity_nM'] if affinity.startswith('<'): affinity = affinity.lstrip('<') affinity = float(affinity) if affinity >= 10.0: affinity -= 1.0 lt += 1 elif affinity.startswith('>'): affinity = affinity.lstrip('>') affinity = float(affinity) affinity += 1.0 gt += 1 else: try: affinity = float(affinity) eq += 1 except ValueError: affinity = None err += 1 binding['affinity_nM'] = affinity print('< {}\n> {}\n= {}\nerrors {}'.format(lt, gt, eq, err)) # In[8]: fields = ['reaction_id', 'bindingdb_id', 'uniprot', 'entrez_gene', 'measure', 'affinity_nM', 'source', 'organism', 'pubmed', 'doi'] with gzip.open('data/binding.tsv.gz', 'wt') as write_file: writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=fields) writer.writeheader() bindings.sort(key=operator.itemgetter(*fields)) writer.writerows(bindings) # ## Calculate summary and diagnostic information # In[9]: # Measurement types path = os.path.join('download', filename + '.gz') bindingdb_generator = read_bindingdb(path) measure_keys = ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)'] #, 'kon (M-1-s-1)', 'koff (s-1)'] measures = list() for i, row in enumerate(bindingdb_generator): if len(row['chains']) != 1: continue chain, = row['chains'] uniprot = chain['UniProt (SwissProt) Primary ID of Target Chain'] if not uniprot: continue measure_set = frozenset(key for key in measure_keys if row[key] is not None) measures.append(measure_set) pprint.pprint(collections.Counter(measures)) # In[10]: # Number of chains (proteins in target) path = os.path.join('download', filename + '.gz') bindingdb_generator = read_bindingdb(path) collections.Counter(int(row[chains_key]) for row in bindingdb_generator) # In[11]: # Targets that mapped to SwissProt path = os.path.join('download', filename + '.gz') bindingdb_generator = read_bindingdb(path) collections.Counter( bool(row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain']) for row in bindingdb_generator if len(row['chains']) == 1 ) # In[12]: # Species path = os.path.join('download', filename + '.gz') bindingdb_generator = read_bindingdb(path) collections.Counter( row['Target Source Organism According to Curator or DataSource'] for row in bindingdb_generator if len(row['chains']) == 1 and row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain'] )