Parse the BindingDB tsv export

This notebook peforms the following processing steps on the BindingDB export:

  • processes affinities to floats
  • converts to entrez genes
  • simplifies observation into essential fields

See the corresponding Thinklab discussion for more information.

In [1]:
import os
import csv
import gzip
import pprint
import collections
import operator

import pandas
import requests

Download BindingDB

In [2]:
# Download all data from BindingDB
filename = 'BindingDB_All_2015m10.tsv'
# ! wget --directory-prefix download https://www.bindingdb.org/bind/downloads/{filename}.zip
# ! unzip -d download download/{filename}.zip
# ! rm download/{filename}.zip
# ! mv download/BindingDB_All.tsv download/{filename}
# ! gzip -f download/{filename}
! shasum download/{filename}.gz
9dae0b2175a1ac22b11733e2b9343a7efec7936e  download/BindingDB_All_2015m10.tsv.gz

Load uniprot to entrez gene mapping

In [3]:
# uniprot to entrez gene mapping
url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'
uniprot_df = pandas.read_table(url, compression='gzip')
In [4]:
uniprot_to_entrez = dict()
for uniprot, entrez in zip(uniprot_df.uniprot, uniprot_df.GeneID):
    uniprot_to_entrez.setdefault(uniprot, set()).add(str(entrez))

Read and process BindingDB tsv

In [5]:
target_fields = [
    'BindingDB Target Chain  Sequence',
    'PDB ID(s) of Target Chain',
    'UniProt (SwissProt) Recommended Name of Target Chain',
    'UniProt (SwissProt) Entry Name of Target Chain',
    'UniProt (SwissProt) Primary ID of Target Chain',
    'UniProt (SwissProt) Secondary ID(s) of Target Chain',
    'UniProt (SwissProt) Alternative ID(s) of Target Chain',
    'UniProt (TrEMBL) Submitted Name of Target Chain',
    'UniProt (TrEMBL) Entry Name of Target Chain',
    'UniProt (TrEMBL) Primary ID of Target Chain',
    'UniProt (TrEMBL) Secondary ID(s) of Target Chain',
    'UniProt (TrEMBL) Alternative ID(s) of Target Chain',
]

chains_key = 'Number of Protein Chains in Target (>1 implies a multichain complex)'

def read_bindingdb(path, verbose=False, max_rows=None):
    """
    Field documentation: https://www.bindingdb.org/bind/chemsearch/marvin/BindingDB-TSV-Format.pdf
    """
    read_file = gzip.open(path, 'rt')
    reader = csv.reader(read_file, delimiter='\t')
    header = next(reader)
    chains_index = header.index(chains_key)
    target0_index = chains_index + 1
    ligand_fields = header[:chains_index + 1]
    for j, row in enumerate(reader):
        if max_rows is not None and j == max_rows:
            break
        row = [x if x else None for x in row]
        ligand_values = row[:chains_index + 1]
        # Ensure line has sufficient ligand fields
        if len(row) < chains_index + 1:
            if verbose:
                print('Line', j + 2, 'is deficient')
            continue
        rowdict = collections.OrderedDict(zip(ligand_fields, ligand_values))
        for key in [chains_key]:
            if key not in rowdict:
                print(j+2)
                print(row)
                print(rowdict)
            rowdict[key] = int(rowdict[key])
        chains = list()
        assert rowdict[chains_key] == len(row[target0_index:]) / len(target_fields)
        for i in range(rowdict[chains_key]):
            i_0 = target0_index + i * len(target_fields)
            i_1 = target0_index + (i + 1) * len(target_fields)
            target_values = row[i_0:i_1]
            chain = collections.OrderedDict(zip(target_fields, target_values))
            chains.append(chain)
        rowdict['chains'] = chains
        yield rowdict
    read_file.close()
In [6]:
path = os.path.join('download', filename + '.gz')
bindingdb_generator = read_bindingdb(path, verbose=True)

bindings = list()
for i, row in enumerate(bindingdb_generator):
    #if i > 10000:
    #    break
    if len(row['chains']) != 1:
        continue
    chain, = row['chains']
    uniprots = chain['UniProt (SwissProt) Primary ID of Target Chain']
    if not uniprots:
        continue
    uniprots = uniprots.split(',')

    template = dict()
    template['bindingdb_id'] = row['BindingDB MonomerID']
    template['reaction_id'] = row['BindingDB Reactant_set_id']
    template['source'] = row['Curation/DataSource']
    template['organism'] = row['Target Source Organism According to Curator or DataSource']
    template['pubmed'] = row['PMID']
    template['doi'] = row['Article DOI']

    affinities = {'Ki': row['Ki (nM)'], 'Kd': row['Kd (nM)'], 'IC50': row['IC50 (nM)']}
    for measure, affinity in affinities.items():
        if affinity is None:
            continue
        for uniprot in uniprots:
            entrez_set = uniprot_to_entrez.get(uniprot)
            if not entrez_set:
                # uniprot_id not found in mapping
                continue
            for entrez in entrez_set:
                binding = template.copy()
                binding['measure'] = measure
                binding['affinity_nM'] = affinity
                binding['uniprot'] = uniprot
                binding['entrez_gene'] = entrez
                bindings.append(binding)
Line 192304 is deficient
Line 192305 is deficient
Line 192306 is deficient
Line 192307 is deficient
Line 192308 is deficient
Line 192309 is deficient
Line 192310 is deficient
Line 192311 is deficient
Line 192312 is deficient
Line 192313 is deficient
Line 192314 is deficient
Line 192315 is deficient
Line 192316 is deficient
Line 192317 is deficient
Line 192318 is deficient
Line 192319 is deficient
Line 192320 is deficient
Line 192321 is deficient
Line 192322 is deficient
Line 192323 is deficient
Line 192324 is deficient
Line 192325 is deficient
Line 192326 is deficient
Line 192327 is deficient
Line 192328 is deficient
Line 192329 is deficient
Line 192330 is deficient
Line 192331 is deficient
Line 192332 is deficient
Line 192333 is deficient
Line 192334 is deficient
Line 192335 is deficient
Line 192336 is deficient
Line 192337 is deficient
Line 192338 is deficient
Line 192339 is deficient
Line 192340 is deficient
Line 192341 is deficient
Line 192342 is deficient
Line 192343 is deficient
Line 192344 is deficient
Line 192345 is deficient
Line 192346 is deficient
Line 192347 is deficient
Line 192348 is deficient
Line 192349 is deficient
Line 192350 is deficient
Line 192351 is deficient
Line 192352 is deficient
Line 192353 is deficient
Line 192354 is deficient
Line 192355 is deficient
Line 192356 is deficient
Line 192357 is deficient
Line 192358 is deficient
Line 192359 is deficient
Line 192360 is deficient
Line 192361 is deficient
Line 192362 is deficient
Line 192363 is deficient
Line 192364 is deficient
Line 192365 is deficient
Line 192366 is deficient
Line 192367 is deficient
Line 192368 is deficient
Line 192369 is deficient
Line 192370 is deficient
Line 192371 is deficient
Line 192372 is deficient
Line 192373 is deficient
Line 192374 is deficient
Line 192375 is deficient
Line 192376 is deficient
Line 192377 is deficient
Line 192378 is deficient
Line 192379 is deficient
Line 192380 is deficient
Line 192381 is deficient
Line 192382 is deficient
Line 192383 is deficient
Line 192384 is deficient
Line 192385 is deficient
Line 192386 is deficient
Line 192387 is deficient
Line 192388 is deficient
Line 192389 is deficient
Line 192390 is deficient
Line 192391 is deficient
Line 192392 is deficient
Line 192393 is deficient
Line 192394 is deficient
Line 192395 is deficient
Line 192396 is deficient
Line 192397 is deficient
Line 192398 is deficient
Line 192399 is deficient
Line 192400 is deficient
Line 192401 is deficient
Line 192402 is deficient
Line 192403 is deficient
Line 192404 is deficient
Line 192405 is deficient
Line 192406 is deficient
Line 192407 is deficient
Line 192408 is deficient
Line 192409 is deficient
Line 192410 is deficient
Line 192411 is deficient
Line 192412 is deficient
Line 192413 is deficient
Line 192414 is deficient
Line 192415 is deficient
Line 192416 is deficient
Line 192417 is deficient
Line 192418 is deficient
Line 192419 is deficient
Line 192420 is deficient
Line 192421 is deficient
Line 192422 is deficient
Line 192423 is deficient
Line 192424 is deficient
Line 192425 is deficient
Line 192426 is deficient
Line 192427 is deficient
Line 192428 is deficient
Line 192429 is deficient
Line 192430 is deficient
Line 192431 is deficient
Line 192432 is deficient
Line 192433 is deficient
Line 192434 is deficient
Line 192435 is deficient
Line 192436 is deficient
Line 192437 is deficient
Line 192438 is deficient
Line 192439 is deficient
Line 192440 is deficient
Line 192441 is deficient
Line 192442 is deficient
Line 192443 is deficient
Line 192444 is deficient
Line 192445 is deficient
Line 192446 is deficient
Line 192447 is deficient
Line 192448 is deficient
Line 192449 is deficient
Line 192450 is deficient
Line 192451 is deficient
Line 192452 is deficient
Line 192453 is deficient
Line 192454 is deficient
Line 192455 is deficient
Line 192456 is deficient
Line 192457 is deficient
Line 192458 is deficient
Line 192459 is deficient
Line 192460 is deficient
Line 192461 is deficient
Line 192462 is deficient
Line 192463 is deficient
Line 192464 is deficient
Line 192465 is deficient
Line 192466 is deficient
Line 192467 is deficient
Line 192468 is deficient
Line 192469 is deficient
Line 192470 is deficient
Line 192471 is deficient
Line 192472 is deficient
Line 192473 is deficient
In [7]:
# Convert affinities to floats
lt, gt, eq, err = 0, 0, 0, 0
for binding in bindings:
    affinity = binding['affinity_nM']
    if affinity.startswith('<'):
        affinity = affinity.lstrip('<')
        affinity = float(affinity)
        if affinity >= 10.0:
            affinity -= 1.0
        lt += 1
    elif affinity.startswith('>'):
        affinity = affinity.lstrip('>')
        affinity = float(affinity)
        affinity += 1.0
        gt += 1
    else:
        try:
            affinity = float(affinity)
            eq += 1
        except ValueError:
            affinity = None
            err += 1
    binding['affinity_nM'] = affinity
print('< {}\n> {}\n= {}\nerrors {}'.format(lt, gt, eq, err))
< 1267
> 129495
= 603956
errors 19
In [8]:
fields = ['reaction_id', 'bindingdb_id', 'uniprot', 'entrez_gene',
          'measure', 'affinity_nM', 'source', 'organism', 'pubmed', 'doi']
with gzip.open('data/binding.tsv.gz', 'wt') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=fields)
    writer.writeheader()
    bindings.sort(key=operator.itemgetter(*fields))
    writer.writerows(bindings)

Calculate summary and diagnostic information

In [9]:
# Measurement types
path = os.path.join('download', filename + '.gz')
bindingdb_generator = read_bindingdb(path)

measure_keys = ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)'] #, 'kon (M-1-s-1)', 'koff (s-1)']

measures = list()
for i, row in enumerate(bindingdb_generator):
    if len(row['chains']) != 1:
        continue
    chain, = row['chains']
    uniprot = chain['UniProt (SwissProt) Primary ID of Target Chain']
    if not uniprot:
        continue
    measure_set = frozenset(key for key in measure_keys if row[key] is not None)
    measures.append(measure_set)

pprint.pprint(collections.Counter(measures))
Counter({frozenset({'IC50 (nM)'}): 462031,
         frozenset({'Ki (nM)'}): 227288,
         frozenset({'EC50 (nM)'}): 65727,
         frozenset({'Kd (nM)'}): 52197,
         frozenset({'Ki (nM)', 'IC50 (nM)'}): 1184,
         frozenset(): 892,
         frozenset({'IC50 (nM)', 'EC50 (nM)'}): 628,
         frozenset({'Ki (nM)', 'EC50 (nM)'}): 574,
         frozenset({'IC50 (nM)', 'Kd (nM)'}): 93,
         frozenset({'Ki (nM)', 'IC50 (nM)', 'EC50 (nM)'}): 23,
         frozenset({'EC50 (nM)', 'Kd (nM)'}): 8,
         frozenset({'Ki (nM)', 'Kd (nM)'}): 4,
         frozenset({'Ki (nM)', 'IC50 (nM)', 'Kd (nM)'}): 1})
In [10]:
# Number of chains (proteins in target)
path = os.path.join('download', filename + '.gz')
bindingdb_generator = read_bindingdb(path)
collections.Counter(int(row[chains_key]) for row in bindingdb_generator)
Out[10]:
Counter({1: 1101095, 2: 31218, 3: 5393, 4: 526, 5: 302, 6: 343, 12: 3, 19: 1})
In [11]:
# Targets that mapped to SwissProt
path = os.path.join('download', filename + '.gz')
bindingdb_generator = read_bindingdb(path)

collections.Counter(
    bool(row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain'])
    for row in bindingdb_generator if len(row['chains']) == 1
)
Out[11]:
Counter({False: 290445, True: 810650})
In [12]:
# Species
path = os.path.join('download', filename + '.gz')
bindingdb_generator = read_bindingdb(path)

collections.Counter(
    row['Target Source Organism According to Curator or DataSource']
    for row in bindingdb_generator if
    len(row['chains']) == 1 and 
    row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain']
)
Out[12]:
Counter({'Rattus norvegicus': 78798,
         'Vibrio harveyi': 60,
         'Malus domestica': 4,
         'Lymnaea stagnalis': 154,
         'Macaca fascicularis': 271,
         'Naja mossambica': 9,
         'Thermus thermophilus': 13,
         'Pseudomonas aeruginosa': 8,
         'Varicella-zoster virus (strain Dumas)': 66,
         'Human immunodeficiency virus type 1 group M subtype B (isolate HXB2)': 301,
         'Pseudomonas fluorescens': 3,
         'Streptococcus pyogenes': 24,
         'Vibrio proteolyticus': 40,
         'Musca domestica': 13,
         'Mesocricetus auratus': 74,
         'Bacillus amyloliquefaciens': 4,
         'Hepatitis C virus genotype 1b (isolate Con1)': 34,
         'Hepatitis C virus genotype 3a (isolate NZL1)': 38,
         'Vibrio fischeri': 83,
         'Saccharomyces cerevisiae': 17,
         'Pichia angusta': 14,
         'Clostridium perfringens': 124,
         'Poliovirus type 1 (strain Mahoney)': 20,
         'Klebsiella pneumoniae': 35,
         'Mus musculus': 25316,
         'Photinus pyralis': 96,
         'Spiroplasma sp. (strain MQ-1)': 36,
         'Humicola insolens': 7,
         'Influenza A virus (strain A/Memphis/1/1971 H3N2)': 8,
         'Influenza B virus (strain B/Lee/1940)': 224,
         'Drosophila melanogaster': 130,
         'Rhizobium radiobacter': 41,
         'Toxoplasma gondii': 1205,
         'Paramecium tetraurelia': 6,
         'Dictyostelium discoideum': 7,
         'Gallus gallus': 875,
         'Influenza A virus (strain A/Tokyo/3/1967 H2N2)': 14,
         'Neisseria gonorrhoeae': 103,
         'Bacillus lentus': 36,
         'Carica papaya': 201,
         'Mycoplana ramosa': 11,
         'Bacillus subtilis': 23,
         'Streptococcus pneumoniae (strain ATCC BAA-255 / R6)': 11,
         'Legionella pneumophila': 13,
         'Stenotrophomonas maltophilia': 58,
         'Yersinia pestis': 34,
         'Thermoanaerobacter saccharolyticum': 2,
         'Leishmania mexicana': 46,
         'Human immunodeficiency virus type 1 group M subtype B (isolate BRU/LAI)': 1,
         'Ovis aries': 1618,
         'Staphylococcus aureus': 656,
         'Methanosarcina thermophila': 79,
         'Sus scrofa': 4259,
         'Rhizopus oryzae': 2,
         'Bos taurus': 13461,
         'Human immunodeficiency virus type 1 group M subtype B (isolate PCV12)': 36,
         'Plasmodium falciparum': 819,
         'Leuconostoc mesenteroides': 168,
         'Clostridium botulinum': 427,
         'Vibrio harveyi (strain ATCC BAA-1116 / BB120)': 3,
         'Lucilia cuprina': 24,
         'Photuris pennsylvanica': 76,
         'Citrobacter freundii': 9,
         'Avian erythroblastosis virus (strain ES4)': 15,
         'Human immunodeficiency virus type 1 group M subtype B (isolate YU-2)': 166,
         'Streptococcus pyogenes serotype M1': 38,
         'Bacillus anthracis': 407,
         'Trypanosoma brucei brucei': 131,
         'Thermus aquaticus': 95,
         'Solanum tuberosum': 1,
         'Vaccinia virus (strain Western Reserve)': 28,
         'Human herpesvirus 1 (strain SC16)': 168,
         'Human herpesvirus 6A (strain Uganda-1102)': 69,
         'Aspergillus aculeatus': 5,
         'Fujinami sarcoma virus': 1,
         'Electrophorus electricus': 2784,
         'Human herpesvirus 1 (strain 17)': 680,
         'Serratia marcescens': 36,
         'Rhizobium meliloti': 2,
         'Aspergillus niger': 20,
         'Pneumocystis carinii': 889,
         'Bacillus licheniformis': 26,
         'Plasmodium falciparum (isolate K1 / Thailand)': 966,
         'West Nile virus': 158,
         'Luciola lateralis': 22,
         'Staphylococcus aureus (strain MRSA252)': 74,
         'Penicillium janthinellum': 9,
         'Apis mellifera': 33,
         None: 1738,
         'Streptomyces caespitosus': 4,
         'Torpedo marmorata': 5,
         'Danio rerio': 35,
         'Staphylococcus aureus (strain Mu50 / ATCC 700699)': 46,
         'Meleagris gallopavo': 98,
         'Candida albicans': 491,
         'Cavia porcellus': 7656,
         'Canis familiaris': 312,
         'Echis carinatus': 8,
         'Providencia stuartii': 4,
         'Carassius auratus': 39,
         'Naja naja': 9,
         'Human cytomegalovirus (strain AD169)': 375,
         'Aedes aegypti': 34,
         'Yersinia enterocolitica': 21,
         'Actinomadura sp. (strain R39)': 32,
         'Escherichia coli': 353,
         'Plasmodium falciparum (isolate 3D7)': 100,
         'Brassica oleracea var. capitata': 12,
         'Streptococcus pneumoniae': 31,
         'Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)': 1,
         'Mycobacterium smegmatis': 4,
         'Epstein-Barr virus (strain B95-8)': 30,
         'Feline herpesvirus 1': 4,
         'Abelson murine leukemia virus': 80,
         'Bovine viral diarrhea virus (strain CP7)': 1,
         'Human papillomavirus type 16': 42,
         'Nicotiana tabacum': 13,
         'Caenorhabditis elegans': 2245,
         'Influenza A virus (strain A/Aichi/2/1968 H3N2)': 5,
         'Influenza A virus (strain A/Puerto Rico/8/1934 H1N1)': 208,
         'Xenopus laevis': 24,
         'Glycine max': 171,
         'Canis lupus dingo': 70,
         'Hansenula anomala': 5,
         'Canavalia ensiformis': 194,
         'Agaricus bisporus': 813,
         'Enterococcus faecium': 14,
         'Naja melanoleuca': 8,
         'Plasmodium falciparum (isolate FcB1 / Columbia)': 726,
         'Lactobacillus fermentum': 4,
         'Influenza A virus (strain A/Brevig Mission/1/1918 H1N1)': 27,
         'Human papillomavirus type 11': 54,
         'Bacillus thermoproteolyticus': 134,
         'Enterobacter cloacae': 307,
         'Pisum sativum': 9,
         'Mycobacterium tuberculosis': 12,
         'Human SARS coronavirus': 100,
         'Oryctolagus cuniculus': 4261,
         'Clostridium botulinum (strain Hall / ATCC 3502 / NCTC 13319 / Type A)': 179,
         'Flavobacterium meningosepticum': 3,
         'Crithidia fasciculata': 50,
         'Equus caballus': 1720,
         'Macaca mulatta': 100,
         'Trypanosoma cruzi': 1222,
         'Saccharomyces cerevisiae (strain ATCC 204508 / S288c)': 1383,
         'Vibrio fischeri (strain ATCC 700601 / ES114)': 32,
         'Human herpesvirus 1 (strain KOS)': 4,
         'Human herpesvirus 1': 51,
         'Ricinus communis': 71,
         'Bison bison': 15,
         'Lactobacillus casei': 700,
         'Alcaligenes sp. (strain DSM 11172)': 44,
         'Aeromonas hydrophila': 14,
         'Human T-cell leukemia virus 1 (strain Japan ATK-1 subtype A)': 31,
         'Enterobacteria phage T4': 77,
         'Rhizopus chinensis': 11,
         'Nipah virus': 7,
         'Hordeum vulgare': 6,
         'Caldocellum saccharolyticum': 41,
         'Human papillomavirus type 1a': 5,
         'Cryptosporidium parvum': 355,
         'Enterobacteria phage lambda': 11,
         'Leishmania major': 118,
         'Zea mays': 46,
         'Hepatitis C virus genotype 4a (isolate ED43)': 1,
         'Woolly monkey sarcoma virus': 22,
         'Human rhinovirus 16': 5,
         'Human immunodeficiency virus type 1 group M subtype B (isolate MN)': 10,
         'Human herpesvirus 2 (strain HG52)': 45,
         'Influenza B virus (strain B/Memphis/3/1989)': 2,
         'Arabidopsis thaliana': 173,
         'Bombyx mori': 25,
         'Oryza sativa subsp. japonica': 16,
         'Homo sapiens': 645210,
         'Bacillus cereus': 51,
         'Staphylococcus aureus (strain MW2)': 42,
         'Hepatitis C virus genotype 1b (isolate BK)': 1,
         'Torpedo californica': 574,
         'Pseudomonas putida': 37})