#!/usr/bin/env python # coding: utf-8 # In[2]: import os import csv import collections import json import gzip import io import requests # In[2]: path = os.path.join('data', 'drugbank.tsv') with open(path) as read_file: reader = csv.DictReader(read_file, delimiter='\t') drugbank = list(reader) drugbank_ids = [drug['drugbank_id'] for drug in drugbank] assert len(drugbank_ids) == len(set(drugbank_ids)) # In[3]: # Compound has InChIKey collections.Counter(bool(drug['inchikey']) for drug in drugbank) # In[4]: # Compound types collections.Counter(drug['type'] for drug in drugbank) # In[1]: id_to_source = { 0: None, 1: 'chembl', 2: 'drugbank', 3: 'pdb', 4: 'iuphar', 5: 'pubchem_dotf', 6: 'kegg_ligand', 7: 'chebi', 8: 'nih_ncc', 9: 'zinc', 10: 'emolecules', 11: 'ibm', 12: 'atlas', 13: 'ibm_patents', 14: 'fdasrs', 15: 'surechembl', 17: 'pharmgkb', 18: 'hmdb', 20: 'selleck', 21: 'pubchem_tpharma', 22: 'pubchem', 23: 'mcule', 24: 'nmrshiftdb2', 25: 'lincs', 26: 'actor', 27: 'recon', 28: 'molport', 29: 'nikkaji', 31: 'bindingdb', } source_to_id = {v: k for k, v in id_to_source.items()} # In[37]: def connectivity_query(search_url, target = None, B = 0, C = 0, D = 0, E = 0, F = 0, G = 0): """ https://www.ebi.ac.uk/unichem/info/widesearchInfo """ url = '{search_url}/{A}/{B}/{C}/{D}/{E}/{F}/{G}/{H}'.format( search_url = search_url, A = source_to_id[target], # Sources B = B, # Pattern C = C, # Component Mapping D = D, # Frequency Block E = E, # InChI Length Block F = F, # UniChem Labels G = G, # Assignment Status H = 1, # Data Structure ) response = requests.get(url) try: response = response.json() except ValueError: print('cannot decode json:', url) return if 'error' in response: print('UniChem error:', response['error']) return for assignment in response.values(): header = assignment.pop(0) for match in assignment: yield collections.OrderedDict(zip(header, match)) def key_search(inchikey, **kwargs): """Search by InChIKeys.""" if inchikey.startswith('InChIKey='): prefix, inchikey = inchikey.split('=', 1) base_url = 'https://www.ebi.ac.uk/unichem/rest/key_search' search_url = '{base_url}/{StandardInChIKey}'.format( base_url = base_url, StandardInChIKey = inchikey) return connectivity_query(search_url, **kwargs) def cpd_search(source, compound_id, **kwargs): """Search by source-specific identifiers.""" base_url = 'https://www.ebi.ac.uk/unichem/rest/cpd_search' search_url = '{base_url}/{src_compound_id}/{src_id}'.format( base_url = base_url, src_compound_id = compound_id, src_id = source_to_id[source]) return connectivity_query(search_url, **kwargs) # In[ ]: # In[38]: # mapping writer mapping_path = os.path.join('data', 'mapping.tsv.gz') mapping_file = gzip.open(mapping_path, 'wb') mapping_buffer = io.TextIOWrapper(mapping_file, line_buffering = True) mapping_fields = ['drugbank_id', 'drugbank_name', 'src_id', 'source_name', 'src_compound_id', 'C', 'Query_InChIKey', 'CpdId_InChIKey', 'Full_Query_InChI', 'Full_CpdId_InChI', 'Matching_Query_InChI', 'Matching_CpdId_InChI', 'b', 'i', 'm', 'p', 's', 't'] mapping_writer = csv.DictWriter(mapping_buffer, delimiter = '\t', fieldnames = mapping_fields, extrasaction = 'ignore') mapping_writer.writeheader() # mapping counts writer count_path = os.path.join('data', 'mapping-counts.tsv') count_file = open(count_path, 'w') source_names = [id_to_source[i] for i in sorted(set(id_to_source) - {0})] count_fields = ['drugbank_id', 'drugbank_name'] + source_names count_writer = csv.DictWriter(count_file, delimiter = '\t', fieldnames = count_fields, restval = 0) count_writer.writeheader() for drug in drugbank: if drug['type'] != 'small molecule': continue if not drug['inchikey']: continue drugbank_id = drug['drugbank_id'] drugbank_name = drug['name'] print(drugbank_id, drugbank_name) query_matches = list(cpd_search('drugbank', drugbank_id, C = 4)) if not query_matches: if drug['inchi'].startswith('InChI=1S'): query_matches = list(key_search(drug['inchikey'], C = 4)) else: # non-standard InChI print('non-standard InChI: cannot query compound') continue for match in query_matches: match['drugbank_id'] = drugbank_id match['drugbank_name'] = drugbank_name match['source_name'] = id_to_source[int(match['src_id'])] mapping_writer.writerow(match) source_to_matches = dict() for match in query_matches: match_set = source_to_matches.setdefault(match['source_name'], set()) match_set.add(match['src_compound_id']) count = {k: len(v) for k, v in source_to_matches.items()} count = collections.defaultdict(int, count) count['drugbank_id'] = drugbank_id count['drugbank_name'] = drugbank_name count_writer.writerow(count) mapping_file.close() count_file.close() # In[ ]: # In[3]: # write source-specific mapping files mapping_path = os.path.join('data', 'mapping.tsv.gz') mapping_file = gzip.open(mapping_path, 'rb') mapping_buffer = io.TextIOWrapper(mapping_file) reader = csv.DictReader(mapping_buffer, delimiter='\t') source_to_pairs = dict() for row in reader: pair = row['drugbank_id'], row['src_compound_id'] pairs = source_to_pairs.setdefault(row['source_name'], set()) pairs.add(pair) mapping_file.close() del source_to_pairs['drugbank'] for source, pairs in source_to_pairs.items(): path = os.path.join('data', 'mapping', '{}.tsv'.format(source)) write_file = open(path, 'w') writer = csv.writer(write_file, delimiter='\t') writer.writerow(['drugbank_id', '{}_id'.format(source)]) writer.writerows(sorted(pairs)) write_file.close() # In[ ]: