#!/usr/bin/env python
# coding: utf-8

# In[2]:


import os
import csv
import collections
import json
import gzip
import io

import requests


# In[2]:


path = os.path.join('data', 'drugbank.tsv')
with open(path) as read_file:
    reader = csv.DictReader(read_file, delimiter='\t')
    drugbank = list(reader)

drugbank_ids = [drug['drugbank_id'] for drug in drugbank]
assert len(drugbank_ids) == len(set(drugbank_ids))


# In[3]:


# Compound has InChIKey
collections.Counter(bool(drug['inchikey']) for drug in drugbank)


# In[4]:


# Compound types
collections.Counter(drug['type'] for drug in drugbank)


# In[1]:


id_to_source = {
    0: None,
    1: 'chembl',
    2: 'drugbank',
    3: 'pdb',
    4: 'iuphar',
    5: 'pubchem_dotf',
    6: 'kegg_ligand',
    7: 'chebi',
    8: 'nih_ncc',
    9: 'zinc',
    10: 'emolecules',
    11: 'ibm',
    12: 'atlas',
    13: 'ibm_patents',
    14: 'fdasrs',
    15: 'surechembl',
    17: 'pharmgkb',
    18: 'hmdb',
    20: 'selleck',
    21: 'pubchem_tpharma',
    22: 'pubchem',
    23: 'mcule',
    24: 'nmrshiftdb2',
    25: 'lincs',
    26: 'actor',
    27: 'recon',
    28: 'molport',
    29: 'nikkaji',
    31: 'bindingdb',
}

source_to_id = {v: k for k, v in id_to_source.items()}


# In[37]:


def connectivity_query(search_url, target = None, B = 0, C = 0, D = 0, E = 0, F = 0, G = 0):
    """
    https://www.ebi.ac.uk/unichem/info/widesearchInfo
    """
    url = '{search_url}/{A}/{B}/{C}/{D}/{E}/{F}/{G}/{H}'.format(
        search_url = search_url,
        A = source_to_id[target], # Sources
        B = B, # Pattern
        C = C, # Component Mapping
        D = D, # Frequency Block
        E = E, # InChI Length Block
        F = F, # UniChem Labels
        G = G, # Assignment Status
        H = 1, # Data Structure
    )
    response = requests.get(url)
    try:
        response = response.json()
    except ValueError:
        print('cannot decode json:', url)
        return
    if 'error' in response:
        print('UniChem error:', response['error'])
        return
    for assignment in response.values():
        header = assignment.pop(0)
        for match in assignment:
            yield collections.OrderedDict(zip(header, match))

def key_search(inchikey, **kwargs):
    """Search by InChIKeys."""
    if inchikey.startswith('InChIKey='):
        prefix, inchikey = inchikey.split('=', 1)
    base_url = 'https://www.ebi.ac.uk/unichem/rest/key_search'
    search_url = '{base_url}/{StandardInChIKey}'.format(
        base_url = base_url,
        StandardInChIKey = inchikey)
    return connectivity_query(search_url, **kwargs)
    
def cpd_search(source, compound_id, **kwargs):
    """Search by source-specific identifiers."""
    base_url = 'https://www.ebi.ac.uk/unichem/rest/cpd_search'
    search_url = '{base_url}/{src_compound_id}/{src_id}'.format(
        base_url = base_url,
        src_compound_id = compound_id,
        src_id = source_to_id[source])
    return connectivity_query(search_url, **kwargs)


# In[ ]:


# In[38]:


# mapping writer
mapping_path = os.path.join('data', 'mapping.tsv.gz')
mapping_file = gzip.open(mapping_path, 'wb')
mapping_buffer = io.TextIOWrapper(mapping_file, line_buffering = True)
mapping_fields = ['drugbank_id', 'drugbank_name', 'src_id', 'source_name', 'src_compound_id',
              'C', 'Query_InChIKey', 'CpdId_InChIKey', 'Full_Query_InChI', 'Full_CpdId_InChI',
              'Matching_Query_InChI', 'Matching_CpdId_InChI', 'b', 'i', 'm', 'p', 's', 't']
mapping_writer = csv.DictWriter(mapping_buffer, delimiter = '\t', fieldnames = mapping_fields, extrasaction = 'ignore')
mapping_writer.writeheader()

# mapping counts writer
count_path = os.path.join('data', 'mapping-counts.tsv')
count_file = open(count_path, 'w')
source_names = [id_to_source[i] for i in sorted(set(id_to_source) - {0})]
count_fields = ['drugbank_id', 'drugbank_name'] + source_names
count_writer = csv.DictWriter(count_file, delimiter = '\t', fieldnames = count_fields, restval = 0)
count_writer.writeheader()


for drug in drugbank:
    if drug['type'] != 'small molecule':
        continue
    if not drug['inchikey']:
        continue
    drugbank_id = drug['drugbank_id']
    drugbank_name = drug['name']
    print(drugbank_id, drugbank_name)
    query_matches = list(cpd_search('drugbank', drugbank_id, C = 4))
    if not query_matches:
        if drug['inchi'].startswith('InChI=1S'):
            query_matches = list(key_search(drug['inchikey'], C = 4))
        else: # non-standard InChI
            print('non-standard InChI: cannot query compound')
            continue
    
    for match in query_matches:
        match['drugbank_id'] = drugbank_id
        match['drugbank_name'] = drugbank_name
        match['source_name'] = id_to_source[int(match['src_id'])]
        mapping_writer.writerow(match)
    
    source_to_matches = dict()
    for match in query_matches:
        match_set = source_to_matches.setdefault(match['source_name'], set())
        match_set.add(match['src_compound_id'])
    count = {k: len(v) for k, v in source_to_matches.items()}
    count = collections.defaultdict(int, count)
    count['drugbank_id'] = drugbank_id
    count['drugbank_name'] = drugbank_name
    count_writer.writerow(count)

mapping_file.close()
count_file.close()


# In[ ]:


# In[3]:


# write source-specific mapping files
mapping_path = os.path.join('data', 'mapping.tsv.gz')
mapping_file = gzip.open(mapping_path, 'rb')
mapping_buffer = io.TextIOWrapper(mapping_file)
reader = csv.DictReader(mapping_buffer, delimiter='\t')
source_to_pairs = dict()
for row in reader:
    pair = row['drugbank_id'], row['src_compound_id']
    pairs = source_to_pairs.setdefault(row['source_name'], set())
    pairs.add(pair)
mapping_file.close()

del source_to_pairs['drugbank']
for source, pairs in source_to_pairs.items():
    path = os.path.join('data', 'mapping', '{}.tsv'.format(source))
    write_file = open(path, 'w')
    writer = csv.writer(write_file, delimiter='\t')
    writer.writerow(['drugbank_id', '{}_id'.format(source)])
    writer.writerows(sorted(pairs))
    write_file.close()


# In[ ]: