import urllib
import json
import pandas
url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'
do_slim = pandas.read_table(url)
do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]
do_slim.head(2)
slim_id | slim_name | subsumed_id | |
---|---|---|---|
0 | DOID:0050156 | idiopathic pulmonary fibrosis | DOID:0050156 |
1 | DOID:0050425 | restless legs syndrome | DOID:0050425 |
url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'
entrez_map_df = pandas.read_table(url, compression='gzip')
entrez_map_df.head(2)
uniprot | GeneID | |
---|---|---|
0 | A0A010PZJ8 | 19039206 |
1 | A0A010PZK3 | 19039211 |
url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'
drugbank_df = pandas.read_table(url)
drugbank_df = drugbank_df[['drugbank_id', 'name']]
drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})
drugbank_df.head(2)
drugbank_id | drugbank_name | |
---|---|---|
0 | DB00014 | Goserelin |
1 | DB00035 | Desmopressin |
len(drugbank_df)
1552
path = 'drugtarget/identifiers.tsv'
id_df = pandas.read_table(path)
id_df = id_df.query("ID_TYPE == 'DRUGBANK_ID'")[['DRUG_ID', 'IDENTIFIER']]
id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})
drugbank_df = id_df.merge(drugbank_df)
drugbank_df.head(2)
DRUG_ID | drugbank_id | drugbank_name | |
---|---|---|---|
0 | 1327 | DB00014 | Goserelin |
1 | 817 | DB00035 | Desmopressin |
len(drugbank_df)
1634
path = 'drugtarget/drug_target.tsv'
target_df = pandas.read_table(path)
target_df = drugbank_df.merge(target_df)
target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]
# Split multi-protein targets into many rows
s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name ='uniprot'
del target_df['UNIPROT']
target_df = target_df.join(s)
target_df = entrez_map_df.merge(target_df)
del target_df['uniprot']
target_df['action'] = target_df['ACTION_TYPE'].str.lower()
del target_df['ACTION_TYPE']
target_df['pubmed_id'] = target_df.REFERENCE.str.extract('pubmed/([0-9]+)')
target_df = target_df.drop_duplicates()
target_df.head(2)
GeneID | drugbank_id | drugbank_name | TARGET_NAME | TARGET_FAMILY | SOURCE | REFERENCE | action | pubmed_id | |
---|---|---|---|---|---|---|---|---|---|
0 | 8233868 | DB00431 | Lindane | GABA-A receptor | Ion channel | CHEMBL | https://www.ebi.ac.uk/chembl/compound/inspect/... | negative allosteric modulator | NaN |
1 | 8232849 | DB08823 | Spinosad | Nicotinic acetylcholine receptor | Ion channel | CHEMBL | https://www.ebi.ac.uk/chembl/compound/inspect/... | agonist | NaN |
target_source_map = {
'CHEMBL': 'DrugCentral (ChEMBL)',
'SCIENTIFIC LITERATURE': 'DrugCentral (literature)',
'DRUG LABEL': 'DrugCentral (label)',
'IUPHAR': 'DrugCentral (IUPHAR)',
'KEGG DRUG': 'DrugCentral (KEGG DRUG)',
}
target_df.SOURCE = target_df.SOURCE.map(target_source_map)
target_df.SOURCE.value_counts()
DrugCentral (ChEMBL) 2922 DrugCentral (literature) 182 DrugCentral (label) 89 DrugCentral (IUPHAR) 56 DrugCentral (KEGG DRUG) 25 Name: SOURCE, dtype: int64
def condense_targets(df):
"""Condense drug-target relationships."""
row = pandas.Series()
row['pubmed_ids'] = '|'.join(sorted(df.pubmed_id.dropna().unique()))
row['sources'] = '|'.join(sorted(df.SOURCE.unique()))
row['actions'] = '|'.join(sorted(df.action.unique()))
row['urls'] = '|'.join(sorted(url for url in df.REFERENCE.unique() if not 'pubmed' in url))
return row
target_df = target_df.groupby(['GeneID', 'drugbank_id', 'drugbank_name']).apply(condense_targets).reset_index()
target_df.to_csv('rephetio/targets.tsv', sep='\t', index=False)
path = 'drugtarget/drug_indication.tsv'
indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})
indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})
indication_df = do_slim.merge(drugbank_df.merge(indication_df))
del indication_df['DRUG_ID']
indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]
indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})
indication_df = indication_df.sort_values(['disease', 'drug'])
indication_df = indication_df.drop_duplicates()
url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'
phcoth_df = pandas.read_table(url)
phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]
indication_df = indication_df.merge(phcoth_df, how='left')
indication_df.head(2)
doid_id | drugbank_id | disease | drug | category | |
---|---|---|---|---|---|
0 | DOID:10652 | DB00843 | Alzheimer's disease | Donepezil | DM |
1 | DOID:10652 | DB00674 | Alzheimer's disease | Galantamine | DM |
len(indication_df)
671
indication_df.category.value_counts(dropna=False)
DM 359 NaN 210 SYM 77 NOT 25 Name: category, dtype: int64
indication_df.to_csv('rephetio/indications.tsv', sep='\t', index=False)
path = 'drugtarget/pharm_class.tsv'
class_df = pandas.read_table(path)
class_df = drugbank_df.merge(class_df)
classes_df = class_df[['TYPE', 'CLASS_SOURCE_ID', 'CLASS', 'SOURCE']].drop_duplicates()
class_df = class_df[['drugbank_id', 'drugbank_name', 'CLASS_SOURCE_ID', 'CLASS']]
class_df = class_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name'})
class_df = class_df.drop_duplicates()
class_df.head(2)
drugbank_id | drugbank_name | class_id | class_name | |
---|---|---|---|---|
0 | DB00014 | Goserelin | N0000175655 | Gonadotropin Releasing Hormone Receptor Agonist |
1 | DB00014 | Goserelin | N0000175654 | Gonadotropin Releasing Hormone Receptor Agonists |
# Pharmacologic mappings
len(classes_df)
1262
# Class to Drug mappings
len(class_df)
10959
class_type_map = {
'MoA': 'Mechanism of Action',
'PE': 'Physiologic Effect',
'CS': 'Chemical Structure',
'EPC': 'FDA Established Pharmacologic Class',
'PA': 'Pharmacological Action',
'has role': 'Application',
'Chemical/Ingredient': 'Chemical/Ingredient',
}
def get_class_url(class_source, class_id):
"""Create URLs for pharmacological classes based on their source"""
class_id = urllib.parse.quote(class_id)
if class_source == 'CHEBI':
return 'http://identifiers.org/chebi/{}'.format(class_id)
if class_source == 'MeSH':
return 'http://identifiers.org/mesh/{}'.format(class_id)
if class_source == 'FDA':
#return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)
# Use bioportal link until something better arises
return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)
classes_df['class_type'] = classes_df.TYPE.map(class_type_map)
del classes_df['TYPE']
classes_df = classes_df.sort_values(['class_type', 'CLASS_SOURCE_ID'])
classes_df = classes_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name', 'SOURCE': 'class_source'})
classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')
classes_df.head(2)
class_id | class_name | class_source | class_type | url | |
---|---|---|---|---|---|
73 | CHEBI:21241 | vitamin C | CHEBI | Application | http://identifiers.org/chebi/CHEBI%3A21241 |
4385 | CHEBI:22153 | acaricide | CHEBI | Application | http://identifiers.org/chebi/CHEBI%3A22153 |
class_df.to_csv('rephetio/drug-to-class.tsv', sep='\t', index=False)
classes_df.to_csv('rephetio/classes.tsv', sep='\t', index=False)