import csv
import gzip
import collections
import pandas
# Download SIDER data
base_url = 'http://sideeffects.embl.de/media/download/'
filenames = [
'README',
'meddra_all_indications.tsv.gz',
'meddra_all_se.tsv.gz',
'meddra_freq.tsv.gz',
]
for filename in filenames:
! wget --no-verbose --timestamping --directory-prefix download {base_url}/{filename}
! mv download/README download/README.txt
2016-02-03 14:13:11 URL:http://sideeffects.embl.de/media/download//README [2270/2270] -> "download/README" [1]
def stitch_flat_to_pubchem(cid):
assert cid.startswith('CID')
return int(cid[3:]) - 1e8
def stitch_stereo_to_pubchem(cid):
assert cid.startswith('CID')
return int(cid[3:])
# Read DrugBank terms
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'
drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})
# Pubchem to DrugBank mapping
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pandas.read_table(url)
columns = [
'stitch_id_flat',
'stitch_id_sterio',
'umls_cui_from_label',
'placebo',
'frequency',
'lower',
'upper',
'meddra_type',
'umls_cui_from_meddra',
'side_effect_name',
]
freq_df = pandas.read_table('download/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)
stitch_id_flat | stitch_id_sterio | umls_cui_from_label | placebo | frequency | lower | upper | meddra_type | umls_cui_from_meddra | side_effect_name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | CID100000085 | CID000010917 | C0000737 | NaN | 21% | 0.21 | 0.21 | LLT | C0000737 | Abdominal pain |
1 | CID100000085 | CID000010917 | C0000737 | NaN | 21% | 0.21 | 0.21 | PT | C0000737 | Abdominal pain |
columns = [
'stitch_id_flat',
'stitch_id_sterio',
'umls_cui_from_label',
'meddra_type',
'umls_cui_from_meddra',
'side_effect_name',
]
se_df = pandas.read_table('download/meddra_all_se.tsv.gz', names=columns)
se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
se_df = drugbank_map_df.merge(se_df)
se_df.head(2)
drugbank_id | pubchem_id | stitch_id_flat | stitch_id_sterio | umls_cui_from_label | meddra_type | umls_cui_from_meddra | side_effect_name | |
---|---|---|---|---|---|---|---|---|
0 | DB00014 | 47725 | CID100047725 | CID000047725 | C0000737 | LLT | C0000737 | Abdominal pain |
1 | DB00014 | 47725 | CID100047725 | CID000047725 | C0000737 | PT | C0687713 | Gastrointestinal pain |
se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]
se_df = se_df.dropna()
se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])
se_df = drugbank_df.merge(se_df)
se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])
len(se_df)
153663
# Create a reference of side effect IDs and Names
se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()
assert se_terms_df.side_effect_name.duplicated().sum() == 0
se_terms_df = se_terms_df.sort_values('side_effect_name')
se_terms_df.to_csv('data/side-effect-terms.tsv', sep='\t', index=False)
# Side effects of cocaine
se_df.query("drugbank_id == 'DB00907'")
drugbank_id | drugbank_name | umls_cui_from_meddra | side_effect_name | |
---|---|---|---|---|
80494 | DB00907 | Cocaine | C0085631 | Agitation |
80495 | DB00907 | Cocaine | C0233571 | Excitement |
80486 | DB00907 | Cocaine | C0014549 | Grand mal convulsion |
80487 | DB00907 | Cocaine | C0020517 | Hypersensitivity |
80488 | DB00907 | Cocaine | C0026961 | Mydriasis |
80489 | DB00907 | Cocaine | C0027769 | Nervousness |
80496 | DB00907 | Cocaine | C1145670 | Respiratory failure |
80497 | DB00907 | Cocaine | C1325847 | Sensitisation |
80490 | DB00907 | Cocaine | C0233494 | Tension |
80491 | DB00907 | Cocaine | C0040822 | Tremor |
80492 | DB00907 | Cocaine | C0041582 | Ulcer |
80493 | DB00907 | Cocaine | C0042963 | Vomiting |
# Number of drugbank drugs
se_df.drugbank_id.nunique()
1223
# Number of UMLS side effects
se_df.umls_cui_from_meddra.nunique()
5734
# Save side effects
se_df.to_csv('data/side-effects.tsv', sep='\t', index=False)
columns = [
'stitch_id_flat',
'umls_cui_from_label',
'method',
'concept_name',
'meddra_type',
'umls_cui_from_meddra',
'meddra_name',
]
indication_df = pandas.read_table('download/meddra_all_indications.tsv.gz', names=columns)
indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)
indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))
indication_df = indication_df.query("meddra_type == 'PT'")
indication_df.head(2)
drugbank_id | drugbank_name | pubchem_id | stitch_id_flat | umls_cui_from_label | method | concept_name | meddra_type | umls_cui_from_meddra | meddra_name | |
---|---|---|---|---|---|---|---|---|---|---|
1 | DB00014 | Goserelin | 47725 | CID100047725 | C0002871 | text_mention | Anemia | PT | C0002871 | Anaemia |
3 | DB00014 | Goserelin | 47725 | CID100047725 | C0006142 | NLP_indication | Malignant neoplasm of breast | PT | C0006142 | Breast cancer |
# Multiple Sclerosis indications
indication_df.query("umls_cui_from_meddra == 'C0026769'").drugbank_name.tolist()
['Baclofen', 'Betamethasone', 'Carbamazepine', 'Triamcinolone', 'Prednisone', 'Tizanidine', 'Hydrocortisone', 'Prednisolone', 'Methylprednisolone', 'Mitoxantrone', 'Dantrolene', 'Dexamethasone', 'FTY 720', 'Dalfampridine', '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione', 'Fingolimod']
# Save indications
indication_df.to_csv('data/indications.tsv', sep='\t', index=False)