Analyzing the SIDER 4.1 data¶

In [1]:

import csv
import gzip
import collections

In [2]:

import pandas

In [3]:

# Download SIDER data
base_url = 'http://sideeffects.embl.de/media/download/'
filenames = [
    'README',
    'meddra_all_indications.tsv.gz',
    'meddra_all_se.tsv.gz',
    'meddra_freq.tsv.gz',
]
for filename in filenames:
    ! wget --no-verbose --timestamping --directory-prefix download {base_url}/{filename}

! mv download/README download/README.txt

2016-02-03 14:13:11 URL:http://sideeffects.embl.de/media/download//README [2270/2270] -> "download/README" [1]

STITCH to DrugBank mapping utilities¶

In [4]:

def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8

def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

In [5]:

# Read DrugBank terms
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'
drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})

# Pubchem to DrugBank mapping
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pandas.read_table(url)

meddra_freq.tsv.gz¶

In [6]:

columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'placebo',
    'frequency',
    'lower',
    'upper',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
freq_df = pandas.read_table('download/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)

Out[6]:

	stitch_id_flat	stitch_id_sterio	umls_cui_from_label	placebo	frequency	lower	upper	meddra_type	umls_cui_from_meddra	side_effect_name
0	CID100000085	CID000010917	C0000737	NaN	21%	0.21	0.21	LLT	C0000737	Abdominal pain
1	CID100000085	CID000010917	C0000737	NaN	21%	0.21	0.21	PT	C0000737	Abdominal pain

meddra_all_se.tsv.gz¶

In [7]:

columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
se_df = pandas.read_table('download/meddra_all_se.tsv.gz', names=columns)
se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
se_df = drugbank_map_df.merge(se_df)
se_df.head(2)

Out[7]:

	drugbank_id	pubchem_id	stitch_id_flat	stitch_id_sterio	umls_cui_from_label	meddra_type	umls_cui_from_meddra	side_effect_name
0	DB00014	47725	CID100047725	CID000047725	C0000737	LLT	C0000737	Abdominal pain
1	DB00014	47725	CID100047725	CID000047725	C0000737	PT	C0687713	Gastrointestinal pain

In [8]:

se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]
se_df = se_df.dropna()
se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])
se_df = drugbank_df.merge(se_df)
se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])
len(se_df)

Out[8]:

In [9]:

# Create a reference of side effect IDs and Names
se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()
assert se_terms_df.side_effect_name.duplicated().sum() == 0
se_terms_df = se_terms_df.sort_values('side_effect_name')
se_terms_df.to_csv('data/side-effect-terms.tsv', sep='\t', index=False)

In [10]:

# Side effects of cocaine
se_df.query("drugbank_id == 'DB00907'")

Out[10]:

	drugbank_id	drugbank_name	umls_cui_from_meddra	side_effect_name
80494	DB00907	Cocaine	C0085631	Agitation
80495	DB00907	Cocaine	C0233571	Excitement
80486	DB00907	Cocaine	C0014549	Grand mal convulsion
80487	DB00907	Cocaine	C0020517	Hypersensitivity
80488	DB00907	Cocaine	C0026961	Mydriasis
80489	DB00907	Cocaine	C0027769	Nervousness
80496	DB00907	Cocaine	C1145670	Respiratory failure
80497	DB00907	Cocaine	C1325847	Sensitisation
80490	DB00907	Cocaine	C0233494	Tension
80491	DB00907	Cocaine	C0040822	Tremor
80492	DB00907	Cocaine	C0041582	Ulcer
80493	DB00907	Cocaine	C0042963	Vomiting

In [11]:

# Number of drugbank drugs
se_df.drugbank_id.nunique()

Out[11]:

In [12]:

# Number of UMLS side effects
se_df.umls_cui_from_meddra.nunique()

Out[12]:

In [13]:

# Save side effects
se_df.to_csv('data/side-effects.tsv', sep='\t', index=False)

meddra_all_indications.tsv.gz¶

In [14]:

columns = [
    'stitch_id_flat',
    'umls_cui_from_label',
    'method',
    'concept_name',
    'meddra_type',
    'umls_cui_from_meddra',
    'meddra_name',
]
indication_df = pandas.read_table('download/meddra_all_indications.tsv.gz', names=columns)
indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)

In [15]:

indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))
indication_df = indication_df.query("meddra_type == 'PT'")
indication_df.head(2)

Out[15]:

	drugbank_id	drugbank_name	pubchem_id	stitch_id_flat	umls_cui_from_label	method	concept_name	meddra_type	umls_cui_from_meddra	meddra_name
1	DB00014	Goserelin	47725	CID100047725	C0002871	text_mention	Anemia	PT	C0002871	Anaemia
3	DB00014	Goserelin	47725	CID100047725	C0006142	NLP_indication	Malignant neoplasm of breast	PT	C0006142	Breast cancer

In [16]:

# Multiple Sclerosis indications
indication_df.query("umls_cui_from_meddra == 'C0026769'").drugbank_name.tolist()

Out[16]:

['Baclofen',
 'Betamethasone',
 'Carbamazepine',
 'Triamcinolone',
 'Prednisone',
 'Tizanidine',
 'Hydrocortisone',
 'Prednisolone',
 'Methylprednisolone',
 'Mitoxantrone',
 'Dantrolene',
 'Dexamethasone',
 'FTY 720',
 'Dalfampridine',
 '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione',
 'Fingolimod']

In [17]:

# Save indications
indication_df.to_csv('data/indications.tsv', sep='\t', index=False)