Collapsed source/target edge contributions to epilepsy predictions¶

In [1]:

import re
import itertools
import collections

import pandas

from utilities import tidy_split

In [2]:

# Read Project Rephetio DrugBank Info
url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'
drugbank_df = (pandas.read_table(url)
    .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})
    [['compound_id', 'compound_name', 'atc_codes', 'categories']]
)

In [3]:

# Read top epilepsy predictions
top_compounds_df = (pandas.read_table('./data/windows.tsv')
    .rename(columns={'name': 'compound_name'})
    [['compound_name', 'disease_pctl', 'phcodb']]
    .merge(drugbank_df)
)
top_compounds_df.head(2)

Out[3]:

	compound_name	disease_pctl	phcodb	compound_id	atc_codes	categories
0	Topiramate	1.0000	DM	DB00273	N03AX11	Anticonvulsants\|Anti-Obesity Agents\|Neuroprote...
1	Ethotoin	0.9993	NaN	DB00754	N03AB01	Anticonvulsants

In [4]:

path_dfs = list()
for compound_id in top_compounds_df.compound_id:
    path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)
    path_dfs.append(pandas.read_table(path))
path_df = pandas.concat(path_dfs)
path_df.head(2)

Out[4]:

	nodes	percent_of_prediction	percent_of_DWPC	source_edge	target_edge	metapath
0	Topiramate—migraine—epilepsy syndrome	0.1780	1.000	Topiramate—treats—migraine	epilepsy syndrome—resembles—migraine	CtDrD
1	Topiramate—GRIK5—epilepsy syndrome	0.0385	0.249	Topiramate—binds—GRIK5	epilepsy syndrome—associates—GRIK5	CbGaD

In [5]:

def summarize(df):
    s = pandas.Series()
    s['paths'] = len(df)
    s['contribution'] = sum(df.percent_of_prediction)
    return s

Metapath contributions¶

In [6]:

metapath_df = (path_df
    .groupby('metapath')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
metapath_df.head(2)

Out[6]:

	metapath	paths	contribution
1	CbGbCtD	6358.0	20.623795
10	CrCtD	160.0	18.265600

In [7]:

metapath_df.to_csv('data/metapath-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(metapath_df)

Out[7]:

Source edge contributions¶

In [8]:

source_df = (path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
source_df.head()

Out[8]:

	source_edge	paths	contribution
1437	Compound—includes—Decreased Central Nervous Sy...	238.0	6.341200
1429	Compound—includes—Benzodiazepines	52.0	3.844600
104	Compound—binds—GABRA1	12385.0	2.819223
1519	Compound—resembles—Diazepam	402.0	2.708075
1438	Compound—includes—General Anesthesia	6.0	2.456000

In [9]:

source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(source_df)

Out[9]:

In [10]:

# Source metaedge contributions
(path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)

Out[10]:

	source_edge	paths	contribution
0	Compound—binds	266192.0	43.778627
4	Compound—resembles	8507.0	30.306896
2	Compound—includes	322.0	15.050900
3	Compound—palliates	212.0	5.319900
1	Compound—causes	117724.0	4.418177
5	Compound—treats	5.0	1.130000

Target edge contributions¶

In [11]:

target_df = (path_df
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
target_df.head()

Out[11]:

	target_edge	paths	contribution
355	epilepsy syndrome—treats—Diazepam	6843.0	8.123404
354	epilepsy syndrome—treats—Clonazepam	6488.0	6.273890
362	epilepsy syndrome—treats—Midazolam	4832.0	6.116992
353	epilepsy syndrome—treats—Clobazam	4159.0	5.670810
351	epilepsy syndrome—treats—Amobarbital	2002.0	4.840363

In [12]:

target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(target_df)

Out[12]:

In [13]:

# Source metaedge contributions
(path_df
    .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)

Out[13]:

	target_edge	paths	contribution
3	epilepsy syndrome—treats	127343.0	75.582070
0	epilepsy syndrome—associates	255092.0	21.689669
1	epilepsy syndrome—localizes	10522.0	1.602761
2	epilepsy syndrome—resembles	5.0	1.130000

Anatomy (intermediate node) contributions¶

In [14]:

anatomy_df = path_df.query("metapath == 'CbGeAlD'").copy()
anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])
anatomy_df = (anatomy_df
    .groupby('anatomy')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
anatomy_df.head(2)

Out[14]:

	anatomy	paths	contribution
21	telencephalon	928.0	0.153357
7	forebrain	834.0	0.147358

In [15]:

anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(anatomy_df)

Out[15]:

Classes of predicted compounds¶

In [16]:

def get_counts(df):
    s = pandas.Series()
    s['count'] = len(df)
    s['compounds'] = ', '.join(sorted(df.compound_name))
    return s

Third-level ATC Codes¶

In [17]:

# Anatomical Therapeutic Chemical Classification System

# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907
url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'
atc_df = (pandas.read_table(url)
    .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})
)
df = tidy_split(top_compounds_df, 'atc_codes')
print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))

# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.
df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)
df = df.drop_duplicates()
df = df.merge(atc_df, how='left')
df.atc_name = df.atc_name.str.lower()
df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)

91 compounds have at least 1 ATC code

Out[17]:

	atc_code	atc_name	count	compounds
10	N03A	antiepileptics	25	Carbamazepine, Clonazepam, Ethosuximide, Ethot...
13	N05C	hypnotics and sedatives	21	Amobarbital, Aprobarbital, Cinolazepam, Estazo...

DrugBank Categories¶

In [18]:

df = tidy_split(top_compounds_df, 'categories')
print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))
df = df.rename(columns={'categories': 'category'})
df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-categories.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)

84 compounds have at least 1 category

Out[18]:

	category	count	compounds
16	Anticonvulsants	28	Acetazolamide, Carbamazepine, Clobazam, Clonaz...
46	Hypnotics and Sedatives	24	Alprazolam, Butabarbital, Butethal, Chlordiaze...

DurgCentral Pharmacologic Classes¶

In [19]:

class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'
class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'
class_df = (pandas.read_table(class_url)
    .merge(pandas.read_table(class_rel_url))
    .rename(columns={'drugbank_id': 'compound_id'})
    [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]
)

class_df.head(2)

Out[19]:

	compound_id	class_id	class_name	class_source	class_type
0	DB00126	CHEBI:21241	vitamin C	CHEBI	Application
1	DB00676	CHEBI:22153	acaricide	CHEBI	Application

In [20]:

df = top_compounds_df.merge(class_df)
print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))
df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)

92 compounds have at least 1 pharmacologic class

Out[20]:

	class_id	class_name	class_source	class_type	count	compounds
89	D002491	Central Nervous System Agents	MeSH	Pharmacological Action	76	Acamprosate, Acetazolamide, Adinazolam, Alpraz...
90	D002492	Central Nervous System Depressants	MeSH	Pharmacological Action	46	Alprazolam, Amobarbital, Bromazepam, Butabarbi...

Contribution by gene groups¶

In [21]:

# Read Entrez Gene
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
gene_df = pandas.read_table(url)
symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))

In [22]:

# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668
gene_split = re.compile(r',\s*(?![^()]*\))')

def summarize(df):
    s = pandas.Series()
    s['paths'] = sum(df.paths)
    s['contribution'] = sum(df.contribution)
    s['gene_symbols'] = ', '.join(df.gene_symbol)
    return s

def contributions_by_gene(df, edge_column):
    df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])
    df['gene_name'] = df['gene_symbol'].map(symbol_to_name)
    df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])
    return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)

In [23]:

source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()
source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')
source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\t', index=False, float_format='%.5g')
source_bind_df.head(2)

Out[23]:

	gene_main_name	paths	contribution	gene_symbols
74	gamma-aminobutyric acid (GABA) A receptor	91967.0	15.329433	GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,...
66	cytochrome P450	34323.0	5.585686	CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C...

In [24]:

target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()
target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')
target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\t', index=False, float_format='%.5g')
target_bind_df.head(2)

Out[24]:

	gene_main_name	paths	contribution	gene_symbols
119	gamma-aminobutyric acid (GABA) A receptor	23347.0	6.834028	GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD
128	glutamate receptor	20142.0	2.284863	GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI...

Contribution by Side Effect (source edges)¶

In [25]:

side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]
side_effect_df.head()

Out[25]:

	source_edge	paths	contribution
390	Compound—causes—Ataxia	1312.0	0.069241
1057	Compound—causes—Nystagmus	611.0	0.048500
579	Compound—causes—Diplopia	948.0	0.044986
1278	Compound—causes—Somnolence	1577.0	0.043543
1416	Compound—causes—Vomiting	1777.0	0.042753