Collapsed source/target edge contributions to epilepsy predictions

In [1]:
import re
import itertools
import collections

import pandas

from utilities import tidy_split
In [2]:
# Read Project Rephetio DrugBank Info
url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'
drugbank_df = (pandas.read_table(url)
    .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})
    [['compound_id', 'compound_name', 'atc_codes', 'categories']]
)
In [3]:
# Read top epilepsy predictions
top_compounds_df = (pandas.read_table('./data/windows.tsv')
    .rename(columns={'name': 'compound_name'})
    [['compound_name', 'disease_pctl', 'phcodb']]
    .merge(drugbank_df)
)
top_compounds_df.head(2)
Out[3]:
compound_name disease_pctl phcodb compound_id atc_codes categories
0 Topiramate 1.0000 DM DB00273 N03AX11 Anticonvulsants|Anti-Obesity Agents|Neuroprote...
1 Ethotoin 0.9993 NaN DB00754 N03AB01 Anticonvulsants
In [4]:
path_dfs = list()
for compound_id in top_compounds_df.compound_id:
    path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)
    path_dfs.append(pandas.read_table(path))
path_df = pandas.concat(path_dfs)
path_df.head(2)
Out[4]:
nodes percent_of_prediction percent_of_DWPC source_edge target_edge metapath
0 Topiramate—migraine—epilepsy syndrome 0.1780 1.000 Topiramate—treats—migraine epilepsy syndrome—resembles—migraine CtDrD
1 Topiramate—GRIK5—epilepsy syndrome 0.0385 0.249 Topiramate—binds—GRIK5 epilepsy syndrome—associates—GRIK5 CbGaD
In [5]:
def summarize(df):
    s = pandas.Series()
    s['paths'] = len(df)
    s['contribution'] = sum(df.percent_of_prediction)
    return s

Metapath contributions

In [6]:
metapath_df = (path_df
    .groupby('metapath')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
metapath_df.head(2)
Out[6]:
metapath paths contribution
1 CbGbCtD 6358.0 20.623795
10 CrCtD 160.0 18.265600
In [7]:
metapath_df.to_csv('data/metapath-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(metapath_df)
Out[7]:
12

Source edge contributions

In [8]:
source_df = (path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
source_df.head()
Out[8]:
source_edge paths contribution
1437 Compound—includes—Decreased Central Nervous Sy... 238.0 6.341200
1429 Compound—includes—Benzodiazepines 52.0 3.844600
104 Compound—binds—GABRA1 12385.0 2.819223
1519 Compound—resembles—Diazepam 402.0 2.708075
1438 Compound—includes—General Anesthesia 6.0 2.456000
In [9]:
source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(source_df)
Out[9]:
1667
In [10]:
# Source metaedge contributions
(path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
Out[10]:
source_edge paths contribution
0 Compound—binds 266192.0 43.778627
4 Compound—resembles 8507.0 30.306896
2 Compound—includes 322.0 15.050900
3 Compound—palliates 212.0 5.319900
1 Compound—causes 117724.0 4.418177
5 Compound—treats 5.0 1.130000

Target edge contributions

In [11]:
target_df = (path_df
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
target_df.head()
Out[11]:
target_edge paths contribution
355 epilepsy syndrome—treats—Diazepam 6843.0 8.123404
354 epilepsy syndrome—treats—Clonazepam 6488.0 6.273890
362 epilepsy syndrome—treats—Midazolam 4832.0 6.116992
353 epilepsy syndrome—treats—Clobazam 4159.0 5.670810
351 epilepsy syndrome—treats—Amobarbital 2002.0 4.840363
In [12]:
target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(target_df)
Out[12]:
375
In [13]:
# Source metaedge contributions
(path_df
    .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
Out[13]:
target_edge paths contribution
3 epilepsy syndrome—treats 127343.0 75.582070
0 epilepsy syndrome—associates 255092.0 21.689669
1 epilepsy syndrome—localizes 10522.0 1.602761
2 epilepsy syndrome—resembles 5.0 1.130000

Anatomy (intermediate node) contributions

In [14]:
anatomy_df = path_df.query("metapath == 'CbGeAlD'").copy()
anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])
anatomy_df = (anatomy_df
    .groupby('anatomy')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
anatomy_df.head(2)
Out[14]:
anatomy paths contribution
21 telencephalon 928.0 0.153357
7 forebrain 834.0 0.147358
In [15]:
anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(anatomy_df)
Out[15]:
24

Classes of predicted compounds

In [16]:
def get_counts(df):
    s = pandas.Series()
    s['count'] = len(df)
    s['compounds'] = ', '.join(sorted(df.compound_name))
    return s

Third-level ATC Codes

In [17]:
# Anatomical Therapeutic Chemical Classification System

# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907
url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'
atc_df = (pandas.read_table(url)
    .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})
)
df = tidy_split(top_compounds_df, 'atc_codes')
print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))

# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.
df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)
df = df.drop_duplicates()
df = df.merge(atc_df, how='left')
df.atc_name = df.atc_name.str.lower()
df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
91 compounds have at least 1 ATC code
Out[17]:
atc_code atc_name count compounds
10 N03A antiepileptics 25 Carbamazepine, Clonazepam, Ethosuximide, Ethot...
13 N05C hypnotics and sedatives 21 Amobarbital, Aprobarbital, Cinolazepam, Estazo...

DrugBank Categories

In [18]:
df = tidy_split(top_compounds_df, 'categories')
print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))
df = df.rename(columns={'categories': 'category'})
df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-categories.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
84 compounds have at least 1 category
Out[18]:
category count compounds
16 Anticonvulsants 28 Acetazolamide, Carbamazepine, Clobazam, Clonaz...
46 Hypnotics and Sedatives 24 Alprazolam, Butabarbital, Butethal, Chlordiaze...

DurgCentral Pharmacologic Classes

In [19]:
class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'
class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'
class_df = (pandas.read_table(class_url)
    .merge(pandas.read_table(class_rel_url))
    .rename(columns={'drugbank_id': 'compound_id'})
    [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]
)

class_df.head(2)
Out[19]:
compound_id class_id class_name class_source class_type
0 DB00126 CHEBI:21241 vitamin C CHEBI Application
1 DB00676 CHEBI:22153 acaricide CHEBI Application
In [20]:
df = top_compounds_df.merge(class_df)
print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))
df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
92 compounds have at least 1 pharmacologic class
Out[20]:
class_id class_name class_source class_type count compounds
89 D002491 Central Nervous System Agents MeSH Pharmacological Action 76 Acamprosate, Acetazolamide, Adinazolam, Alpraz...
90 D002492 Central Nervous System Depressants MeSH Pharmacological Action 46 Alprazolam, Amobarbital, Bromazepam, Butabarbi...

Contribution by gene groups

In [21]:
# Read Entrez Gene
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
gene_df = pandas.read_table(url)
symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))
In [22]:
# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668
gene_split = re.compile(r',\s*(?![^()]*\))')

def summarize(df):
    s = pandas.Series()
    s['paths'] = sum(df.paths)
    s['contribution'] = sum(df.contribution)
    s['gene_symbols'] = ', '.join(df.gene_symbol)
    return s

def contributions_by_gene(df, edge_column):
    df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])
    df['gene_name'] = df['gene_symbol'].map(symbol_to_name)
    df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])
    return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)
In [23]:
source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()
source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')
source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\t', index=False, float_format='%.5g')
source_bind_df.head(2)
Out[23]:
gene_main_name paths contribution gene_symbols
74 gamma-aminobutyric acid (GABA) A receptor 91967.0 15.329433 GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,...
66 cytochrome P450 34323.0 5.585686 CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C...
In [24]:
target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()
target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')
target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\t', index=False, float_format='%.5g')
target_bind_df.head(2)
Out[24]:
gene_main_name paths contribution gene_symbols
119 gamma-aminobutyric acid (GABA) A receptor 23347.0 6.834028 GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD
128 glutamate receptor 20142.0 2.284863 GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI...

Contribution by Side Effect (source edges)

In [25]:
side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]
side_effect_df.head()
Out[25]:
source_edge paths contribution
390 Compound—causes—Ataxia 1312.0 0.069241
1057 Compound—causes—Nystagmus 611.0 0.048500
579 Compound—causes—Diplopia 948.0 0.044986
1278 Compound—causes—Somnolence 1577.0 0.043543
1416 Compound—causes—Vomiting 1777.0 0.042753