import re
import itertools
import collections
import pandas
from utilities import tidy_split
# Read Project Rephetio DrugBank Info
url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'
drugbank_df = (pandas.read_table(url)
.rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})
[['compound_id', 'compound_name', 'atc_codes', 'categories']]
)
# Read top epilepsy predictions
top_compounds_df = (pandas.read_table('./data/windows.tsv')
.rename(columns={'name': 'compound_name'})
[['compound_name', 'disease_pctl', 'phcodb']]
.merge(drugbank_df)
)
top_compounds_df.head(2)
compound_name | disease_pctl | phcodb | compound_id | atc_codes | categories | |
---|---|---|---|---|---|---|
0 | Topiramate | 1.0000 | DM | DB00273 | N03AX11 | Anticonvulsants|Anti-Obesity Agents|Neuroprote... |
1 | Ethotoin | 0.9993 | NaN | DB00754 | N03AB01 | Anticonvulsants |
path_dfs = list()
for compound_id in top_compounds_df.compound_id:
path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)
path_dfs.append(pandas.read_table(path))
path_df = pandas.concat(path_dfs)
path_df.head(2)
nodes | percent_of_prediction | percent_of_DWPC | source_edge | target_edge | metapath | |
---|---|---|---|---|---|---|
0 | Topiramate—migraine—epilepsy syndrome | 0.1780 | 1.000 | Topiramate—treats—migraine | epilepsy syndrome—resembles—migraine | CtDrD |
1 | Topiramate—GRIK5—epilepsy syndrome | 0.0385 | 0.249 | Topiramate—binds—GRIK5 | epilepsy syndrome—associates—GRIK5 | CbGaD |
def summarize(df):
s = pandas.Series()
s['paths'] = len(df)
s['contribution'] = sum(df.percent_of_prediction)
return s
metapath_df = (path_df
.groupby('metapath')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
metapath_df.head(2)
metapath | paths | contribution | |
---|---|---|---|
1 | CbGbCtD | 6358.0 | 20.623795 |
10 | CrCtD | 160.0 | 18.265600 |
metapath_df.to_csv('data/metapath-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(metapath_df)
12
source_df = (path_df
.assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))
.groupby('source_edge')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
source_df.head()
source_edge | paths | contribution | |
---|---|---|---|
1437 | Compound—includes—Decreased Central Nervous Sy... | 238.0 | 6.341200 |
1429 | Compound—includes—Benzodiazepines | 52.0 | 3.844600 |
104 | Compound—binds—GABRA1 | 12385.0 | 2.819223 |
1519 | Compound—resembles—Diazepam | 402.0 | 2.708075 |
1438 | Compound—includes—General Anesthesia | 6.0 | 2.456000 |
source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(source_df)
1667
# Source metaedge contributions
(path_df
.assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))
.groupby('source_edge')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
source_edge | paths | contribution | |
---|---|---|---|
0 | Compound—binds | 266192.0 | 43.778627 |
4 | Compound—resembles | 8507.0 | 30.306896 |
2 | Compound—includes | 322.0 | 15.050900 |
3 | Compound—palliates | 212.0 | 5.319900 |
1 | Compound—causes | 117724.0 | 4.418177 |
5 | Compound—treats | 5.0 | 1.130000 |
target_df = (path_df
.groupby('target_edge')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
target_df.head()
target_edge | paths | contribution | |
---|---|---|---|
355 | epilepsy syndrome—treats—Diazepam | 6843.0 | 8.123404 |
354 | epilepsy syndrome—treats—Clonazepam | 6488.0 | 6.273890 |
362 | epilepsy syndrome—treats—Midazolam | 4832.0 | 6.116992 |
353 | epilepsy syndrome—treats—Clobazam | 4159.0 | 5.670810 |
351 | epilepsy syndrome—treats—Amobarbital | 2002.0 | 4.840363 |
target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(target_df)
375
# Source metaedge contributions
(path_df
.assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))
.groupby('target_edge')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
target_edge | paths | contribution | |
---|---|---|---|
3 | epilepsy syndrome—treats | 127343.0 | 75.582070 |
0 | epilepsy syndrome—associates | 255092.0 | 21.689669 |
1 | epilepsy syndrome—localizes | 10522.0 | 1.602761 |
2 | epilepsy syndrome—resembles | 5.0 | 1.130000 |
anatomy_df = path_df.query("metapath == 'CbGeAlD'").copy()
anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])
anatomy_df = (anatomy_df
.groupby('anatomy')
.apply(summarize).reset_index()
.sort_values('contribution', ascending=False)
)
anatomy_df.head(2)
anatomy | paths | contribution | |
---|---|---|---|
21 | telencephalon | 928.0 | 0.153357 |
7 | forebrain | 834.0 | 0.147358 |
anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(anatomy_df)
24
def get_counts(df):
s = pandas.Series()
s['count'] = len(df)
s['compounds'] = ', '.join(sorted(df.compound_name))
return s
# Anatomical Therapeutic Chemical Classification System
# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907
url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'
atc_df = (pandas.read_table(url)
.rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})
)
df = tidy_split(top_compounds_df, 'atc_codes')
print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))
# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.
df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)
df = df.drop_duplicates()
df = df.merge(atc_df, how='left')
df.atc_name = df.atc_name.str.lower()
df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
91 compounds have at least 1 ATC code
atc_code | atc_name | count | compounds | |
---|---|---|---|---|
10 | N03A | antiepileptics | 25 | Carbamazepine, Clonazepam, Ethosuximide, Ethot... |
13 | N05C | hypnotics and sedatives | 21 | Amobarbital, Aprobarbital, Cinolazepam, Estazo... |
df = tidy_split(top_compounds_df, 'categories')
print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))
df = df.rename(columns={'categories': 'category'})
df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-categories.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
84 compounds have at least 1 category
category | count | compounds | |
---|---|---|---|
16 | Anticonvulsants | 28 | Acetazolamide, Carbamazepine, Clobazam, Clonaz... |
46 | Hypnotics and Sedatives | 24 | Alprazolam, Butabarbital, Butethal, Chlordiaze... |
class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'
class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'
class_df = (pandas.read_table(class_url)
.merge(pandas.read_table(class_rel_url))
.rename(columns={'drugbank_id': 'compound_id'})
[['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]
)
class_df.head(2)
compound_id | class_id | class_name | class_source | class_type | |
---|---|---|---|---|---|
0 | DB00126 | CHEBI:21241 | vitamin C | CHEBI | Application |
1 | DB00676 | CHEBI:22153 | acaricide | CHEBI | Application |
df = top_compounds_df.merge(class_df)
print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))
df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)
92 compounds have at least 1 pharmacologic class
class_id | class_name | class_source | class_type | count | compounds | |
---|---|---|---|---|---|---|
89 | D002491 | Central Nervous System Agents | MeSH | Pharmacological Action | 76 | Acamprosate, Acetazolamide, Adinazolam, Alpraz... |
90 | D002492 | Central Nervous System Depressants | MeSH | Pharmacological Action | 46 | Alprazolam, Amobarbital, Bromazepam, Butabarbi... |
# Read Entrez Gene
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
gene_df = pandas.read_table(url)
symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))
# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668
gene_split = re.compile(r',\s*(?![^()]*\))')
def summarize(df):
s = pandas.Series()
s['paths'] = sum(df.paths)
s['contribution'] = sum(df.contribution)
s['gene_symbols'] = ', '.join(df.gene_symbol)
return s
def contributions_by_gene(df, edge_column):
df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])
df['gene_name'] = df['gene_symbol'].map(symbol_to_name)
df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])
return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)
source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()
source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')
source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\t', index=False, float_format='%.5g')
source_bind_df.head(2)
gene_main_name | paths | contribution | gene_symbols | |
---|---|---|---|---|
74 | gamma-aminobutyric acid (GABA) A receptor | 91967.0 | 15.329433 | GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,... |
66 | cytochrome P450 | 34323.0 | 5.585686 | CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C... |
target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()
target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')
target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\t', index=False, float_format='%.5g')
target_bind_df.head(2)
gene_main_name | paths | contribution | gene_symbols | |
---|---|---|---|---|
119 | gamma-aminobutyric acid (GABA) A receptor | 23347.0 | 6.834028 | GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD |
128 | glutamate receptor | 20142.0 | 2.284863 | GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI... |
side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]
side_effect_df.head()
source_edge | paths | contribution | |
---|---|---|---|
390 | Compound—causes—Ataxia | 1312.0 | 0.069241 |
1057 | Compound—causes—Nystagmus | 611.0 | 0.048500 |
579 | Compound—causes—Diplopia | 948.0 | 0.044986 |
1278 | Compound—causes—Somnolence | 1577.0 | 0.043543 |
1416 | Compound—causes—Vomiting | 1777.0 | 0.042753 |