#!/usr/bin/env python # coding: utf-8 # # Collapsed source/target edge contributions to epilepsy predictions # In[1]: import re import itertools import collections import pandas from utilities import tidy_split # In[2]: # Read Project Rephetio DrugBank Info url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv' drugbank_df = (pandas.read_table(url) .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'}) [['compound_id', 'compound_name', 'atc_codes', 'categories']] ) # In[3]: # Read top epilepsy predictions top_compounds_df = (pandas.read_table('./data/windows.tsv') .rename(columns={'name': 'compound_name'}) [['compound_name', 'disease_pctl', 'phcodb']] .merge(drugbank_df) ) top_compounds_df.head(2) # In[4]: path_dfs = list() for compound_id in top_compounds_df.compound_id: path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id) path_dfs.append(pandas.read_table(path)) path_df = pandas.concat(path_dfs) path_df.head(2) # In[5]: def summarize(df): s = pandas.Series() s['paths'] = len(df) s['contribution'] = sum(df.percent_of_prediction) return s # ## Metapath contributions # In[6]: metapath_df = (path_df .groupby('metapath') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) metapath_df.head(2) # In[7]: metapath_df.to_csv('data/metapath-contributions.tsv', sep='\t', index=False, float_format='%.5g') len(metapath_df) # ## Source edge contributions # In[8]: source_df = (path_df .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1])) .groupby('source_edge') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) source_df.head() # In[9]: source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g') len(source_df) # In[10]: # Source metaedge contributions (path_df .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1])) .groupby('source_edge') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) # ## Target edge contributions # In[11]: target_df = (path_df .groupby('target_edge') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) target_df.head() # In[12]: target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g') len(target_df) # In[13]: # Source metaedge contributions (path_df .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2])) .groupby('target_edge') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) # ## Anatomy (intermediate node) contributions # In[14]: anatomy_df = path_df.query("metapath == 'CbGeAlD'").copy() anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2]) anatomy_df = (anatomy_df .groupby('anatomy') .apply(summarize).reset_index() .sort_values('contribution', ascending=False) ) anatomy_df.head(2) # In[15]: anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\t', index=False, float_format='%.5g') len(anatomy_df) # ## Classes of predicted compounds # In[16]: def get_counts(df): s = pandas.Series() s['count'] = len(df) s['compounds'] = ', '.join(sorted(df.compound_name)) return s # ### Third-level ATC Codes # In[17]: # Anatomical Therapeutic Chemical Classification System # Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907 url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt' atc_df = (pandas.read_table(url) .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'}) ) df = tidy_split(top_compounds_df, 'atc_codes') print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique())) # The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter. df['atc_code'] = df.pop('atc_codes').str.slice(0, 4) df = df.drop_duplicates() df = df.merge(atc_df, how='left') df.atc_name = df.atc_name.str.lower() df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False) df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\t', index=False, float_format='%.5g') df.head(2) # ### DrugBank Categories # In[18]: df = tidy_split(top_compounds_df, 'categories') print('{} compounds have at least 1 category'.format(df.compound_name.nunique())) df = df.rename(columns={'categories': 'category'}) df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False) df.to_csv('data/compounds-categories.tsv', sep='\t', index=False, float_format='%.5g') df.head(2) # ### DurgCentral Pharmacologic Classes # In[19]: class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv' class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv' class_df = (pandas.read_table(class_url) .merge(pandas.read_table(class_rel_url)) .rename(columns={'drugbank_id': 'compound_id'}) [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']] ) class_df.head(2) # In[20]: df = top_compounds_df.merge(class_df) print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique())) df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False) df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\t', index=False, float_format='%.5g') df.head(2) # ## Contribution by gene groups # In[21]: # Read Entrez Gene url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv' gene_df = pandas.read_table(url) symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description)) # In[22]: # Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668 gene_split = re.compile(r',\s*(?![^()]*\))') def summarize(df): s = pandas.Series() s['paths'] = sum(df.paths) s['contribution'] = sum(df.contribution) s['gene_symbols'] = ', '.join(df.gene_symbol) return s def contributions_by_gene(df, edge_column): df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1]) df['gene_name'] = df['gene_symbol'].map(symbol_to_name) df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0]) return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False) # In[23]: source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy() source_bind_df = contributions_by_gene(source_bind_df, 'source_edge') source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\t', index=False, float_format='%.5g') source_bind_df.head(2) # In[24]: target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy() target_bind_df = contributions_by_gene(target_bind_df, 'target_edge') target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\t', index=False, float_format='%.5g') target_bind_df.head(2) # ## Contribution by Side Effect (source edges) # In[25]: side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')] side_effect_df.head()