#!/usr/bin/env python
# coding: utf-8

# # Collapsed source/target edge contributions to epilepsy predictions

# In[1]:


import re
import itertools
import collections

import pandas

from utilities import tidy_split


# In[2]:


# Read Project Rephetio DrugBank Info
url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'
drugbank_df = (pandas.read_table(url)
    .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})
    [['compound_id', 'compound_name', 'atc_codes', 'categories']]
)


# In[3]:


# Read top epilepsy predictions
top_compounds_df = (pandas.read_table('./data/windows.tsv')
    .rename(columns={'name': 'compound_name'})
    [['compound_name', 'disease_pctl', 'phcodb']]
    .merge(drugbank_df)
)
top_compounds_df.head(2)


# In[4]:


path_dfs = list()
for compound_id in top_compounds_df.compound_id:
    path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)
    path_dfs.append(pandas.read_table(path))
path_df = pandas.concat(path_dfs)
path_df.head(2)


# In[5]:


def summarize(df):
    s = pandas.Series()
    s['paths'] = len(df)
    s['contribution'] = sum(df.percent_of_prediction)
    return s


# ## Metapath contributions

# In[6]:


metapath_df = (path_df
    .groupby('metapath')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
metapath_df.head(2)


# In[7]:


metapath_df.to_csv('data/metapath-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(metapath_df)


# ## Source edge contributions

# In[8]:


source_df = (path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
source_df.head()


# In[9]:


source_df.to_csv('data/source-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(source_df)


# In[10]:


# Source metaedge contributions
(path_df
    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))
    .groupby('source_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)


# ## Target edge contributions

# In[11]:


target_df = (path_df
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
target_df.head()


# In[12]:


target_df.to_csv('data/target-edge-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(target_df)


# In[13]:


# Source metaedge contributions
(path_df
    .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))
    .groupby('target_edge')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)


# ## Anatomy (intermediate node) contributions

# In[14]:


anatomy_df = path_df.query("metapath == 'CbGeAlD'").copy()
anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])
anatomy_df = (anatomy_df
    .groupby('anatomy')
    .apply(summarize).reset_index()
    .sort_values('contribution', ascending=False)
)
anatomy_df.head(2)


# In[15]:


anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\t', index=False, float_format='%.5g')
len(anatomy_df)


# ## Classes of predicted compounds

# In[16]:


def get_counts(df):
    s = pandas.Series()
    s['count'] = len(df)
    s['compounds'] = ', '.join(sorted(df.compound_name))
    return s


# ### Third-level ATC Codes

# In[17]:


# Anatomical Therapeutic Chemical Classification System

# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907
url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'
atc_df = (pandas.read_table(url)
    .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})
)
df = tidy_split(top_compounds_df, 'atc_codes')
print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))

# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.
df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)
df = df.drop_duplicates()
df = df.merge(atc_df, how='left')
df.atc_name = df.atc_name.str.lower()
df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)


# ### DrugBank Categories

# In[18]:


df = tidy_split(top_compounds_df, 'categories')
print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))
df = df.rename(columns={'categories': 'category'})
df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-categories.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)


# ### DurgCentral Pharmacologic Classes

# In[19]:


class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'
class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'
class_df = (pandas.read_table(class_url)
    .merge(pandas.read_table(class_rel_url))
    .rename(columns={'drugbank_id': 'compound_id'})
    [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]
)

class_df.head(2)


# In[20]:


df = top_compounds_df.merge(class_df)
print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))
df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)
df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\t', index=False, float_format='%.5g')
df.head(2)


# ## Contribution by gene groups

# In[21]:


# Read Entrez Gene
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
gene_df = pandas.read_table(url)
symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))


# In[22]:


# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668
gene_split = re.compile(r',\s*(?![^()]*\))')

def summarize(df):
    s = pandas.Series()
    s['paths'] = sum(df.paths)
    s['contribution'] = sum(df.contribution)
    s['gene_symbols'] = ', '.join(df.gene_symbol)
    return s

def contributions_by_gene(df, edge_column):
    df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])
    df['gene_name'] = df['gene_symbol'].map(symbol_to_name)
    df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])
    return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)


# In[23]:


source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()
source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')
source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\t', index=False, float_format='%.5g')
source_bind_df.head(2)


# In[24]:


target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()
target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')
target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\t', index=False, float_format='%.5g')
target_bind_df.head(2)


# ## Contribution by Side Effect (source edges)

# In[25]:


side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]
side_effect_df.head()