#!/usr/bin/env python
# coding: utf-8

# # Bgee
# 
# Extracting data from [Bgee](http://bgee.unil.ch/). See [this Thinklab discussion](http://thinklab.com/discussion/tissue-specific-gene-expression-resources/81#278) for more information.

# In[1]:


import collections
import gzip

import pandas
import seaborn
import numpy
import matplotlib.pyplot as plt
import IPython

get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


def get_groupby_counts(df, columns):
    """Group datagrame by columns and return the number of rows for each grouping."""
    grouped = df.groupby(columns)
    get_len = lambda df: pandas.Series({'count': len(df)})
    df = grouped.apply(get_len)
    df = df.sort('count', ascending=False)
    df = df.reset_index()
    return df


# ## Load entrez gene for ensembl conversion

# In[44]:


# Read Entrez Genes
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
coding_genes = set(entrez_df.GeneID[entrez_df.type_of_gene == 'protein-coding'])

# Merge with entrez gene identifiers
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/xrefs-human.tsv'
entrez_map_df = pandas.read_table(url)    
entrez_map_df = entrez_map_df[entrez_map_df.resource == 'Ensembl']

ensembl_to_entrez = dict(zip(entrez_map_df.identifier, entrez_map_df.GeneID))


# ## Read and process presence of expression data

# In[30]:


# Read expression
presence_df = pandas.read_table('download/Homo_sapiens_expr-complete.tsv.gz', compression='gzip')
presence_df.head()


# In[31]:


# Apply filters for gene presence
present_df = presence_df[
    presence_df['Call quality'].isin({'high quality', 'low quality'}) &
    presence_df['Expression'].isin({'present', 'low ambiguity'})
]


# In[32]:


# Find genes present per developmental stage
stage_df = get_groupby_counts(present_df, ['Developmental stage ID', 'Developmental stage name'])
stage_df.head(12)


# In[34]:


# Find genes present per anatomical entity
anatomy_df = get_groupby_counts(present_df, ['Anatomical entity ID', 'Anatomical entity name'])
anatomy_df.head(10)


# ### Figure 1: Number of genes present by developmental stage and anatomical entity

# In[66]:


# Number of present genes per development stage -- anatomical entity pair
pairwise_df = get_groupby_counts(present_df, ['Developmental stage name', 'Anatomical entity name'])
rect_df = pairwise_df.pivot('Developmental stage name', 'Anatomical entity name', 'count').fillna(0)
rect_df = rect_df.loc[stage_df['Developmental stage name'][:30], anatomy_df['Anatomical entity name'][:50]]
IPython.core.pylabtools.figsize(14, 7)
seaborn.heatmap(rect_df);


# In[63]:


# filter to human adult stage (human)
adult_df = present_df[present_df['Developmental stage ID'] == 'HsapDv:0000087']

# pivot dataframe so each column is an anatomical entity
adult_df = adult_df.pivot('Gene ID', 'Anatomical entity ID', 'Expression')
adult_df = (adult_df == 'present').astype(int)

# convert to entrez GeneIDs
adult_df = adult_df.groupby(ensembl_to_entrez).max()
adult_df = adult_df.reset_index()
adult_df = adult_df.rename(columns=({'index': 'GeneID'}))
adult_df.GeneID = adult_df.GeneID.astype(int)

# save dataframe
with gzip.open('data/present-in-adult.tsv.gz', 'wt') as write_file:
    adult_df.to_csv(write_file, sep='\t', index=False)

# see tail
adult_df.tail()


# In[64]:


# Coding genes in entrez dataset
len(set(adult_df.GeneID) & coding_genes)


# In[83]:


# Frequency of gene presence 
adult_df.drop('GeneID', 1).as_matrix().mean()


# ## Read and process differential expression data

# In[18]:


# Read simple differential expression by anatomy
diffex_df = pandas.read_table('download/Homo_sapiens_diffexpr-anatomy-simple.tsv.gz', compression='gzip')
diffex_df.head()


# In[20]:


# filter differential expression
diffex_df = diffex_df[
    diffex_df['Differential expression'].isin({'over-expression', 'under-expression'}) &
    diffex_df['Call quality'].isin({'low quality', 'high quality'})
]


# In[21]:


# Calculate counts per anatomy--DE pair
de_anatomy_df = get_groupby_counts(diffex_df, ['Anatomical entity ID', 'Anatomical entity name', 'Differential expression'])
de_anatomy_df.head()


# ### Figure 2: Number of differntially expressed genes present by anatomical entity

# In[29]:


# Plot DE counts
rect_df = de_anatomy_df.pivot('Anatomical entity name', 'Differential expression', 'count').fillna(0)
rect_df['differential expression'] = rect_df['under-expression'] + rect_df['over-expression']
rect_df = rect_df.sort('differential expression', ascending=False)
IPython.core.pylabtools.figsize(1, 16)
cmap = seaborn.cubehelix_palette(15, start=2.2, rot=1, gamma=1.6, hue=1, light=0.98, dark=0, as_cmap=True)
seaborn.heatmap(rect_df.drop('differential expression', axis=1), cmap=cmap);


# In[73]:


diffex_pivot_df = diffex_df.pivot('Gene ID', 'Anatomical entity ID', 'Differential expression')
diffex_pivot_df = diffex_pivot_df.replace({'under-expression': -1, 'over-expression': 1, numpy.NaN: 0})


# In[74]:


def round_diffex(value):
    if value <= -0.5:
        return -1
    if value >= 0.5:
        return 1
    return 0

diffex_pivot_df = diffex_pivot_df.groupby(ensembl_to_entrez).mean().applymap(round_diffex)


# In[75]:


diffex_pivot_df = diffex_pivot_df.reset_index()
diffex_pivot_df = diffex_pivot_df.rename(columns=({'index': 'GeneID'}))
diffex_pivot_df.GeneID = diffex_pivot_df.GeneID.astype(int)

# save dataframe
with gzip.open('data/diffex.tsv.gz', 'wt') as write_file:
    diffex_pivot_df.to_csv(write_file, sep='\t', index=False)

# see tail
diffex_pivot_df.tail()


# In[76]:


# Coding genes in entrez dataset
len(set(diffex_pivot_df.GeneID) & coding_genes)