#!/usr/bin/env python # coding: utf-8 # # Bgee # # Extracting data from [Bgee](http://bgee.unil.ch/). See [this Thinklab discussion](http://thinklab.com/discussion/tissue-specific-gene-expression-resources/81#278) for more information. # In[1]: import collections import gzip import pandas import seaborn import numpy import matplotlib.pyplot as plt import IPython get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: def get_groupby_counts(df, columns): """Group datagrame by columns and return the number of rows for each grouping.""" grouped = df.groupby(columns) get_len = lambda df: pandas.Series({'count': len(df)}) df = grouped.apply(get_len) df = df.sort('count', ascending=False) df = df.reset_index() return df # ## Load entrez gene for ensembl conversion # In[44]: # Read Entrez Genes url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv' entrez_df = pandas.read_table(url) coding_genes = set(entrez_df.GeneID[entrez_df.type_of_gene == 'protein-coding']) # Merge with entrez gene identifiers url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/xrefs-human.tsv' entrez_map_df = pandas.read_table(url) entrez_map_df = entrez_map_df[entrez_map_df.resource == 'Ensembl'] ensembl_to_entrez = dict(zip(entrez_map_df.identifier, entrez_map_df.GeneID)) # ## Read and process presence of expression data # In[30]: # Read expression presence_df = pandas.read_table('download/Homo_sapiens_expr-complete.tsv.gz', compression='gzip') presence_df.head() # In[31]: # Apply filters for gene presence present_df = presence_df[ presence_df['Call quality'].isin({'high quality', 'low quality'}) & presence_df['Expression'].isin({'present', 'low ambiguity'}) ] # In[32]: # Find genes present per developmental stage stage_df = get_groupby_counts(present_df, ['Developmental stage ID', 'Developmental stage name']) stage_df.head(12) # In[34]: # Find genes present per anatomical entity anatomy_df = get_groupby_counts(present_df, ['Anatomical entity ID', 'Anatomical entity name']) anatomy_df.head(10) # ### Figure 1: Number of genes present by developmental stage and anatomical entity # In[66]: # Number of present genes per development stage -- anatomical entity pair pairwise_df = get_groupby_counts(present_df, ['Developmental stage name', 'Anatomical entity name']) rect_df = pairwise_df.pivot('Developmental stage name', 'Anatomical entity name', 'count').fillna(0) rect_df = rect_df.loc[stage_df['Developmental stage name'][:30], anatomy_df['Anatomical entity name'][:50]] IPython.core.pylabtools.figsize(14, 7) seaborn.heatmap(rect_df); # In[63]: # filter to human adult stage (human) adult_df = present_df[present_df['Developmental stage ID'] == 'HsapDv:0000087'] # pivot dataframe so each column is an anatomical entity adult_df = adult_df.pivot('Gene ID', 'Anatomical entity ID', 'Expression') adult_df = (adult_df == 'present').astype(int) # convert to entrez GeneIDs adult_df = adult_df.groupby(ensembl_to_entrez).max() adult_df = adult_df.reset_index() adult_df = adult_df.rename(columns=({'index': 'GeneID'})) adult_df.GeneID = adult_df.GeneID.astype(int) # save dataframe with gzip.open('data/present-in-adult.tsv.gz', 'wt') as write_file: adult_df.to_csv(write_file, sep='\t', index=False) # see tail adult_df.tail() # In[64]: # Coding genes in entrez dataset len(set(adult_df.GeneID) & coding_genes) # In[83]: # Frequency of gene presence adult_df.drop('GeneID', 1).as_matrix().mean() # ## Read and process differential expression data # In[18]: # Read simple differential expression by anatomy diffex_df = pandas.read_table('download/Homo_sapiens_diffexpr-anatomy-simple.tsv.gz', compression='gzip') diffex_df.head() # In[20]: # filter differential expression diffex_df = diffex_df[ diffex_df['Differential expression'].isin({'over-expression', 'under-expression'}) & diffex_df['Call quality'].isin({'low quality', 'high quality'}) ] # In[21]: # Calculate counts per anatomy--DE pair de_anatomy_df = get_groupby_counts(diffex_df, ['Anatomical entity ID', 'Anatomical entity name', 'Differential expression']) de_anatomy_df.head() # ### Figure 2: Number of differntially expressed genes present by anatomical entity # In[29]: # Plot DE counts rect_df = de_anatomy_df.pivot('Anatomical entity name', 'Differential expression', 'count').fillna(0) rect_df['differential expression'] = rect_df['under-expression'] + rect_df['over-expression'] rect_df = rect_df.sort('differential expression', ascending=False) IPython.core.pylabtools.figsize(1, 16) cmap = seaborn.cubehelix_palette(15, start=2.2, rot=1, gamma=1.6, hue=1, light=0.98, dark=0, as_cmap=True) seaborn.heatmap(rect_df.drop('differential expression', axis=1), cmap=cmap); # In[73]: diffex_pivot_df = diffex_df.pivot('Gene ID', 'Anatomical entity ID', 'Differential expression') diffex_pivot_df = diffex_pivot_df.replace({'under-expression': -1, 'over-expression': 1, numpy.NaN: 0}) # In[74]: def round_diffex(value): if value <= -0.5: return -1 if value >= 0.5: return 1 return 0 diffex_pivot_df = diffex_pivot_df.groupby(ensembl_to_entrez).mean().applymap(round_diffex) # In[75]: diffex_pivot_df = diffex_pivot_df.reset_index() diffex_pivot_df = diffex_pivot_df.rename(columns=({'index': 'GeneID'})) diffex_pivot_df.GeneID = diffex_pivot_df.GeneID.astype(int) # save dataframe with gzip.open('data/diffex.tsv.gz', 'wt') as write_file: diffex_pivot_df.to_csv(write_file, sep='\t', index=False) # see tail diffex_pivot_df.tail() # In[76]: # Coding genes in entrez dataset len(set(diffex_pivot_df.GeneID) & coding_genes)