#!/usr/bin/env python
# coding: utf-8

# Since our paper on [SpatialDE](https://github.com/teichlab/spatialde) was published today in [Nature Methods](https://www.nature.com/articles/nmeth.4636), I wanted to demonstrate how to use our method on a dataset which came out while the paper was under review.
# 
# I spent a few hours this afternoon (March 19), doing this analysis. The data is from [Asp et al](https://www.nature.com/articles/s41598-017-13462-5) published in Scientific Reports in October last year. The authors use the [Spatial Transcriptomics](www.spatialtranscriptomicsresearch.org) method to investigate spatial gene expression in cardiac biopsies.

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd


# According to [Table 1 of Asp et al ](https://www.nature.com/articles/s41598-017-13462-5/tables/1), individual 3 right atrial appandage is covering 543 "spots", being one of the larger datasets. This data is available on the [ST website](http://www.spatialtranscriptomicsresearch.org/datasets/doi10-1038s41598-017-13462-5/), I downloaded both tables for `Individual3_RAA`.

# In[22]:


long_table = pd.read_csv('Individual3_RAA_R1.txt', sep='\t')


# In[32]:


long_table.head()


# This table is in "long" format, where each row is a combination of gene, x, y, and gene count. This is generally a pretty handy format, but for our analysis, it will be simpler to work on a wide expression matrix.

# In[34]:


counts = long_table \
    .pivot(index='barcode_id', columns='# gene_name', values='reads_count') \
    .fillna(0) \
    .astype(int)


# In[35]:


counts.iloc[:10, :5]


# In[36]:


counts.shape


# To keep track of spatial location of each sample, we create a sample information table.

# In[39]:


sample_info = long_table.groupby('barcode_id')['x', 'y'].first()


# In[46]:


# Just to make sure ecerything is sorted the same
sample_info = sample_info.sort_index()
counts = counts.sort_index()


# In[183]:


figsize(5, 5)
plt.scatter(sample_info.x, sample_info.y, c='k', s=50)
plt.gca().invert_yaxis();
plt.axis('equal');
plt.xlabel('Spatial x-axis')
plt.ylabel('Spatial y-axis');


# In[53]:


sample_info['total_count'] = counts.sum(1)


# In[63]:


# Filter practically unobserved genes, and remove practically empty "spots"
counts = counts.T[counts.sum(0) >= 3].T
sample_info = sample_info.query('total_count > 5').copy()
counts = counts.loc[sample_info.index]


# In[89]:


counts.shape


# After filtering, there are 530 spatial "spots", where 12,150 genes have been measured.

# In[184]:


figsize(5, 5)
plt.scatter(sample_info.x, sample_info.y, c=np.log10(sample_info.total_count), s=50)
plt.gca().invert_yaxis();
plt.axis('equal');
plt.xlabel('Spatial x-axis')
plt.ylabel('Spatial y-axis');


# Before running `SpatialDE` we need to do a couple of things: first variance stabilize the counts, then control for the variation in count depth for the "spots".

# In[72]:


import NaiveDE

# Apply Anscombe's transform for NB data
dfm = NaiveDE.stabilize(counts.T).T

# Extract residuals from regression
resids = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_count)').T

# Put spatial coordinates in their own table
X = sample_info[['x', 'y']]


# Now we run SpatialDE, this takes about 3 minutes on my iMac

# In[73]:


get_ipython().run_cell_magic('time', '', 'import SpatialDE\n\nresults = SpatialDE.run(X, resids)\n')


# In[76]:


results.head(3).T


# In[88]:


results.shape


# In[77]:


import SpatialDE.plot


# In[80]:


# For numerical reasons, sometimes p-values are set to 0 of they are too small.
# Bring these back to reasonable numbers.
results['pval'] = results['pval'].clip_lower(results.query('pval > 0')['pval'].min() / 2)
results['qval'] = results['qval'].clip_lower(results.query('qval > 0')['qval'].min() / 2)


# In[81]:


SpatialDE.plot.FSV_sig(results)


# In[85]:


sig_res = results.query('qval < 0.05')


# In[86]:


sig_res.shape


# This means 73 genes are significantly spatially variable. We would like to investigate these, so we need to translate the ENSEMBL IDs in the table to gene names that are easier for people to parse. For this, I went to [http://uswest.ensembl.org/biomart](http://uswest.ensembl.org/biomart) and downloaded a table of gene ids and gene names for the human gene annotation. I don't know if this is the annotation Asp et al used, but in most cases the IDs haven't changed in a long time.

# In[92]:


gene_info = pd.read_csv('gene_ids.tsv', index_col=0, sep='\t')
gene_info.head()


# In[95]:


results['gene_name'] = results.g.map(gene_info['Gene name'])


# In[96]:


sig_res = results.query('qval < 0.05')


# What is the general distribution of length scales of significant genes?

# In[99]:


sig_res.l.value_counts()


# For the ST technology used in Asp et al, the length unit for spatial coordinates is 200 µm, so in real-world coordinates, a lengthscale of 0.5 corresponds to 100 µm, while 4.17 corresponds to 834 µm. These are the roughyl scales at which expression levels change in a meaningful way. Now let's see which genes are represented at these scales. We can probably ignore the results for the smallest lengthscale, since it is so much smaller than smallest observed distance, it is likely to indicate a couple of neighboring spots expressing the gene.

# In[129]:


grp = sig_res.groupby('l')
for l, group in grp:
    top_string = ', '.join(group.sort_values('pval').head(10)['gene_name'].values)
    if group.shape[0] > 10:
        top_string += ', ...'

    print(f'l = {l}:')
    print(top_string)
    print()


# At this point it is usually fun to google all these genes, and plot what they look like spatially. I leave that as an exercise to the reader, but plot the top gene at each lengthscale so you can get a feeling for how the different lengthscales look.

# In[179]:


figsize(10, 6)
top_list = ['PABPC3', 'ALMS1', 'HBA2', 'NPPA', 'DCN', 'MT-ND2', 'MYL7']
for i, g in enumerate(top_list):
    plt.subplot(2, 4, i + 1)
    g_id = results.query('gene_name == @g').g.iloc[0]
    l = results.query('gene_name == @g').l.iloc[0]
    plt.scatter(sample_info.x, sample_info.y, c=resids[g_id], s=25)
    plt.axis('equal');
    plt.gca().invert_yaxis();
    plt.title(f'l = {l:.2}: {g}')
    
plt.tight_layout()


# Now let's investigate what histological patterns in the heart tissue is defined by these spatially variable genes.
# 
# I'm excluding the genes with very small lengthscales, because they seem independent from eachother. I'm picking 8 patterns, and a lengthscale of 2.0 which should be compatible with the l=1.4 genes at the same time as using the spatial prior for genes with larger l.

# In[238]:


get_ipython().run_cell_magic('time', '', "histology_results, patterns = SpatialDE.spatial_patterns(X, res, sig_res.query('l > 0.8'), 8, 2.0, verbosity=1)\n")


# In[239]:


histology_results.pattern.value_counts()


# In[240]:


# Again put in interpretable gene names
histology_results['gene_name'] = histology_results.g.map(gene_info['Gene name'])


# In[241]:


for i in histology_results.sort_values('pattern').pattern.unique():
    
    print('Pattern {}'.format(i))
    print(', '.join(histology_results.query('pattern == @i').sort_values('membership')['gene_name'].tolist()))
    print()


# These groups of genes can then be directly related to underlying spatial patterns.

# In[245]:


figsize(13, 5.5)
for i, Ci in enumerate(histology_results.sort_values('pattern').pattern.unique()):
    C = patterns[Ci]
    plt.subplot(2, 4, i + 1)
    plt.scatter(sample_info.x, sample_info.y, c=C, s=25)
    plt.axis('equal');
    plt.gca().invert_yaxis();
    plt.title(f'Pattern {i}')
    
plt.tight_layout()


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]: