#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np
import pylab as plt


# The Shannon-Weiner index was developed from information theory and is based on measuring uncertainty. The degree of uncertainty of predicting the species of a random sample is related to the diversity of a community. If a community is dominated by one species (low diversity), the uncertainty of prediction is low; a randomly-sampled species is most likely going to be the dominant species. However, if diversity is high, uncertainty is high.
# 
# $$H' = \sum_i p_i \log(p_i)$$
# 
# This function calculates Shannon diversity:

# In[2]:


# Shannon index
shannon = lambda x: -np.sum([p*np.log(p) for p in x/float(sum(x)) if p])


# The Simpson index (1949) measures the degree of concentration when individuals are classified into types. It is based on the expected outcome of two random samples from the same community:
# 
# $$D = \frac{N(N-1)}{\sum_i n_i (n_i-1)}$$
# 
# This function calculates Simpson's index of diversity:

# In[3]:


simpsons = lambda x: sum(x)*(sum(x)-1)/np.sum([float(n-1)*n for n in x if n])


# ## Genus-level diversity

# In[4]:


genus_raw = pd.read_excel('OTU.xlsx', 'L5', index_col=0).unstack().reset_index()
genus_raw.head()


# In[5]:


taxa = genus_raw.Taxon.unique()
taxa_dict = dict(zip(taxa, range(len(taxa))))


# In[6]:


genus_raw.Taxon.replace(taxa_dict, inplace=True)


# In[7]:


genus_raw['nec'] = genus_raw.level_0.apply(lambda x: x.startswith('N'))
genus_raw['stool'] = genus_raw.level_0.apply(lambda x: x.endswith('s'))
genus_raw['patient'] = genus_raw.level_0.apply(lambda x: int(''.join([s for s in x if s.isdigit()])))


# In[8]:


genus_data = genus_raw.drop('level_0', axis=1)
genus_data.columns = 'taxon', 'count', 'nec', 'stool', 'patient'


# In[9]:


genus_data.tail()


# ## Comparison of tissue diversity in NEC vs control

# In[17]:


tissue_data = genus_data[genus_data.stool==False]
simpsons_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(simpsons)
simpsons_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Simpson\'s index');


# In[13]:


shannon_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(shannon)
shannon_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Shannon index');


# In[15]:


# Chao 1 index
chao = lambda x: len(x) + ((x==1).sum())**2/(2 * (x==2).sum())

chao_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(chao)
chao_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Chao index');


# ## Comparison of stool diversity in NEC vs control

# In[18]:


stool_data = genus_data[genus_data.stool==True]

simpsons_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(simpsons)
simpsons_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Simpson\'s index');


# In[21]:


shannon_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(shannon)
shannon_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Shannon index');


# In[22]:


chao_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(chao)
chao_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False)
plt.ylabel('Chao index');


# ## Line plots of tissue vs stool diversity in NEC samples

# In[60]:


simpsons_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(simpsons)
simpsons_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna()


# In[61]:


len(simpsons_nec)


# In[66]:


def plot_bipartite(data, label):
    axes = data.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna().T.plot(grid=False, 
                                                                                           color='k', 
                                                                                           style='.-',
                                                                                           legend=False)
    axes.set_xlim(-0.1, 1.1)
    axes.set_xticks([0,1])
    axes.set_xticklabels(['Tissue', 'Stool'])
    axes.set_ylabel(label)
    axes.set_xlabel('')


# In[67]:


plot_bipartite(simpsons_nec, 'Simpson\'s index')


# In[68]:


shannon_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(shannon)
shannon_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna()


# In[69]:


plot_bipartite(shannon_nec, 'Shannon index')


# In[70]:


chao_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(chao)
chao_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna()


# Chao was very unstable, so I am not attempting to generate a graph.