#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import pylab as plt # The Shannon-Weiner index was developed from information theory and is based on measuring uncertainty. The degree of uncertainty of predicting the species of a random sample is related to the diversity of a community. If a community is dominated by one species (low diversity), the uncertainty of prediction is low; a randomly-sampled species is most likely going to be the dominant species. However, if diversity is high, uncertainty is high. # # $$H' = \sum_i p_i \log(p_i)$$ # # This function calculates Shannon diversity: # In[2]: # Shannon index shannon = lambda x: -np.sum([p*np.log(p) for p in x/float(sum(x)) if p]) # The Simpson index (1949) measures the degree of concentration when individuals are classified into types. It is based on the expected outcome of two random samples from the same community: # # $$D = \frac{N(N-1)}{\sum_i n_i (n_i-1)}$$ # # This function calculates Simpson's index of diversity: # In[3]: simpsons = lambda x: sum(x)*(sum(x)-1)/np.sum([float(n-1)*n for n in x if n]) # ## Genus-level diversity # In[4]: genus_raw = pd.read_excel('OTU.xlsx', 'L5', index_col=0).unstack().reset_index() genus_raw.head() # In[5]: taxa = genus_raw.Taxon.unique() taxa_dict = dict(zip(taxa, range(len(taxa)))) # In[6]: genus_raw.Taxon.replace(taxa_dict, inplace=True) # In[7]: genus_raw['nec'] = genus_raw.level_0.apply(lambda x: x.startswith('N')) genus_raw['stool'] = genus_raw.level_0.apply(lambda x: x.endswith('s')) genus_raw['patient'] = genus_raw.level_0.apply(lambda x: int(''.join([s for s in x if s.isdigit()]))) # In[8]: genus_data = genus_raw.drop('level_0', axis=1) genus_data.columns = 'taxon', 'count', 'nec', 'stool', 'patient' # In[9]: genus_data.tail() # ## Comparison of tissue diversity in NEC vs control # In[17]: tissue_data = genus_data[genus_data.stool==False] simpsons_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(simpsons) simpsons_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Simpson\'s index'); # In[13]: shannon_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(shannon) shannon_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Shannon index'); # In[15]: # Chao 1 index chao = lambda x: len(x) + ((x==1).sum())**2/(2 * (x==2).sum()) chao_tissue = tissue_data.groupby(['patient', 'nec'])['count'].aggregate(chao) chao_tissue.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Chao index'); # ## Comparison of stool diversity in NEC vs control # In[18]: stool_data = genus_data[genus_data.stool==True] simpsons_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(simpsons) simpsons_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Simpson\'s index'); # In[21]: shannon_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(shannon) shannon_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Shannon index'); # In[22]: chao_stool = stool_data.groupby(['patient', 'nec'])['count'].aggregate(chao) chao_stool.unstack().rename(columns={False: 'Control', True: 'NEC'}).boxplot(grid=False) plt.ylabel('Chao index'); # ## Line plots of tissue vs stool diversity in NEC samples # In[60]: simpsons_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(simpsons) simpsons_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna() # In[61]: len(simpsons_nec) # In[66]: def plot_bipartite(data, label): axes = data.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna().T.plot(grid=False, color='k', style='.-', legend=False) axes.set_xlim(-0.1, 1.1) axes.set_xticks([0,1]) axes.set_xticklabels(['Tissue', 'Stool']) axes.set_ylabel(label) axes.set_xlabel('') # In[67]: plot_bipartite(simpsons_nec, 'Simpson\'s index') # In[68]: shannon_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(shannon) shannon_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna() # In[69]: plot_bipartite(shannon_nec, 'Shannon index') # In[70]: chao_nec = genus_data[genus_data.nec==True].groupby(['patient','stool'])['count'].aggregate(chao) chao_nec.unstack().rename(columns={False: 'Tissue', True: 'Stool'}).dropna() # Chao was very unstable, so I am not attempting to generate a graph.