Import Ornstein-Uhlenbeck (OU) process parameters inferred from Subject B's time-series data.

In [1]:

```
import ExampleHelper # contains methods to facilitate this example
import pylab as pl
```

In [2]:

```
ou_param_fn = './sim.ou_params.txt'
ou_param_M = np.genfromtxt(ou_param_fn)
```

In [3]:

```
np.random.seed(1)
otu_d = {} # will store ou process for each simulated OTU
otu_c = ou_param_M.shape[0]
for otu_i in range(otu_c):
# OU parameters for a given process
mu_n = ou_param_M[otu_i,0]
lambda_n = ou_param_M[otu_i,1]
sigma_n = ou_param_M[otu_i,2]
dt_n = ou_param_M[otu_i,3]
ou_v, t_v = ExampleHelper.OUProcess(x0=mu_n,mu_n=mu_n,sigma_n=sigma_n,lambda_n=lambda_n,dt_n=dt_n)
# join processes
otu_d[otu_i] = ou_v
#endfor
```

In [4]:

```
time_c = len(otu_d[0])
abun_M = np.zeros((time_c,otu_c))
for i in otu_d:
abun_M[:,i] = np.power(10,otu_d[i])
#endfor
```

Plot the simulated microbiota time-series

In [5]:

```
# Linear scale
pl.figure()
pl.plot(abun_M)
pl.xlabel('Day')
pl.ylabel('Abundance')
pl.title('Simulated microbiota time-series')
```

Out[5]:

In [6]:

```
# Log-scale
pl.figure()
for otu_i in otu_d.keys():
pl.plot(otu_d[otu_i])
#endfor
pl.xlabel('Day')
pl.ylabel('log10(Abundance)')
pl.title('Simulated microbiota time-series')
```

Out[6]:

In [7]:

```
read_count_fn = './sim.read_counts.txt'
read_count_v = np.genfromtxt(read_count_fn) # vector of read counts drawn from our sequencing data
```

In [8]:

```
# obtain matrix of fractional abundances
sum_v = np.sum(abun_M,1)
sum_M = np.tile(sum_v,(len(otu_d),1)).T
frac_M = abun_M/sum_M
```

In [9]:

```
# draw uniformly from cumulative probability distribution to randomly generate reads
seq_M = np.zeros(frac_M.shape) # will simulate sequencing reads
cumsum_M = np.cumsum(frac_M,1)
# step through each time point
for time_i in range(time_c):
# randomly pick a number of reads to use
random.shuffle(read_count_v)
read_n = int(read_count_v[0])
# draw as many random numbers as chosen number of reads
rand_v = np.random.rand(read_n)
# for each random number/read, pick OTU using cumulative probability
for rand_n in rand_v:
rand_i = np.min(np.nonzero(cumsum_M[time_i,:] > rand_n)[0])
seq_M[time_i,rand_i] += 1.
#endfor
print "Simulating time point " + str(time_i)
#endfor
```

In [10]:

```
import scipy.spatial.distance as ssd
seq_sum_v = np.sum(seq_M,1)
seq_frac_M = seq_M/np.tile(seq_sum_v,(otu_c,1)).T
jsd_M = ssd.squareform(ssd.pdist(seq_frac_M,ExampleHelper.JSD))
dist_M = np.sqrt(jsd_M)
```

Setup weighting matrix using JSD. Use simple scheme based on squares.

In [11]:

```
weight_M = np.zeros((time_c,time_c)) + np.nan
time_v = range(time_c)
for time_i in time_v:
weight_v = np.power(1 - jsd_M[time_i,:],2)
weight_v = weight_v/np.sum(weight_v)
weight_M[time_i,:] = weight_v
weight_M[time_i,time_i] = 0.
#endfor
```

In [12]:

```
median_v = np.median(seq_M,0)
frac_v = median_v/sum(median_v)
sort_v = np.flipud(np.sort(frac_v))
cumsum_v = np.cumsum(sort_v)
thresh_i = np.nonzero(cumsum_v > 0.9)[0][0]
otu_subset_v = np.nonzero(frac_v > sort_v[thresh_i])[0]
```

Start the normalization using fractional abundances converted to log-space

In [13]:

```
obs_M = np.log10(seq_frac_M[:,otu_subset_v])
start_M = obs_M.copy()
```

Normalize each time point, in random order

In [14]:

```
random.shuffle(time_v)
for t in time_v:
# observed abundances
obs_v = obs_M[t,:]
# expected abundances
exp_v = ExampleHelper.WeightedMedian(obs_M,weight_M,t)
# solve y=mx in log-space, using median, so as to avoid effects of outliers
m_t = np.median(obs_v-exp_v)
# update time-series
obs_M[t,:] = obs_M[t,:] - m_t
#endfor
```

Now, rescale simulated sequence data using inferred scaling factors.

In [15]:

```
scale_v = start_M[:,0]-obs_M[:,0] # scaling factors for each time point
scale_M = np.tile(scale_v,(seq_frac_M.shape[1],1)).T
scale_frac_M = np.log10(seq_frac_M) - scale_M
inf_abun_M = np.power(10,scale_frac_M) # inferred OTU abundances in linear space
```

In [16]:

```
# estimate total community abundances at each time point.
true_abun_v = np.sum(abun_M,1)
inf_abun_v = np.sum(inf_abun_M,1)
# scale inferred abundances so that shares the same mean with the true abundances.
# this will not affect RELATIVE abundances between each time point
inf_abun_v += (np.mean(true_abun_v)-np.mean(inf_abun_v))
```

In [17]:

```
pl.plot(inf_abun_v,'-ob',label='Inferred')
pl.plot(true_abun_v,'-or',label='True')
pl.xlabel('Day')
pl.ylabel('Total bacterial abundances')
pl.title('True and inferred bacterial abundances over time')
pl.grid()
pl.legend()
```

Out[17]:

Scatter plot of inferred and true total bacterial abundances on each day

In [18]:

```
pl.figure(figsize=(4,4))
pl.plot(inf_abun_v,true_abun_v,'o')
pl.grid()
pl.xlim([0.7,1.3])
pl.ylim([0.7,1.3])
pl.xlabel('Inferred')
pl.ylabel(True)
pl.title('Scatter plot of True and Inferred total bacterial abundances')
```

Out[18]:

In [19]:

```
# Simulated
pl.figure()
pl.plot(abun_M)
pl.xlabel('Day')
pl.ylabel('Abundance')
pl.title('Simulated microbiota time-series')
# Fractional abundances
pl.figure()
pl.plot(seq_frac_M);
pl.title('Fractional abundance time-series')
pl.xlabel('Day')
pl.ylabel('Abundance')
# Our normalized abundances
pl.figure()
pl.plot(inf_abun_M);
pl.title('Inferred microbiota time-series')
pl.xlabel('Day')
pl.ylabel('Abundance')
```

Out[19]: