#!/usr/bin/env python # coding: utf-8 # In[1]: import os os.environ['MKL_NUM_THREADS'] = '2' # In[2]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import pymc3 as pm import seaborn as sb import theano import theano.tensor as tt get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: counts = pd.read_csv('pasilla_gene_counts.tsv', sep='\t', index_col=0) # In[4]: counts.head() # In[5]: is_treated = np.array([False] * 4 + [True] * 3) # In[6]: counts = counts[counts.sum(1) > 500] # In[7]: len(counts) # In[8]: sb.distplot(np.log(counts.sum(1))) # In[9]: counts = counts.head(200) # In[11]: class ExpModNormal(pm.Continuous): def __init__(self, mu, sd, lambda_, *args, **kwargs): super(ExpModNormal, self).__init__(*args, **kwargs) self.mu = mu = tt.as_tensor_variable(mu) self.sd = sd = tt.as_tensor_variable(sd) self.lambda_ = lambda_ = tt.as_tensor_variable(lambda_) self.mean = mu + 1 / lambda_ def logp(self, value): mu, sd, lambda_ = self.mu, self.sd, self.lambda_ tmp = mu + sd * sd * lambda_ - value return (tt.log(lambda_ / 2) + lambda_ * (tmp - sd * sd * lambda_ / 2) + tt.log(tt.erfc(tmp / (np.sqrt(2) * sd)))) # In[12]: n_genes, n_samples = counts.shape with pm.Model() as model: gene = pm.Normal('gene', shape=n_genes, mu=8, sd=5, testval=np.log(counts + 1).mean(1).values.ravel()) #sample_raw = pm.Flat('sample_raw', shape=n_samples - 1, # testval=np.log(counts + 1).mean().values.ravel()[:-1] # - np.log(counts + 1).mean().mean()) #sample_last = - sample_raw.sum() #sample = tt.concatenate([sample_raw, sample_last.reshape((1,))]) #pm.Deterministic('sample', sample) # This is the source of quite some correcation. Reparametrization would help. sample_ps = np.log(counts + 1).mean().values.ravel() - np.log(counts + 1).mean().mean() sample = pm.Normal('sample', shape=n_samples, testval=sample_ps) pm.Potential('sample_pot', pm.Normal.dist(mu=0, sd=3).logp(sample)) mu = pm.Normal('gene_sample_lsd_mu', mu=-3, sd=1) sd = pm.Bound(pm.HalfNormal, lower=0.05, upper=3)('gene_sample_lsd_sd', sd=0.3) #sd = pm.HalfNormal('gene_sample_lsd_sd', sd=0.3) lambda_inv = pm.HalfNormal('gene_sample_lsd_lambda_inv', sd=1) #gene_sample_lsd = ExpModNormal('gene_sample_lsd', mu=mu, sd=sd, # lambda_=1/lambda_inv, shape=n_genes) gene_sample_lsd = ExpModNormal('gene_sample_lsd', mu=0, sd=1, lambda_=1/lambda_inv, shape=n_genes) gene_sample_lsd = gene_sample_lsd * sd + mu gene_sample_sd = tt.exp(gene_sample_lsd) pm.Deterministic('gene_sample_sd', gene_sample_sd) # Sparse prior for the effects #tau = pm.HalfStudentT('effect_tau', nu=3, sd=0.1, testval=0.01) #c = pm.InverseGamma('effect_c', alpha=2.5, beta=10) #theta_raw = pm.Uniform('effect_theta_raw', lower=0, upper=np.pi / 2, shape=n_genes) #theta = tt.tan(theta_raw) #lmbda = c * theta / tt.sqrt(c ** 2 + tau ** 2 * theta ** 2) #eta = pm.Normal('effect_eta', shape=n_genes) #effect = tau * lmbda * eta #pm.Deterministic('effect', effect) effect_sd = pm.HalfNormal('effect_sd', sd=1) effect = pm.Normal('effect', mu=0, sd=effect_sd, shape=n_genes) mu = gene[:, None] + sample[None, :] + effect[:, None] * (is_treated - 0.5) pm.Deterministic('mu', mu) gene_sample = pm.StudentT('gene_sample', nu=7, mu=mu, sd=gene_sample_sd[:, None], shape=(n_genes, n_samples)) pm.Poisson('counts', log_mu=gene_sample, observed=counts) # In[13]: model.ndim # In[14]: with model: trace = pm.sample( #chains=4, init='advi+adapt_diag', tune=1000, chains=4, tune=1000, draws=10000, cores=2, target_accept=0.9 ) # In[15]: import covadapt.eigvals_lw import covadapt.potential # In[16]: with model: pot = covadapt.potential.EigvalsAdapt( model.ndim, np.zeros(model.ndim), covadapt.eigvals_lw.eigh_lw_samples_grads, eigvalsfunc_kwargs=dict( n_eigs=15, n_eigs_grad=15, n_final=30, ), display=True, adaptation_window=200, ) step = pm.NUTS(potential=pot, target_accept=0.9) trace2 = pm.sample(step=step, draws=10000, tune=1100, chains=4, cores=2) # In[17]: {var: vals.min() for var, vals in pm.effective_n(trace).items()} # In[18]: {var: vals.min() for var, vals in pm.effective_n(trace2).items()} # In[19]: plt.plot(trace['step_size']) plt.plot(trace2['step_size']) # In[20]: plt.plot(trace['tree_size']) plt.plot(trace2['tree_size']) # In[ ]: # In[ ]: