Regularized NMF for automatic segmentation of odor maps in the mouse OB

This Notebook reproduces results of the manuscript http://dx.doi.org/10.1016/j.neuroimage.2014.04.041. It also allows to inspect the data in more detail.

In [1]:
import sys
import os
import cPickle as pickle
from collections import defaultdict

from scipy.spatial.distance import pdist,squareform
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib
%pylab inline

#the datamaker and ImageAnalysisComponents utilities reside two levels above this notebook.
utility_path = os.path.realpath(os.path.join(os.path.pardir, os.path.pardir))
sys.path.append(utility_path)
from regnmf import datamaker
from regnmf import ImageAnalysisComponents as ia
from regnmf.progress_bar import ProgressBar
Populating the interactive namespace from numpy and matplotlib

Define the location of the data. Data can be downloaded from zenodo.org at http://dx.doi.org/10.5281/zenodo.12352 .

In [2]:
datapath = 'please set the data path' #on my machine it's '../../../../Data' 

Surrogate Data

For biological data it is generally difficult to asses factorization performance. To obtain nevertheless a detailed picture on the terms of performance for regularized NMF and sICA we constructed a parameterized surrogate dataset. With this dataset we could address two important questions: First, what is the influence of method inherent parameters and their optimal choice? And second, what is the application domain of both methods with respect to strength of pixel noise and size of measured stimuli set?

Create surrogate data

In [3]:
load_paperdata = True # load the surrogate dataset that was used in the manuscript
In [4]:
param = {'act_time': [0.01, 0.1, 0.3, 0.8, 1.0, 1.0],
         'cov': 0.3,
         'latents': 40,
         'mean': 0.2,
         'no_samples': 50,
         'noisevar': 0.2,
         'shape': (50, 50),
         'width':0.1,
         'var': 0.08}
In [5]:
if load_paperdata :
    data = pickle.load(open(os.path.join(datapath,'surrogate','data.pik')))
else:
    data = datamaker.Dataset(param)
    os.mkdir(datapath)

Define general layout for figures

In [6]:
# fontsizes
global_fs = 10
fs_num = 10

matplotlib.rcParams['axes.linewidth']=0.4
matplotlib.rcParams['axes.edgecolor']='k'

matplotlib.rcParams['xtick.major.size']=1     
matplotlib.rcParams['xtick.minor.size']=0.5      
matplotlib.rcParams['xtick.major.width']=0.3    
matplotlib.rcParams['xtick.minor.width']=0.3

matplotlib.rcParams['ytick.major.size']=1     
matplotlib.rcParams['ytick.minor.size']=0.5      
matplotlib.rcParams['ytick.major.width']=0.3    
matplotlib.rcParams['ytick.minor.width']=0.3  

matplotlib.rcParams['xtick.labelsize']=global_fs
matplotlib.rcParams['ytick.labelsize']=global_fs

matplotlib.rcParams['text.usetex']=False
matplotlib.rcParams['mathtext.default']='regular'
In [7]:
# parameter for visualizing sources
cmap_tmp = matplotlib.colors.LinearSegmentedColormap.from_list('tmp', ['w',np.array([147,112,219])/256.,'m','b'])
rgb_list = [cmap_tmp(i) for i in np.linspace(0,1,100)]+[plt.cm.jet(i) for i in np.linspace(0,1,101)]
cmap_bases = matplotlib.colors.LinearSegmentedColormap.from_list('bases', rgb_list)
param_imshow = {'cmap':cmap_bases, 'interpolation':'none', 'vmax':1, 'vmin':-1}

# axes layout for time courses
def mydecor(ax):
    ax.set_ylabel('activation', fontsize=global_fs, labelpad=-0.2)
    ax.yaxis.set_tick_params(labelsize=global_fs, labelright=True, labelleft=False)
    ax.yaxis.set_label_position('right')
    ax.xaxis.set_tick_params(labelsize=global_fs, pad=2)
    ax.set_xticks(range(60, 6*50, 60))
    
# timecourses plotter
splitnum = param['no_samples']
xparts = np.hsplit(np.arange(splitnum*len(param['act_time'])), splitnum)
def timeplot(ax, data):
    ax.plot(data, 'o', ms=1.5, mfc='k', mec='none')
    for x in xparts:
        ax.plot(x, data[x], '--',  dashes=(2,2), linewidth=0.4, color='k')
    #ax.set_ylabel('activation', fontsize=global_fs, labelpad=-0.2)
    ax.yaxis.set_tick_params(labelsize=global_fs, labelright=True, labelleft=False)
    ax.yaxis.set_label_position('right')
    ax.xaxis.set_tick_params(labelsize=global_fs, pad=2)
    ax.set_xticks(range(60, 6*50, 60))
    ax.set_yticks([0,1])

def no_ticks(ax):
    ax.set_xticks([])
    ax.set_yticks([])

def little_spines(ax):
    ax.spines['top'].set_color('none')
    ax.spines['right'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    
def no_spines(ax):
    ax.spines['top'].set_color('none')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_color('none')
    ax.spines['left'].set_color('none')
    no_ticks(ax)

Surrogate Data Construction

To obtain useful conclusions, we've constructed sources resembling the main characteristics of the biological case: Glomeruli are arranged side by side with slightly overlapping spatial signal distribution (Fig. 1a) The response spectra of surrogate glomeruli are narrowly tuned and correlated (Fig. 1b) and each glomeruli rises to peak activation with a model time-course (Fig. 1c). The data to enter factorization is the concurrent observation of forty glomeruli in response to $n_{stim}$ stimuli (e.g. odours) corrupted by additional pixel noise $\sigma_{\text{noise}}$.

In [8]:
fig_dim = np.array((3.54, 1.1))*4
fig1 = plt.figure(figsize=fig_dim)
fig_ratio = fig_dim[0]/fig_dim[1]
panel_width = 0.18 
panel_bottom = 0.23

# ================================================================================
# Panel (a)
# ================================================================================
ax = fig1.add_axes([0.05, panel_bottom, panel_width, panel_width*fig_ratio])
ax.text(0.08, 1.03, '(a)', transform=ax.transAxes,
      fontsize=fs_num, fontweight='bold', va='bottom', ha='right')
# contours
for c in data.spt_sources:
    ax.contour(c.reshape((50,50))[::-1], [0.5] ,colors=['k'], label='0.3', linewidths=0.4)
# centers
p = ax.scatter(np.array(data.points)[:param['latents'],1], 49-np.array(data.points)[:param['latents'],0], 
           marker='x', s=4, label='max', linewidths=0.4)
ax.set_xlim([0.5,49.5])
ax.set_ylim([-0.5,48.5])
ax.set_xticks([])
ax.set_yticks([])

# =================================================================================
# Panel (b)
# =================================================================================
leftpos = 0.365
ax = fig1.add_axes([leftpos, panel_bottom,0.2, panel_width*fig_ratio])
ax.text(-0.3, 1.03, '(b)', transform=ax.transAxes,
      fontsize=fs_num, fontweight='bold', va='bottom', ha='right')
# plot marginal
ax.plot(np.arange(1E-2,4,0.001), datamaker.adjusted_gamma(param['mean'], param['var']).pdf(np.arange(1E-2,4,0.001)))
ax.set_yscale('log')
ax.set_xlabel('$a_{peak}$', fontsize=global_fs, labelpad=0.5)
ax.set_xticks([0,1,2,3,4])
ax.set_ylabel('pdf', fontsize=global_fs, labelpad=0.5)

# plot covariance
ax = fig1.add_axes([leftpos+0.1, 0.5, 0.12, 0.12*fig_ratio])
im = ax.imshow(data.cov, interpolation='none', vmin=0, vmax=1, cmap=plt.cm.binary)
ax.set_xticks([])
ax.set_yticks([])
axbar = fig1.add_axes([leftpos+0.11+0.12,0.5,0.012,0.12*fig_ratio])
# plot covariance colorscale
cbar = fig1.colorbar(im, axbar)
cbar.set_ticks([0,1])
cbar.set_label('$\\rho$', size=global_fs, labelpad=-0.5)

# ===================================================================================
# panel (c)
# ===================================================================================
ax = fig1.add_axes([0.77,panel_bottom,0.2,panel_width*fig_ratio])
ax.text(-0.3, 1.03, '(c)', transform=ax.transAxes,
      fontsize=fs_num, fontweight='bold', va='bottom', ha='right')

ax.plot(param['act_time'], 'o--', ms=2, mfc='k', linewidth=0.6, color='k', dashes=(2,2))
ax.set_xlabel('sample points', fontsize=global_fs, labelpad=1)
ax.set_ylim((-0.05, 1.05))
ax.set_ylabel('% of $a_{peak}$', fontsize=global_fs, labelpad=-3)
ax.set_yticks([0,0.5,1])
ax.set_yticklabels([0,50,100])

plt.show()

Fig. 1 Surrogate data (a) 40 Gaussian shaped sources were randomly placed on a regular $9\times 9$ grid in a $50\times 50\text{px}$ image. Crosses mark centres of pixel participation and lines indicate half maximum (HM). (b) The peak activations for each stimulus were drawn from a gamma distribution ($\mu=0.2$, $\sigma=0.28$). Inset shows covariance $\rho$ introduced via a Gaussian copula into the sources' activation. (c) Each stimulus activation was extended to a 6 point model time-course.

In [9]:
fig_dim = np.array((3.54, 1.8))*4
fig = plt.figure(figsize=fig_dim, dpi=200)
for i in range(50):
    ax = fig.add_subplot(5,10,i+1)
    ax.imshow(data.observed[(i+1)*len(param['act_time'])-1].reshape(param['shape']), interpolation='none', vmin=-2.5, vmax=2.5)
    ax.set_xlabel('t=%s'%((i+1)*6), size=global_fs, labelpad=0.5)
    ax.set_xticks([])
    ax.set_yticks([])
plt.show()

Fig. 2 Observation of the combined signal at peak activation for different stimuli. Signal is corrupted with pixel noise ($\sigma_{\text{noise}}=0.2$).

Examplatory Factorizations

We started our analysis with a dataset roughly mimicking the properties of our IOS data with $n_{stim}=50$ stimulus observations and a noise level of $\sigma_{noise}=0.2$. Figure 3 shows exemplary recovered sources from both regularized NMF ($\alpha_{sm}=2$, $\alpha_{sp}=0.5$) and sICA, illustrating the general characteristics of the methods. Regularized NMF indeed showed the desired properties of a localized, sparse and smooth pixel participation, accurately reproducing the spatial and temporal characteristics of the source. In contrast plain sICA (with no additional processing applied) generates more holistic pixel participations, containing global noise contributions besides the local source contribution. This implies a more noisy reconstruction of the activation courses by sICA, especially for weaker signals.

In [10]:
# general NMF param
anal_param = {'sparse_param': 0.5,
              'factors': 80,
              'smooth_param': 2,
              'init':'convex', 
              'sparse_fct':'global_sparse',
              'verbose':0
              }
In [12]:
input_data = ia.TimeSeries(data.observed, shape=param['shape'])
    
nmf = ia.NNMF(maxcount=50, num_components=anal_param['factors'], **anal_param)
nmf_out = nmf(input_data)

sica = ia.sICA(num_components=anal_param['factors']) 
sica_out = sica(input_data)  
In [13]:
nmf_match, nmf_cor, _ = data.cor2source(nmf_out)
sica_match, sica_cor, _ = data.cor2source(sica_out)
In [14]:
figsize = np.array((20,40))
fig = plt.figure(figsize=figsize)
gs = gridspec.GridSpec(40,3)
gs.update(left = 0.01, top = 0.96, bottom = 0.01)

for source_ind in range(40):

    gs_temp = matplotlib.gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs[source_ind,0], width_ratios=[1,4], wspace=0.01)
    ax = fig.add_subplot(gs_temp[0])
    im = ax.imshow(data.spt_sources[source_ind].reshape((50,50)), **param_imshow)
    ax.set_axis_off()
    
    ax = fig.add_subplot(gs_temp[1])
    timeplot(ax, data.activation[:,source_ind])

    
    gs_temp = matplotlib.gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs[source_ind,1], width_ratios=[1,4], wspace=0.01)
    ax = fig.add_subplot(gs_temp[0])
    im = ax.imshow(nmf_out.base.shaped2D()[nmf_match[source_ind]], **param_imshow)
    ax.set_axis_off()
    
    ax = fig.add_subplot(gs_temp[1])
    timeplot(ax, nmf_out._series[:,nmf_match[source_ind]])
    ax.text(10, ax.get_ylim()[1], r'$r^{tmp} = %.2f$'%nmf_cor[source_ind], size=7, va='top')
    
    gs_temp = matplotlib.gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs[source_ind,2], width_ratios=[1,4], wspace=0.01)
    ax = fig.add_subplot(gs_temp[0])
    im = ax.imshow(sica_out.base.shaped2D()[sica_match[source_ind]], **param_imshow)
    ax.set_axis_off()
    
    ax = fig.add_subplot(gs_temp[1])
    timeplot(ax, sica_out._series[:,sica_match[source_ind]])
    ax.text(10, ax.get_ylim()[1], r'$r^{tmp} = %.2f$'%sica_cor[source_ind], size=7, va='top')
    
# colorbar
axbar = fig.add_axes([0.01, 0.992, 0.2, 0.003])
cbar = fig.colorbar(im, axbar, orientation='horizontal')
cbar.set_ticks([-1,0,1])
cbar.set_label('pixel participation', size=global_fs, labelpad=2)
cbar.ax.xaxis.set_tick_params(labelsize=global_fs, pad=2)

fig.text(0.15,0.97,'source')
fig.text(0.45,0.97,'NMF')
fig.text(0.75,0.97,'sICA')

plt.show()