#!/usr/bin/env python # coding: utf-8 # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') from numpy import linspace, log, exp from numpy.random import normal from scipy.stats import chisqprob from lmfit import Model import seaborn as sns sns.set_style('ticks') sns.set_context('notebook', font_scale=1) # # The likelihood ratio test: Python implementation # For two models, one nested in the other (meaning that the nested model estimated parameters are a subset of the nesting model), the test statistic $D$ is (based on [this](http://www.stat.sc.edu/~habing/courses/703/GLRTExample.pdf)): # # $$ # \Lambda = \Big( \Big(\frac{\sum{(X_i - \hat{X_i}(\theta_1))^2}}{\sum{(X_i - \hat{X_i}(\theta_0))^2}}\Big)^{n/2} \Big) \\ # D = -2 log \Lambda \\ # lim_{n \to \infty} D \sim \chi^2_{df=\Delta} # $$ # # where $\Lambda$ is the likelihood ratio, $D$ is the statistic, $X_{i}$ are the data points, $\hat{X_i}(\theta)$ is the model prediction with parameters $\theta$, $\theta_i$ is the parameters estimation for model $i$, $n$ is the number of data points and $\Delta$ is the difference in number of parameters between the models. # The python implementation below compares between two `lmfit.ModelFit` objects. These are the results of fitting models to the same data set using the [`lmfit` package](lmfit.github.io/lmfit-py/). # # The function compares between model fit `m0` and `m1` and assumes that `m0` is nested in `m1`, meaning that the set of varying parameters of `m0` is a subset of the varying parameters of `m1`. The property `chisqr` of the `ModelFit` objects is the sum of the square of the residuals of the fit. `ndata` is the number of data points. `nvarys` is the number of varying parameters. # In[31]: def lrtest(m0, m1, alfa=0.05): """Perform a likelihood ratio test on two nested models. Input: `m0` is nested in `m1`. `alfa` is the test significance level (default 5%). Output: prefer_m1 - a boolean, should we prefer `m1` over `m0`. pval - the test p-value D - the test statistic ddf - the number of degrees of freedom """ n0 = m0.ndata k0 = m0.nvarys chisqr0 = m0.chisqr assert chisqr0 > 0 n1 = m1.ndata assert n0 == n1 k1 = m1.nvarys chisqr1 = m1.chisqr assert chisqr1 > 0 Lambda = (m1.chisqr / m0.chisqr)**(n0 / 2.) D = -2 * log( Lambda ) assert D > 0 ddf = k1 - k0 assert ddf > 0 pval = chisqprob(D, ddf) prefer_m1 = pval < alfa return prefer_m1, pval, D, ddf # ## Test on a simple model # # We test the function on data generated from the equation: # # $$ # y = b + e^{-a t} + N(0, \sigma^2) # $$ # # where $a$ and $b$ are the parameters, $t$ is the variable, and $N$ is the normal distribution. # # We fit a nesting model `model_fit1` (the alternative hypothesis of the test). This model estimates both $a$ and $b$. # We also fit a nested model `model_fit0` (the nul hypothesis of the test) in which either $a$ or $b$ is fixed at an initial value. # We than plot both model fits, print the estimated parameters, the test statistic and the p-value of the test. # In[34]: # test lrt def lrtest_test(a, b, a_init, b_init, a_vary, b_vary, sig=0.1, N=100, alfa=0.05): assert a_vary or b_vary assert not(a_vary and b_vary) t = linspace(0,1,N) f = lambda t,a,b: b + exp(-a*t) data = f(t,a,b) + normal(0, sig, N) fig, ax = subplots(1,2,sharex=True,sharey=True,figsize=(15,5)) print "real:", a,b model = Model(f) params = model.make_params(a=a_init, b=b_init) two_var_fit = model.fit(data, t=t, params=params) two_var_fit.plot_fit(ax=ax[1]) ax[1].set_title("Two variables") #model_fit1.plot_residuals(ax[1]) print "Two variables:",two_var_fit.best_values params['a'].set(vary=a_vary) params['b'].set(vary=b_vary) one_var_fit = model.fit(data, t=t, params=params) one_var_fit.plot_fit(ax=ax[0]) #model_fit0.plot_residuals(ax[0]) ax[0].set_title("One variable") print "One variable:",one_var_fit.best_values prefer_m1,pval,D,ddf = lrtest(one_var_fit, two_var_fit, alfa) print "pval=%.2g, D=%.2g, ddf=%d" % (pval, D, ddf) print "Decision with significance level %.2g:" % alfa, if prefer_m1: print "Two variable model" else: print "One variable model" return t,data # In[35]: lrtest_test(1,1,1,1,False,True); # In[36]: lrtest_test(1,1,1,1,True,False); # In[37]: lrtest_test(1,1,0,0,True,False); # In[38]: lrtest_test(1,1,0,0,False,True); # In[39]: lrtest_test(1,1,0,1,False,True); # In[40]: lrtest_test(1,1,0,1,True,False); # In[41]: lrtest_test(1,1,1,0,True,False); # In[42]: lrtest_test(1,1,1,0,False,True); # In[43]: lrtest_test(1,1,2,1,False,True); # ## Compare with R # # Here we compare the Python implementation with the R implementation - `lmtest::lrtest`. The data is generated from a quadratic polynomial: # # $$ # y = at^2 + bt + c + N(0,\sigma^2) # $$ # # and we fit two models - the nested model is a linear function and the nesting model is a quadratic polynomial. # # ### Note # # In order to get this to work you will need to make sure the `lmtest` package is installed (don't install via RStudio - use regular command line R) and that the `R_HOME` and `R_USER` environment variable are set to where R is running from the command line and where the packages are installed to when running from the command line. # In[44]: get_ipython().run_line_magic('load_ext', 'rpy2.ipython') # In[45]: get_ipython().system('echo %R_HOME%') get_ipython().system('echo %R_USER%') # In[47]: for _ in range(10): N = 100 sig = 0.1 a,b,c = 0.1,1,1 t = linspace(0,1,N) f = lambda t,a,b,c: c + b*t + a*t**2 data = f(t,a,b,c) + normal(0, sig, N) model = Model(f) params = model.make_params(a=0, b=0, c=0) model_fit1 = model.fit(data, t=t, params=params) params['a'].set(vary=False) model_fit0 = model.fit(data, t=t, params=params) prefer_m1,pval,D,ddf = lrtest(model_fit0, model_fit1) print "py", print pval print "R ", get_ipython().run_line_magic('Rpush', 't data') get_ipython().run_line_magic('R', 'library(lmtest)') get_ipython().run_line_magic('R', 'lm1=lm(data~t)') get_ipython().run_line_magic('R', 'lm2=lm(data~t+I(t^2))') get_ipython().run_line_magic('R', 'pvalue = lrtest(lm2,lm1)[2,5]') get_ipython().run_line_magic('Rpull', 'pvalue') print pvalue[0] # ## License # # This notebook was written by [Yoav Ram](http://www.yoavram.com) and [Uri Obolski](https://sites.google.com/site/hadanylab/people/uri-obolski). # The notebook was written to facilitate the discussion on [lmfit mailing-list](https://groups.google.com/forum/?fromgroups#!topic/lmfit-py/NuQsmZ6D4vY). # The latest version can be found at [ipython.yoavram.com](http://nbviewer.ipython.org/github/yoavram/ipython-notebooks/blob/master/likelihood%20ratio%20test.ipynb). # # The code is released with a CC-BY-SA 3.0 license.