import numpy as np import scipy as sp import pandas as pd import patsy import seaborn as sns import statsmodels.formula.api as smf import statsmodels.api as sm import matplotlib.pyplot as plt import itertools %matplotlib inline sns.set_style("white") gf = pd.read_csv('../Datasets/gforces.txt', sep='\t') # make all columns lower case: gf.columns = [c.lower() for c in gf.columns] print(gf) fit_lm = smf.glm('signs ~ age', gf).fit() print(fit_lm.summary()) # Let's have a look at the model predictions sns.regplot('age', 'signs', gf) sns.despine(offset=10); def logit(x): return(1. / (1. + np.exp(-x))) x = np.linspace(-5, 5) # imagine this is the linear predictor (inner product of X and beta) y = logit(x) plt.plot(x, y) sns.despine(offset=10) plt.vlines(0, 0, 1, linestyles='--') plt.hlines(0.5, -5, 5, linestyles='--') plt.yticks([0, 0.5, 1]) plt.xlabel('Linear predictor value') plt.ylabel('Transformed value'); logit(-np.inf) logit(np.inf) # code hacked from here: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.binom.html n, p = 50, 0.5 fig, ax = plt.subplots(1, 1) x = np.arange(sp.stats.distributions.binom.ppf(0.001, n, p), sp.stats.distributions.binom.ppf(0.999, n, p)) ax.plot(x, sp.stats.distributions.binom.pmf(x, n, p), 'bo', ms=8, label='binom pmf') ax.vlines(x, 0, sp.stats.distributions.binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) sns.despine(offset=10); plt.xlabel('Number of successes') plt.xlim(0, 50) plt.ylabel('Probability mass'); n, p = 50, 0.9 fig, ax = plt.subplots(1, 1) x = np.arange(sp.stats.distributions.binom.ppf(0.001, n, p), sp.stats.distributions.binom.ppf(0.999, n, p)) ax.plot(x, sp.stats.distributions.binom.pmf(x, n, p), 'bo', ms=8, label='binom pmf') ax.vlines(x, 0, sp.stats.distributions.binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) sns.despine(offset=10); plt.xlabel('Number of successes') plt.xlim(0, 50) plt.ylabel('Probability mass'); fit = smf.glm('signs ~ age', data=gf, family=sm.families.Binomial(link=sm.families.links.logit)).fit() # note we don't have to specify the logit link above, since it's the default for the binomial distribution. print(fit.summary()) print(fit_lm.summary()) age = 28 linear_predictor = fit.params[0] + fit.params[1]*age linear_predictor pred = logit(linear_predictor) pred fit.predict(exog={'age':65}) fit_lm.predict(exog={'age': 65}) # define expand grid function: def expand_grid(data_dict): """ A port of R's expand.grid function for use with Pandas dataframes. from http://pandas.pydata.org/pandas-docs/stable/cookbook.html?highlight=expand%20grid """ rows = itertools.product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys()) # build a new matrix with expand grid: preds = expand_grid({'age': np.linspace(10, 80, num=100)}) # put a line of ages in, to plot a smooth curve. preds['yhat'] = fit.predict(preds) # add to data: merged = gf.append(preds) merged.tail() g = sns.FacetGrid(merged, size=4) g.map(plt.plot, 'age', 'yhat') # plot model predictions g.map(plt.scatter, 'age', 'signs') sns.despine(offset=10); plt.ylim([-.1, 1.1]) plt.yticks([0, 0.5, 1]); nat_ims = pd.read_csv('../Datasets/psychophysics.txt', sep='\t') # columns to lowercase: nat_ims.columns = [c.lower() for c in nat_ims.columns] nat_ims.head() nat_ims.groupby(['observer']).size() np.random.seed(12345) # seed rng so these results are reproducible. rows = np.random.choice(nat_ims.index, size=5000, replace=False) dat = nat_ims.ix[rows].copy() dat.groupby(['observer', 'eccent']).size() sns.lmplot('patchsize', 'correct', data=dat, hue='eccent', col='observer', logistic=True, y_jitter=.05, size=4) sns.despine(offset=10) g.fig.subplots_adjust(wspace=0.5) plt.xlabel('log patch size') plt.ylabel('Proportion correct') plt.yticks([0, 0.5, 1]); # the data get easier to see if we bin trials along the x-axis: sns.lmplot('patchsize', 'correct', data=dat, hue='eccent', col='observer', logistic=True, x_bins=6, size=4) sns.despine(offset=10) g.fig.subplots_adjust(wspace=0.5) plt.xlabel('log patch size') plt.ylabel('Proportion correct') plt.yticks([0, 0.5, 1]); fit = smf.glm('correct ~ patchsize + C(eccent)', data=nat_ims, family=sm.families.Binomial(link=sm.families.links.logit)).fit() # note we don't have to specify the logit link above, since it's the default for the binomial distribution. print(fit.summary()) spk = pd.read_csv('../Datasets/neuron2.txt', sep='\t',index_col=False) spk.head() sns.factorplot("contrast","spikes","orientation",spk,kind='point') sns.despine(offset=10); fit = smf.glm('spikes ~ contrast * orientation', data=spk, family=sm.families.Poisson(link=sm.families.links.log)).fit() print(fit.summary()) # build a new matrix with expand grid: preds = expand_grid({'contrast': [1, 10], 'orientation': [0, 90]}) preds_2 = preds.copy() preds['spikes'] = fit.predict(preds) # also fit a model with no interaction for comparison: #fit_2 = smf.glm('spikes ~ contrast * orientation', spk).fit() fit_2 = smf.glm('spikes ~ contrast + orientation', data=spk, family=sm.families.Poisson(link=sm.families.links.log)).fit() preds_2['spikes'] = fit_2.predict(preds_2) # to plot model predictions against data, we should merge them together: # see answer (https://stackoverflow.com/questions/28239359/showing-data-and-model-predictions-in-one-plot-using-seaborn-and-statsmodels) merged = pd.concat(dict(data=spk, linear=preds_2, interact=preds), names=['type']).reset_index() g = sns.factorplot('contrast', 'spikes', 'orientation', data=merged, col='type', kind='point', size=3.5) g.fig.subplots_adjust(wspace=0.3) sns.despine(offset=10); print(fit_2.summary()) # Do an expanded prediction for many contrasts, compare to linear model: preds_3 = expand_grid({'contrast': np.linspace(0.01, 20), 'orientation': [0, 90]}) preds_4 = preds_3.copy() preds_3['yhat'] = fit.predict(preds_3) fit_lm = smf.glm('spikes ~ contrast + orientation', data=spk).fit() preds_4['yhat'] = fit_lm.predict(preds_3) # to plot model predictions against data, we should merge them together: # see answer (https://stackoverflow.com/questions/28239359/showing-data-and-model-predictions-in-one-plot-using-seaborn-and-statsmodels) merged = pd.concat(dict(data=spk, poisson=preds_3, linear=preds_4), names=['type']).reset_index() merged.tail() g = sns.FacetGrid(merged, hue='type', col='orientation') g.map(plt.scatter, 'contrast', 'spikes') g.map(plt.plot, 'contrast', 'yhat') g.fig.subplots_adjust(wspace=0.3) sns.despine(offset=10) g.add_legend(); from IPython.core.display import HTML def css_styling(): styles = open("../custom_style.css", "r").read() return HTML(styles) css_styling()