#!/usr/bin/env python # coding: utf-8 # # Analysis of Bridget Jones's Diary # # ## Trend Lines From A Sample # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import json import warnings warnings.filterwarnings('ignore') book1 = 'book_1_gist_subsampled.json' data = json.loads(open(book1).read()) import matplotlib import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats day_nos = [] weights = [] for key in sorted(data.iterkeys(), key=int): day_nos.append(int(key)) weights.append(data[key]['weight']) weights = np.asarray(weights) day_nos = np.asarray(day_nos) def lin_reg(x, xp, yp): """ OLS """ m = (np.dot(xp,yp) - (np.sum(xp) * np.sum(yp))/float(yp.size))/(np.dot(xp,xp) - np.sum(xp)**2/float(xp.size)) c = (np.sum(yp) - m * np.sum(xp))/float(xp.size) return m * x + c trend_line = lin_reg(day_nos, day_nos, weights) r = trend_line - weights r_m = np.abs(r) fig,ax = plt.subplots(nrows=1,ncols=3, gridspec_kw={'width_ratios':[2,1,1]}, facecolor='white') fig.suptitle('Bridget Jones\'s Weight',fontsize=15) fig.subplots_adjust(wspace=0.25) fig.set_size_inches(24, 6) ax[0].scatter( day_nos, weights, s=50, marker='o', alpha=0.4) ax[0].plot(day_nos, trend_line, color="blue", linewidth=2.0, linestyle="--", label = "Trend Line") ax[0].set_yticks([weights.min(), weights.max()]) ax[0].set_yticklabels([str(weights.min()) + ' lbs', str(weights.max()) + ' lbs']) ax[0].set_xticks([1,365]) ax[0].set_xticklabels(['Day 1', 'Day 365']) ax[0].spines['right'].set_color('none') ax[0].spines['top'].set_color('none') ax[0].spines['left'].set_color('grey') ax[0].spines['bottom'].set_color('grey') ax[1].scatter( np.asarray(day_nos), np.asarray(r_m), s=25, c="red", marker='o', alpha='0.8', label="Residual Magnitudes") ax[1].set_yticks([r_m.min(), r_m.max()]) ax[1].set_yticklabels([str(round(r_m.min())), str(round(r_m.max()))]) ax[1].set_xticks([1,365]) ax[1].set_xticklabels(['Day 1', 'Day 365']) ax[1].spines['right'].set_color('none') ax[1].spines['top'].set_color('none') ax[1].spines['left'].set_color('grey') ax[1].spines['bottom'].set_color('black') ax[1].set_title('Residual Magnitudes') res=stats.probplot(r, dist="norm", plot=ax[2]) ax[2].set_title('QQ Plot') ax[2].spines['right'].set_color('none') ax[2].spines['top'].set_color('none') ax[2].spines['left'].set_color('grey') ax[2].spines['bottom'].set_color('grey') ax[2].set_ylabel('Ordered Residuals') ax[2].set_xticks([-2.0,0,2.0]) ax[2].set_xticklabels([r'$-2.0$', r'$0$', r'$2.0$']) ax[2].set_yticks([-8.0,0,8.0]) ax[2].set_yticklabels([r'$-8.0$', r'$0$', r'$8.0$']) plt.show() # There is a small downward trend when fitting a trend line. Note that applying linear regression is possible only when the conditional error, i.e. the spread you would see if you weighed her every day was normally distributed with a constant variance across days. # # The spread of the residuals (_actual weight minus the estimated weight for the day_) is by far not constant. However, it isn't apparent to me that the spread is out of line. I am not quite sure how to find. The $\mathbf{QQ}$ Plot compares the $\mathbf{CDF}$ of the residuals against that of a Standard Normal distribution. This is supposed to lie very close to that line. Again, I am not sure what _very close_ means. # ## Trends From More Days # In[5]: import pandas as pd bf = pd.read_json('book_1_full.json') bf = bf.transpose() bf.reset_index(inplace=True) bf.rename(columns={'index':'doy'}, inplace=True) bfc = bf.dropna() bfc[['doy', 'weight']] = bfc[['doy', 'weight']].apply(pd.to_numeric) fig,ax = plt.subplots(nrows=2,ncols=1,facecolor='white') fig.suptitle('Bridget Jones\'s Weight',fontsize=30) fig.subplots_adjust(hspace=0.5) fig.set_size_inches(24, 24) def loess( x, h, xp, yp ): from numpy import * w = exp(-0.5*((x-xp)/h)**2)/sqrt(2*pi*h**2) b = np.dot(w,xp)*np.dot(w,yp) - np.sum(w)*np.sum(w*xp*yp) b /= sum(w*xp)**2 - sum(w)*sum(w*xp**2) a = ( np.dot(w,yp) - b*np.dot(w,xp) )/np.sum(w) return a + b*x for ii in xrange(0, 2): ax[ii].spines['right'].set_color('none') ax[ii].spines['top'].set_color('none') ax[ii].spines['left'].set_color('grey') ax[ii].spines['bottom'].set_color('grey') ax[ii].set_ylim((bfc.weight.min() -5.0 ,bfc.weight.max() + 5.0)) ax[ii].set_yticks([bfc.weight.min(),bfc.weight.max()]) ax[ii].set_yticklabels([str(round(bfc.weight.min())) + ' lbs', str(round(bfc.weight.max())) + ' lbs']) ax[ii].set_xticks([1,365]) ax[ii].set_xticklabels(['Day 1', 'Day 365']) colors = [u'#4878cf', u'#6acc65', u'#d65f5f', u'#b47cc7', u'#c4ad66'] # Let us take 5 samples of 20 each samples = [] for ii in xrange(0,5): t = bfc[['doy', 'date','weight']].sample(frac=.25) t.sort_values('doy', ascending=True, inplace=True) t['lfit'] = lin_reg(t['doy'], t.doy.values, t.weight.values) doyl = t.doy.values.tolist() cfit = [] for doy in doyl: cfit.append(loess(doy, 25.0, t.doy.values, t.weight.values)) t['cfit'] = cfit samples.append(t) ax[0].scatter(t.doy.values, t.weight.values, color=colors[ii], s=50, marker='o', alpha=0.75) ax[0].plot(t.doy.values, t.lfit.values, color=colors[ii], linewidth=2.0, linestyle="--", label = "Trend Line %d" % (ii)) ax[0].set_title('Trend Lines For 25% of Samples', fontsize=20) ax[1].scatter(t.doy.values, t.weight.values, color=colors[ii], s=50, marker='o', alpha=0.75) ax[1].plot(t.doy.values, t.cfit.values, color=colors[ii], linewidth=2.0, linestyle="-", label = "Trend Line %d" % (ii), alpha=0.5) ax[1].set_title('Nonparametric Trend Lines For 25% of Samples', fontsize=20) plt.show() # Conclusion: # # 1. Why are trend lines varying? # 2. Is the variance high? # 3. Is this sign that the linear estimator is high variance? # 4. how do i find out? # In[ ]: