%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy.stats as ss
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')
This lecture is the solution to an analysis we did in class. I'll try to put each iteration we tried in lecture so that you can see how each cell was improved. We're trying to answer three questions:
daily_data = pd.read_csv('Fitness Data/day.csv')
daily_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 102 entries, 0 to 101 Data columns (total 33 columns): Date 102 non-null object Steps 102 non-null int64 Calories 102 non-null int64 HR_Lowest 102 non-null int64 HR_Highest 102 non-null int64 HR_Average 102 non-null int64 Total_Miles_Moved 102 non-null float64 Active_Hours 102 non-null int64 Floors_Climbed 102 non-null int64 UV_Exposure_Minutes 102 non-null int64 Total_Seconds_All_Activities 85 non-null float64 Total_Calories_All_Activities 85 non-null float64 Sleep_Events 85 non-null float64 Sleep_Total_Calories 85 non-null float64 Total_Seconds_Slept 85 non-null float64 Run_Events 85 non-null float64 Run_Total_Seconds 85 non-null float64 Total_Miles_Run 85 non-null float64 Run_Total_Calories 85 non-null float64 Bike_Events 85 non-null float64 Bike_Total_Seconds 85 non-null float64 Total_Miles_Biked 85 non-null float64 Bike_Total_Calories 85 non-null float64 Exercise_Events 85 non-null float64 Exercise_Total_Seconds 85 non-null float64 Exercise_Total_Calories 85 non-null float64 Guided_Workout_Events 85 non-null float64 Guided_Workout_Total_Seconds 85 non-null float64 Guided_Workout_Total_Calories 85 non-null float64 Golf_Events 85 non-null float64 Golf_Total_Seconds 85 non-null float64 Total_Miles_Golfed 85 non-null float64 Golf_Total_Calories 85 non-null float64 dtypes: float64(24), int64(8), object(1) memory usage: 27.1+ KB
#Attempt 1
plt.plot(daily_data.Date, daily_data.HR_Average)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-48-4c960599580a> in <module>() 1 #Attempt 1 ----> 2 plt.plot(daily_data.Date, daily_data.HR_Average) /opt/conda/lib/python3.5/site-packages/matplotlib/pyplot.py in plot(*args, **kwargs) 3152 ax.hold(hold) 3153 try: -> 3154 ret = ax.plot(*args, **kwargs) 3155 finally: 3156 ax.hold(washold) /opt/conda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs) 1810 warnings.warn(msg % (label_namer, func.__name__), 1811 RuntimeWarning, stacklevel=2) -> 1812 return func(ax, *args, **kwargs) 1813 pre_doc = inner.__doc__ 1814 if pre_doc is None: /opt/conda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in plot(self, *args, **kwargs) 1423 1424 for line in self._get_lines(*args, **kwargs): -> 1425 self.add_line(line) 1426 lines.append(line) 1427 /opt/conda/lib/python3.5/site-packages/matplotlib/axes/_base.py in add_line(self, line) 1706 line.set_clip_path(self.patch) 1707 -> 1708 self._update_line_limits(line) 1709 if not line.get_label(): 1710 line.set_label('_line%d' % len(self.lines)) /opt/conda/lib/python3.5/site-packages/matplotlib/axes/_base.py in _update_line_limits(self, line) 1728 Figures out the data limit of the given line, updating self.dataLim. 1729 """ -> 1730 path = line.get_path() 1731 if path.vertices.size == 0: 1732 return /opt/conda/lib/python3.5/site-packages/matplotlib/lines.py in get_path(self) 923 """ 924 if self._invalidy or self._invalidx: --> 925 self.recache() 926 return self._path 927 /opt/conda/lib/python3.5/site-packages/matplotlib/lines.py in recache(self, always) 610 x = ma.asarray(xconv, np.float_).filled(np.nan) 611 else: --> 612 x = np.asarray(xconv, np.float_) 613 x = x.ravel() 614 else: /opt/conda/lib/python3.5/site-packages/numpy/core/numeric.py in asarray(a, dtype, order) 472 473 """ --> 474 return array(a, dtype, copy=False, order=order) 475 476 def asanyarray(a, dtype=None, order=None): ValueError: could not convert string to float: '2016-04-11'
#Problem: The dates were treated as strings
#Solution: Convert them
dates = pd.to_datetime(daily_data.Date)
plt.plot(dates, daily_data.HR_Average)
[<matplotlib.lines.Line2D at 0x7fb6a6367f60>]
#Problem: The x-axis is crowded
#Solution: Websearch reveals there is a method for this
dates = pd.to_datetime(daily_data.Date)
plt.plot(dates, daily_data.HR_Average)
plt.gcf().autofmt_xdate()
activities = pd.read_csv('Fitness Data/activity.csv')
activities.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 133 entries, 0 to 132 Data columns (total 50 columns): Date 133 non-null object Start_Time 133 non-null object Event_Type 133 non-null object Duration_Seconds 133 non-null int64 Seconds_Paused 133 non-null int64 Calories_Burned 133 non-null int64 Calories_Burned_Carbs 35 non-null float64 Calories_Burned_Fats 35 non-null float64 HR_Lowest 131 non-null float64 HR_Peak 131 non-null float64 HR_Average 131 non-null float64 UV_Exposure_Minutes 133 non-null int64 Total_Miles_Moved 35 non-null float64 Cardio_Benefit 35 non-null object Minutes_Under_50%_HR 35 non-null float64 Minutes_In_HRZ_Very_Light_50%_60% 35 non-null float64 Minutes_In_HRZ_Light_60%_70% 35 non-null float64 Minutes_In_HRZ_Moderate_70%_80% 35 non-null float64 Minutes_In_HRZ_Hard_80%_90% 35 non-null float64 Minutes_In_HRZ_Very_Hard_90%_Plus 35 non-null float64 HR_Finish 35 non-null float64 HR_Recovery_Rate_1_Min 24 non-null float64 HR_Recovery_Rate_2_Min 19 non-null float64 Recovery_Time_Seconds 35 non-null float64 Bike_Average_MPH 0 non-null float64 Bike_Max_MPH 0 non-null float64 Elevation_Highest_Feet 20 non-null float64 Elevation_Lowest_Feet 20 non-null float64 Elevation_Gain_Feet 20 non-null float64 Elevation_Loss_Feet 20 non-null float64 Wake_Up_Time 93 non-null object Seconds_Awake 98 non-null float64 Seconds_Asleep_Total 98 non-null float64 Seconds_Asleep_Restful 98 non-null float64 Seconds_Asleep_Light 98 non-null float64 Wake_Ups 98 non-null float64 Seconds_to_Fall_Asleep 98 non-null float64 Sleep_Efficiency 98 non-null float64 Sleep_Restoration 92 non-null object Sleep_HR_Resting 93 non-null float64 Sleep_Auto_Detect 98 non-null object GW_Plan_Name 0 non-null float64 GW_Reps_Performed 0 non-null float64 GW_Rounds_Performed 0 non-null float64 Golf_Course_Name 0 non-null float64 Golf_Course_Par 0 non-null float64 Golf_Total_Score 0 non-null float64 Golf_Par_or_Better 0 non-null float64 Golf_Pace_of_Play_Minutes 0 non-null float64 Golf_Longest_Drive_Yards 0 non-null float64 dtypes: float64(39), int64(4), object(7) memory usage: 53.0+ KB
adates = pd.to_datetime(activities.Date)
plt.plot(adates, activities.Seconds_Asleep_Total)
plt.gcf().autofmt_xdate()
#Problem: Some of the data is missing?
#Solution: Analyze the raw data and we see that each row is an activity. For "running" activity no "sleep" data is recorder
print(activities.Seconds_Asleep_Total[0:10])
print(activities.Event_Type[0:10])
print(activities.Date[0:10])
print(activities.Start_Time[0:10])
0 23210 1 22929 2 21746 3 16135 4 NaN 5 20749 6 NaN 7 20224 8 15875 9 27613 Name: Seconds_Asleep_Total, dtype: float64 0 Sleep 1 Sleep 2 Sleep 3 Sleep 4 Run 5 Sleep 6 Run 7 Sleep 8 Sleep 9 Sleep Name: Event_Type, dtype: object 0 2016-01-01 1 2016-01-02 2 2016-01-04 3 2016-01-04 4 2016-01-04 5 2016-01-05 6 2016-01-06 7 2016-01-07 8 2016-01-07 9 2016-01-08 Name: Date, dtype: object 0 2016-01-01 00:46:55 1 2016-01-02 23:04:51 2 2016-01-04 23:07:02 3 2016-01-04 00:09:01 4 2016-01-04 07:27:36 5 2016-01-05 23:31:01 6 2016-01-06 09:00:08 7 2016-01-07 22:57:57 8 2016-01-07 03:20:11 9 2016-01-08 23:52:26 Name: Start_Time, dtype: object
#Problem: How to remove the extra rows
#Solution: Find all NaNs and remove those rows
sleep_valid = np.invert(np.isnan(activities.Seconds_Asleep_Total))
sleep_valid = np.logical_and(activities.Seconds_Asleep_Total > 500, sleep_valid)
sleep_dates = adates[sleep_valid]
sleep_hours = activities.Seconds_Asleep_Total[sleep_valid] / (60 * 60)
plt.plot(sleep_dates, sleep_hours)
plt.xlabel('Date')
plt.ylabel('Hours Asleep')
plt.gcf().autofmt_xdate()
#Problem: Some of these sleep times seem unreasonable. Let's see what time I went to bed at
sleep_start_times = pd.to_datetime(activities.Start_Time)[sleep_valid]
print(sleep_start_times[0:5])
print(sleep_dates[0:5])
0 2016-01-01 00:46:55 1 2016-01-02 23:04:51 2 2016-01-04 23:07:02 3 2016-01-04 00:09:01 5 2016-01-05 23:31:01 Name: Start_Time, dtype: datetime64[ns] 0 2016-01-01 1 2016-01-02 2 2016-01-04 3 2016-01-04 5 2016-01-05 Name: Date, dtype: datetime64[ns]
#Problem: If we went to bed after midnight, we assign that to the following day
#Solution: We will write a function that takes in a sleep time and returns what the actual day should be
#Let's see how to move a date
t = sleep_start_times[0]
print(t)
print(pd.DateOffset(-1) + t)
2016-01-01 00:46:55 2015-12-31 00:46:55
#Problem: That moves the sleep 24 hours. We actually don't want the time to bed, just the date
#Solution: Use normalize
print( (pd.DateOffset(-1) + t).normalize())
2015-12-31 00:00:00
def get_nearest_sleepytime(date):
sleep_hour = date.time().hour
if 0 <= sleep_hour <= 6:
return (pd.DateOffset(-1) + date).normalize()
else:
return date.normalize()
print(t, get_nearest_sleepytime(t))
2016-01-01 00:46:55 2015-12-31 00:00:00
#Now apply it to all dates
sleep_dates = []
for st in sleep_start_times:
sleep_dates.append(get_nearest_sleepytime(st))
sleep_dates = np.array(sleep_dates)
print(sleep_dates[0:5])
print(sleep_start_times[0:5])
[Timestamp('2015-12-31 00:00:00') Timestamp('2016-01-02 00:00:00') Timestamp('2016-01-04 00:00:00') Timestamp('2016-01-03 00:00:00') Timestamp('2016-01-05 00:00:00')] 0 2016-01-01 00:46:55 1 2016-01-02 23:04:51 2 2016-01-04 23:07:02 3 2016-01-04 00:09:01 5 2016-01-05 23:31:01 Name: Start_Time, dtype: datetime64[ns]
#Problem: Now our sleeps are out of order!
#If we try to correlate how much sleep I have tonight vs last night, we will have a problem
#
#Solution: Sort them based on date and re-order them
o_sleep_dates = np.sort(sleep_dates)
#Problem: I can sort my dates, but how do I rearrange my sleep_hours?
#Solution: Use the argsort command
reorder = np.argsort(sleep_dates)
print(reorder[0:5])
[0 1 3 2 4]
o_sleep_dates = sleep_dates[reorder]
o_sleep_hours = sleep_hours[reorder]
print(o_sleep_hours[0:5])
print(o_sleep_dates[0:5])
0 6.447222 1 6.369167 3 4.481944 2 6.040556 4 NaN Name: Seconds_Asleep_Total, dtype: float64 [Timestamp('2015-12-31 00:00:00') Timestamp('2016-01-02 00:00:00') Timestamp('2016-01-03 00:00:00') Timestamp('2016-01-04 00:00:00') Timestamp('2016-01-05 00:00:00')]
#Problem: We have NaNS again!
#Solution: Our sleep_hours is a Pandas Series, not Numpy Arrays. It remembers the original indices.
#Remember we removed all our Nans previously. We can convert to a numpy array use the values command
o_sleep_hours = sleep_hours.values[reorder]
#Problem: We want to know if we have duplicates
#Solution: Get the unique set and see if it's the same length
print(len(np.unique(o_sleep_dates)))
print(len(o_sleep_dates))
91 93
#Problem: We have duplicates!
#Solution: Identify the non-unique elements by seeing which unique elements occur more than once
uniq, counts = np.unique(o_sleep_dates, return_counts=True)
print(counts[0:10])
non_uniq = uniq[counts > 1]
print(non_uniq)
[1 1 1 1 1 1 1 1 1 1] [Timestamp('2016-01-19 00:00:00') Timestamp('2016-01-22 00:00:00')]
#Problem: Can we grab all the repeats for the non-unique elements?
#Solution:
for i in non_uniq:
print(o_sleep_hours[o_sleep_dates == i])
[ 0.40555556 4.77138889] [ 7.06666667 0.58083333]
#Problem: We now need to make a new dataset that has no repeats
#Solution: Use a for loop and a sum to combine all the sleeps together which are non-unique
clean_sleep_dates = np.unique(o_sleep_dates)
clean_sleep_hours = np.empty(len(np.unique(o_sleep_dates)))
clean_i = 0
for i in range(len(clean_sleep_hours)):
clean_sleep_hours[i] = np.sum(o_sleep_hours[o_sleep_dates == clean_sleep_dates[i]])
#Just set them equal to our new ordered, cleaned data
sleep_dates = clean_sleep_dates
sleep_hours = clean_sleep_hours
Build a confidence interval
serror = np.std(sleep_hours, ddof=1) / np.sqrt(len(sleep_hours))
print('I sleep {:.3} +/- {:.2} hours per night'.format(np.mean(sleep_hours), ss.norm.ppf(0.975) * serror))
I sleep 5.85 +/- 0.26 hours per night
See if there is a correlation between tonight and last night
plt.plot(sleep_hours[:-1], sleep_hours[1:], 'o')
plt.xlabel('Hours Slept Last Night')
plt.ylabel('Hours Asleep')
plt.show()
ss.spearmanr(sleep_hours[:-1], sleep_hours[1:])
SpearmanrResult(correlation=-0.18116639914392724, pvalue=0.087480218009651625)
There is no corelation. Perhaps if I don't sleep mcuh in multiple nights, I sleep more. Let's try the average of the last few nights
#Problem: Need a way to compute a running mean. We also
#have gaps in our sleep data
#Solution: We'll write a function
def runningMean(t,x, N):
y = np.zeros(len(x))
for ctr in range(len(x)):
#need to account for gaps.
#increment 1-by-1 upwards and stop once we exceed the number of
#days forward we want (N)
offset = 0
delta = t[ctr + offset] - t[ctr]
while delta.days < N:
if(ctr + offset == len(t) - 1):
#we can't go any farther forward
break
offset += 1
delta = t[ctr + offset] - t[ctr]
#be conservative, do not go too far forward.
#Go back if we went too far forward
if(delta.days > N and offset > 1):
offset -= 1
elif(offset == 0):
offset = 1
#Give warning of shortened mean
if((t[min(len(t) - 1, ctr + offset)] - t[ctr]).days != N):
print('Shortened running mean at {} for length {}'.format(ctr, N))
y[ctr] = np.mean(x[ctr:(ctr + offset)])
return y
forward_three = runningMean(sleep_dates, sleep_hours, 3)
print(sleep_dates[0:4])
print(sleep_hours[0:4])
print(forward_three[:2])
Shortened running mean at 26 for length 3 Shortened running mean at 27 for length 3 Shortened running mean at 30 for length 3 Shortened running mean at 34 for length 3 Shortened running mean at 51 for length 3 Shortened running mean at 52 for length 3 Shortened running mean at 88 for length 3 Shortened running mean at 89 for length 3 Shortened running mean at 90 for length 3 [Timestamp('2015-12-31 00:00:00') Timestamp('2016-01-02 00:00:00') Timestamp('2016-01-03 00:00:00') Timestamp('2016-01-04 00:00:00')] [ 6.44722222 6.36916667 4.48194444 6.04055556] [ 6.40819444 5.63055556]
plt.plot(sleep_dates, forward_three)
plt.plot(sleep_dates, sleep_hours)
plt.xlabel('Date')
plt.ylabel('Hours Asleep')
plt.gcf().autofmt_xdate()
#Problem: Why is our average on the first night different than how much we slept?
#Solution: The running mean function looks forward by N on the first night.
#We care about the average of the last few nights
plt.plot(sleep_dates[3:], forward_three[:-3])
plt.plot(sleep_dates, sleep_hours)
plt.xlabel('Date')
plt.ylabel('Hours Asleep')
plt.gcf().autofmt_xdate()
#Problem: The graph looks awful
#Solution: Make it look nice
line = plt.plot(sleep_dates, sleep_hours, linewidth=0.4)
plt.plot(sleep_dates[3:], forward_three[:-3], color=line[0].get_color())
plt.xlabel('Date')
plt.ylabel('Hours Asleep')
plt.gcf().autofmt_xdate()
plt.plot(forward_three[:-3], sleep_hours[3:], 'o')
plt.show()
ss.spearmanr(forward_three[:-3], sleep_hours[3:])
SpearmanrResult(correlation=-0.078616462963615227, pvalue=0.4665701528891194)
#Problem: Still no correlation. Can we check all possible running-averages?
#Solution:
days_back = []
pval = []
for N in range(1, 15):
running = runningMean(sleep_dates, sleep_hours, N)
result = ss.spearmanr(running[:-N], sleep_hours[N:])
days_back.append(N)
pval.append(result.pvalue)
print(N, result)
plt.plot(days_back, pval)
plt.xlabel('Days Back Included in Analysis')
plt.ylabel('P-Value')
plt.show()
Shortened running mean at 0 for length 1 Shortened running mean at 28 for length 1 Shortened running mean at 30 for length 1 Shortened running mean at 31 for length 1 Shortened running mean at 32 for length 1 Shortened running mean at 34 for length 1 Shortened running mean at 35 for length 1 Shortened running mean at 53 for length 1 Shortened running mean at 90 for length 1 1 SpearmanrResult(correlation=-0.18116639914392724, pvalue=0.087480218009651625) Shortened running mean at 27 for length 2 Shortened running mean at 28 for length 2 Shortened running mean at 29 for length 2 Shortened running mean at 31 for length 2 Shortened running mean at 33 for length 2 Shortened running mean at 52 for length 2 Shortened running mean at 53 for length 2 Shortened running mean at 89 for length 2 Shortened running mean at 90 for length 2 2 SpearmanrResult(correlation=-0.20287708546135511, pvalue=0.056551433321119331) Shortened running mean at 26 for length 3 Shortened running mean at 27 for length 3 Shortened running mean at 30 for length 3 Shortened running mean at 34 for length 3 Shortened running mean at 51 for length 3 Shortened running mean at 52 for length 3 Shortened running mean at 88 for length 3 Shortened running mean at 89 for length 3 Shortened running mean at 90 for length 3 3 SpearmanrResult(correlation=-0.078616462963615227, pvalue=0.4665701528891194) Shortened running mean at 25 for length 4 Shortened running mean at 26 for length 4 Shortened running mean at 29 for length 4 Shortened running mean at 30 for length 4 Shortened running mean at 31 for length 4 Shortened running mean at 32 for length 4 Shortened running mean at 33 for length 4 Shortened running mean at 50 for length 4 Shortened running mean at 51 for length 4 Shortened running mean at 87 for length 4 Shortened running mean at 88 for length 4 Shortened running mean at 89 for length 4 Shortened running mean at 90 for length 4 4 SpearmanrResult(correlation=-0.085386746373113651, pvalue=0.4316608474463457) Shortened running mean at 24 for length 5 Shortened running mean at 25 for length 5 Shortened running mean at 28 for length 5 Shortened running mean at 29 for length 5 Shortened running mean at 49 for length 5 Shortened running mean at 50 for length 5 Shortened running mean at 86 for length 5 Shortened running mean at 87 for length 5 Shortened running mean at 88 for length 5 Shortened running mean at 89 for length 5 Shortened running mean at 90 for length 5 5 SpearmanrResult(correlation=-0.10156139440539648, pvalue=0.35212858518921475) Shortened running mean at 23 for length 6 Shortened running mean at 24 for length 6 Shortened running mean at 27 for length 6 Shortened running mean at 30 for length 6 Shortened running mean at 32 for length 6 Shortened running mean at 48 for length 6 Shortened running mean at 49 for length 6 Shortened running mean at 85 for length 6 Shortened running mean at 86 for length 6 Shortened running mean at 87 for length 6 Shortened running mean at 88 for length 6 Shortened running mean at 89 for length 6 Shortened running mean at 90 for length 6 6 SpearmanrResult(correlation=-0.11612272816103186, pvalue=0.2899039910119301) Shortened running mean at 22 for length 7 Shortened running mean at 23 for length 7 Shortened running mean at 26 for length 7 Shortened running mean at 28 for length 7 Shortened running mean at 29 for length 7 Shortened running mean at 31 for length 7 Shortened running mean at 47 for length 7 Shortened running mean at 48 for length 7 Shortened running mean at 84 for length 7 Shortened running mean at 85 for length 7 Shortened running mean at 86 for length 7 Shortened running mean at 87 for length 7 Shortened running mean at 88 for length 7 Shortened running mean at 89 for length 7 Shortened running mean at 90 for length 7 7 SpearmanrResult(correlation=0.0044952920927407106, pvalue=0.96762842232263013) Shortened running mean at 21 for length 8 Shortened running mean at 22 for length 8 Shortened running mean at 25 for length 8 Shortened running mean at 27 for length 8 Shortened running mean at 28 for length 8 Shortened running mean at 46 for length 8 Shortened running mean at 47 for length 8 Shortened running mean at 83 for length 8 Shortened running mean at 84 for length 8 Shortened running mean at 85 for length 8 Shortened running mean at 86 for length 8 Shortened running mean at 87 for length 8 Shortened running mean at 88 for length 8 Shortened running mean at 89 for length 8 Shortened running mean at 90 for length 8 8 SpearmanrResult(correlation=0.01679190630116284, pvalue=0.88023439963072803) Shortened running mean at 20 for length 9 Shortened running mean at 21 for length 9 Shortened running mean at 24 for length 9 Shortened running mean at 26 for length 9 Shortened running mean at 27 for length 9 Shortened running mean at 30 for length 9 Shortened running mean at 31 for length 9 Shortened running mean at 45 for length 9 Shortened running mean at 46 for length 9 Shortened running mean at 82 for length 9 Shortened running mean at 83 for length 9 Shortened running mean at 84 for length 9 Shortened running mean at 85 for length 9 Shortened running mean at 86 for length 9 Shortened running mean at 87 for length 9 Shortened running mean at 88 for length 9 Shortened running mean at 89 for length 9 Shortened running mean at 90 for length 9 9 SpearmanrResult(correlation=-0.03897432548622675, pvalue=0.72810940434431681) Shortened running mean at 19 for length 10 Shortened running mean at 20 for length 10 Shortened running mean at 23 for length 10 Shortened running mean at 25 for length 10 Shortened running mean at 26 for length 10 Shortened running mean at 28 for length 10 Shortened running mean at 29 for length 10 Shortened running mean at 44 for length 10 Shortened running mean at 45 for length 10 Shortened running mean at 81 for length 10 Shortened running mean at 82 for length 10 Shortened running mean at 83 for length 10 Shortened running mean at 84 for length 10 Shortened running mean at 85 for length 10 Shortened running mean at 86 for length 10 Shortened running mean at 87 for length 10 Shortened running mean at 88 for length 10 Shortened running mean at 89 for length 10 Shortened running mean at 90 for length 10 10 SpearmanrResult(correlation=0.0068879855465221315, pvalue=0.95133607283110755) Shortened running mean at 18 for length 11 Shortened running mean at 19 for length 11 Shortened running mean at 22 for length 11 Shortened running mean at 24 for length 11 Shortened running mean at 25 for length 11 Shortened running mean at 27 for length 11 Shortened running mean at 30 for length 11 Shortened running mean at 43 for length 11 Shortened running mean at 44 for length 11 Shortened running mean at 80 for length 11 Shortened running mean at 81 for length 11 Shortened running mean at 82 for length 11 Shortened running mean at 83 for length 11 Shortened running mean at 84 for length 11 Shortened running mean at 85 for length 11 Shortened running mean at 86 for length 11 Shortened running mean at 87 for length 11 Shortened running mean at 88 for length 11 Shortened running mean at 89 for length 11 Shortened running mean at 90 for length 11 11 SpearmanrResult(correlation=-0.060103141115799348, pvalue=0.5963911655184273) Shortened running mean at 17 for length 12 Shortened running mean at 18 for length 12 Shortened running mean at 21 for length 12 Shortened running mean at 23 for length 12 Shortened running mean at 24 for length 12 Shortened running mean at 26 for length 12 Shortened running mean at 29 for length 12 Shortened running mean at 42 for length 12 Shortened running mean at 43 for length 12 Shortened running mean at 79 for length 12 Shortened running mean at 80 for length 12 Shortened running mean at 81 for length 12 Shortened running mean at 82 for length 12 Shortened running mean at 83 for length 12 Shortened running mean at 84 for length 12 Shortened running mean at 85 for length 12 Shortened running mean at 86 for length 12 Shortened running mean at 87 for length 12 Shortened running mean at 88 for length 12 Shortened running mean at 89 for length 12 Shortened running mean at 90 for length 12 12 SpearmanrResult(correlation=-0.058519961051606624, pvalue=0.60845114820743817) Shortened running mean at 16 for length 13 Shortened running mean at 17 for length 13 Shortened running mean at 20 for length 13 Shortened running mean at 22 for length 13 Shortened running mean at 23 for length 13 Shortened running mean at 25 for length 13 Shortened running mean at 28 for length 13 Shortened running mean at 41 for length 13 Shortened running mean at 42 for length 13 Shortened running mean at 78 for length 13 Shortened running mean at 79 for length 13 Shortened running mean at 80 for length 13 Shortened running mean at 81 for length 13 Shortened running mean at 82 for length 13 Shortened running mean at 83 for length 13 Shortened running mean at 84 for length 13 Shortened running mean at 85 for length 13 Shortened running mean at 86 for length 13 Shortened running mean at 87 for length 13 Shortened running mean at 88 for length 13 Shortened running mean at 89 for length 13 Shortened running mean at 90 for length 13 13 SpearmanrResult(correlation=-0.090023900150482422, pvalue=0.43314192915958449) Shortened running mean at 15 for length 14 Shortened running mean at 16 for length 14 Shortened running mean at 19 for length 14 Shortened running mean at 21 for length 14 Shortened running mean at 22 for length 14 Shortened running mean at 24 for length 14 Shortened running mean at 27 for length 14 Shortened running mean at 40 for length 14 Shortened running mean at 41 for length 14 Shortened running mean at 77 for length 14 Shortened running mean at 78 for length 14 Shortened running mean at 79 for length 14 Shortened running mean at 80 for length 14 Shortened running mean at 81 for length 14 Shortened running mean at 82 for length 14 Shortened running mean at 83 for length 14 Shortened running mean at 84 for length 14 Shortened running mean at 85 for length 14 Shortened running mean at 86 for length 14 Shortened running mean at 87 for length 14 Shortened running mean at 88 for length 14 Shortened running mean at 89 for length 14 Shortened running mean at 90 for length 14 14 SpearmanrResult(correlation=-0.050134076449865922, pvalue=0.66501358741064176)
running = runningMean(sleep_dates, sleep_hours, 2)
plt.plot(running[:-2], sleep_hours[2:], 'o')
plt.ylabel('Hours Asleep')
plt.xlabel('Average Hours Asleep from Last Two Days')
plt.show()
Shortened running mean at 27 for length 2 Shortened running mean at 28 for length 2 Shortened running mean at 29 for length 2 Shortened running mean at 31 for length 2 Shortened running mean at 33 for length 2 Shortened running mean at 52 for length 2 Shortened running mean at 53 for length 2 Shortened running mean at 89 for length 2 Shortened running mean at 90 for length 2
So it appears there is a correlation only with two days back.
We'll regress our sleep tonight to sleep to the last two nights. We'll ignore the difficulty of non-contiguous data.
import numpy.linalg as lin
x_mat = np.column_stack( (np.ones(len(sleep_hours[2:])), sleep_hours[:-2], sleep_hours[1:-1]) )
y = sleep_hours[2:]
beta = lin.inv(x_mat.transpose().dot(x_mat)).dot(x_mat.transpose()).dot(y)
plt.plot(x_mat.dot(beta),y, 'o')
plt.plot(np.linspace(4,8,10), np.linspace(4,8,10))
plt.xlabel('Predicted Sleep')
plt.ylabel('Observed Sleep')
plt.show()
resids = y - x_mat.dot(beta)
ss.shapiro(resids)
(0.9835946559906006, 0.32443320751190186)
def predicted_sleep(last_night, last_last_night):
return beta[0] + beta[1] * last_last_night + beta[2] * last_night
print(predicted_sleep(0, 0))
print(predicted_sleep(6, 6))
print(predicted_sleep(3, 2))
print(predicted_sleep(8, 8))
8.47651398408 5.76085393616 7.31057033458 4.85563392018
#Problem: Let's do a confidence interval for prediction. How do we get standard error in prediction?
#Solution: It's the same as standard error in residual, since residual is prediciton - observed
resids_se2 = np.sum(resids**2) / (len(sleep_hours[2:]) - 3)
def predicted_sleep_interval(last_night, last_last_night, confidence=0.90):
center = beta[0] + beta[1] * last_last_night + beta[2] * last_night
width = ss.t.ppf(confidence, len(sleep_hours[2:]) - 3) * np.sqrt(resids_se2)
return '{} +/- {}'.format(center, width)
print(predicted_sleep_interval(0, 0))
print(predicted_sleep_interval(6, 6))
print(predicted_sleep_interval(3, 2))
print(predicted_sleep_interval(8, 8))
8.476513984080393 +/- 1.5804994596185127 5.760853936158573 +/- 1.5804994596185127 7.310570334577289 +/- 1.5804994596185127 4.855633920184634 +/- 1.5804994596185127
#Problem: How do I get the day of week?
#Solutoin: Websearch reveals this:
plt.plot(pd.to_datetime(sleep_dates).dayofweek, sleep_hours, 'o')
plt.show()
#Problem: Points are hard to see
#Solution: Use swarm plot (from Seaborn)
sns.swarmplot(pd.to_datetime(sleep_dates).dayofweek, sleep_hours)
plt.ylabel('Hours Asleep')
plt.xlabel('Day of Week')
plt.show()
#Problem: The day of week is an integer
#Solutoin: Set the x labels
sns.swarmplot(pd.to_datetime(sleep_dates).dayofweek, sleep_hours)
plt.gca().set_xticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'])
plt.ylabel('Hours Asleep')
plt.xlabel('Day of Week')
plt.show()
#Problem: Can we provide more information than just the points
#Solution Use a violin/boxplot. It shows you the spread of data, mean, and confidence interval
fig = sns.violinplot(pd.to_datetime(sleep_dates).dayofweek, sleep_hours)
plt.gca().set_xticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'])
plt.ylabel('Hours Asleep')
plt.xlabel('Day of Week')
plt.show()
This part requires combining two datasets.
#Problem: How will pandas know how to join the two datasets?
#Solution: Create a series and set the index to be the dates. We'll use that for our join
sleep_series = pd.Series(sleep_hours, index=pd.to_datetime(sleep_dates))
calories_series = pd.Series(daily_data.Calories.values, index=pd.to_datetime(daily_data.Date))
print(sleep_series[0:3])
print(calories_series[0:3])
2015-12-31 6.447222 2016-01-02 6.369167 2016-01-03 4.481944 dtype: float64 Date 2016-01-01 2005 2016-01-02 1930 2016-01-03 2024 dtype: int64
#join two datasets
#The inner means if days are missing, they are discareded
joined_data = pd.concat([sleep_series, calories_series], axis=1, join='inner', keys=['hours','calories'])
plt.plot(joined_data.calories, joined_data.hours, 'o')
plt.xlabel('Calories')
plt.ylabel('Hourse Asleep')
ss.spearmanr(joined_data.calories, joined_data.hours)
SpearmanrResult(correlation=-0.076426934059877299, pvalue=0.47401442828725215)