#!/usr/bin/env python
# coding: utf-8

# # Analysis of 5K race results from races run by both Jesse and Hugh
# This notebook analyzes 5K races run by Jesse Bloom and Hugh Haddox.
# 
# ### import modules

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import scipy.stats
import pandas
import matplotlib
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context('notebook', font_scale=2)


# ### Read in race results
# First, we read in a table of race results manually compiled from the internet.

# In[2]:


results = pandas.read_csv('race_results.csv')
print(results.to_string(index=False))


# ### Plot race results over time
# Next, we look at how the race results have changed over time for each runner, drawing a dashed line at the critical threshold of 20 minutes.

# In[3]:


results_melt = pandas.melt(results, id_vars=['year'], value_vars=['Hugh', 'Jesse'], 
        var_name='runner', value_name='time')

seaborn.pointplot(x='year', y='time', data=results_melt, hue='runner', fit_reg=False)
plt.ylabel('time (minutes)')
plt.axhline(y=20, color='black', linestyle='--', linewidth=1)
plt.show()


# ### Examine if there is a significant difference between runners
# We perform statistical tests to determine if the runners have significantly different times.
# 
# First, we plot the distributions for the two runners.

# In[4]:


seaborn.stripplot(data=results_melt, x='runner', y='time', s=10)
plt.ylabel('time (minutes)')
plt.show()


# Then we test if they are significantly different using the non-parametric [Mann-Whitney test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test).

# In[5]:


scipy.stats.mannwhitneyu(results['Hugh'], results['Jesse'])


# Therefore, we can conclude with *P < 0.05* that Jesse's times are significantly faster than Hugh's.

# In[ ]: