from IPython.display import HTML
HTML("""<style>
.lev1 {margin-left: 80px}
.lev2 {margin-left: 100px}
.lev3 {margin-left: 120px}
.lev4 {margin-left: 140px}
.lev5 {margin-left: 160px}
.lev6 {margin-left: 180px}
</style>""")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
from scipy import stats
import seaborn as sns
sns.set_context('notebook')
%config InlineBackend.figure_format = 'retina'
# Prepare dictionary for JSON export to build viz
lausanne_marathon_viz = {}
The data used were parsed from https://services.datasport.com/2016/lauf/lamara/
trial_dataset = pd.read_csv('../datasets/lausanne_marathon_2016_stefano.csv')
trial_dataset = trial_dataset.drop('Unnamed: 0',axis=1)
# remove useless features
trial_dataset.drop('rang',axis=1,inplace=True)
trial_dataset.drop('retard',axis=1,inplace=True)
# substitute False as Nan
trial_dataset.replace('False',np.NaN,inplace=True)
print('dataset shape:',trial_dataset.shape)
trial_dataset.head()
dataset shape: (12059, 7)
cat | sex | nom | an | lieu | temps | pace | |
---|---|---|---|---|---|---|---|
0 | 21 | M | Abaidia Jilani | 1966 | St-Légier-La Chiésaz | 1:45.28,4 | 4.59 |
1 | 21 | F | Abaidia Sandrine | 1972 | St-Légier | 1:49.40,8 | 5.11 |
2 | NaN | F | Abaidia Selma | 2006 | St-Légier-La Chiésaz | 7.12,2 | 4.48 |
3 | 21 | M | Abb Jochen | 1948 | Ernen | 2:50.40,7 | 8.05 |
4 | 10 | M | Abbas Dhia | 1961 | Lausanne | 1:13.04,1 | 7.18 |
Also some of us were there :)
trial_dataset.loc[trial_dataset.nom=='Lazzari Gianrocco',:]
cat | sex | nom | an | lieu | temps | pace | |
---|---|---|---|---|---|---|---|
6409 | 10 | M | Lazzari Gianrocco | 1988 | Ecublens VD | 42.59,1 | 4.17 |
Counting NaN (False
) values:
trial_dataset.isnull().sum()
cat 812 sex 499 nom 0 an 0 lieu 0 temps 0 pace 0 dtype: int64
(running cells in this section will take longer - better to skip it)
Mind that the rows are not unique, for each runner's name
len(trial_dataset.nom.unique())==trial_dataset.shape[0]
False
Therefore we compute the distribution of rows/runner:
...and it seems there are 38 names with 2 'copies' and even 1 with 3 (runners with same names):
trial_dataset.nom.value_counts().value_counts()
1 11980 2 38 3 1 Name: nom, dtype: int64
Here we print those names that are not unique:
for r in trial_dataset.nom.unique():
if (sum(trial_dataset.nom==r)!=1):
print(r)
Allaz Vincent Bach Philippe Demont Christophe Di Rosa Giuseppe Dutruy Simon Favre Guillaume Favre Pauline Ferreira Fernanda Ferri Fabio Fournier Pierre Gay Julien Genoud Patrick Gerber Patrick Gerber Thomas Graf Stéphanie Guignard Romain Jalloh Ibrahim Jaquier Philippe Keller Christian Lemay Virginie Lizner Petr Longchamp Philippe Michel Julien Michel Patrick Nyanduga Nzumbe Pittet Nicolas Pittet Olivier Ribeiro Antonio Robert Vincent Rochat Daniel Schneeberger Christian Schwab Philippe Tabone Vincent Trolliet Camille Uldry Marc Verdier Thomas Vouilloz Cédric Weber Julien Weber Nicolas
here is an example:
trial_dataset[trial_dataset.nom=='Allaz Vincent']
cat | sex | nom | an | lieu | temps | pace | |
---|---|---|---|---|---|---|---|
159 | 10 | M | Allaz Vincent | 1972 | Epalinges | 45.35,3 | 4.33 |
160 | 42 | M | Allaz Vincent | 1988 | Villars-le-Terroir | 3:31.40,5 | 5.00 |
(name,age) is NOT unique descriptor:
for r in trial_dataset.nom.unique():
if ( (sum(trial_dataset.nom==r)!=1) and
(len(trial_dataset[trial_dataset.nom==r].an.unique())==1) ):
print(r)
Fournier Pierre Gay Julien Jalloh Ibrahim Lemay Virginie Michel Patrick Nyanduga Nzumbe
... five runner have the same name and age:
trial_dataset[trial_dataset.nom=='Michel Patrick']
cat | sex | nom | an | lieu | temps | pace | |
---|---|---|---|---|---|---|---|
7547 | 21 | M | Michel Patrick | 1983 | La Chaux-de-Fonds | 1:56.19,9 | 5.30 |
7548 | 10 | M | Michel Patrick | 1983 | Bulle | 1:06.08,5 | 6.36 |
(name,cat) is also NOT a unique descriptor:
for r in trial_dataset.nom.unique():
if ( (sum(trial_dataset.nom==r)!=1) and
(len(trial_dataset[trial_dataset.nom==r].cat.unique())==1) ):
print(r)
Di Rosa Giuseppe Favre Pauline Genoud Patrick Gerber Thomas Graf Stéphanie Guignard Romain Jaquier Philippe Lizner Petr Pittet Olivier Ribeiro Antonio Schwab Philippe Verdier Thomas Weber Julien Weber Nicolas
example:
trial_dataset[trial_dataset.nom=='Pittet Olivier']
cat | sex | nom | an | lieu | temps | pace | |
---|---|---|---|---|---|---|---|
8858 | 10 | M | Pittet Olivier | 1967 | Cheseaux-sur-Lausanne | 51.20,9 | 5.08 |
8859 | 10 | NaN | Pittet Olivier | 1999 | Vufflens-la-Ville | 51.13,5 | 5.07 |
(name,age,cat) is a unique descriptor!!:
for r in trial_dataset.nom.unique():
if ( (sum(trial_dataset.nom==r)!=1) and
(len(trial_dataset[trial_dataset.nom==r].cat.unique())==1) and
(len(trial_dataset[trial_dataset.nom==r].an.unique())==1) ):
print(r)
The following table summarizes correlation study among features
(√ : done, - : not really relevant)
Define the logical conditions for sex/race, useful for dataframe subsetting during this study:
For simplicity we consider only the three main races: 10Km, 1/2 marathon and marathon
f10 = (trial_dataset.sex=='F') & (trial_dataset.cat=='10')
f21 = (trial_dataset.sex=='F') & (trial_dataset.cat=='21')
f42 = (trial_dataset.sex=='F') & (trial_dataset.cat=='42')
m10 = (trial_dataset.sex=='M') & (trial_dataset.cat=='10')
m21 = (trial_dataset.sex=='M') & (trial_dataset.cat=='21')
m42 = (trial_dataset.sex=='M') & (trial_dataset.cat=='42')
trial_dataset.cat.value_counts()
10 5515 21 4414 42 1318 Name: cat, dtype: int64
trial_dataset.sex.value_counts()
M 6905 F 4655 Name: sex, dtype: int64
grouped_df = trial_dataset.groupby([trial_dataset.sex,trial_dataset.cat]).nom.agg('count')
grouped_df
sex cat F 10 2584 21 1449 42 219 M 10 2537 21 2870 42 1089 Name: nom, dtype: int64
Mind that we chose the following palette to be colorblind-friendly
(source: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette )
cbPalette = ["#999999", "#E69F00", "#56B4E9", "#009E73","#D55E00", "#CC79A7"]
plt.pie(grouped_df,labels=grouped_df.index,autopct='%1.1f%%',
colors = cbPalette);
plt.axis('equal');
# Save data for visualization
# order of count lists : 10km, 21km, 42km
counts = {
"men": list(map(int,grouped_df['M'].values.tolist())),
"women": list(map(int,grouped_df['F'].values.tolist()))
}
lausanne_marathon_viz["counts"] = counts
overall_age_dist = 2016 - trial_dataset.an
plt.plot(overall_age_dist.value_counts()/len(overall_age_dist),marker='.',linestyle='')
plt.xlabel('runner\'s age',size=16)
plt.ylabel('fraction of runners',size=16);
# Save data for visualization
age_distrib = overall_age_dist.value_counts()/len(overall_age_dist)
lausanne_marathon_viz["age_distribution"] = {}
lausanne_marathon_viz["age_distribution"]["overall"] = {
"ages": [int(age) for age in age_distrib.index.tolist()],
"counts": [float(count) for count in age_distrib.values.tolist()]
}
Preparing data:
age_female_10 = 2016 - trial_dataset[f10].an
age_male_10 = 2016 - trial_dataset[m10].an
age_female_21 = 2016 - trial_dataset[f21].an
age_male_21 = 2016 - trial_dataset[m21].an
age_female_42 = 2016 - trial_dataset[f42].an
age_male_42 = 2016 - trial_dataset[m42].an
plt.plot(age_female_10.value_counts()/len(age_female_10),
marker='.',linestyle='',label = 'women 10Km')
plt.plot(age_female_21.value_counts()/len(age_female_21),
marker='.',linestyle='',label = 'women 1/2 marathon')
plt.plot(age_female_42.value_counts()/len(age_female_42)
,marker='.',linestyle='',label = 'women marathon')
plt.xlabel('runner\'s age',size=16)
plt.ylabel('fraction of women runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["age_distribution"]["women"] = {
"10km": {
"ages": [int(age) for age in (age_female_10.value_counts()/len(age_female_10)).index.tolist()],
"counts": [float(count) for count in (age_female_10.value_counts()/len(age_female_10)).values.tolist()]
},
"21km": {
"ages": [int(age) for age in (age_female_21.value_counts()/len(age_female_21)).index.tolist()],
"counts": [float(count) for count in (age_female_21.value_counts()/len(age_female_21)).values.tolist()]
},
"42km": {
"ages": [int(age) for age in (age_female_42.value_counts()/len(age_female_42)).index.tolist()],
"counts": [float(count) for count in (age_female_42.value_counts()/len(age_female_42)).values.tolist()]
}
}
Statistical test on women's age, by category:
stats.ks_2samp(age_female_21,age_female_10)
Ks_2sampResult(statistic=0.069158670333121799, pvalue=0.00025867599203266173)
stats.ks_2samp(age_female_42,age_female_21)
Ks_2sampResult(statistic=0.082361950140389606, pvalue=0.14421995770930032)
stats.ks_2samp(age_female_42,age_female_10)
Ks_2sampResult(statistic=0.13244659796146291, pvalue=0.0014767471293253094)
Conclusion: only women who run the 10Km are significantly younger than the ones who run 21Km and 42Km:
print('women mean age 10km:',np.round(age_female_10.mean(),2))
print('women mean age 21km:',np.round(age_female_21.mean(),2))
print('women mean age 42km:',np.round(age_female_42.mean(),2))
women mean age 10km: 36.11 women mean age 21km: 37.48 women mean age 42km: 39.16
plt.plot(age_male_10.value_counts()/len(age_male_10),
marker='.',linestyle='',label = 'men 10Km')
plt.plot(age_male_21.value_counts()/len(age_male_21)
,marker='.',linestyle='',label = 'men 1/2 marathon')
plt.plot(age_male_42.value_counts()/len(age_male_42),
marker='.',linestyle='',label = 'men marathon')
plt.xlabel('runner\'s age',size=16)
plt.ylabel('fraction of men runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["age_distribution"]["men"] = {
"10km": {
"ages": [int(age) for age in (age_male_10.value_counts()/len(age_male_10)).index.tolist()],
"counts": [float(count) for count in (age_male_10.value_counts()/len(age_male_10)).values.tolist()]
},
"21km": {
"ages": [int(age) for age in (age_male_21.value_counts()/len(age_male_21)).index.tolist()],
"counts": [float(count) for count in (age_male_21.value_counts()/len(age_male_21)).values.tolist()]
},
"42km": {
"ages": [int(age) for age in (age_male_42.value_counts()/len(age_male_42)).index.tolist()],
"counts": [float(count) for count in (age_male_42.value_counts()/len(age_male_42)).values.tolist()]
}
}
Statistical test on men's age, by category:
stats.ks_2samp(age_male_21,age_male_10)
Ks_2sampResult(statistic=0.032486447956996067, pvalue=0.11434962823975953)
stats.ks_2samp(age_male_42,age_male_21)
Ks_2sampResult(statistic=0.11752974790668802, pvalue=5.5649755760621104e-10)
stats.ks_2samp(age_male_42,age_male_10)
Ks_2sampResult(statistic=0.11826437956082847, pvalue=9.1469336246093692e-10)
Conclusion: only men who run the 42Km are significantly older than the ones who run 10Km and 21Km:
print('men mean age 10km:',np.round(age_male_10.mean(),2))
print('men mean age 21km:',np.round(age_male_21.mean(),2))
print('men mean age 42km:',np.round(age_male_42.mean(),2))
men mean age 10km: 40.26 men mean age 21km: 39.88 men mean age 42km: 42.15
age_sex_df = pd.DataFrame([stats.ks_2samp(age_female_10,age_male_10),
stats.ks_2samp(age_female_21,age_male_21),
stats.ks_2samp(age_female_42,age_male_42)])
age_sex_df.set_index([['10Km','21Km','42Km']])
statistic | pvalue | |
---|---|---|
10Km | 0.142575 | 3.487088e-23 |
21Km | 0.096879 | 2.448399e-08 |
42Km | 0.144672 | 8.376947e-04 |
Conclusions: women are significantly younger than man, in all competitions
print('mean for 10km f/m:',np.round(age_female_10.mean(),2),np.round(age_male_10.mean(),2))
print('mean for 21km f/m:',np.round(age_female_21.mean(),2),np.round(age_male_21.mean(),2))
print('mean for 42km f/m:',np.round(age_female_42.mean(),2),np.round(age_male_42.mean(),2))
mean for 10km f/m: 36.11 40.26 mean for 21km f/m: 37.48 39.88 mean for 42km f/m: 39.16 42.15
lieu_distr = trial_dataset.lieu.value_counts().value_counts()
plt.loglog(lieu_distr,marker='.',linestyle='')
plt.xlabel('number of runners, per town',size=16)
plt.ylabel('number of towns',size=16);
# Save data for visualization
lausanne_marathon_viz["towns"] = {
"num_runners": [int(num) for num in lieu_distr.index.tolist()],
"num_towns": [int(num) for num in lieu_distr.values.tolist()]
}
import powerlaw
data_for_fit = trial_dataset.lieu.value_counts()
fit_runners_race = powerlaw.Fit(data_for_fit,discrete=True)
print('fitted exponent:',fit_runners_race.alpha)
print('fitting error:',fit_runners_race.sigma)
print('xmin of the fit:',fit_runners_race.xmin)
print('xmax of the fit:',fit_runners_race.xmax)
fitted exponent: 1.90231050208 fitting error: 0.0289267466596 xmin of the fit: 2.0 xmax of the fit: None
Calculating best minimal value for power law fit /Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/powerlaw.py:692: RuntimeWarning: invalid value encountered in true_divide (Theoretical_CDF * (1 - Theoretical_CDF))
powerlaw.plot_pdf(data_for_fit,marker='.',linestyle='',label='data')
fit_runners_race.power_law.plot_pdf(label='p-l fit')
plt.legend()
plt.xlabel('number of runners, per town',size=16)
plt.ylabel('number of towns',size=16);
Define helpful function for mean of datetime objects:
def my_dt_mean(race,sex):
my_list = [datetime.datetime.strptime(my_t,'%H:%M.%S,%f')
for my_t in trial_dataset[(trial_dataset.cat==race) &
(trial_dataset.sex==sex)].temps]
my_mean = min(my_list) + sum([my_dt - min(my_list) for my_dt in my_list],
datetime.timedelta(0)) / len(my_list)
return my_mean.strftime('%H:%M.%S')
times_42_male = pd.to_datetime(trial_dataset[m42].temps,format='%H:%M.%S,%f').dt.time
times_42_female = pd.to_datetime(trial_dataset[f42].temps,format='%H:%M.%S,%f').dt.time
times_42_male.hist(label='male marathon',normed=True,alpha=0.3)
times_42_female.hist(label='female marathon',normed=True,alpha=0.3)
plt.xlabel('time to complete the race',size=16)
plt.ylabel('fraction of runners',size=16)
plt.legend();
def time_to_seconds(dt_time):
# ignores precision after seconds
return dt_time.hour*3600 + dt_time.minute*60 + dt_time.second
# Save data for visualization
lausanne_marathon_viz["time_distribution"] = {
"42km": {
"men": times_42_male.apply(time_to_seconds).values.tolist(),
"women": times_42_female.apply(time_to_seconds).values.tolist()
}
}
Statistical test on marathon time, by sex :
stats.ks_2samp(times_42_female,times_42_male)
Ks_2sampResult(statistic=0.24675144974024177, pvalue=2.9816388254975668e-10)
Conclusion: men are significantly faster on marathon
print('times on marathon f/m:',my_dt_mean('42','F'),my_dt_mean('42','M'))
times on marathon f/m: 04:16.47 03:54.59
times_21_male = pd.to_datetime(trial_dataset[m21].temps,format='%H:%M.%S,%f').dt.time
times_21_female = pd.to_datetime(trial_dataset[f21].temps,format='%H:%M.%S,%f').dt.time
times_21_male.hist(label='male 1/2 marathon',normed=True,alpha=0.3)
times_21_female.hist(label='female 1/2 marathon',normed=True,alpha=0.3)
# plt.xticks(rotation=30)
plt.xlabel('time to complete the race',size=16)
plt.ylabel('fraction of runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["time_distribution"]["21km"] = {
"men": times_21_male.apply(time_to_seconds).values.tolist(),
"women": times_21_female.apply(time_to_seconds).values.tolist()
}
Statistical test on 1/2 marathon time, by sex :
stats.ks_2samp(times_21_female,times_21_male)
Ks_2sampResult(statistic=0.36126512817923212, pvalue=1.8953260558079772e-110)
Conclusion: men are significantly faster also on 1/2 marathon
print('times on 1/2 marathon f/m:',my_dt_mean('21','F'),my_dt_mean('21','M'))
times on 1/2 marathon f/m: 02:03.28 01:50.10
(as some runners took more than 1 h to finish the race, there are 2 different data formats: this represent a little challenge for the parsing. For the following solution we give credits to http://stackoverflow.com/questions/23581128/how-to-format-date-string-via-multiple-formats-in-python)
def parsing_10km(my_list):
temp_list = []
for my_t in my_list:
for fmt in ('%H:%M.%S,%f','%M.%S,%f'):
try:
temp_list.append(datetime.datetime.strptime(my_t,fmt))
except ValueError:
pass
return temp_list
# raise ValueError('no valid date format found')
times_10_male = parsing_10km(trial_dataset[m10].temps)
times_10_female = parsing_10km(trial_dataset[f10].temps)
plt.hist(times_10_male,label='male 10km',normed=True,alpha=0.3)
plt.hist(times_10_female,label='female 10km',normed=True,alpha=0.3)
plt.xticks(rotation=30)
plt.xlabel('time to complete the race',size=16)
plt.ylabel('fraction of runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["time_distribution"]["10km"] = {
"men": list(map(time_to_seconds, times_10_male)),
"women": list(map(time_to_seconds, times_10_female))
}
Statistical test on 10km time, by sex :
stats.ks_2samp(times_10_female,times_10_male)
Ks_2sampResult(statistic=0.40039535615918465, pvalue=6.5151475523402082e-180)
Conclusion: men are significantly faster also on 10Km
f10_mean = min(times_10_female) + sum([my_dt - min(times_10_female)
for my_dt in times_10_female],
datetime.timedelta(0)) / len(times_10_female)
f10_mean = f10_mean.strftime('%H:%M.%S')
m10_mean = min(times_10_male) + sum([my_dt - min(times_10_male)
for my_dt in times_10_male],
datetime.timedelta(0)) / len(times_10_male)
m10_mean = m10_mean.strftime('%H:%M.%S')
print('times on 10Km f/m:',f10_mean,m10_mean)
times on 10Km f/m: 00:59.03 00:51.18
Define helpful function for mean of datetime objects:
def mean_pace(race,my_sex):
my_list = [datetime.datetime.strptime(str(my_t),'%M.%S')
for my_t in trial_dataset[(trial_dataset.cat==race) &
(trial_dataset.sex==my_sex)].pace]
my_mean = min(my_list) + sum([my_dt - min(my_list) for my_dt in my_list],
datetime.timedelta(0)) / len(my_list)
return my_mean.strftime('%M.%S')
pace_10_male = pd.to_datetime(trial_dataset[m10].pace,format='%M.%S').dt.time
pace_21_male = pd.to_datetime(trial_dataset[m21].pace,format='%M.%S').dt.time
pace_42_male = pd.to_datetime(trial_dataset[m42].pace,format='%M.%S').dt.time
pace_10_male.hist(alpha=0.2,normed=True,label = 'men 10Km')
pace_21_male.hist(alpha=0.2,normed=True,label = 'men 1/2 marathon')
pace_42_male.hist(alpha=0.2,normed=True,label = 'men marathon')
plt.xlabel('average pace (min/Km)',size=16)
plt.ylabel('fraction of runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["pace_distribution"]= {
"men": {
"10km": list(map(time_to_seconds, pace_10_male)),
"21km": list(map(time_to_seconds, pace_21_male)),
"42km": list(map(time_to_seconds, pace_42_male))
}
}
Statistical test on men's pace, by race :
stats.ks_2samp(pace_10_male,pace_21_male)
Ks_2sampResult(statistic=0.09338418582676733, pvalue=1.0774039160931041e-10)
stats.ks_2samp(pace_42_male,pace_21_male)
Ks_2sampResult(statistic=0.18151166399503427, pvalue=3.2336052932226477e-23)
stats.ks_2samp(pace_10_male,pace_42_male)
Ks_2sampResult(statistic=0.23316585788367061, pvalue=9.9261501006407161e-37)
Conclusion: men pace significantly increases with race distance (for all races)
print('men pace (min/Km) for 10/21/42 Km:',mean_pace('10','M'),mean_pace('21','M'),
mean_pace('42','M'))
men pace (min/Km) for 10/21/42 Km: 05.05 05.10 05.31
pace_10_female = pd.to_datetime(trial_dataset[f10].pace,format='%M.%S').dt.time
pace_21_female = pd.to_datetime(trial_dataset[f21].pace,format='%M.%S').dt.time
pace_42_female = pd.to_datetime(trial_dataset[f42].pace,format='%M.%S').dt.time
pace_10_female.hist(alpha=0.2,normed=True,label = 'women 10Km')
pace_21_female.hist(alpha=0.2,normed=True,label = 'women 1/2 marathon')
pace_42_female.hist(alpha=0.2,normed=True,label = 'women marathon')
plt.xlabel('average pace (min/Km)',size=16)
plt.ylabel('fraction of runners',size=16)
plt.legend();
# Save data for visualization
lausanne_marathon_viz["pace_distribution"]["women"] = {
"10km": list(map(time_to_seconds, pace_10_female)),
"21km": list(map(time_to_seconds, pace_21_female)),
"42km": list(map(time_to_seconds, pace_42_female))
}
Statistical test on women's pace, by race :
stats.ks_2samp(pace_10_female,pace_21_female)
Ks_2sampResult(statistic=0.042865315462569442, pvalue=0.064161356872410827)
stats.ks_2samp(pace_10_female,pace_42_female)
Ks_2sampResult(statistic=0.13355810961731485, pvalue=0.001307741132536468)
stats.ks_2samp(pace_21_female,pace_42_female)
Ks_2sampResult(statistic=0.16766404794993239, pvalue=3.7082010609103116e-05)
Conclusion: women pace are significantly higher only on marathon
print('women pace (min/Km) for 10/21/42 Km:',mean_pace('10','F'),
mean_pace('21','F'),mean_pace('42','F'))
women pace (min/Km) for 10/21/42 Km: 05.51 05.48 06.02
First we make a copy of the dataframe and transform the pace to usefule time object
copy_df = trial_dataset[trial_dataset.cat.notnull()]
copy_df.an = 2016 - copy_df.an
copy_df.pace = [datetime.datetime.strptime(str(my_t),'%M.%S').time()
for my_t in copy_df.pace]
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/generic.py:2773: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self[name] = value
g = sns.FacetGrid(copy_df,col='cat',row='sex',hue='cat')
g.map(plt.scatter,'an','pace',s=2)
# g = sns.factorplot(data=copy_df,x='an',y='pace',kind='box',
# col='cat',row='sex',hue='cat')
g.set_ylabels('average pace',size=16)
g.set_xlabels('runner\'s age',size=16);
Age VS Pace correlation
ageVSpace = pd.DataFrame([stats.pearsonr(trial_dataset.an[f10],trial_dataset.pace[f10]),
stats.pearsonr(trial_dataset.an[f21],trial_dataset.pace[f21]),
stats.pearsonr(trial_dataset.an[f42],trial_dataset.pace[f42]),
stats.pearsonr(trial_dataset.an[m10],trial_dataset.pace[m10]),
stats.pearsonr(trial_dataset.an[m21],trial_dataset.pace[m21]),
stats.pearsonr(trial_dataset.an[m42],trial_dataset.pace[m42])])
ageVSpace.set_index([['f10','f21','f42','m10','m21','m42']],inplace=True)
ageVSpace.columns = ['correlation','p-value']
ageVSpace
correlation | p-value | |
---|---|---|
f10 | -0.056288 | 4.207625e-03 |
f21 | -0.064283 | 1.438986e-02 |
f42 | -0.167475 | 1.307315e-02 |
m10 | -0.258214 | 6.346756e-40 |
m21 | -0.195661 | 3.694399e-26 |
m42 | -0.237944 | 1.752555e-15 |
We can reasonably conclude that there is no significant linear correlation between average pace and runner's age, for all competitions and for both sex.
### Run only if necessary
import json
with open('../../website/_data/lausanne_viz.json', 'w') as out_file:
json.dump(lausanne_marathon_viz, out_file)