import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
dfWell = pd.read_csv('https://www.dropbox.com/s/170bc3dimgn8ru8/wellness.csv?dl=1')
dfWell.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5011 entries, 0 to 5010 Data columns (total 19 columns): Date 5011 non-null object PlayerID 5011 non-null int64 Fatigue 5011 non-null int64 Soreness 5011 non-null int64 Desire 5011 non-null int64 Irritability 5011 non-null int64 BedTime 5011 non-null object WakeTime 5011 non-null object SleepHours 5011 non-null float64 SleepQuality 5011 non-null int64 MonitoringScore 5011 non-null int64 Pain 5011 non-null object Illness 5011 non-null object Menstruation 4995 non-null object Nutrition 4174 non-null object NutritionAdjustment 4266 non-null object USGMeasurement 4843 non-null object USG 629 non-null float64 TrainingReadiness 5011 non-null object dtypes: float64(2), int64(7), object(10) memory usage: 743.9+ KB
dfWell.head()
Date | PlayerID | Fatigue | Soreness | Desire | Irritability | BedTime | WakeTime | SleepHours | SleepQuality | MonitoringScore | Pain | Illness | Menstruation | Nutrition | NutritionAdjustment | USGMeasurement | USG | TrainingReadiness | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-07-21 | 1 | 3 | 3 | 2 | 3 | 23:00:00 | 07:00:00 | 8.00 | 2 | 13 | No | No | Yes | Excellent | Yes | No | NaN | 0% |
1 | 2018-07-21 | 2 | 4 | 3 | 4 | 4 | 23:00:00 | 07:00:00 | 8.00 | 4 | 19 | Yes | No | Yes | NaN | NaN | Yes | 1.010 | 0% |
2 | 2018-07-21 | 3 | 3 | 3 | 5 | 4 | 22:30:00 | 06:30:00 | 8.00 | 4 | 19 | No | No | No | NaN | NaN | Yes | 1.016 | 100% |
3 | 2018-07-21 | 4 | 2 | 3 | 5 | 4 | 00:30:00 | 07:00:00 | 6.50 | 1 | 15 | No | No | Yes | Excellent | Yes | Yes | 1.025 | 95% |
4 | 2018-07-21 | 5 | 5 | 3 | 4 | 4 | 23:45:00 | 07:00:00 | 7.25 | 4 | 20 | No | No | No | Okay | Yes | Yes | 1.022 | 100% |
dfWell.Menstruation.value_counts()
No 4285 Yes 710 Name: Menstruation, dtype: int64
dfWell.Nutrition.value_counts()
Excellent 2713 Okay 1398 Poor 63 Name: Nutrition, dtype: int64
dfWell.NutritionAdjustment.value_counts()
Yes 3727 No 439 I Don't Know 100 Name: NutritionAdjustment, dtype: int64
# As Menstruation, Nutrition and Nutrition Adjustment are categorical, fill NA with their Modes
dfWell.Menstruation.fillna(dfWell.Menstruation.mode()[0],inplace=True)
dfWell.Nutrition.fillna(dfWell.Nutrition.mode()[0],inplace=True)
dfWell.NutritionAdjustment.fillna(dfWell.NutritionAdjustment.mode()[0],inplace=True)
dfWell.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5011 entries, 0 to 5010 Data columns (total 19 columns): Date 5011 non-null object PlayerID 5011 non-null int64 Fatigue 5011 non-null int64 Soreness 5011 non-null int64 Desire 5011 non-null int64 Irritability 5011 non-null int64 BedTime 5011 non-null object WakeTime 5011 non-null object SleepHours 5011 non-null float64 SleepQuality 5011 non-null int64 MonitoringScore 5011 non-null int64 Pain 5011 non-null object Illness 5011 non-null object Menstruation 5011 non-null object Nutrition 5011 non-null object NutritionAdjustment 5011 non-null object USGMeasurement 4843 non-null object USG 629 non-null float64 TrainingReadiness 5011 non-null object dtypes: float64(2), int64(7), object(10) memory usage: 743.9+ KB
dfWell.Menstruation.value_counts()
No 4301 Yes 710 Name: Menstruation, dtype: int64
dfWell.Nutrition.value_counts()
Excellent 3550 Okay 1398 Poor 63 Name: Nutrition, dtype: int64
dfWell.NutritionAdjustment.value_counts()
Yes 4472 No 439 I Don't Know 100 Name: NutritionAdjustment, dtype: int64
# As USG measurement has almost 87% NA, it doesn't make sense to fill NA with random or mean values.
# Hence, drop USG related columns
dfWell = dfWell.drop(['USGMeasurement', 'USG'], axis=1)
dfWell.head()
Date | PlayerID | Fatigue | Soreness | Desire | Irritability | BedTime | WakeTime | SleepHours | SleepQuality | MonitoringScore | Pain | Illness | Menstruation | Nutrition | NutritionAdjustment | TrainingReadiness | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-07-21 | 1 | 3 | 3 | 2 | 3 | 23:00:00 | 07:00:00 | 8.00 | 2 | 13 | No | No | Yes | Excellent | Yes | 0% |
1 | 2018-07-21 | 2 | 4 | 3 | 4 | 4 | 23:00:00 | 07:00:00 | 8.00 | 4 | 19 | Yes | No | Yes | Excellent | Yes | 0% |
2 | 2018-07-21 | 3 | 3 | 3 | 5 | 4 | 22:30:00 | 06:30:00 | 8.00 | 4 | 19 | No | No | No | Excellent | Yes | 100% |
3 | 2018-07-21 | 4 | 2 | 3 | 5 | 4 | 00:30:00 | 07:00:00 | 6.50 | 1 | 15 | No | No | Yes | Excellent | Yes | 95% |
4 | 2018-07-21 | 5 | 5 | 3 | 4 | 4 | 23:45:00 | 07:00:00 | 7.25 | 4 | 20 | No | No | No | Okay | Yes | 100% |
# We do not require Bed Time and Wake Time as well. We'll be using SleepHours instead
dfWell = dfWell.drop(['BedTime', 'WakeTime'], axis=1)
# Convert TrainingReadiness from String to fraction
dfWell['TrainingReadiness'] = dfWell['TrainingReadiness'].str.rstrip('%').astype('float') / 100.0 * 7
dfWell.Pain.value_counts()
No 4423 Yes 588 Name: Pain, dtype: int64
dfWell.Illness.value_counts()
No 4456 Slightly Off 421 Yes 134 Name: Illness, dtype: int64
# Convert into numerical values
dfWell.Pain = np.where(dfWell.Pain == 'No', 1, 0)
dfWell.Menstruation = np.where(dfWell.Menstruation == 'No', 1, 0)
dfWell.Nutrition = np.where(dfWell.Nutrition == 'Poor', 0, np.where(dfWell.Nutrition == 'Okay', 1, 2))
dfWell.Illness = np.where(dfWell.Illness == 'Yes', 0, np.where(dfWell.Illness == 'Slightly Off', 1, 2))
dfWell.NutritionAdjustment = np.where(dfWell.NutritionAdjustment == 'No', 0, \
np.where(dfWell.NutritionAdjustment == 'I Don\'t Know', 1, 2))
dfWell.head()
Date | PlayerID | Fatigue | Soreness | Desire | Irritability | SleepHours | SleepQuality | MonitoringScore | Pain | Illness | Menstruation | Nutrition | NutritionAdjustment | TrainingReadiness | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-07-21 | 1 | 3 | 3 | 2 | 3 | 8.00 | 2 | 13 | 1 | 2 | 0 | 2 | 2 | 0.00 |
1 | 2018-07-21 | 2 | 4 | 3 | 4 | 4 | 8.00 | 4 | 19 | 0 | 2 | 0 | 2 | 2 | 0.00 |
2 | 2018-07-21 | 3 | 3 | 3 | 5 | 4 | 8.00 | 4 | 19 | 1 | 2 | 1 | 2 | 2 | 7.00 |
3 | 2018-07-21 | 4 | 2 | 3 | 5 | 4 | 6.50 | 1 | 15 | 1 | 2 | 0 | 2 | 2 | 6.65 |
4 | 2018-07-21 | 5 | 5 | 3 | 4 | 4 | 7.25 | 4 | 20 | 1 | 2 | 1 | 1 | 2 | 7.00 |
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
df = dfWell[dfWell.columns.difference(['Date','PlayerID','MonitoringScore'])]
chi_square_value, p_value = calculate_bartlett_sphericity(df)
chi_square_value, p_value
(10209.44506669854, 0.0)
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(df)
kmo_model
0.7230727699430893
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer()
fa.analyze(df, 12, rotation=None)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
Original_Eigenvalues | |
---|---|
0 | 2.867840 |
1 | 1.550185 |
2 | 1.163748 |
3 | 1.079430 |
4 | 0.990381 |
5 | 0.955426 |
6 | 0.885087 |
7 | 0.663733 |
8 | 0.566753 |
9 | 0.500336 |
10 | 0.445287 |
11 | 0.331794 |
# Create scree plot using matplotlib
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
fa.analyze(df, 4, rotation="varimax")
fa.loadings
Factor1 | Factor2 | Factor3 | Factor4 | |
---|---|---|---|---|
Desire | 0.297775 | 0.014830 | -0.018263 | 0.826713 |
Fatigue | 0.680681 | -0.005057 | 0.027876 | 0.481921 |
Illness | 0.083225 | 0.071866 | 0.125495 | 0.110502 |
Irritability | 0.469493 | 0.046278 | 0.004567 | 0.365455 |
Menstruation | 0.022113 | 0.063481 | 0.020381 | 0.033328 |
Nutrition | 0.047010 | 0.675161 | 0.051184 | -0.029770 |
NutritionAdjustment | -0.015880 | 0.726706 | -0.021174 | -0.000230 |
Pain | 0.030785 | 0.050629 | 0.995425 | 0.039629 |
SleepHours | 0.334700 | 0.057831 | 0.020863 | -0.088249 |
SleepQuality | 0.713932 | 0.088429 | 0.078114 | 0.116014 |
Soreness | 0.382954 | -0.096350 | 0.098340 | 0.393524 |
TrainingReadiness | -0.057163 | 0.057130 | 0.055586 | 0.305239 |
fa.get_factor_variance()
Factor1 | Factor2 | Factor3 | Factor4 | |
---|---|---|---|---|
SS Loadings | 1.554890 | 1.021801 | 1.030533 | 1.334318 |
Proportion Var | 0.129574 | 0.085150 | 0.085878 | 0.111193 |
Cumulative Var | 0.129574 | 0.214724 | 0.300602 | 0.411795 |
df_fact = pd.DataFrame(np.dot(df, fa.loadings))
df_fact.columns = ["Energy","Nourishment","Discomfort","Determination"]
df_fact
Energy | Nourishment | Discomfort | Determination | |
---|---|---|---|---|
0 | 9.559896 | 3.501880 | 1.985388 | 5.102797 |
1 | 12.702699 | 3.698991 | 1.142109 | 7.796001 |
2 | 11.972553 | 4.232901 | 2.500879 | 10.350426 |
3 | 8.645920 | 3.802447 | 2.167529 | 9.512673 |
4 | 12.738104 | 3.489422 | 2.508064 | 10.583512 |
... | ... | ... | ... | ... |
5006 | 18.614064 | 1.087246 | 2.825409 | 15.476140 |
5007 | 10.961311 | 3.214301 | 2.515009 | 9.804929 |
5008 | 12.822358 | 2.503752 | 2.477125 | 11.418906 |
5009 | 12.524053 | 3.412587 | 2.583743 | 10.473052 |
5010 | 14.886535 | 3.679469 | 2.736708 | 11.853125 |
5011 rows × 4 columns
dfWell = pd.concat([dfWell, df_fact], axis=1)
dfWell.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5011 entries, 0 to 5010 Data columns (total 19 columns): Date 5011 non-null object PlayerID 5011 non-null int64 Fatigue 5011 non-null int64 Soreness 5011 non-null int64 Desire 5011 non-null int64 Irritability 5011 non-null int64 SleepHours 5011 non-null float64 SleepQuality 5011 non-null int64 MonitoringScore 5011 non-null int64 Pain 5011 non-null int32 Illness 5011 non-null int32 Menstruation 5011 non-null int32 Nutrition 5011 non-null int32 NutritionAdjustment 5011 non-null int32 TrainingReadiness 5011 non-null float64 Energy 5011 non-null float64 Nourishment 5011 non-null float64 Discomfort 5011 non-null float64 Determination 5011 non-null float64 dtypes: float64(6), int32(5), int64(7), object(1) memory usage: 646.1+ KB
dfWell = dfWell[["Date","PlayerID","MonitoringScore","Energy","Nourishment","Discomfort","Determination"]]
dfWell
Date | PlayerID | MonitoringScore | Energy | Nourishment | Discomfort | Determination | |
---|---|---|---|---|---|---|---|
0 | 2018-07-21 | 1 | 13 | 9.559896 | 3.501880 | 1.985388 | 5.102797 |
1 | 2018-07-21 | 2 | 19 | 12.702699 | 3.698991 | 1.142109 | 7.796001 |
2 | 2018-07-21 | 3 | 19 | 11.972553 | 4.232901 | 2.500879 | 10.350426 |
3 | 2018-07-21 | 4 | 15 | 8.645920 | 3.802447 | 2.167529 | 9.512673 |
4 | 2018-07-21 | 5 | 20 | 12.738104 | 3.489422 | 2.508064 | 10.583512 |
... | ... | ... | ... | ... | ... | ... | ... |
5006 | 2017-08-01 | 8 | 33 | 18.614064 | 1.087246 | 2.825409 | 15.476140 |
5007 | 2017-08-01 | 10 | 18 | 10.961311 | 3.214301 | 2.515009 | 9.804929 |
5008 | 2017-08-01 | 12 | 21 | 12.822358 | 2.503752 | 2.477125 | 11.418906 |
5009 | 2017-08-01 | 13 | 20 | 12.524053 | 3.412587 | 2.583743 | 10.473052 |
5010 | 2017-08-01 | 14 | 24 | 14.886535 | 3.679469 | 2.736708 | 11.853125 |
5011 rows × 7 columns
df = dfWell[dfWell.columns.difference(['Date','PlayerID'])]
ax = sns.heatmap(df.corr(), annot=True, fmt=".2f")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()
dfWell = dfWell.drop(['MonitoringScore'], axis=1)
dfWell.head()
Date | PlayerID | Energy | Nourishment | Discomfort | Determination | |
---|---|---|---|---|---|---|
0 | 2018-07-21 | 1 | 9.559896 | 3.501880 | 1.985388 | 5.102797 |
1 | 2018-07-21 | 2 | 12.702699 | 3.698991 | 1.142109 | 7.796001 |
2 | 2018-07-21 | 3 | 11.972553 | 4.232901 | 2.500879 | 10.350426 |
3 | 2018-07-21 | 4 | 8.645920 | 3.802447 | 2.167529 | 9.512673 |
4 | 2018-07-21 | 5 | 12.738104 | 3.489422 | 2.508064 | 10.583512 |