Name: Jaime Avendaño
Date: 5/21/2021
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import matplotlib.ticker as mtick
import seaborn as sns
import plotly.graph_objects as go
from plotly.colors import n_colors
import janitor
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
df = pd.read_parquet('teams_spending_df.parquet')
details_df = pd.read_parquet('teams_detail_df.parquet')
logos_df = pd.read_parquet('teams_logos_df.parquet')
df.shape, details_df.shape, logos_df.shape
((288, 15), (256, 5), (32, 4))
teams = df.team.unique().astype('str')
teams.sort()
positions = df.columns[1:-2]
positions
Index(['qb', 'rb', 'wr', 'te', 'ol', 'offense', 'idl', 'edge', 'lb', 's', 'cb', 'defense'], dtype='object')
positions_pct = [f'{pos}_pct' for pos in positions if pos not in ['defense', 'offense']]
positions_pct
['qb_pct', 'rb_pct', 'wr_pct', 'te_pct', 'ol_pct', 'idl_pct', 'edge_pct', 'lb_pct', 's_pct', 'cb_pct']
def getImage(path):
return OffsetImage(plt.imread(path), zoom=0.35)
Preparing the totals and percentate of cap columns.
df['total'] = df['offense'] + df['defense']
for pos in positions:
df[f'{pos}_pct'] = df[pos] / df.cap
df.head()
team | qb | rb | wr | te | ol | offense | idl | edge | lb | ... | wr_pct | te_pct | ol_pct | offense_pct | idl_pct | edge_pct | lb_pct | s_pct | cb_pct | defense_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Eagles | 13385137 | 10203112 | 19241989 | 5509036 | 25664899 | 74004173 | 5264666 | 10241101 | 10004817 | ... | 0.156439 | 0.044789 | 0.208658 | 0.601660 | 0.042802 | 0.083261 | 0.081340 | 0.056261 | 0.049047 | 0.312711 |
1 | Seahawks | 1557085 | 10799653 | 16831423 | 12778788 | 27955261 | 69922210 | 7701509 | 25013832 | 7377232 | ... | 0.136841 | 0.103893 | 0.227279 | 0.568473 | 0.062614 | 0.203364 | 0.059977 | 0.069753 | 0.028519 | 0.424228 |
2 | Titans | 6336958 | 15376098 | 12686896 | 6979500 | 26721984 | 68101436 | 6496528 | 8776293 | 5740835 | ... | 0.103145 | 0.056744 | 0.217252 | 0.553670 | 0.052817 | 0.071352 | 0.046673 | 0.087618 | 0.078069 | 0.336530 |
3 | Broncos | 18716295 | 5070632 | 10120554 | 8013902 | 24318758 | 66240141 | 6880138 | 5313554 | 12951882 | ... | 0.082281 | 0.065154 | 0.197713 | 0.538538 | 0.055936 | 0.043200 | 0.105300 | 0.047922 | 0.142758 | 0.395115 |
4 | Giants | 21998400 | 5036739 | 8677626 | 2998913 | 24235900 | 62947578 | 5118995 | 9523813 | 7916847 | ... | 0.070550 | 0.024381 | 0.197040 | 0.511769 | 0.041618 | 0.077429 | 0.064365 | 0.103293 | 0.091074 | 0.377778 |
5 rows × 28 columns
@interact(team=widgets.Dropdown(options=teams, description='Select team: '))
def f(team):
df_long = df.loc[df.team == team, ['team', 'year','offense_pct','defense_pct']]
df_long = df_long.melt(id_vars=['team', 'year'], value_vars=['offense_pct', 'defense_pct'],
var_name='position', value_name='cap_pct')
plt.figure(figsize=(15, 10))
sns.lineplot(data=df_long, x='year', y='cap_pct', hue='position')
return None;
C:\Users\jaime.avendano\Anaconda3\lib\site-packages\traitlets\traitlets.py:586: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison silent = bool(old_value == new_value)
interactive(children=(Dropdown(description='Select team: ', options=('49ers', 'Bears', 'Bengals', 'Bills', 'Br…
def get_single_year_data(df, year):
df_year = df[df.year == year][['team', 'offense_pct', 'defense_pct']]
df_year = df_year.join(logos_df.set_index('team'), on='team')
return df_year
@interact(plot_year=widgets.Dropdown(options=range(2013, 2021), description='Select year: '))
def f(plot_year):
plot_df = get_single_year_data(df, plot_year)
plt.figure(figsize=(12, 12))
sns.set_style('darkgrid')
ax = sns.scatterplot(data=df, x='offense_pct', y='defense_pct', s=4)
ax.xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.xlim((0.2, 0.6))
plt.ylim((0.2, 0.6))
for x0, y0, path in zip(plot_df.offense_pct, plot_df.defense_pct, plot_df.path):
ab = AnnotationBbox(getImage(path), (x0, y0), frameon=False, fontsize=4)
ax.add_artist(ab)
plt.suptitle(f'NFL - Cap Spending ({ plot_year } highlighted)', fontsize=20, y=0.93)
plt.title('Does not include dead money for the year.', fontsize=15)
plt.xlabel('% of Cap spent on Offense', fontsize=18)
plt.ylabel('% of Cap spent on Defense', fontsize=18)
plt.show()
return None;
interactive(children=(Dropdown(description='Select year: ', options=(2013, 2014, 2015, 2016, 2017, 2018, 2019,…
team_avg_spending = df[['team', 'offense_pct', 'defense_pct']].groupby('team').mean().reset_index()
team_avg_spending.describe()
offense_pct | defense_pct | |
---|---|---|
count | 32.000000 | 32.000000 |
mean | 0.448416 | 0.418107 |
std | 0.032713 | 0.035482 |
min | 0.378094 | 0.343014 |
25% | 0.434256 | 0.392952 |
50% | 0.449247 | 0.427205 |
75% | 0.468599 | 0.443904 |
max | 0.508775 | 0.479979 |
team_avg_spending.sort_values('offense_pct').head()
team | offense_pct | defense_pct | |
---|---|---|---|
24 | Ravens | 0.378094 | 0.441465 |
3 | Bills | 0.388025 | 0.450515 |
12 | Dolphins | 0.390519 | 0.453997 |
16 | Jaguars | 0.402557 | 0.440993 |
4 | Broncos | 0.408136 | 0.478347 |
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score,v_measure_score
positions_pct
['qb_pct', 'rb_pct', 'wr_pct', 'te_pct', 'ol_pct', 'idl_pct', 'edge_pct', 'lb_pct', 's_pct', 'cb_pct']
#kmeans_data = df.join(details_df.set_index(['year', 'team']), on=['year', 'team'], how='inner')[positions_pct + ['win_pct']].copy()
#kmeans_data = df.join(details_df.set_index(['year', 'team']), on=['year', 'team'], how='inner')[['offense_pct', 'defense_pct']].copy()
kmeans_data = df.join(details_df.set_index(['year', 'team']), on=['year', 'team'], how='inner')[positions_pct].copy()
kmeans_data.head()
qb_pct | rb_pct | wr_pct | te_pct | ol_pct | idl_pct | edge_pct | lb_pct | s_pct | cb_pct | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.108822 | 0.082952 | 0.156439 | 0.044789 | 0.208658 | 0.042802 | 0.083261 | 0.081340 | 0.056261 | 0.049047 |
1 | 0.012659 | 0.087802 | 0.136841 | 0.103893 | 0.227279 | 0.062614 | 0.203364 | 0.059977 | 0.069753 | 0.028519 |
2 | 0.051520 | 0.125009 | 0.103145 | 0.056744 | 0.217252 | 0.052817 | 0.071352 | 0.046673 | 0.087618 | 0.078069 |
3 | 0.152165 | 0.041225 | 0.082281 | 0.065154 | 0.197713 | 0.055936 | 0.043200 | 0.105300 | 0.047922 | 0.142758 |
4 | 0.178849 | 0.040949 | 0.070550 | 0.024381 | 0.197040 | 0.041618 | 0.077429 | 0.064365 | 0.103293 | 0.091074 |
kmeans_data['skill_pos_pct'] = kmeans_data['rb_pct'] + kmeans_data['wr_pct'] + kmeans_data['te_pct']
kmeans_data['def_front_pct'] = kmeans_data['idl_pct'] + kmeans_data['edge_pct'] + kmeans_data['lb_pct']
kmeans_data['def_backs_pct'] = kmeans_data['s_pct'] + kmeans_data['cb_pct']
kmeans_data = kmeans_data.drop(columns=['rb_pct', 'wr_pct', 'te_pct', 'idl_pct', 'edge_pct', 'lb_pct', 's_pct', 'cb_pct'])
kmeans_data.describe()
qb_pct | ol_pct | skill_pos_pct | def_front_pct | def_backs_pct | |
---|---|---|---|---|---|
count | 256.000000 | 256.000000 | 256.000000 | 256.000000 | 256.000000 |
mean | 0.094615 | 0.158892 | 0.184358 | 0.259330 | 0.149787 |
std | 0.048310 | 0.041257 | 0.045228 | 0.059617 | 0.049788 |
min | 0.007666 | 0.040623 | 0.076744 | 0.089453 | 0.050437 |
25% | 0.053108 | 0.130671 | 0.156896 | 0.215515 | 0.111904 |
50% | 0.095976 | 0.154496 | 0.184788 | 0.259532 | 0.151131 |
75% | 0.133372 | 0.190164 | 0.212091 | 0.305886 | 0.182531 |
max | 0.237667 | 0.281236 | 0.328535 | 0.404949 | 0.332771 |
kmeans_kwargs = {
'init': 'k-means++',
'n_init': 10,
'max_iter': 100,
'random_state': 42
}
km_scores= []
km_silhouette = []
db_score = []
for k in range(2, 11):
km = KMeans(n_clusters=k, **kmeans_kwargs)
km.fit(kmeans_data)
preds = km.predict(kmeans_data)
km_scores.append(-km.score(kmeans_data))
km_silhouette.append(silhouette_score(kmeans_data, preds))
db_score.append(davies_bouldin_score(kmeans_data, preds))
fig, axs = plt.subplots(3, sharex=True)
fig.set_figheight(10)
fig.set_figwidth(15)
axs[0].scatter(x=range(2, 11), y=km_scores)
axs[0].set_title('The elbow method with Score')
axs[0].set(ylabel='WCSS')
axs[1].scatter(x=range(2, 11), y=km_silhouette)
axs[1].set_title('KMeans with Silhouette Score')
axs[1].set(ylabel='Silhouette Score')
axs[2].scatter(x=range(2, 11), y=db_score)
axs[2].set_title('KMeans with Davies-Bouldin Score')
axs[2].set(ylabel='DB Score')
plt.show();
km = KMeans(n_clusters=6, **kmeans_kwargs)
nfl_pred = km.fit_predict(kmeans_data)
df_grouped = df[df.year <= 2020].copy()
df_grouped['group'] = nfl_pred
df_grouped['teams'] = df_grouped[df_grouped.year == 2019][['group', 'team']].reset_index().groupby(['group'])['team'].transform(lambda x: ', '.join(x))
group_2019_df = df_grouped[df_grouped.year == 2019][['group', 'teams']].drop_duplicates()
group_2019_df
group | teams | |
---|---|---|
0 | 2 | Packers, Chargers, Giants |
1 | 4 | Buccaneers, Rams, Bills |
3 | 0 | Washington, Raiders, Titans, Lions, Patriots, ... |
4 | 1 | Bengals, Browns, Cowboys, Colts, Chiefs, Eagle... |
7 | 5 | 49ers, Saints, Falcons, Seahawks, Steelers, Pa... |
13 | 3 | Vikings, Texans, Bears |
km_centers_df = pd.DataFrame(km.cluster_centers_, columns=['qb_pct', 'ol_pct', 'skill_pos_pct', 'def_front_pct', 'def_backs_pct'])
km_centers_df
qb_pct | ol_pct | skill_pos_pct | def_front_pct | def_backs_pct | |
---|---|---|---|---|---|
0 | 0.119513 | 0.150984 | 0.158227 | 0.186586 | 0.196191 |
1 | 0.044367 | 0.191741 | 0.201702 | 0.232452 | 0.155091 |
2 | 0.135194 | 0.161479 | 0.200706 | 0.222426 | 0.116409 |
3 | 0.071422 | 0.130103 | 0.189287 | 0.301982 | 0.183293 |
4 | 0.068724 | 0.165448 | 0.217353 | 0.326246 | 0.097044 |
5 | 0.119417 | 0.172083 | 0.127278 | 0.299409 | 0.134926 |
km_table = km_centers_df.join(group_2019_df.set_index('group'))
km_table['qb_rank'] = km_table.qb_pct.rank(method='first', ascending=True).astype('int') - 1
km_table['ol_rank'] = km_table.ol_pct.rank(method='first', ascending=True).astype('int') - 1
km_table['skill_pos_rank'] = km_table.skill_pos_pct.rank(method='first', ascending=True).astype('int') - 1
km_table['def_front_rank'] = km_table.def_front_pct.rank(method='first', ascending=True).astype('int') - 1
km_table['def_backs_rank'] = km_table.def_backs_pct.rank(method='first', ascending=True).astype('int') - 1
km_table
qb_pct | ol_pct | skill_pos_pct | def_front_pct | def_backs_pct | teams | qb_rank | ol_rank | skill_pos_rank | def_front_rank | def_backs_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.119513 | 0.150984 | 0.158227 | 0.186586 | 0.196191 | Washington, Raiders, Titans, Lions, Patriots, ... | 4 | 1 | 1 | 0 | 5 |
1 | 0.044367 | 0.191741 | 0.201702 | 0.232452 | 0.155091 | Bengals, Browns, Cowboys, Colts, Chiefs, Eagle... | 0 | 5 | 4 | 2 | 3 |
2 | 0.135194 | 0.161479 | 0.200706 | 0.222426 | 0.116409 | Packers, Chargers, Giants | 5 | 2 | 3 | 1 | 1 |
3 | 0.071422 | 0.130103 | 0.189287 | 0.301982 | 0.183293 | Vikings, Texans, Bears | 2 | 0 | 2 | 4 | 4 |
4 | 0.068724 | 0.165448 | 0.217353 | 0.326246 | 0.097044 | Buccaneers, Rams, Bills | 1 | 3 | 5 | 5 | 0 |
5 | 0.119417 | 0.172083 | 0.127278 | 0.299409 | 0.134926 | 49ers, Saints, Falcons, Seahawks, Steelers, Pa... | 3 | 4 | 0 | 3 | 2 |
colors = np.array(n_colors('rgb(225, 225, 255)', 'rgb(125, 125, 255)', 6, colortype='rgb'))
fig = go.Figure(data=[go.Table(
header=dict(
values=['<b>Cluster</b>', '<b>Teams</b>', '<b>QB %</b>', '<b>OL %</b>', '<b>Skill Pos %</b>', '<b>Def Front %</b>', '<b>Def Backs %</b>'],
align=['left', 'left', 'center'],
font=dict(color='black', size=15),
fill_color='white',
line_color='black'
),
columnwidth=[5,30,10,10,10,10,10],
cells=dict(
values=[km_table.index.tolist(), km_table.teams, km_table.qb_pct, km_table.ol_pct, km_table.skill_pos_pct, km_table.def_front_pct, km_table.def_backs_pct],
format=[None, None, '.3p'],
line_color='black',
fill_color=[colors[km_table.index.tolist()],
colors[km_table.index.tolist()],
colors[km_table.qb_rank],
colors[km_table.ol_rank],
colors[km_table.skill_pos_rank],
colors[km_table.def_front_rank],
colors[km_table.def_backs_rank]],
align=['left', 'left', 'center'],
font=dict(color='black', size=13)
)),
])
fig.update_layout(title=dict(text='NFL Cap Spending for 2019', y=0.95),
margin=dict(l=20, r=20, t=55, b=20),
height=250)
fig.show()
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
df.shape
(288, 28)
win_stats_df = df[['team', 'year'] + positions_pct].join(details_df.set_index(['year', 'team']), on=['year', 'team'], how='inner')
win_stats_df.head()
team | year | qb_pct | rb_pct | wr_pct | te_pct | ol_pct | idl_pct | edge_pct | lb_pct | s_pct | cb_pct | win_pct | points_for | points_against | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Eagles | 2013 | 0.108822 | 0.082952 | 0.156439 | 0.044789 | 0.208658 | 0.042802 | 0.083261 | 0.081340 | 0.056261 | 0.049047 | 0.625 | 442 | 382 |
1 | Seahawks | 2013 | 0.012659 | 0.087802 | 0.136841 | 0.103893 | 0.227279 | 0.062614 | 0.203364 | 0.059977 | 0.069753 | 0.028519 | 0.813 | 417 | 231 |
2 | Titans | 2013 | 0.051520 | 0.125009 | 0.103145 | 0.056744 | 0.217252 | 0.052817 | 0.071352 | 0.046673 | 0.087618 | 0.078069 | 0.438 | 362 | 381 |
3 | Broncos | 2013 | 0.152165 | 0.041225 | 0.082281 | 0.065154 | 0.197713 | 0.055936 | 0.043200 | 0.105300 | 0.047922 | 0.142758 | 0.813 | 606 | 399 |
4 | Giants | 2013 | 0.178849 | 0.040949 | 0.070550 | 0.024381 | 0.197040 | 0.041618 | 0.077429 | 0.064365 | 0.103293 | 0.091074 | 0.438 | 294 | 383 |
corr = win_stats_df.drop(columns=['team', 'year']).corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(18, 15))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.set(font_scale = 1.5)
sns.heatmap(corr, mask=mask, cmap=cmap,
annot=True, fmt='.2f', annot_kws={'size': 14},
vmax=1, vmin=-1, center=0,
square=True, linewidths=.5,
cbar_kws={'shrink': 0.5})
plt.title('Correlation between Positional Spending and Win Percentage.', fontsize=20)
max_year = win_stats_df.year.max()
max_year
2020
# Dividing train and test by max_year so that season doesn't creep into the data.
X_train = win_stats_df.loc[win_stats_df.year != max_year, positions_pct]
y_train = win_stats_df.loc[win_stats_df.year != max_year, 'win_pct']
X_test = win_stats_df.loc[win_stats_df.year == max_year, positions_pct]
y_test = win_stats_df.loc[win_stats_df.year == max_year, 'win_pct']
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((224, 10), (224,), (32, 10), (32,))
linear_reg = LinearRegression(fit_intercept = True)
linear_reg.fit(X_train,y_train)
LinearRegression()
y_predict = linear_reg.predict(X_test)
y_predict[:5]
array([0.5885631 , 0.71246849, 0.45979239, 0.58286972, 0.52654104])
linear_r2 = r2_score(y_test, y_predict)
linear_rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print(f'Linear Regression: R2 = {linear_r2:.4f}')
print(f'Linear Regression: RMSE = {linear_rmse:.4f}')
Linear Regression: R2 = 0.2395 Linear Regression: RMSE = 0.1857
Tested with degree=2 and degree=3. Both cases result in negative R2.
poly_regressor = PolynomialFeatures(degree=2)
X_columns = poly_regressor.fit_transform(X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_columns, y_train)
LinearRegression()
y_predict = poly_reg.predict(poly_regressor.transform(X_test))
y_predict[:5]
array([0.14269698, 0.77947322, 0.36924222, 0.53488198, 0.42912787])
poly_r2 = r2_score(y_test, y_predict)
poly_rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print(f'Polynomial Regression: R2 = {poly_r2:.4f}')
print(f'Polynomial Regression: RMSE = {poly_rmse:.4f}')
Polynomial Regression: R2 = -0.2126 Polynomial Regression: RMSE = 0.2345
dec_tree_regressor = DecisionTreeRegressor(random_state=42)
dec_tree_regressor.fit(X_train,y_train)
DecisionTreeRegressor(random_state=42)
y_predict = dec_tree_regressor.predict(X_test)
y_predict[:5]
array([0.438, 0.813, 0.25 , 0.813, 0.375])
dec_tree_r2 = r2_score(y_test, y_predict)
dec_tree_rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print(f'Decision Tree Regression: R2 = {dec_tree_r2:.4f}')
print(f'Decision Tree Regression: RMSE = {dec_tree_rmse:.4f}')
Decision Tree Regression: R2 = -0.5154 Decision Tree Regression: RMSE = 0.2622
There is some correlation between the spending, with QB, TE, Edge, and Safety spending somewhat more correlated than the other positions.
However, none of the model produced any actionable predictions. With nine independent variables, we likely need more than just 288 observations to really find a pattern.