Name: Jaime Avendaño
Date: 5/20/2021
Data: https://overthecap.com/positional-spending/
NFL Logos: https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv
This notebook scrapes the data from overthecap.com and stored a parquet file to be used for analysis.
NFL Logos are also pulled and resized for use in visualizations.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import urllib.request
import requests
from bs4 import BeautifulSoup
from PIL import Image
import janitor
from sportsipy.nfl.teams import Teams
spending_url = 'https://overthecap.com/positional-spending/'
page = requests.get(spending_url)
page
<Response [200]>
soup = BeautifulSoup(page.content, 'html.parser')
position_spending_content = soup.find_all('table')
dfs = pd.read_html(str(position_spending_content))
year = 2013
for df in dfs:
df['year'] = year
year += 1
nfl_df = pd.concat(dfs)
nfl_df.shape
(383, 14)
nfl_df = nfl_df.clean_names()\
.filter_on('year <= 2021')
nfl_df.shape
(288, 14)
nfl_df.loc[:, nfl_df.columns[1:-1]] = nfl_df[nfl_df.columns[1:-1]].replace('[\$,]', '', regex=True).astype(int)
nfl_df.head()
C:\Users\jaime.avendano\Anaconda3\lib\site-packages\pandas\core\indexing.py:1717: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy isetter(loc, v)
team | qb | rb | wr | te | ol | offense | idl | edge | lb | s | cb | defense | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Eagles | 13385137 | 10203112 | 19241989 | 5509036 | 25664899 | 74004173 | 5264666 | 10241101 | 10004817 | 6920158 | 6032738 | 38463480 | 2013 |
1 | Seahawks | 1557085 | 10799653 | 16831423 | 12778788 | 27955261 | 69922210 | 7701509 | 25013832 | 7377232 | 8579619 | 3507877 | 52180069 | 2013 |
2 | Titans | 6336958 | 15376098 | 12686896 | 6979500 | 26721984 | 68101436 | 6496528 | 8776293 | 5740835 | 10777066 | 9602477 | 41393199 | 2013 |
3 | Broncos | 18716295 | 5070632 | 10120554 | 8013902 | 24318758 | 66240141 | 6880138 | 5313554 | 12951882 | 5894346 | 17559225 | 48599145 | 2013 |
4 | Giants | 21998400 | 5036739 | 8677626 | 2998913 | 24235900 | 62947578 | 5118995 | 9523813 | 7916847 | 12704990 | 11202110 | 46466755 | 2013 |
# Data from https://en.wikipedia.org/wiki/Salary_cap
nfl_salary_cap = [[2013, 123], [2014, 133], [2015, 143.28], [2016, 155.27], [2017, 167], [2018, 177.2], [2019, 188.2], [2020, 198.2], [2021, 182.5]]
cap_df = pd.DataFrame(nfl_salary_cap, columns=['year', 'cap'])
cap_df.cap = cap_df.cap * 1e6
cap_df
year | cap | |
---|---|---|
0 | 2013 | 123000000.0 |
1 | 2014 | 133000000.0 |
2 | 2015 | 143280000.0 |
3 | 2016 | 155270000.0 |
4 | 2017 | 167000000.0 |
5 | 2018 | 177200000.0 |
6 | 2019 | 188200000.0 |
7 | 2020 | 198200000.0 |
8 | 2021 | 182500000.0 |
nfl_df = nfl_df.join(cap_df.set_index('year'), on='year')
nfl_df.team = nfl_df.team.astype('category')
nfl_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 288 entries, 0 to 31 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 team 288 non-null category 1 qb 288 non-null int32 2 rb 288 non-null int32 3 wr 288 non-null int32 4 te 288 non-null int32 5 ol 288 non-null int32 6 offense 288 non-null int32 7 idl 288 non-null int32 8 edge 288 non-null int32 9 lb 288 non-null int32 10 s 288 non-null int32 11 cb 288 non-null int32 12 defense 288 non-null int32 13 year 288 non-null int64 14 cap 288 non-null float64 dtypes: category(1), float64(1), int32(12), int64(1) memory usage: 22.0 KB
nfl_df.to_parquet('teams_spending_df.parquet')
Code based on: https://gist.github.com/Deryck97/dff8d33e9f841568201a2a0d5519ac5e
teams_logos_df = pd.read_csv('https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv')
teams_logos_df.head()
team | team_code | url | |
---|---|---|---|
0 | Arizona Cardinals | ARI | https://upload.wikimedia.org/wikipedia/en/thum... |
1 | Atlanta Falcons | ATL | https://upload.wikimedia.org/wikipedia/en/thum... |
2 | Baltimore Ravens | BAL | https://upload.wikimedia.org/wikipedia/en/thum... |
3 | Buffalo Bills | BUF | https://upload.wikimedia.org/wikipedia/en/thum... |
4 | Carolina Panthers | CAR | https://upload.wikimedia.org/wikipedia/en/thum... |
teams_logos_df.loc[:, 'team'] = teams_logos_df.team.str.split().str[-1]
teams_logos_df.loc[teams_logos_df.team == 'Team', 'team'] = 'Washington'
teams_logos_df['path'] = os.getcwd() + '\\Logos\\' + teams_logos_df.team + '.png'
teams_logos_df = teams_logos_df.drop_duplicates(subset='team')
teams_logos_df.shape
(32, 4)
for i in range(0,len(teams_logos_df)):
img_path = teams_logos_df.path.iloc[i]
urllib.request.urlretrieve(teams_logos_df.url.iloc[i], img_path)
img = Image.open(img_path)
img.thumbnail((100, 100), Image.ANTIALIAS)
img.save(img_path)
teams_logos_df.to_parquet('teams_logos_df.parquet')
team_info = []
for year in range(2013, 2021):
for team in Teams(year):
team_info.append([year, team.name.split()[-1], team.win_percentage, team.points_for, team.points_against])
team_info[:5]
[[2013, 'Broncos', 0.813, 606, 399], [2013, 'Bears', 0.5, 445, 478], [2013, 'Patriots', 0.75, 444, 338], [2013, 'Eagles', 0.625, 442, 382], [2013, 'Cowboys', 0.5, 439, 432]]
team_detail_df = pd.DataFrame(team_info, columns=['year', 'team', 'win_pct', 'points_for', 'points_against'])
team_detail_df.loc[team_detail_df.team == 'Redskins', 'team'] = 'Washington'
team_detail_df.loc[team_detail_df.team == 'Team', 'team'] = 'Washington'
team_detail_df.head()
year | team | win_pct | points_for | points_against | |
---|---|---|---|---|---|
0 | 2013 | Broncos | 0.813 | 606 | 399 |
1 | 2013 | Bears | 0.500 | 445 | 478 |
2 | 2013 | Patriots | 0.750 | 444 | 338 |
3 | 2013 | Eagles | 0.625 | 442 | 382 |
4 | 2013 | Cowboys | 0.500 | 439 | 432 |
team_detail_df.to_parquet('teams_detail_df.parquet')
team_detail_df.shape
(256, 5)