In [311]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/Users/danielforsyth/Desktop/salaries.csv')
df.head()
Out[311]:
Rk Player Tm Cap Hit 2013-14 2014-15 2015-16 2016-17 2017-18 Signed Using Guaranteed
0 1 Kobe Bryant LAL 30453805 30453805 23500000 25000000 NaN NaN Bird Rights 78953805
1 2 Dirk Nowitzki DAL 22721381 22721381 NaN NaN NaN NaN Bird Rights 22721381
2 3 Amar'e Stoudemire NYK 21679893 21679893 23410988 NaN NaN NaN Bird Rights 45090881
3 4 Joe Johnson BRK 21466718 21466718 23180790 24894863 NaN NaN Bird Rights 69542371
4 5 Carmelo Anthony NYK 21388953 21388953 23333405 NaN NaN NaN Bird Rights 44722359

5 rows × 11 columns

In [313]:
df = df[['Player','Cap Hit']]
df.head()
Out[313]:
Player Cap Hit
0 Kobe Bryant 30453805
1 Dirk Nowitzki 22721381
2 Amar'e Stoudemire 21679893
3 Joe Johnson 21466718
4 Carmelo Anthony 21388953

5 rows × 2 columns

In [282]:
per = pd.read_csv('/Users/danielforsyth/Desktop/per.csv')
per.head()
Out[282]:
Rk Player Pos Age Tm G MP PER TS% eFG% FTr 3PAr ORB% DRB% TRB% AST% STL% BLK% TOV% USG%
0 1 DeAndre Liggins SG 25 MIA 1 1 128.3 1.000 1.000 0.000 0.000 100.0 0.0 62.5 0.0 0.0 0.0 0.0 47.5 ...
1 2 Tony Mitchell SF 24 MIL 3 10 31.0 0.600 0.600 0.000 0.200 11.0 0.0 5.7 22.7 5.2 0.0 0.0 22.6 ...
2 3 Kevin Durant SF 25 OKC 69 2654 30.4 0.639 0.564 0.488 0.285 2.2 19.4 11.2 27.2 1.7 1.6 12.4 32.9 ...
3 4 LeBron James PF 29 MIA 66 2489 29.1 0.649 0.612 0.420 0.221 3.6 18.6 11.4 31.7 2.3 0.8 14.4 30.4 ...
4 5 Kevin Love PF 25 MIN 66 2408 27.9 0.594 0.528 0.449 0.350 8.6 29.8 18.8 20.9 1.1 1.0 9.4 28.7 ...

5 rows × 26 columns

In [283]:
per['MPG'] = per['MP'] / per['G']
In [285]:
per = per[['Player','PER','MPG']]
In [286]:
per.head()
Out[286]:
Player PER MPG
2 Kevin Durant 30.4 38.463768
3 LeBron James 29.1 37.712121
4 Kevin Love 27.9 36.484848
5 Anthony Davis 27.2 36.245902
6 Chris Paul 26.1 34.826923

5 rows × 3 columns

In [287]:
final = pd.merge(df, per, on='Player', how='outer')
final.head()
Out[287]:
Rk Player Cap Hit PER MPG
0 1 Kobe Bryant 30453805 10.9 29.500000
1 2 Dirk Nowitzki 22721381 23.2 32.318841
2 3 Amar'e Stoudemire 21679893 18.6 21.296296
3 4 Joe Johnson 21466718 15.1 32.895522
4 5 Carmelo Anthony 21388953 24.9 38.820896

5 rows × 5 columns

In [288]:
final = final[final.MPG >6.09]
In [290]:
final.dropna()
final.head()
Out[290]:
Player Cap Hit PER MPG
0 Kobe Bryant 30453805 10.9 29.500000
1 Dirk Nowitzki 22721381 23.2 32.318841
2 Amar'e Stoudemire 21679893 18.6 21.296296
3 Joe Johnson 21466718 15.1 32.895522
4 Carmelo Anthony 21388953 24.9 38.820896

5 rows × 4 columns

In [314]:
pd.options.display.mpl_style = 'default'
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
In [315]:
plt.scatter(final['PER'],final['Cap Hit'])
Out[315]:
<matplotlib.collections.PathCollection at 0x112302450>
In [293]:
`smaller_frame=final[['Cap Hit', 'PER', 'MPG']]
from pandas.tools.plotting import scatter_matrix
axeslist=scatter_matrix(smaller_frame, alpha=0.8, figsize=(12, 12), diagonal="kde")
for ax in axeslist.flatten():
    ax.grid(False)
In [294]:
final.corr()
Out[294]:
Cap Hit PER MPG
Cap Hit 1.000000 0.502899 0.574893
PER 0.502899 1.000000 0.654308
MPG 0.574893 0.654308 1.000000

3 rows × 3 columns

In [295]:
final.dtypes
Out[295]:
Player      object
Cap Hit    float64
PER        float64
MPG        float64
dtype: object
In [296]:
final = final.dropna()
In [297]:
cap = final['Cap Hit'][:, np.newaxis]
per = final['PER'][:, np.newaxis]
In [298]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(per, cap)
In [299]:
clf1 = LinearRegression()
clf1.fit(X_train, y_train)
predicted_train = clf1.predict(X_train)
predicted_test = clf1.predict(X_test)
trains=X_train.reshape(1,-1).flatten()
tests=X_test.reshape(1,-1).flatten()
print clf1.coef_, clf1.intercept_
[[ 524954.09866003]] [-2557366.02426694]
In [316]:
plt.scatter(per,cap,c='r')
plt.plot(trains, predicted_train, c='b', alpha=0.5)
Out[316]:
[<matplotlib.lines.Line2D at 0x111a9cf90>]
In [301]:
lr = LinearRegression()
lr.fit(per,cap)
 
b_0   = lr.intercept_
coeff = lr.coef_

pred = lr.predict(33.68)
pred
Out[301]:
array([[ 14560859.24396323]])
In [302]:
ncaa = pd.read_csv('/Users/danielforsyth/Desktop/ncaa.csv')
ncaa = ncaa[ncaa.MPG > 6.09]
ncaa.head()
Out[302]:
RK PLAYER GP MPG AST TO USG ORR DRR REBR PER
0 1 Doug McDermott, CREI 33 33.6 6.6 7.7 27.8 5.7 18.5 12.1 33.68
1 2 Alan Williams, UCSB 26 31.7 5.4 10.2 27.1 13.2 28.4 20.8 33.37
2 3 De'Mon Brooks, DAV 26 26.8 7.8 12.9 24.2 10.4 22.6 16.5 32.13
3 4 T.J. Warren, NCST 33 35.3 4.5 9.6 28.0 10.9 13.1 12.0 32.06
4 5 Javonte Green, RAD 30 26.1 6.1 10.8 23.0 9.7 27.7 18.7 30.61

5 rows × 11 columns

In [303]:
ncaa = ncaa[['PLAYER','PER']]
ncaa.head()
Out[303]:
PLAYER PER
0 Doug McDermott, CREI 33.68
1 Alan Williams, UCSB 33.37
2 De'Mon Brooks, DAV 32.13
3 T.J. Warren, NCST 32.06
4 Javonte Green, RAD 30.61

5 rows × 2 columns

In [304]:
ncaa_per = ncaa['PER'][:, np.newaxis]
ncaa_player = ncaa['PLAYER'][:, np.newaxis]
In [305]:
predictions = []
for i in ncaa_per:
    pred = lr.predict(i)
    pred = float(pred)
    pred = format(pred, '.2f')
    predictions.append(pred)
In [320]:
ncaa['Predicted Salary'] = predictions
ncaa.head(15)
Out[320]:
PLAYER PER Predicted Salary
0 Doug McDermott, CREI 33.68 14560859.24
1 Alan Williams, UCSB 33.37 14406288.32
2 De'Mon Brooks, DAV 32.13 13788004.62
3 T.J. Warren, NCST 32.06 13753101.51
4 Javonte Green, RAD 30.61 13030108.47
5 John Brown, HP 30.56 13005177.68
6 Javon McCrea, BUFF 30.30 12875537.55
7 Frank Kaminsky, WIS 29.73 12591326.49
8 Cameron Bairstow, UNM 29.68 12566395.70
9 Jabari Parker, DUKE 29.46 12456700.20
10 Troy Huff, UND 29.44 12446727.89
11 Billy Baron, CAN 29.38 12416810.93
12 Justin Sears, YALE 28.98 12217364.58
13 Jordan Parks, NCCU 28.90 12177475.31
14 Brad Waldow, SMC 28.83 12142572.19

15 rows × 3 columns