# read the data into a DataFrame
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT4-students/master/kerry/Final/NBA_players_2015.csv'
nba = pd.read_csv(url, index_col=0)
# examine the columns
nba.columns
Index([u'season_end', u'player', u'pos', u'age', u'bref_team_id', u'g', u'gs', u'mp', u'fg', u'fga', u'fg_', u'x3p', u'x3pa', u'x3p_', u'x2p', u'x2pa', u'x2p_', u'ft', u'fta', u'ft_', u'orb', u'drb', u'trb', u'ast', u'stl', u'blk', u'tov', u'pf', u'pts', u'G', u'MP', u'PER', u'TS%', u'3PAr', u'FTr', u'TRB%', u'AST%', u'STL%', u'BLK%', u'TOV%', u'USG%', u'OWS', u'DWS', u'WS', u'WS/48', u'OBPM', u'DBPM', u'BPM', u'VORP'], dtype='object')
# examine the positions
nba.pos.value_counts()
G 200 F 199 C 79 dtype: int64
Use the following features: assists, steals, blocks, turnovers, personal fouls
# map positions to numbers
nba['pos_num'] = nba.pos.map({'C':0, 'F':1, 'G':2})
# create feature matrix (X)
feature_cols = ['ast', 'stl', 'blk', 'tov', 'pf']
X = nba[feature_cols]
# alternative way to create X
X = nba.loc[:, 'ast':'pf']
# create response vector (y)
y = nba.pos_num
# import class
from sklearn.neighbors import KNeighborsClassifier
# instantiate with K=5
knn = KNeighborsClassifier(n_neighbors=5)
# fit with data
knn.fit(X, y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=5, p=2, weights='uniform')
Predict for a player with these statistics: 1 assist, 1 steal, 0 blocks, 1 turnover, 2 personal fouls
# create a list to represent a player
player = [1, 1, 0, 1, 2]
# make a prediction
knn.predict(player)
array([2], dtype=int64)
# calculate predicted probabilities
knn.predict_proba(player)
array([[ 0. , 0.2, 0.8]])
# repeat for K=50
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X, y)
knn.predict(player)
array([1], dtype=int64)
# calculate predicted probabilities
knn.predict_proba(player)
array([[ 0.06, 0.62, 0.32]])
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
# increase default figure and font sizes for easier viewing
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['font.size'] = 14
# description of assists grouped by position
nba.groupby('pos').ast.describe().unstack()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pos | ||||||||
C | 79 | 0.945570 | 0.858263 | 0 | 0.40 | 0.80 | 1.15 | 4.4 |
F | 199 | 1.173367 | 1.086252 | 0 | 0.45 | 0.90 | 1.50 | 7.3 |
G | 200 | 2.729000 | 2.128287 | 0 | 1.10 | 2.25 | 3.80 | 10.2 |
# box plot of assists grouped by position
nba.boxplot(column='ast', by='pos')
<matplotlib.axes._subplots.AxesSubplot at 0x1740b470>
# histogram of assists grouped by position
nba.hist(column='ast', by='pos', sharex=True)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001740B780>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000182EE7F0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000183102E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000184730F0>]], dtype=object)