#!/usr/bin/env python # coding: utf-8 # ## IDENTIFYING MOST COMMON NBA PLAYERS FROM THE 2017 DRAFT CLASS USING PCA DIMENSIONALITY REDUCTION AND K NEAREST NEIGHBORS ALGORITHM # # **Its so common that we hear talking heads tell us about how Lonzo Ball looks like the next Jason Kidd, or how John Jackson is a better shooting version of Kawhi Leonard. But there's a lot of inherent bias in the prognostications. One - they're limited to players we're familiar with; Two - we choose to see certain aspects of a players' game, wanted to describe someone as a great shooter, or passer, or rebounder. The goal of this analysis is to strip away those biases and get the most accurate comparisons possible, using the best data we have available. ** # # The approach is pretty straightforward, and is outlined below before digging into all of the code. # # * Take every player who has been drafted since 2010 - who also played in the NCAA # * Append their basic and advanced college stats from CBB Reference # * Take all NCAA players from this season and retrieve their advanced stats as well. # * Since we have about 36 different statistics - there's alot of covariance among our features, so we'll perform something called "dimensionality reduction" to reduce them to the fewest # of features that can explain the variance we see in our dataset # * Take every player and measure their euclidean distance to every other player in the dataset # * Limit the dataset to NCAA players compared to NBA players (this is the comparison we wanted to make from the beginning) # * return every NBA player and sort by ascending distance metric # * limit to Chad Fords top 50 and export to CSV # In[ ]: #import necessary libraries import pandas as pd import numpy as np from time import sleep from scipy.spatial import distance from scipy.spatial.distance import squareform from scipy.spatial.distance import pdist from sklearn.decomposition import PCA # In[158]: #read in datasets nba = pd.read_csv("nba_draft_picks_final.csv") df = pd.read_csv("ncaa_stats.csv") df.drop('Unnamed: 0',axis=1,inplace=True) # In[228]: #scrape data from bbref if pulling data for the first time #df = pd.read_html("http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=2011&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=pts_per_g&c1stat=mp_per_g&c1comp=gt&c1val=10&c2stat=pts_per_g&c2comp=gt&c2val=5&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=bpm&order_by_asc=")[0] #i = 100 #while(i < 15000): # print("Number of players retrieved:", str(i)) # df = df.append(pd.read_html("http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=2011&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=pts_per_g&c1stat=mp_per_g&c1comp=gt&c1val=10&c2stat=pts_per_g&c2comp=gt&c2val=5&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=bpm&order_by_asc=&offset="+str(i))[0]) # i = i+100 # sleep(10) #realign columns properly #cols = ['Rk', 'Player', 'Class', 'Season', # 'Pos', 'School', 'Conf', 'G', 'MP', 'MP.1', 'FG', 'FGA', '2P', '2PA', # '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', # 'TOV', 'PF', 'PTS', 'PER', 'TS%', 'eFG%', 'ORB%', 'DRB%', 'TRB%', # 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'PProd', 'ORtg', 'DRtg', 'OWS', # 'DWS', 'WS', 'OBPM', 'DBPM', 'BPM','drop1','drop2','drop3'] #df.columns = cols #df.drop(['drop1','drop2','drop3'], axis=1, inplace=True) #df = df.drop(df[df.Class == 'Advanced'].index) #df = df.drop(df[df.Class == 'Class'].index) #send to CSV for perpetuity #df.to_csv("ncaa_stats.csv") # In[229]: #get the max year for each player (only one row for the season) df_new = df.groupby(['Player'])['Season'].transform(max) == df['Season'] stats = df[df_new] # In[230]: #limit to the few columns we need from the NBA dataset nba = nba[['Player','Rd','Pk','Year']] # In[231]: #merge nba players and get their ncaa stats draft_stats = pd.merge(nba,stats,on='Player',how='left') draft_stats = draft_stats.dropna(subset=['Class']) # In[233]: #limit to the most recent NCAA season and drop the raw MP column (we only want Min/Gm) test = stats[stats['Season']== '2016-17'] test.drop('MP',axis=1,inplace=True) draft_stats_test = draft_stats[draft_stats['Year'] < 2017] # In[236]: #drop unwanted columns draft_stats_vars = draft_stats_test.drop(['Rd','Pk','Year','Rk','Class','Season','Pos','School','Conf','G','MP'],axis=1) test_16_vars = test.drop(['Class','Rk','Season','Pos','School','Conf','G'],axis=1) # In[237]: #verify they have the same # of columns print ("test df shape:",test_16_vars.shape) print ("draft df shape:",draft_stats_vars.shape) # In[238]: # concat the two DFs together into one big DF final = pd.concat([test_16_vars,draft_stats_vars]) final = final.reset_index() # In[239]: #finally drop the player and additional index column df_final = final.drop(['Player','index'],axis=1) # In[240]: #normalize the data so that distance is equalized regardless of the scale of the metric final_normal = (df_final - df_final.mean()) / df_final.std() # In[241]: #introduce PCA Dimensionality Reduction to get the best features that explain the variance in our matrix pca = PCA() transformed_pca_x = pca.fit_transform(final_normal) #create component indices component_names = ["component_"+str(comp) for comp in range(1, len(pca.explained_variance_)+1)] #generate new component dataframe transformed_pca_x = pd.DataFrame(transformed_pca_x,columns=component_names) # In[264]: #generate component loadings on original features component_matrix = pd.DataFrame(pca.components_,index=component_names,columns = df_final.columns) #add additional columns to describe what component_matrix["explained_variance_ratio"] = pca.explained_variance_ratio_ component_matrix["eigenvalue"] = pca.explained_variance_ print("explained variance running sum by component:",component_matrix.explained_variance_ratio.cumsum()) # In[243]: #so letes perform the KNN algorithm on components 1-14, since they expalin 98.15% of the variance in the dataset pca_final = transformed_pca_x.iloc[:,:14] # In[245]: #get the distance between every obs in the final D distances_euclidean = pdist(pca_final, metric='euclidean') # In[246]: #create a pairwise matrix distances_matrix = squareform(distances_euclidean) # In[247]: #transform that pairwise matrix into a dataframe and add an index field for joining distances = pd.DataFrame(distances_matrix) distances['id_a'] = range(0, len(distances)) # In[248]: #pivot this data so that we can have one row per player comparison distances_final = pd.melt(distances,id_vars='id_a') #rename columns cols = ['player_a','player_b','eucl_dist'] distances_final.columns = cols # In[251]: #merge over the players' names' final1 = pd.merge(distances_final, final, how='inner',left_on='player_a', right_index=True) final2 = pd.merge(final1,final,how='inner',left_on='player_b',right_index=True) final2 = final2[['eucl_dist','Player_y','Player_x']] # In[255]: #create lookup tables ncaa_lookup = test[['Player']] ncaa_lookup['player_type'] = 'NCAA' nba_lookup = draft_stats_test[['Player']] nba_lookup['player_type'] = 'NBA' # In[256]: #join over whether or not the players are NBA comparisons or NCAA players final3 = pd.merge(final2,ncaa_lookup,how='left',left_on='Player_y',right_on='Player') final3 = final3.drop('Player',axis=1) final3.rename(columns={'player_type':'player_y_type'},inplace=True) final4 = pd.merge(final3,ncaa_lookup,how='left',left_on='Player_x',right_on='Player') final4.drop('Player',axis=1,inplace=True) final4.rename(columns={'player_type':'player_x_type'},inplace=True) final4.fillna('NBA',inplace=True) # In[259]: #create final final dataframe with one NCAA players compared to NBA players final_final = final4[(final4.player_y_type =='NCAA') & (final4.player_x_type=='NBA')] #df[(df.A == 1) & (df.D == 6)] # In[265]: #examine the comparisons for Lonzo Ball final_final[final_final['Player_y'] == 'Lonzo Ball'].sort_values(by='eucl_dist',ascending = True) # In[263]: #write this final_final table out to CSV final_final.to_csv("player_comparison_2017_pca.csv")