#!/usr/bin/env python
# coding: utf-8

# ## IDENTIFYING MOST COMMON NBA PLAYERS FROM THE 2017 DRAFT CLASS USING PCA DIMENSIONALITY REDUCTION AND K NEAREST NEIGHBORS ALGORITHM
# 
# **Its so common that we hear talking heads tell us about how Lonzo Ball looks like the next Jason Kidd, or how John Jackson is a better shooting version of Kawhi Leonard.  But there's a lot of inherent bias in the prognostications. One - they're limited to players we're familiar with; Two - we choose to see certain aspects of a players' game, wanted to describe someone as a great shooter, or passer, or rebounder. The goal of this analysis is to strip away those biases and get the most accurate comparisons possible, using the best data we have available. **
# 
# The approach is pretty straightforward, and is outlined below before digging into all of the code.
# 
# * Take every player who has been drafted since 2010 - who also played in the NCAA
# * Append their basic and advanced college stats from CBB Reference
# * Take all NCAA players from this season and retrieve their advanced stats as well. 
# * Since we have about 36 different statistics - there's alot of covariance among our features, so we'll perform something called "dimensionality reduction" to reduce them to the fewest # of features that can explain the variance we see in our dataset
# * Take every player and measure their euclidean distance to every other player in the dataset 
# * Limit the dataset to NCAA players compared to NBA players (this is the comparison we wanted to make from the beginning)
# * return every NBA player and sort by ascending distance metric
# * limit to Chad Fords top 50 and export to CSV

# In[ ]:


#import necessary libraries
import pandas as pd
import numpy as np
from time import sleep 
from scipy.spatial import distance
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA


# In[158]:


#read in datasets
nba = pd.read_csv("nba_draft_picks_final.csv")

df = pd.read_csv("ncaa_stats.csv")
df.drop('Unnamed: 0',axis=1,inplace=True)


# In[228]:


#scrape data from bbref if pulling data for the first time
#df = pd.read_html("http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=2011&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=pts_per_g&c1stat=mp_per_g&c1comp=gt&c1val=10&c2stat=pts_per_g&c2comp=gt&c2val=5&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=bpm&order_by_asc=")[0]
#i = 100

#while(i < 15000):
#    print("Number of players retrieved:", str(i))
#    df = df.append(pd.read_html("http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=2011&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=pts_per_g&c1stat=mp_per_g&c1comp=gt&c1val=10&c2stat=pts_per_g&c2comp=gt&c2val=5&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=bpm&order_by_asc=&offset="+str(i))[0])
#    i = i+100
#    sleep(10)

#realign columns properly
#cols = ['Rk', 'Player', 'Class', 'Season',
#       'Pos', 'School', 'Conf', 'G', 'MP', 'MP.1', 'FG', 'FGA', '2P', '2PA',
#       '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
#       'TOV', 'PF', 'PTS', 'PER', 'TS%', 'eFG%', 'ORB%', 'DRB%', 'TRB%',
#       'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'PProd', 'ORtg', 'DRtg', 'OWS',
#       'DWS', 'WS', 'OBPM', 'DBPM', 'BPM','drop1','drop2','drop3']

#df.columns = cols

#df.drop(['drop1','drop2','drop3'], axis=1, inplace=True)

#df = df.drop(df[df.Class == 'Advanced'].index)
#df = df.drop(df[df.Class == 'Class'].index)

#send to CSV for perpetuity
#df.to_csv("ncaa_stats.csv")


# In[229]:


#get the max year for each player (only one row for the season)
df_new = df.groupby(['Player'])['Season'].transform(max) == df['Season']

stats = df[df_new]


# In[230]:


#limit to the few columns we need from the NBA dataset
nba = nba[['Player','Rd','Pk','Year']]


# In[231]:


#merge nba players and get their ncaa stats
draft_stats = pd.merge(nba,stats,on='Player',how='left')

draft_stats = draft_stats.dropna(subset=['Class']) 


# In[233]:


#limit to the most recent NCAA season and drop the raw MP column (we only want Min/Gm)
test = stats[stats['Season']== '2016-17']
test.drop('MP',axis=1,inplace=True)

draft_stats_test = draft_stats[draft_stats['Year'] < 2017]


# In[236]:


#drop unwanted columns
draft_stats_vars = draft_stats_test.drop(['Rd','Pk','Year','Rk','Class','Season','Pos','School','Conf','G','MP'],axis=1)
test_16_vars = test.drop(['Class','Rk','Season','Pos','School','Conf','G'],axis=1)


# In[237]:


#verify they have the same # of columns
print ("test df shape:",test_16_vars.shape)
print ("draft df shape:",draft_stats_vars.shape)


# In[238]:


# concat the two DFs together into one big DF 
final = pd.concat([test_16_vars,draft_stats_vars])
final = final.reset_index()


# In[239]:


#finally drop the player and additional index column
df_final = final.drop(['Player','index'],axis=1)


# In[240]:


#normalize the data so that distance is equalized regardless of the scale of the metric
final_normal = (df_final - df_final.mean()) / df_final.std()


# In[241]:


#introduce PCA Dimensionality Reduction to get the best features that explain the variance in our matrix
pca = PCA()
transformed_pca_x = pca.fit_transform(final_normal)
#create component indices
component_names = ["component_"+str(comp) for comp in range(1, len(pca.explained_variance_)+1)]

#generate new component dataframe
transformed_pca_x = pd.DataFrame(transformed_pca_x,columns=component_names)


# In[264]:


#generate component loadings on original features
component_matrix = pd.DataFrame(pca.components_,index=component_names,columns = df_final.columns)
#add additional columns to describe what
component_matrix["explained_variance_ratio"] = pca.explained_variance_ratio_
component_matrix["eigenvalue"] = pca.explained_variance_

print("explained variance running sum by component:",component_matrix.explained_variance_ratio.cumsum())


# In[243]:


#so letes perform the KNN algorithm on components 1-14, since they expalin 98.15% of the variance in the dataset
pca_final = transformed_pca_x.iloc[:,:14]


# In[245]:


#get the distance between every obs in the final D
distances_euclidean = pdist(pca_final, metric='euclidean')


# In[246]:


#create a pairwise matrix 
distances_matrix = squareform(distances_euclidean)


# In[247]:


#transform that pairwise matrix into a dataframe and add an index field for joining
distances = pd.DataFrame(distances_matrix)
distances['id_a'] = range(0, len(distances))


# In[248]:


#pivot this data so that we can have one row per player comparison
distances_final = pd.melt(distances,id_vars='id_a')

#rename columns
cols = ['player_a','player_b','eucl_dist']
distances_final.columns = cols


# In[251]:


#merge over the players' names'
final1 = pd.merge(distances_final, final, how='inner',left_on='player_a', right_index=True)

final2 = pd.merge(final1,final,how='inner',left_on='player_b',right_index=True)

final2 = final2[['eucl_dist','Player_y','Player_x']]


# In[255]:


#create lookup tables 
ncaa_lookup = test[['Player']]
ncaa_lookup['player_type'] = 'NCAA'

nba_lookup = draft_stats_test[['Player']]
nba_lookup['player_type'] = 'NBA'


# In[256]:


#join over whether or not the players are NBA comparisons or NCAA players
final3 = pd.merge(final2,ncaa_lookup,how='left',left_on='Player_y',right_on='Player')
final3 = final3.drop('Player',axis=1)
final3.rename(columns={'player_type':'player_y_type'},inplace=True)

final4 = pd.merge(final3,ncaa_lookup,how='left',left_on='Player_x',right_on='Player')
final4.drop('Player',axis=1,inplace=True)
final4.rename(columns={'player_type':'player_x_type'},inplace=True)

final4.fillna('NBA',inplace=True)


# In[259]:


#create final final dataframe with one NCAA players compared to NBA players
final_final = final4[(final4.player_y_type =='NCAA') & (final4.player_x_type=='NBA')]

#df[(df.A == 1) & (df.D == 6)]


# In[265]:


#examine the comparisons for Lonzo Ball
final_final[final_final['Player_y'] == 'Lonzo Ball'].sort_values(by='eucl_dist',ascending = True)


# In[263]:


#write this final_final table out to CSV
final_final.to_csv("player_comparison_2017_pca.csv")