#!/usr/bin/env python
# coding: utf-8

# ## __Intro to Data Science:__
# ### Testing Intuition w/ Visualization

# # How data science workflows are supposed to go:
# 
# ### 1) Get data.
# 
# ### 2) Look at numbers.
# 
# ### 3) Do Math!
# 
# ### 4) Visualize results.

# # How they actually go:
# 
# ### 1) Figure out how you're going to get data.
# 
# ### 2) Spend a lot of time cleaning data.
# 
# ### 3) Look at what are supposed to be numbers.
# 
# ### 4) Spend more time cleaning and getting data that you found out you need.
# 
# ### 5) Visualize.
# 
# ### 6) Clean + Splice _creatively._
# 
# ### 7) DO MATH!
# 
# ### 8) Rinse and repeat steps 3-7 until deadline,
# 
# _For everyone's sake, we're not really going to get into steps 1 and 2._

# ## So the first (and usually hardest) step is to pick a problem. Make it one you're interested in - but don't tie yourself down to it. 
# 
# ### A good problem changes as you dig deeper into it. 
# 
# 
# I'm a (recovering) NBA addict, so I'm picking something basketball related.
# 
# ### What is playoff Lebron?

# ## Step 1: Get the data.
# 
# ### Forget what I said earlier - this is the hardest part.
# 
# ## Thankfully, smart people are working on problems like this, so for the purposes of this talk, it's going to be solved by nba_py: https://github.com/seemethere/nba_py

# In[4]:


import nba_helper_functions
from nba_py import  team, game
import pandas as pd
import numpy as  np
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nbashots as nba # this will throw a warning if using matplotlib 1.5
import requests as r
get_ipython().run_line_magic('matplotlib', 'inline')


# In[5]:


cavs_id =  1610612739


# # Step 2: Splice and clean the data:
# 
# Writing helper functions counts as cleaning.

# In[6]:


def lebron_data(season, season_type):
    """Gets info on all cavs games, and returns dataframe with only Lebron's statistics"""
    val = team.TeamGameLogs(cavs_id, season=season, season_type = season_type)
    game_ids = val.info()['Game_ID'].tolist()
    data = pd.DataFrame()
    for id in game_ids:
        all_players = game.PlayerTracking(id).info()[['GAME_ID', 'PLAYER_ID', 'MIN', 'SPD', 'TCHS', 'PASS', 'AST', 'CFGM', 'CFGA', 'UFGM', 'UFGA']]
        lebron = all_players.loc[all_players['PLAYER_ID'] == 2544]
        #this gets some player tracking stats; average speed, touches, passes, assists, and some shot info.
        data = data.append(lebron)
    return data


# In[7]:


df_playoffs =  pd.DataFrame()
playoff_seasons = ['2014-15', '2015-16', '2016-17']
for season in  playoff_seasons:
    df = lebron_data(season, 'Playoffs')
    df_playoffs =  df_playoffs.append(df)


# In[8]:


df_regular_season = lebron_data('2016-17', 'Regular Season')


# In[9]:


df_playoffs.head()


# Now that we've gotten a good chunk, it's time to clean!

# In[11]:


#Filtering out rest games
df_regular_season = df_regular_season[df_regular_season['SPD'] != 0]


# In[16]:


def hh_mm_ss2seconds(hh_mm_ss):
    import functools
    return functools.reduce(lambda min, sec: (min*60 + sec)/60.0, map(int, hh_mm_ss.split(':')))


# In[17]:


df_playoffs['MIN'] = df_playoffs['MIN'].map(lambda x: hh_mm_ss2seconds(x))
df_regular_season['MIN'] = df_regular_season['MIN'].map(lambda x: hh_mm_ss2seconds(x))


# ### Step 3: Numbers!

# In[18]:


df_playoffs.describe()


# In[19]:


df_regular_season.describe()


# ### Step 4: Some more splicing:

# In[20]:


df_playoffs.describe() - df_regular_season.describe() 


# ### (We're skipping step 5 for now- it's not fun.)

# ### Step 6: Time to clean+visualize (again):

# In[21]:


court_shapes =  nba_helper_functions.court_shapes()


# In[23]:


playoff_df = pd.DataFrame()

for season in playoff_seasons:
    df = nba_helper_functions.get_shot_info(season= season)
    playoff_df =  playoff_df.append(df)


# In[25]:


missed_shot_trace = go.Scatter(
    x = playoff_df[playoff_df['EVENT_TYPE'] == 'Missed Shot']['LOC_X'],
    y = playoff_df[playoff_df['EVENT_TYPE'] == 'Missed Shot']['LOC_Y'],
    mode = 'markers',
    name = 'Missed Shot',
    marker = dict(
        size = 5,
        color = 'rgba(255, 255, 0, .8)',
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0, 1)'
        )
    )
)
made_shot_trace = go.Scatter(
    x = playoff_df[playoff_df['EVENT_TYPE'] == 'Made Shot']['LOC_X'],
    y = playoff_df[playoff_df['EVENT_TYPE'] == 'Made Shot']['LOC_Y'],
    mode = 'markers',
    name = 'Made Shot',
    marker = dict(
        size = 5,
        color = 'rgba(0, 200, 100, .8)',
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0, 1)'
        )
    )
)


layout = go.Layout(
    title='Shots by Lebron Last 3 Playoff Series',
    showlegend=True,
    xaxis=dict(
        showgrid=False,
        range=[-300, 300]
    ),
    yaxis=dict(
        showgrid=False,
        range=[-100, 500]
    ),
    height=600,
    width=650,
    shapes=court_shapes
)

data = [missed_shot_trace, made_shot_trace]
 
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,  filename='Lebron Playoffs Shot Chart')


# In[26]:


regular_season = nba_helper_functions.get_shot_info(season_type = 'Regular Season')


# In[28]:


missed_shot_trace = go.Scatter(
    x = regular_season[regular_season['EVENT_TYPE'] == 'Missed Shot']['LOC_X'],
    y = regular_season[regular_season['EVENT_TYPE'] == 'Missed Shot']['LOC_Y'],
    mode = 'markers',
    name = 'Missed Shot',
    marker = dict(
        size = 5,
        color = 'rgba(255, 255, 0, .8)',
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0, 1)'
        )
    )
)
made_shot_trace = go.Scatter(
    x = regular_season[regular_season['EVENT_TYPE'] == 'Made Shot']['LOC_X'],
    y = regular_season[regular_season['EVENT_TYPE'] == 'Made Shot']['LOC_Y'],
    mode = 'markers',
    name = 'Made Shot',
    marker = dict(
        size = 5,
        color = 'rgba(0, 200, 100, .8)',
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0, 1)'
        )
    )
)


layout = go.Layout(
    title='Shots by Lebron 2016-2017 Regular Season',
    showlegend=True,
    xaxis=dict(
        showgrid=False,
        range=[-300, 300]
    ),
    yaxis=dict(
        showgrid=False,
        range=[-100, 500]
    ),
    height=600,
    width=650,
    shapes=court_shapes
)

data = [missed_shot_trace, made_shot_trace]
 
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,  filename='Lebron Regular Season Shot Chart')


# ### Clearly, that showed everything that we were looking for.... so time for some _math._
# 
# #### 2-Dimensional binned statistics (a fancy way of saying 2D histogram) are useful for situations like this. 

# In[29]:


from scipy import stats


# In[30]:


mean, xedges, yedges, binnumber = stats.binned_statistic_2d(x=playoff_df.LOC_X, y=playoff_df.LOC_Y, values=playoff_df.SHOT_MADE_FLAG, statistic='mean', bins=35)


# In[31]:


data = [
    go.Heatmap(
        z=mean.T,
        x=xedges,
        y=yedges,
        colorscale='YIOrRd',
        opacity = .5,
        reversescale = True
    )
]

layout = go.Layout(
    title='Playoff Lebron Binned Heatmap Last 3 Years',
    shapes=court_shapes
)

fig = go.Figure(data=data, layout=layout)
    
py.iplot(fig, filename='Playoff Lebron Binned Heatmap Last 3 Years')


# In[32]:


mean_reg, xedges_reg, yedges_reg, binnumber = stats.binned_statistic_2d(x=regular_season.LOC_X, y=regular_season.LOC_Y, values=regular_season.SHOT_MADE_FLAG, statistic='mean', bins=35)

data = [
    go.Heatmap(
        z=mean_reg.T,
        x=xedges_reg,
        y=yedges_reg,
        colorscale='YIOrRd',
        opacity = .5,
        reversescale = True
    )
]

layout = go.Layout(
    title='Lebron Binned Heatmap 2016-17',
    
    shapes=court_shapes
)

fig = go.Figure(data=data, layout=layout)
    
py.iplot(fig, filename='Lebron Binned Heatmap 2016-17')


# In[45]:


from matplotlib.offsetbox import OffsetImage

grid = nba.shot_chart_jointplot(playoff_df.LOC_X, playoff_df.LOC_Y,
                                title="Lebron Last 3 Playoffs")

plt.show()


# In[46]:


grid = nba.shot_chart_jointgrid(regular_season.LOC_X, regular_season.LOC_Y,
                                title="Lebron 2016-2017 Regular Season")

plt.show()


# ## Time for some math!
# 
# ### Everyone knows that  _guassian kernal density estimation_ work well for low dimensional vectors, right?

# In[52]:


cmap=plt.cm.gist_heat_r
grid = nba.shot_chart_jointgrid(regular_season.LOC_X, regular_season.LOC_Y,
                                title="Lebron 2016-2017 Regular Season",marginals_color=cmap(.7),
                                joint_type="scatter",joint_kde_shade=True)

grid.plot_joint(sns.kdeplot, cmap=plt.cm.OrRd_r)
plt.show()


# In[ ]:


# In[54]:


cmap=plt.cm.gist_heat_r
grid = nba.shot_chart_jointgrid(playoff_df.LOC_X, playoff_df.LOC_Y,
                                title="Lebron Last 3 Playoffs",
                                joint_type="scatter", marginals_color=cmap(.3), joint_kde_shade=True)


grid.plot_joint(sns.kdeplot, cmap=plt.cm.OrRd_r)
plt.show()


# ## So we're seeing a lot here, but we're still not seeing a pronounced difference  (based on my hunch that there is one). 
# 
# #### One might argue that this is classic overfitting, but that one isn't the one talking through this.

# In[55]:


reg_season_fourth =  regular_season[regular_season['PERIOD'] == 4]


# In[56]:


img = nba.get_player_img(2544)

cmap=plt.cm.gist_heat_r
grid = nba.shot_chart_jointgrid(reg_season_fourth.LOC_X, reg_season_fourth.LOC_Y,
                                title="Lebron 2016-2017 Regular Season 4th Q",
                                joint_type="scatter", marginals_color=cmap(.3),
                                marginals_type="kde")

grid.plot_joint(sns.kdeplot, cmap=plt.cm.OrRd_r)
offset_img = OffsetImage(img, zoom=0.6)
offset_img.set_offset((600, 584))
grid.ax_joint.add_artist(offset_img)
plt.show()


# In[57]:


playoff_df_fourth =  playoff_df[playoff_df['PERIOD'] == 4]


# In[58]:


from scipy.misc import imread
im = imread("Lebron-James-Stare_one.jpg")


# In[59]:


plt.rcParams['figure.figsize'] = (20, 18)

cmap=plt.cm.gist_heat_r
grid = nba.shot_chart_jointgrid(playoff_df_fourth.LOC_X, playoff_df_fourth.LOC_Y,
                                title="Lebron Last 3 Playoffs 4th Q",
                                joint_type="scatter", marginals_color=cmap(.3),
                                marginals_type="kde")

grid.plot_joint(sns.kdeplot, cmap=plt.cm.OrRd_r)
offset_img = OffsetImage(im, zoom=0.6)
offset_img.set_offset((600, 584))
grid.ax_joint.add_artist(offset_img)
plt.show()


# In[ ]: