#!/usr/bin/env python
# coding: utf-8

# Import libraries

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pylab as plt
import seaborn as sns
import holoviews as hv
import datashader as ds
import holoviews.operation.datashader as hd
from colorcet import fire, rainbow
import warnings
warnings.filterwarnings("ignore", module="theano")

hd.shade.cmap=["lightblue", "darkblue"]
hv.extension("bokeh", "matplotlib") 


# Import dataset

# In[2]:


DATA_DIR = './'
dataset = pd.read_csv(DATA_DIR + 'catcherframe.csv')
dataset.head()


# In[3]:


dataset.columns


# In[4]:


dataset.shape


# ## Question 1
# 
# > Produce a visualization illustrating how the strike zone differs for right-handed and left-handed batters.

# The first step is to extract the pitches that were takes, but neither pitchouts nor intentional balls. This will be the subset I will analyze.

# In[5]:


takes = dataset.query('(takes==1) & (pitchout==0) & (intentball==0)').copy()


# In[6]:


takes.ball.value_counts()


# Because the dataset is relatively large, I will use Holoviews (specifically, the DataShader module) to generate a visualization. It performs a binning of points rather than plotting hundreds of thousands of points that cannot be seen.

# In[7]:


class mean_datashade(hd.datashade):
    aggregator = ds.mean('calledstrike')
    cmap=rainbow


# In[8]:


hv.util.opts("HLine VLine (color='black' line_width=1 line_dash='dashed' level='annotation')")


# I will use these dimensions of the plate. The top and bottom were obtained empirically.

# In[9]:


SZ_BOTTOM = 1.59
SZ_TOP = 3.44
SZ_LEFT = -0.708
SZ_RIGHT = 0.708


# The heatmap below plots the observed probability of a strike, with "hotter" shades representing higher probabilities. The difference in the location of the highest probability region between batter hand is evident.

# In[10]:


pts_right = hv.Points(data=takes[takes.batside=='R'], 
                      kdims=['plate_x', 'plate_z'], 
                      label='Right-handed batter')
pts_left = hv.Points(data=takes[takes.batside=='L'], 
                     kdims=['plate_x', 'plate_z'],
                     label='Left-handed batter')
strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM)
(mean_datashade(pts_right)*strike_zone + 
 mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8))


# ## Question 2
# 
# > Which catcher was the most effective framer on a rate basis? Briefly describe (a paragraph or
# two) the process you used to come to this conclusion. Please include a .csv with every catcherid
# along with some measurement of framing skill (how you choose to convey this is up to you).

# I will use the `shapely` package for Python to perform a spatial analysis of the data. Specifically, I want to extract the pitches that were on or near the corners.
# 
# The first step is to encode the `plate_x` and `plate_y` variables as cartesian coordinates. This will be stored as a `location` variable.

# In[11]:


from shapely.geometry import Point

takes['location'] = list(zip(takes.plate_x, takes.plate_z))
takes['location'] = takes.location.apply(Point)
takes = gpd.GeoDataFrame(takes, geometry='location')


# Next, I will create a set of polygons representing areas near the corners of the strike zone. I have chosen a value of 3 inches as the radius around each corner that I will use to filter pitches; this value could be tuned.

# In[12]:


from shapely.geometry import MultiPolygon

corners = MultiPolygon([Point(SZ_LEFT, SZ_BOTTOM).buffer(0.25), Point(SZ_LEFT, SZ_TOP).buffer(0.25), 
           Point(SZ_RIGHT, SZ_TOP).buffer(0.25), Point(SZ_RIGHT, SZ_BOTTOM).buffer(0.25)])


# In[13]:


corners


# The `intersection` method performs a spatial query, flagging points that intersect with any of the four corner polygons. These will be extracted for use in the model.

# In[14]:


near_corners = takes[takes.location.intersects(corners)]


# In[15]:


pts = hv.Points(data=near_corners, kdims=['plate_x', 'plate_z'])
mean_datashade(pts)


# Next, I will filter out the catchers with fewer than 100 pitches in the corners, to avoid overfitting to small samples.

# In[16]:


candidate_catchers = near_corners.groupby('catcherid').size().where(lambda x: x>100).dropna().index
near_corners = near_corners[near_corners.catcherid.isin(candidate_catchers)]
n_catchers = len(candidate_catchers)


# I want to use the catcher ID as a random effect in a hierarchical model, so I will encode their IDs as consecutive integers.

# In[17]:


from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder.
le = LabelEncoder()
near_corners['catcher_ind']  = catcher_ind = le.fit_transform(near_corners.catcherid)


# In[18]:


strike=near_corners.calledstrike.values


# Here is my statistical model. It is a simple random effect model that partially pools catchers to provide realistic estimates of the probability of a strike. 

# In[19]:


import pymc3 as pm

with pm.Model() as framing_model:
    
    # Baseline probability
    μ = pm.Normal('μ', 2, sd=5)
    σ = pm.Exponential('σ', 1)
    z = pm.Normal('z', 0, 1, shape=n_catchers)
    # Non-centered parameterization
    θ = pm.Deterministic('θ', μ + z*σ)
    
    # Called strike rate as function of distance to strike zone
    p = pm.math.invlogit(θ[catcher_ind])
    
    called_strike = pm.Bernoulli('called_strike', p, observed=strike)
    

# In[20]:


with framing_model:
    samples = pm.sample(1000, tune=1000, njobs=2)


# These are the framing scores for all catchers, on the inverse-logit scale.

# In[21]:


plt.figure(figsize=(8, 14))
pm.forestplot(samples, varnames=['θ'])


# We can sort them based on their framing scores, and plot the best and poorest framers.

# In[22]:


sorted_catchers = np.argsort(samples['θ'].mean(0))


# In[23]:


fig, axes = plt.subplots(2, 5, figsize=(14,6), sharex=True, sharey=True)
legend = 'full'
for ax,ind in zip(axes.ravel(), sorted_catchers[:5].tolist() + sorted_catchers[-5:].tolist()):
    sns.scatterplot(x='plate_x', y='plate_z', hue='calledstrike', ax=ax, legend=legend, alpha=0.6,
                data=near_corners[near_corners.catcher_ind==ind])
    legend = False


# Export estimates to .csv

# In[24]:


called_strike_probs = pd.Series(1/(1 + np.exp(-samples['θ'].mean(0))), index=le.classes_).sort_values()
called_strike_probs.index.name = 'catcher_id'
called_strike_probs.name = 'called_strike_prob'
called_strike_probs.to_csv('called_strike_probs.csv')


# ## Question 3
# 
# > A coach asks you how the attributes of a pitch affect its likelihood of producing a called strike. In
# one sentence, how would you respond?

# Based on the data at hand, there does not appear to be a strong relationship between the available pitch attributes and the probability of getting a called strike, though there is a non-significant tendency for the called strike probability to to increase with lower than average release velocity and vertical break, and higher than average horizontal break.

# In[25]:


from shapely.geometry import Polygon

strike_zone_poly = Polygon(((SZ_LEFT, SZ_BOTTOM), (SZ_LEFT, SZ_TOP), (SZ_RIGHT, SZ_TOP), (SZ_RIGHT, SZ_BOTTOM)))


# I'm restricting the dataset to pitches within 6 inches of the strike zone

# In[26]:


takes['distance_to_zone'] = takes.location.distance(strike_zone_poly)


# In[27]:


within_6in = takes.query('distance_to_zone<0.25').copy()


# In[28]:


pts = hv.Points(data=within_6in, kdims=['plate_x', 'plate_z'])
mean_datashade(pts)


# In[29]:


analysis_subset = within_6in.loc[~within_6in.pitchtype.isin(['UN', 'KN']), 
                                 ['calledstrike', 'pitchtype', 'relspeed', 'hbreak', 'vbreak']].copy()
#analysis_subset['left_handed'] = (analysis_subset.pitcherhand=='L').astype(int)
analysis_subset['relspeed'] = analysis_subset.relspeed - analysis_subset.relspeed.mean()
analysis_subset['vbreak'] = analysis_subset.vbreak - analysis_subset.vbreak.mean()
analysis_subset['hbreak'] = analysis_subset.hbreak - analysis_subset.hbreak.mean()
#analysis_subset['break_by_hand'] = analysis_subset.hbreak*analysis_subset.left_handed
#analysis_subset['pitch_type'] = pitch_type
analysis_subset.head()


# In[30]:


analysis_subset.groupby(['calledstrike']).mean()


# In[31]:


y = analysis_subset.pop('calledstrike').values
pitch_type = analysis_subset.pop('pitchtype').values
X = analysis_subset.values


# Encode pitch types

# In[32]:


from sklearn.preprocessing import LabelEncoder

pitch_enc = LabelEncoder().fit(pitch_type)
pitch_type = pitch_enc.transform(pitch_type)


# In[33]:


pitch_enc.classes_


# I will use minibatches since the dataset is large

# In[34]:


X_batch = pm.Minibatch(X, batch_size=100)
y_batch = pm.Minibatch(y, batch_size=100)
pitch_batch = pm.Minibatch(pitch_type, batch_size=100)


# In[36]:


import theano.tensor as tt
from theano import shared

with pm.Model() as strike_model:
    
    # Baseline probability
    μ = pm.Normal('μ', 2, sd=5)
    σ = pm.Exponential('σ', 1)
    z = pm.Normal('z', 0, 1, shape=len(pitch_enc.classes_))
    # Non-centered parameterization
    θ = pm.Deterministic('θ', μ + z*σ)
    
    β = pm.Normal('β', 0, sd=5, shape=X.shape[1])
    p = pm.math.invlogit(θ[pitch_batch] + X_batch.dot(β))
    
    likelihood = pm.Bernoulli('likelihood', p, observed=y_batch, total_size=len(y))


# In[37]:


with strike_model:
    
    fit = pm.fit(30000)


# In[38]:


plt.plot(fit.hist)


# In[39]:


strike_samples = fit.sample(2000)


# In[40]:


pm.forestplot(strike_samples, varnames=['θ'], ylabels=pitch_enc.classes_)


# In[41]:


pm.forestplot(strike_samples, varnames=['β'], ylabels=analysis_subset.columns)


# ## Question 4
# 
# > Tell us one other interesting conclusion you drew from the dataset.

# The difference in strike zones that is apparent in the visualization from question 1 is smaller when it is restricted to the first pitch a given batter. The region of high called strike probability is shifted towards the center for both hands.

# In[42]:


pts_right = hv.Points(data=takes[(takes.batside=='R') & (takes.balls==0) & (takes.strikes==0)], 
                      kdims=['plate_x', 'plate_z'], 
                      label='Right-handed batter')
pts_left = hv.Points(data=takes[(takes.batside=='L') & (takes.balls==0) & (takes.strikes==0)], 
                     kdims=['plate_x', 'plate_z'],
                     label='Left-handed batter')
strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM)
(mean_datashade(pts_right)*strike_zone + 
 mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8))