#!/usr/bin/env python # coding: utf-8 # Import libraries # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import geopandas as gpd import matplotlib.pylab as plt import seaborn as sns import holoviews as hv import datashader as ds import holoviews.operation.datashader as hd from colorcet import fire, rainbow import warnings warnings.filterwarnings("ignore", module="theano") hd.shade.cmap=["lightblue", "darkblue"] hv.extension("bokeh", "matplotlib") # Import dataset # In[2]: DATA_DIR = './' dataset = pd.read_csv(DATA_DIR + 'catcherframe.csv') dataset.head() # In[3]: dataset.columns # In[4]: dataset.shape # ## Question 1 # # > Produce a visualization illustrating how the strike zone differs for right-handed and left-handed batters. # The first step is to extract the pitches that were takes, but neither pitchouts nor intentional balls. This will be the subset I will analyze. # In[5]: takes = dataset.query('(takes==1) & (pitchout==0) & (intentball==0)').copy() # In[6]: takes.ball.value_counts() # Because the dataset is relatively large, I will use Holoviews (specifically, the DataShader module) to generate a visualization. It performs a binning of points rather than plotting hundreds of thousands of points that cannot be seen. # In[7]: class mean_datashade(hd.datashade): aggregator = ds.mean('calledstrike') cmap=rainbow # In[8]: hv.util.opts("HLine VLine (color='black' line_width=1 line_dash='dashed' level='annotation')") # I will use these dimensions of the plate. The top and bottom were obtained empirically. # In[9]: SZ_BOTTOM = 1.59 SZ_TOP = 3.44 SZ_LEFT = -0.708 SZ_RIGHT = 0.708 # The heatmap below plots the observed probability of a strike, with "hotter" shades representing higher probabilities. The difference in the location of the highest probability region between batter hand is evident. # In[10]: pts_right = hv.Points(data=takes[takes.batside=='R'], kdims=['plate_x', 'plate_z'], label='Right-handed batter') pts_left = hv.Points(data=takes[takes.batside=='L'], kdims=['plate_x', 'plate_z'], label='Left-handed batter') strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM) (mean_datashade(pts_right)*strike_zone + mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8)) # ## Question 2 # # > Which catcher was the most effective framer on a rate basis? Briefly describe (a paragraph or # two) the process you used to come to this conclusion. Please include a .csv with every catcherid # along with some measurement of framing skill (how you choose to convey this is up to you). # I will use the `shapely` package for Python to perform a spatial analysis of the data. Specifically, I want to extract the pitches that were on or near the corners. # # The first step is to encode the `plate_x` and `plate_y` variables as cartesian coordinates. This will be stored as a `location` variable. # In[11]: from shapely.geometry import Point takes['location'] = list(zip(takes.plate_x, takes.plate_z)) takes['location'] = takes.location.apply(Point) takes = gpd.GeoDataFrame(takes, geometry='location') # Next, I will create a set of polygons representing areas near the corners of the strike zone. I have chosen a value of 3 inches as the radius around each corner that I will use to filter pitches; this value could be tuned. # In[12]: from shapely.geometry import MultiPolygon corners = MultiPolygon([Point(SZ_LEFT, SZ_BOTTOM).buffer(0.25), Point(SZ_LEFT, SZ_TOP).buffer(0.25), Point(SZ_RIGHT, SZ_TOP).buffer(0.25), Point(SZ_RIGHT, SZ_BOTTOM).buffer(0.25)]) # In[13]: corners # The `intersection` method performs a spatial query, flagging points that intersect with any of the four corner polygons. These will be extracted for use in the model. # In[14]: near_corners = takes[takes.location.intersects(corners)] # In[15]: pts = hv.Points(data=near_corners, kdims=['plate_x', 'plate_z']) mean_datashade(pts) # Next, I will filter out the catchers with fewer than 100 pitches in the corners, to avoid overfitting to small samples. # In[16]: candidate_catchers = near_corners.groupby('catcherid').size().where(lambda x: x>100).dropna().index near_corners = near_corners[near_corners.catcherid.isin(candidate_catchers)] n_catchers = len(candidate_catchers) # I want to use the catcher ID as a random effect in a hierarchical model, so I will encode their IDs as consecutive integers. # In[17]: from sklearn.preprocessing import LabelEncoder # Initialize the LabelEncoder. le = LabelEncoder() near_corners['catcher_ind'] = catcher_ind = le.fit_transform(near_corners.catcherid) # In[18]: strike=near_corners.calledstrike.values # Here is my statistical model. It is a simple random effect model that partially pools catchers to provide realistic estimates of the probability of a strike. # In[19]: import pymc3 as pm with pm.Model() as framing_model: # Baseline probability μ = pm.Normal('μ', 2, sd=5) σ = pm.Exponential('σ', 1) z = pm.Normal('z', 0, 1, shape=n_catchers) # Non-centered parameterization θ = pm.Deterministic('θ', μ + z*σ) # Called strike rate as function of distance to strike zone p = pm.math.invlogit(θ[catcher_ind]) called_strike = pm.Bernoulli('called_strike', p, observed=strike) # In[20]: with framing_model: samples = pm.sample(1000, tune=1000, njobs=2) # These are the framing scores for all catchers, on the inverse-logit scale. # In[21]: plt.figure(figsize=(8, 14)) pm.forestplot(samples, varnames=['θ']) # We can sort them based on their framing scores, and plot the best and poorest framers. # In[22]: sorted_catchers = np.argsort(samples['θ'].mean(0)) # In[23]: fig, axes = plt.subplots(2, 5, figsize=(14,6), sharex=True, sharey=True) legend = 'full' for ax,ind in zip(axes.ravel(), sorted_catchers[:5].tolist() + sorted_catchers[-5:].tolist()): sns.scatterplot(x='plate_x', y='plate_z', hue='calledstrike', ax=ax, legend=legend, alpha=0.6, data=near_corners[near_corners.catcher_ind==ind]) legend = False # Export estimates to .csv # In[24]: called_strike_probs = pd.Series(1/(1 + np.exp(-samples['θ'].mean(0))), index=le.classes_).sort_values() called_strike_probs.index.name = 'catcher_id' called_strike_probs.name = 'called_strike_prob' called_strike_probs.to_csv('called_strike_probs.csv') # ## Question 3 # # > A coach asks you how the attributes of a pitch affect its likelihood of producing a called strike. In # one sentence, how would you respond? # Based on the data at hand, there does not appear to be a strong relationship between the available pitch attributes and the probability of getting a called strike, though there is a non-significant tendency for the called strike probability to to increase with lower than average release velocity and vertical break, and higher than average horizontal break. # In[25]: from shapely.geometry import Polygon strike_zone_poly = Polygon(((SZ_LEFT, SZ_BOTTOM), (SZ_LEFT, SZ_TOP), (SZ_RIGHT, SZ_TOP), (SZ_RIGHT, SZ_BOTTOM))) # I'm restricting the dataset to pitches within 6 inches of the strike zone # In[26]: takes['distance_to_zone'] = takes.location.distance(strike_zone_poly) # In[27]: within_6in = takes.query('distance_to_zone<0.25').copy() # In[28]: pts = hv.Points(data=within_6in, kdims=['plate_x', 'plate_z']) mean_datashade(pts) # In[29]: analysis_subset = within_6in.loc[~within_6in.pitchtype.isin(['UN', 'KN']), ['calledstrike', 'pitchtype', 'relspeed', 'hbreak', 'vbreak']].copy() #analysis_subset['left_handed'] = (analysis_subset.pitcherhand=='L').astype(int) analysis_subset['relspeed'] = analysis_subset.relspeed - analysis_subset.relspeed.mean() analysis_subset['vbreak'] = analysis_subset.vbreak - analysis_subset.vbreak.mean() analysis_subset['hbreak'] = analysis_subset.hbreak - analysis_subset.hbreak.mean() #analysis_subset['break_by_hand'] = analysis_subset.hbreak*analysis_subset.left_handed #analysis_subset['pitch_type'] = pitch_type analysis_subset.head() # In[30]: analysis_subset.groupby(['calledstrike']).mean() # In[31]: y = analysis_subset.pop('calledstrike').values pitch_type = analysis_subset.pop('pitchtype').values X = analysis_subset.values # Encode pitch types # In[32]: from sklearn.preprocessing import LabelEncoder pitch_enc = LabelEncoder().fit(pitch_type) pitch_type = pitch_enc.transform(pitch_type) # In[33]: pitch_enc.classes_ # I will use minibatches since the dataset is large # In[34]: X_batch = pm.Minibatch(X, batch_size=100) y_batch = pm.Minibatch(y, batch_size=100) pitch_batch = pm.Minibatch(pitch_type, batch_size=100) # In[36]: import theano.tensor as tt from theano import shared with pm.Model() as strike_model: # Baseline probability μ = pm.Normal('μ', 2, sd=5) σ = pm.Exponential('σ', 1) z = pm.Normal('z', 0, 1, shape=len(pitch_enc.classes_)) # Non-centered parameterization θ = pm.Deterministic('θ', μ + z*σ) β = pm.Normal('β', 0, sd=5, shape=X.shape[1]) p = pm.math.invlogit(θ[pitch_batch] + X_batch.dot(β)) likelihood = pm.Bernoulli('likelihood', p, observed=y_batch, total_size=len(y)) # In[37]: with strike_model: fit = pm.fit(30000) # In[38]: plt.plot(fit.hist) # In[39]: strike_samples = fit.sample(2000) # In[40]: pm.forestplot(strike_samples, varnames=['θ'], ylabels=pitch_enc.classes_) # In[41]: pm.forestplot(strike_samples, varnames=['β'], ylabels=analysis_subset.columns) # ## Question 4 # # > Tell us one other interesting conclusion you drew from the dataset. # The difference in strike zones that is apparent in the visualization from question 1 is smaller when it is restricted to the first pitch a given batter. The region of high called strike probability is shifted towards the center for both hands. # In[42]: pts_right = hv.Points(data=takes[(takes.batside=='R') & (takes.balls==0) & (takes.strikes==0)], kdims=['plate_x', 'plate_z'], label='Right-handed batter') pts_left = hv.Points(data=takes[(takes.batside=='L') & (takes.balls==0) & (takes.strikes==0)], kdims=['plate_x', 'plate_z'], label='Left-handed batter') strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM) (mean_datashade(pts_right)*strike_zone + mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8))