Import libraries
%matplotlib inline
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pylab as plt
import seaborn as sns
import holoviews as hv
import datashader as ds
import holoviews.operation.datashader as hd
from colorcet import fire, rainbow
import warnings
warnings.filterwarnings("ignore", module="theano")
hd.shade.cmap=["lightblue", "darkblue"]
hv.extension("bokeh", "matplotlib")
Import dataset
DATA_DIR = './'
dataset = pd.read_csv(DATA_DIR + 'catcherframe.csv')
dataset.head()
parkid | bluejaysgameid | gamepitchsequence | half | balls | strikes | pitcherid | pitcherhand | batterid | batside | ... | calledstrike | ball | swings | takes | pitchout | intentball | inplay | pitcheventtype | hbp | ump_hp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 556C80A4-54E3-400E-9AD1-426D5FB679BB | 1 | 0 | 0 | 0 | 11198 | R | 171109 | R | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | ball | 0 | 142393.0 |
1 | 5 | 556C80A4-54E3-400E-9AD1-426D5FB679BB | 2 | 0 | 1 | 0 | 11198 | R | 171109 | R | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | called_strike | 0 | 142393.0 |
2 | 5 | 556C80A4-54E3-400E-9AD1-426D5FB679BB | 3 | 0 | 1 | 1 | 11198 | R | 171109 | R | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | ball | 0 | 142393.0 |
3 | 5 | 556C80A4-54E3-400E-9AD1-426D5FB679BB | 4 | 0 | 2 | 1 | 11198 | R | 171109 | R | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | foul | 0 | 142393.0 |
4 | 5 | 556C80A4-54E3-400E-9AD1-426D5FB679BB | 5 | 0 | 2 | 2 | 11198 | R | 171109 | R | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | foul | 0 | 142393.0 |
5 rows × 27 columns
dataset.columns
Index(['parkid', 'bluejaysgameid', 'gamepitchsequence', 'half', 'balls', 'strikes', 'pitcherid', 'pitcherhand', 'batterid', 'batside', 'catcherid', 'pitchtype', 'relspeed', 'hbreak', 'vbreak', 'plate_x', 'plate_z', 'calledstrike', 'ball', 'swings', 'takes', 'pitchout', 'intentball', 'inplay', 'pitcheventtype', 'hbp', 'ump_hp'], dtype='object')
dataset.shape
(342506, 27)
Produce a visualization illustrating how the strike zone differs for right-handed and left-handed batters.
The first step is to extract the pitches that were takes, but neither pitchouts nor intentional balls. This will be the subset I will analyze.
takes = dataset.query('(takes==1) & (pitchout==0) & (intentball==0)').copy()
takes.ball.value_counts()
1 123745 0 59062 Name: ball, dtype: int64
Because the dataset is relatively large, I will use Holoviews (specifically, the DataShader module) to generate a visualization. It performs a binning of points rather than plotting hundreds of thousands of points that cannot be seen.
class mean_datashade(hd.datashade):
aggregator = ds.mean('calledstrike')
cmap=rainbow
hv.util.opts("HLine VLine (color='black' line_width=1 line_dash='dashed' level='annotation')")
I will use these dimensions of the plate. The top and bottom were obtained empirically.
SZ_BOTTOM = 1.59
SZ_TOP = 3.44
SZ_LEFT = -0.708
SZ_RIGHT = 0.708
The heatmap below plots the observed probability of a strike, with "hotter" shades representing higher probabilities. The difference in the location of the highest probability region between batter hand is evident.
pts_right = hv.Points(data=takes[takes.batside=='R'],
kdims=['plate_x', 'plate_z'],
label='Right-handed batter')
pts_left = hv.Points(data=takes[takes.batside=='L'],
kdims=['plate_x', 'plate_z'],
label='Left-handed batter')
strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM)
(mean_datashade(pts_right)*strike_zone +
mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8))
Which catcher was the most effective framer on a rate basis? Briefly describe (a paragraph or
two) the process you used to come to this conclusion. Please include a .csv with every catcherid along with some measurement of framing skill (how you choose to convey this is up to you).
I will use the shapely
package for Python to perform a spatial analysis of the data. Specifically, I want to extract the pitches that were on or near the corners.
The first step is to encode the plate_x
and plate_y
variables as cartesian coordinates. This will be stored as a location
variable.
from shapely.geometry import Point
takes['location'] = list(zip(takes.plate_x, takes.plate_z))
takes['location'] = takes.location.apply(Point)
takes = gpd.GeoDataFrame(takes, geometry='location')
Next, I will create a set of polygons representing areas near the corners of the strike zone. I have chosen a value of 3 inches as the radius around each corner that I will use to filter pitches; this value could be tuned.
from shapely.geometry import MultiPolygon
corners = MultiPolygon([Point(SZ_LEFT, SZ_BOTTOM).buffer(0.25), Point(SZ_LEFT, SZ_TOP).buffer(0.25),
Point(SZ_RIGHT, SZ_TOP).buffer(0.25), Point(SZ_RIGHT, SZ_BOTTOM).buffer(0.25)])
corners
The intersection
method performs a spatial query, flagging points that intersect with any of the four corner polygons. These will be extracted for use in the model.
near_corners = takes[takes.location.intersects(corners)]
pts = hv.Points(data=near_corners, kdims=['plate_x', 'plate_z'])
mean_datashade(pts)
Next, I will filter out the catchers with fewer than 100 pitches in the corners, to avoid overfitting to small samples.
candidate_catchers = near_corners.groupby('catcherid').size().where(lambda x: x>100).dropna().index
near_corners = near_corners[near_corners.catcherid.isin(candidate_catchers)]
n_catchers = len(candidate_catchers)
I want to use the catcher ID as a random effect in a hierarchical model, so I will encode their IDs as consecutive integers.
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder.
le = LabelEncoder()
near_corners['catcher_ind'] = catcher_ind = le.fit_transform(near_corners.catcherid)
/home/fonnesbeck/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """
strike=near_corners.calledstrike.values
Here is my statistical model. It is a simple random effect model that partially pools catchers to provide realistic estimates of the probability of a strike.
import pymc3 as pm
with pm.Model() as framing_model:
# Baseline probability
μ = pm.Normal('μ', 2, sd=5)
σ = pm.Exponential('σ', 1)
z = pm.Normal('z', 0, 1, shape=n_catchers)
# Non-centered parameterization
θ = pm.Deterministic('θ', μ + z*σ)
# Called strike rate as function of distance to strike zone
p = pm.math.invlogit(θ[catcher_ind])
called_strike = pm.Bernoulli('called_strike', p, observed=strike)
with framing_model:
samples = pm.sample(1000, tune=1000, njobs=2)
INFO:pymc3:Auto-assigning NUTS sampler... INFO:pymc3:Initializing NUTS using jitter+adapt_diag... INFO:pymc3:Multiprocess sampling (2 chains in 2 jobs) INFO:pymc3:NUTS: [z, σ, μ] Sampling 2 chains: 100%|██████████| 4000/4000 [01:49<00:00, 58.81draws/s]
These are the framing scores for all catchers, on the inverse-logit scale.
plt.figure(figsize=(8, 14))
pm.forestplot(samples, varnames=['θ'])
GridSpec(1, 2, width_ratios=[3, 1])
We can sort them based on their framing scores, and plot the best and poorest framers.
sorted_catchers = np.argsort(samples['θ'].mean(0))
fig, axes = plt.subplots(2, 5, figsize=(14,6), sharex=True, sharey=True)
legend = 'full'
for ax,ind in zip(axes.ravel(), sorted_catchers[:5].tolist() + sorted_catchers[-5:].tolist()):
sns.scatterplot(x='plate_x', y='plate_z', hue='calledstrike', ax=ax, legend=legend, alpha=0.6,
data=near_corners[near_corners.catcher_ind==ind])
legend = False
Export estimates to .csv
called_strike_probs = pd.Series(1/(1 + np.exp(-samples['θ'].mean(0))), index=le.classes_).sort_values()
called_strike_probs.index.name = 'catcher_id'
called_strike_probs.name = 'called_strike_prob'
called_strike_probs.to_csv('called_strike_probs.csv')
A coach asks you how the attributes of a pitch affect its likelihood of producing a called strike. In
one sentence, how would you respond?
Based on the data at hand, there does not appear to be a strong relationship between the available pitch attributes and the probability of getting a called strike, though there is a non-significant tendency for the called strike probability to to increase with lower than average release velocity and vertical break, and higher than average horizontal break.
from shapely.geometry import Polygon
strike_zone_poly = Polygon(((SZ_LEFT, SZ_BOTTOM), (SZ_LEFT, SZ_TOP), (SZ_RIGHT, SZ_TOP), (SZ_RIGHT, SZ_BOTTOM)))
I'm restricting the dataset to pitches within 6 inches of the strike zone
takes['distance_to_zone'] = takes.location.distance(strike_zone_poly)
within_6in = takes.query('distance_to_zone<0.25').copy()
pts = hv.Points(data=within_6in, kdims=['plate_x', 'plate_z'])
mean_datashade(pts)
analysis_subset = within_6in.loc[~within_6in.pitchtype.isin(['UN', 'KN']),
['calledstrike', 'pitchtype', 'relspeed', 'hbreak', 'vbreak']].copy()
#analysis_subset['left_handed'] = (analysis_subset.pitcherhand=='L').astype(int)
analysis_subset['relspeed'] = analysis_subset.relspeed - analysis_subset.relspeed.mean()
analysis_subset['vbreak'] = analysis_subset.vbreak - analysis_subset.vbreak.mean()
analysis_subset['hbreak'] = analysis_subset.hbreak - analysis_subset.hbreak.mean()
#analysis_subset['break_by_hand'] = analysis_subset.hbreak*analysis_subset.left_handed
#analysis_subset['pitch_type'] = pitch_type
analysis_subset.head()
calledstrike | pitchtype | relspeed | hbreak | vbreak | |
---|---|---|---|---|---|
1 | 1 | FA | 5.864484 | 11.604017 | 4.251008 |
7 | 0 | FA | 5.936084 | 12.206017 | 4.655908 |
8 | 1 | FA | 6.563284 | 11.882917 | 4.131908 |
12 | 0 | SL | -2.611216 | -3.426583 | -6.429892 |
17 | 1 | SL | -0.503816 | -3.362583 | -1.029592 |
analysis_subset.groupby(['calledstrike']).mean()
relspeed | hbreak | vbreak | |
---|---|---|---|
calledstrike | |||
0 | 0.203169 | -0.082841 | 0.038795 |
1 | -0.092355 | 0.037657 | -0.017635 |
y = analysis_subset.pop('calledstrike').values
pitch_type = analysis_subset.pop('pitchtype').values
X = analysis_subset.values
Encode pitch types
from sklearn.preprocessing import LabelEncoder
pitch_enc = LabelEncoder().fit(pitch_type)
pitch_type = pitch_enc.transform(pitch_type)
pitch_enc.classes_
array(['CH', 'CU', 'FA', 'FC', 'FS', 'SI', 'SL'], dtype=object)
I will use minibatches since the dataset is large
X_batch = pm.Minibatch(X, batch_size=100)
y_batch = pm.Minibatch(y, batch_size=100)
pitch_batch = pm.Minibatch(pitch_type, batch_size=100)
/home/fonnesbeck/repos/pymc3/pymc3/data.py:244: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. self.shared = theano.shared(data[in_memory_slc]) /home/fonnesbeck/repos/pymc3/pymc3/data.py:244: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. self.shared = theano.shared(data[in_memory_slc])
import theano.tensor as tt
from theano import shared
with pm.Model() as strike_model:
# Baseline probability
μ = pm.Normal('μ', 2, sd=5)
σ = pm.Exponential('σ', 1)
z = pm.Normal('z', 0, 1, shape=len(pitch_enc.classes_))
# Non-centered parameterization
θ = pm.Deterministic('θ', μ + z*σ)
β = pm.Normal('β', 0, sd=5, shape=X.shape[1])
p = pm.math.invlogit(θ[pitch_batch] + X_batch.dot(β))
likelihood = pm.Bernoulli('likelihood', p, observed=y_batch, total_size=len(y))
with strike_model:
fit = pm.fit(30000)
Average Loss = 62.294: 100%|██████████| 30000/30000 [01:19<00:00, 375.03it/s] INFO:pymc3.variational.inference:Finished [100%]: Average Loss = 62.307
plt.plot(fit.hist)
[<matplotlib.lines.Line2D at 0x7cc325d69a58>]
strike_samples = fit.sample(2000)
pm.forestplot(strike_samples, varnames=['θ'], ylabels=pitch_enc.classes_)
GridSpec(1, 1)
pm.forestplot(strike_samples, varnames=['β'], ylabels=analysis_subset.columns)
GridSpec(1, 1)
Tell us one other interesting conclusion you drew from the dataset.
The difference in strike zones that is apparent in the visualization from question 1 is smaller when it is restricted to the first pitch a given batter. The region of high called strike probability is shifted towards the center for both hands.
pts_right = hv.Points(data=takes[(takes.batside=='R') & (takes.balls==0) & (takes.strikes==0)],
kdims=['plate_x', 'plate_z'],
label='Right-handed batter')
pts_left = hv.Points(data=takes[(takes.batside=='L') & (takes.balls==0) & (takes.strikes==0)],
kdims=['plate_x', 'plate_z'],
label='Left-handed batter')
strike_zone = hv.VLine(SZ_RIGHT)*hv.VLine(SZ_LEFT)*hv.HLine(SZ_TOP)*hv.HLine(SZ_BOTTOM)
(mean_datashade(pts_right)*strike_zone +
mean_datashade(pts_left)*strike_zone).cols(1).redim.range(plate_x=(-4, 4), plate_z=(-1, 8))