import numpy as np import pandas as pd %matplotlib inline from ggplot import *
# Inspect data, the data is pitches tracked over a 2 month stretch in the 2013 # MLB season. baseball = pd.read_csv('./data/baseball-pitches-clean.csv') print baseball.shape, " pitches were tracked." baseball.head()
133601 pitches were tracked.
|0||2013-10-01 20:07:43 -0400||1||Top||Francisco Liriano||Shin-Soo Choo||B||78.97||164.92||93.2||85.3||3.10||1.53||11.01||6.47||0.628||1.547||1.757||50||5.472||-6.862||...|
|1||2013-10-01 20:07:57 -0400||1||Top||Francisco Liriano||Shin-Soo Choo||S||82.40||131.24||93.4||85.6||3.06||1.56||10.14||7.99||0.545||3.069||1.711||50||5.650||-6.693||...|
|2||2013-10-01 20:08:12 -0400||1||Top||Francisco Liriano||Shin-Soo Choo||S||96.14||161.47||89.1||82.8||3.25||1.53||3.11||4.95||0.120||1.826||1.559||50||5.792||-4.763||...|
|3||2013-10-01 20:08:31 -0400||1||Top||Francisco Liriano||Shin-Soo Choo||S||106.44||163.19||90.0||83.3||3.25||1.53||-0.38||2.15||-0.229||1.667||1.172||50||5.832||-3.519||...|
|4||2013-10-01 20:09:09 -0400||1||Top||Francisco Liriano||Ryan Ludwick||B||163.95||194.28||87.7||81.6||3.62||1.78||1.62||1.93||-1.917||0.438||0.194||50||5.578||-5.886||...|
5 rows × 36 columns
Index([u'pitch_time', u'inning', u'top_or_bottom', u'pitcher_name', u'hitter_name', u'pitch_type', u'x', u'y', u'start_speed', u'end_speed', u'sz_top', u'sz_bottom', u'pfx_x', u'pfx_z', u'px', u'pz', u'x0', u'y0', u'ax', u'ay', u'az', u'z0', u'vx0', u'vy0', u'vz0', u'break_y', u'break_angle', u'break_length', u'pitch_name', u'type_confidence', u'zone', u'nasty', u'spin_dir', u'spin_rate', u'comments', u'unk'], dtype='object')
# How many pitches types are there? baseball.pitch_type.unique()
array(['B', 'S', 'X'], dtype=object)
array(['Fastball', 'Slider', 'Changeup', 'Cut fastball', 'Curveball', 'Fastball (sinker|split-fingered)', 'Knuckleball', 'Eephus'], dtype=object)
# How many pitchers are in the dataset? len(baseball.pitcher_name.unique())
8 rows × 2 columns
A start speed of 49.4 mph seems very very low, let's investigate this further.
slowest_pitch = baseball[baseball['start_speed'] == baseball['start_speed'].min(0)] slowest_pitch.pitcher_name
51404 Zack Wheeler Name: pitcher_name, dtype: object
zach_wheeler = baseball[baseball['pitcher_name'] == 'Zack Wheeler'] less_than_70 = zach_wheeler[zach_wheeler['start_speed'] < 70] print 'Number of pitches under 70 mph =', len(less_than_70) print 'Mean of Zach Wheeler\'s pitch speeds', round(zach_wheeler['start_speed'].mean(),2), 'MPH.'
Number of pitches under 70 mph = 1 Mean of Zach Wheeler's pitch speeds 90.22 MPH.
Ok so from what we see above that pitch that's 49 MPH is definately an error, there's no way a guy who's throwing 90 MPH on average is going to throw a 49 MPH pitch.
print len(baseball[baseball['start_speed'] < 60]), 'pitches are under 60 mph' # R.A. Dickey is a knuckleballer, one of only ones in the entire league dickey = baseball[baseball['pitcher_name'] == 'R.A. Dickey'] print 'R. A. Dickey has ', len(dickey[dickey['start_speed'] < 60]), 'under 60 mph'
7 pitches are under 60 mph R. A. Dickey has 0 under 60 mph
If Dickey who's a knuckleballer isn't throwing anything under 60 MPH, then it's pretty safe to say these pitches under 60 are outliars.
over_60 = baseball['start_speed'] >= 60 baseball = baseball[over_60]
Now that we've cleaned up the dataset a little, let's start visualizing it.
baseball = baseball[['pitch_time', 'inning', 'pitcher_name', 'hitter_name', 'pitch_type', 'px', 'pz', 'pitch_name', 'start_speed', 'end_speed', 'type_confidence']] baseball.head()
|0||2013-10-01 20:07:43 -0400||1||Francisco Liriano||Shin-Soo Choo||B||0.628||1.547||Fastball||93.2||85.3||0.894|
|1||2013-10-01 20:07:57 -0400||1||Francisco Liriano||Shin-Soo Choo||S||0.545||3.069||Fastball||93.4||85.6||0.895|
|2||2013-10-01 20:08:12 -0400||1||Francisco Liriano||Shin-Soo Choo||S||0.120||1.826||Slider||89.1||82.8||0.931|
|3||2013-10-01 20:08:31 -0400||1||Francisco Liriano||Shin-Soo Choo||S||-0.229||1.667||Slider||90.0||83.3||0.926|
|4||2013-10-01 20:09:09 -0400||1||Francisco Liriano||Ryan Ludwick||B||-1.917||0.438||Slider||87.7||81.6||0.915|
5 rows × 11 columns
p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=baseball) + geom_jitter() p
That's a bit hard to see let's do a facet wrap
p = ggplot(aes(x='px', y='pz'), data=baseball) + geom_point(color='blue') + facet_wrap('pitch_name') p
Ok so I watch baseball and I've literally never heard of the Eephus pitch. From the graph it looks like it's really unpredictable, but also that there's not much data on it. Let's take a look at the actual counts.
Fastball 68227 Slider 20714 Curveball 13798 Changeup 12900 Fastball (sinker|split-fingered) 10267 Cut fastball 7182 Knuckleball 447 Eephus 59 dtype: int64
# Show in percentages baseball['pitch_name'].value_counts() / len(baseball) * 100
Fastball 51.070407 Slider 15.505187 Curveball 10.328308 Changeup 9.656122 Fastball (sinker|split-fingered) 7.685225 Cut fastball 5.375990 Knuckleball 0.334596 Eephus 0.044164 dtype: float64
There are only 59 Eephus pitches thrown in our entire dataset! Put that in comparison with the 447 knuckleballs which are a rarity in themselves. So what is a Eephus pitch then?
from IPython.display import YouTubeVideo YouTubeVideo('uW0V6OsxDBo', 600, 338)
Let's checkout the distribution of pitch types
p = ggplot(aes(x='start_speed'), data=baseball) + geom_histogram() + facet_wrap('pitch_name') p
This rules out my suspicion that the Eephus pitch is similar to the Knuckleball. It's suprising the knuckeball distribution is centered where it is in the high 70's. Traditionally Knuckleballs are high 60's pitches. This might be due to R.A. Dickey being the dominant Knuckleball user in today's game. His are known to be faster than most.
# Let's see how many of these Dickey throws knuckles = baseball[baseball['pitch_name'] == 'Knuckleball'] dickey = knuckles[knuckles['pitcher_name'] == 'R.A. Dickey'] print 'Percentage of Knuckleballs belonging to Dickey', (len(dickey) / len(knuckles) * 100)
Percentage of Knuckleballs belonging to Dickey 100
Well it turns out all the Knuckleballs in our dataset are thrown by R.A. Dickey! Well that confirms the suspicion about the Knuckleball speeds.
We saw previously that it was pretty difficult to gain much insight into pitch types aside from general differences. This might be more meaningful if we analyzed a specific pitcher. Let's do Yu Darvish.
Darvish is known for having a wide array of pitches at his disposal and is one of the best current pitchers in baseball so he's a solid choice.
# Let's get darvish data darvish = baseball[baseball['pitcher_name'] == 'Yu Darvish'] darvish['pitch_name'].value_counts() / len(darvish) * 100
Fastball 36.311239 Slider 35.446686 Cut fastball 22.334294 Fastball (sinker|split-fingered) 3.314121 Curveball 2.593660 dtype: float64
Darvish's percentage pitch counts are drastically different from the average of the dataset, his approach is far more balanced. Over the 50% of pitches in the dataset are fastballs.
p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=darvish) + geom_jitter(alpha=0.3) p = p + ggtitle('Darvish Pitch Spread') + stat_smooth(method='lm') p
It looks like Darvish's pitches all land in similar locations, looking further at the smoothing lines though we can see that the lines for his top 3 pitches (~94% of the pitches) are very similar.
Looking at the data it's easy to see why Darvish is such a lethal pitcher. In summary he was a wide array of pitches and to the hitter they all look pretty much identical.
p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=darvish) p = p + stat_smooth(method='lm', size=5) p
p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=darvish) p = p + geom_jitter(alpha=0.3) p
Apart from his slider there's no drastic change in pitch speeds. Further if we take a lot at his top 3 pitches: fastball, cut fastball and slider we see that the distribution of the pitches speeds is consistent and it stays consistent throughout the entire game.
If a hitter's hope was that Darvish was become weaker over the course of a game it looks like they're out of a luck.
David Price 762 Justin Verlander 755 Chris Tillman 727 Andy Pettitte 718 Ubaldo Jimenez 698 Yu Darvish 694 Jason Vargas 691 Wade Miley 677 Jon Lester 674 J.A. Happ 672 Adam Wainwright 669 Garrett Richards 666 C.J. Wilson 653 Francisco Liriano 653 Gio Gonzalez 649 ... Donnie Joseph 32 Darin Downs 32 Clay Rapada 31 Michael Stutes 30 Scott Rice 27 Tommy Layne 25 Robert Carson 24 Brett Cecil 20 Jeurys Familia 19 Cory Burns 16 Jeff Beliveau 11 Jeremy Affeldt 11 Mike Zagurski 10 Michael Bowden 5 Sam Fuld 5 Length: 513, dtype: int64
verlander = baseball[baseball['pitcher_name'] == 'Justin Verlander'] verlander.head()
|871||2013-09-29 13:16:29 -0400||1||Justin Verlander||Juan Pierre||B||-1.422||2.909||Fastball||91.8||83.5||2|
|872||2013-09-29 13:16:43 -0400||1||Justin Verlander||Juan Pierre||S||-0.868||2.379||Fastball||91.0||83.1||2|
|873||2013-09-29 13:17:06 -0400||1||Justin Verlander||Juan Pierre||X||0.033||1.891||Fastball||91.5||82.8||2|
|874||2013-09-29 13:17:51 -0400||1||Justin Verlander||Ed Lucas||S||0.670||3.067||Fastball||91.0||82.9||2|
|875||2013-09-29 13:18:06 -0400||1||Justin Verlander||Ed Lucas||S||0.702||1.819||Fastball||90.6||82.9||2|
5 rows × 11 columns
verlander['pitch_name'].value_counts() / len(verlander) * 100
Fastball 55.761589 Changeup 16.821192 Curveball 14.834437 Slider 12.582781 dtype: float64
Already we can see Verlander is a drastically different pitcher than Darvish, fastballs make up 55% of his routine, Darvish fastballs made up 36% of his routine. Verlander throws his 3 other pitches for around the same amount.
It's interesting to note that 94% of Darvish's routine was made up of fastball, cut fastball and slider. Verlander 2nd and 3rd pitches are Darvish's 4th and 5th, thrown for ~32% vs ~6%.
p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=verlander) + geom_jitter(alpha=0.3) p = p + ggtitle('Verlander Pitch Spread') + stat_smooth(method='lm') p
Verlander's distribution is more predictable than Darvish's. We see that fastball end up in the upper portion of the strikezone while the other 3 pitches end up in the lower portion.
The changeup and curveball are similar in terms of their distribution, it would be difficult for a hitter to tell them apart.
All 3 secondary pitches follow the trend that the farther right in the strikezone you go the lower the pitch will likely be.
p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=verlander) p = p + stat_smooth(method='lm', size=5) p
p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=verlander) p = p + geom_jitter(alpha=0.3) p
Verlander's fastball becomes faster over the course of the game and his changeup slower. We can also see that Verlander isn't as consistent with his pitch speeds as Darvish. He's more consistent during the middle innings.
This makes sense intuitively since in the first couple of innings the pitcher is finding their "groove" and in the latter innings fatigue starts to set in.
I found it weird that Verlander's fastball gets faster over the course of the game. So I decided to compare it to the norm.
p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch Speed vs Innings') p
p = ggplot(aes(x='inning', y='start_speed'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch speed vs Innings') p
Over this is super weird, at least to me. Shouldn't the speeds get slower as the game progresses?
A problem with the current approach is that it doesn't take into account switching the pitcher, pitch count would probably be a much better way to measure this.
baseball['date'] = baseball['pitch_time'].str.slice(0,10) baseball['pitch_count'] = 1 baseball['pitch_count'] = baseball.groupby(['pitcher_name', 'date'])['pitch_count'].cumsum()
Let's try it again with the pitch counts.
p = ggplot(aes(x='pitch_count', y='start_speed', color='pitch_name'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch Speed vs Pitch Count') p