in this project we will be working with a file from Major League Baseball games from Retrosheet. Retrosheet compile detailed statistics on baseball games from the 1800s through to today. The main file we will be working from game_log.csv, has been produced by combining 127 separate CSV files from retrosheet. The game log has hundreds of data points on each game which we will normalize into several separate table using SQL, providing a robust database of game-level statistics.
In addition to the main file, we have also included three 'helfer' files, also sourced from Retrosheet:
Let's explore the data
import sqlite3
import pandas as pd
import csv
#to prevent the Dataframe output from being truncated, given the size of the main game log file
pd.set_option('max_columns', 180)
pd.set_option('max_rows', 200000)
pd.set_option('max_colwidth', 5000)
game = pd.read_csv("game_log.csv", low_memory=False)
print(game.shape)
game.head()
(171907, 161)
date | number_of_game | day_of_week | v_name | v_league | v_game_number | h_name | h_league | h_game_number | v_score | h_score | length_outs | day_night | completion | forefeit | protest | park_id | attendance | length_minutes | v_line_score | h_line_score | v_at_bats | v_hits | v_doubles | v_triples | v_homeruns | v_rbi | v_sacrifice_hits | v_sacrifice_flies | v_hit_by_pitch | v_walks | v_intentional_walks | v_strikeouts | v_stolen_bases | v_caught_stealing | v_grounded_into_double | v_first_catcher_interference | v_left_on_base | v_pitchers_used | v_individual_earned_runs | v_team_earned_runs | v_wild_pitches | v_balks | v_putouts | v_assists | v_errors | v_passed_balls | v_double_plays | v_triple_plays | h_at_bats | h_hits | h_doubles | h_triples | h_homeruns | h_rbi | h_sacrifice_hits | h_sacrifice_flies | h_hit_by_pitch | h_walks | h_intentional_walks | h_strikeouts | h_stolen_bases | h_caught_stealing | h_grounded_into_double | h_first_catcher_interference | h_left_on_base | h_pitchers_used | h_individual_earned_runs | h_team_earned_runs | h_wild_pitches | h_balks | h_putouts | h_assists | h_errors | h_passed_balls | h_double_plays | h_triple_plays | hp_umpire_id | hp_umpire_name | 1b_umpire_id | 1b_umpire_name | 2b_umpire_id | 2b_umpire_name | 3b_umpire_id | 3b_umpire_name | lf_umpire_id | lf_umpire_name | rf_umpire_id | rf_umpire_name | v_manager_id | v_manager_name | h_manager_id | h_manager_name | winning_pitcher_id | winning_pitcher_name | losing_pitcher_id | losing_pitcher_name | saving_pitcher_id | saving_pitcher_name | winning_rbi_batter_id | winning_rbi_batter_id_name | v_starting_pitcher_id | v_starting_pitcher_name | h_starting_pitcher_id | h_starting_pitcher_name | v_player_1_id | v_player_1_name | v_player_1_def_pos | v_player_2_id | v_player_2_name | v_player_2_def_pos | v_player_3_id | v_player_3_name | v_player_3_def_pos | v_player_4_id | v_player_4_name | v_player_4_def_pos | v_player_5_id | v_player_5_name | v_player_5_def_pos | v_player_6_id | v_player_6_name | v_player_6_def_pos | v_player_7_id | v_player_7_name | v_player_7_def_pos | v_player_8_id | v_player_8_name | v_player_8_def_pos | v_player_9_id | v_player_9_name | v_player_9_def_pos | h_player_1_id | h_player_1_name | h_player_1_def_pos | h_player_2_id | h_player_2_name | h_player_2_def_pos | h_player_3_id | h_player_3_name | h_player_3_def_pos | h_player_4_id | h_player_4_name | h_player_4_def_pos | h_player_5_id | h_player_5_name | h_player_5_def_pos | h_player_6_id | h_player_6_name | h_player_6_def_pos | h_player_7_id | h_player_7_name | h_player_7_def_pos | h_player_8_id | h_player_8_name | h_player_8_def_pos | h_player_9_id | h_player_9_name | h_player_9_def_pos | additional_info | acquisition_info | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18710504 | 0 | Thu | CL1 | NaN | 1 | FW1 | NaN | 1 | 0 | 2 | 54.0 | D | NaN | NaN | NaN | FOR01 | 200.0 | 120.0 | 000000000 | 010010000 | 30.0 | 4.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 6.0 | 1.0 | NaN | -1.0 | NaN | 4.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 27.0 | 9.0 | 0.0 | 3.0 | 0.0 | 0.0 | 31.0 | 4.0 | 1.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 0.0 | 0.0 | NaN | -1.0 | NaN | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 27.0 | 3.0 | 3.0 | 1.0 | 1.0 | 0.0 | boakj901 | John Boake | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | paboc101 | Charlie Pabor | lennb101 | Bill Lennon | mathb101 | Bobby Mathews | prata101 | Al Pratt | NaN | NaN | NaN | NaN | prata101 | Al Pratt | mathb101 | Bobby Mathews | whitd102 | Deacon White | 2.0 | kimbg101 | Gene Kimball | 4.0 | paboc101 | Charlie Pabor | 7.0 | allia101 | Art Allison | 8.0 | white104 | Elmer White | 9.0 | prata101 | Al Pratt | 1.0 | sutte101 | Ezra Sutton | 5.0 | carlj102 | Jim Carleton | 3.0 | bassj101 | John Bass | 6.0 | selmf101 | Frank Sellman | 5.0 | mathb101 | Bobby Mathews | 1.0 | foraj101 | Jim Foran | 3.0 | goldw101 | Wally Goldsmith | 6.0 | lennb101 | Bill Lennon | 2.0 | caret101 | Tom Carey | 4.0 | mince101 | Ed Mincher | 7.0 | mcdej101 | James McDermott | 8.0 | kellb105 | Bill Kelly | 9.0 | NaN | Y |
1 | 18710505 | 0 | Fri | BS1 | NaN | 1 | WS3 | NaN | 1 | 20 | 18 | 54.0 | D | NaN | NaN | NaN | WAS01 | 5000.0 | 145.0 | 107000435 | 640113030 | 41.0 | 13.0 | 1.0 | 2.0 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 18.0 | NaN | 5.0 | 3.0 | NaN | -1.0 | NaN | 12.0 | 1.0 | 6.0 | 6.0 | 1.0 | 0.0 | 27.0 | 13.0 | 10.0 | 1.0 | 2.0 | 0.0 | 49.0 | 14.0 | 2.0 | 0.0 | 0.0 | 11.0 | 0.0 | 0.0 | 0.0 | 10.0 | NaN | 2.0 | 1.0 | NaN | -1.0 | NaN | 14.0 | 1.0 | 7.0 | 7.0 | 0.0 | 0.0 | 27.0 | 20.0 | 10.0 | 2.0 | 3.0 | 0.0 | dobsh901 | Henry Dobson | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | wrigh101 | Harry Wright | younn801 | Nick Young | spala101 | Al Spalding | braia102 | Asa Brainard | NaN | NaN | NaN | NaN | spala101 | Al Spalding | braia102 | Asa Brainard | wrigg101 | George Wright | 6.0 | barnr102 | Ross Barnes | 4.0 | birdd102 | Dave Birdsall | 9.0 | mcvec101 | Cal McVey | 2.0 | wrigh101 | Harry Wright | 8.0 | goulc101 | Charlie Gould | 3.0 | schah101 | Harry Schafer | 5.0 | conef101 | Fred Cone | 7.0 | spala101 | Al Spalding | 1.0 | watef102 | Fred Waterman | 5.0 | forcd101 | Davy Force | 6.0 | mille105 | Everett Mills | 3.0 | allid101 | Doug Allison | 2.0 | hallg101 | George Hall | 7.0 | leona101 | Andy Leonard | 4.0 | braia102 | Asa Brainard | 1.0 | burrh101 | Henry Burroughs | 9.0 | berth101 | Henry Berthrong | 8.0 | HTBF | Y |
2 | 18710506 | 0 | Sat | CL1 | NaN | 2 | RC1 | NaN | 1 | 12 | 4 | 54.0 | D | NaN | NaN | NaN | RCK01 | 1000.0 | 140.0 | 610020003 | 010020100 | 49.0 | 11.0 | 1.0 | 1.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 1.0 | 0.0 | NaN | -1.0 | NaN | 10.0 | 1.0 | 0.0 | 0.0 | 2.0 | 0.0 | 27.0 | 12.0 | 8.0 | 5.0 | 0.0 | 0.0 | 36.0 | 7.0 | 2.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 3.0 | 5.0 | NaN | -1.0 | NaN | 5.0 | 1.0 | 3.0 | 3.0 | 1.0 | 0.0 | 27.0 | 12.0 | 13.0 | 3.0 | 0.0 | 0.0 | mawnj901 | J.H. Manny | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | paboc101 | Charlie Pabor | hasts101 | Scott Hastings | prata101 | Al Pratt | fishc102 | Cherokee Fisher | NaN | NaN | NaN | NaN | prata101 | Al Pratt | fishc102 | Cherokee Fisher | whitd102 | Deacon White | 2.0 | kimbg101 | Gene Kimball | 4.0 | paboc101 | Charlie Pabor | 7.0 | allia101 | Art Allison | 8.0 | white104 | Elmer White | 9.0 | prata101 | Al Pratt | 1.0 | sutte101 | Ezra Sutton | 5.0 | carlj102 | Jim Carleton | 3.0 | bassj101 | John Bass | 6.0 | mackd101 | Denny Mack | 3.0 | addyb101 | Bob Addy | 4.0 | fishc102 | Cherokee Fisher | 1.0 | hasts101 | Scott Hastings | 8.0 | ham-r101 | Ralph Ham | 5.0 | ansoc101 | Cap Anson | 2.0 | sagep101 | Pony Sager | 6.0 | birdg101 | George Bird | 7.0 | stirg101 | Gat Stires | 9.0 | NaN | Y |
3 | 18710508 | 0 | Mon | CL1 | NaN | 3 | CH1 | NaN | 1 | 12 | 14 | 54.0 | D | NaN | NaN | NaN | CHI01 | 5000.0 | 150.0 | 101403111 | 077000000 | 46.0 | 15.0 | 2.0 | 1.0 | 2.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 1.0 | 0.0 | NaN | -1.0 | NaN | 7.0 | 1.0 | 6.0 | 6.0 | 0.0 | 0.0 | 27.0 | 15.0 | 11.0 | 6.0 | 0.0 | 0.0 | 43.0 | 11.0 | 2.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 4.0 | NaN | 2.0 | 1.0 | NaN | -1.0 | NaN | 6.0 | 1.0 | 4.0 | 4.0 | 0.0 | 0.0 | 27.0 | 14.0 | 7.0 | 2.0 | 0.0 | 0.0 | willg901 | Gardner Willard | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | paboc101 | Charlie Pabor | woodj106 | Jimmy Wood | zettg101 | George Zettlein | prata101 | Al Pratt | NaN | NaN | NaN | NaN | prata101 | Al Pratt | zettg101 | George Zettlein | whitd102 | Deacon White | 2.0 | kimbg101 | Gene Kimball | 4.0 | paboc101 | Charlie Pabor | 7.0 | allia101 | Art Allison | 8.0 | white104 | Elmer White | 9.0 | prata101 | Al Pratt | 1.0 | sutte101 | Ezra Sutton | 5.0 | carlj102 | Jim Carleton | 3.0 | bassj101 | John Bass | 6.0 | mcatb101 | Bub McAtee | 3.0 | kingm101 | Marshall King | 8.0 | hodec101 | Charlie Hodes | 2.0 | woodj106 | Jimmy Wood | 4.0 | simmj101 | Joe Simmons | 9.0 | folet101 | Tom Foley | 7.0 | duffe101 | Ed Duffy | 6.0 | pinke101 | Ed Pinkham | 5.0 | zettg101 | George Zettlein | 1.0 | NaN | Y |
4 | 18710509 | 0 | Tue | BS1 | NaN | 2 | TRO | NaN | 1 | 9 | 5 | 54.0 | D | NaN | NaN | NaN | TRO01 | 3250.0 | 145.0 | 000002232 | 101003000 | 46.0 | 17.0 | 4.0 | 1.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 0.0 | 1.0 | NaN | -1.0 | NaN | 12.0 | 1.0 | 2.0 | 2.0 | 0.0 | 0.0 | 27.0 | 12.0 | 5.0 | 0.0 | 1.0 | 0.0 | 36.0 | 9.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 3.0 | NaN | 0.0 | 2.0 | NaN | -1.0 | NaN | 7.0 | 1.0 | 3.0 | 3.0 | 1.0 | 0.0 | 27.0 | 11.0 | 7.0 | 3.0 | 0.0 | 0.0 | leroi901 | Isaac Leroy | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | wrigh101 | Harry Wright | pikel101 | Lip Pike | spala101 | Al Spalding | mcmuj101 | John McMullin | NaN | NaN | NaN | NaN | spala101 | Al Spalding | mcmuj101 | John McMullin | wrigg101 | George Wright | 6.0 | barnr102 | Ross Barnes | 4.0 | birdd102 | Dave Birdsall | 9.0 | mcvec101 | Cal McVey | 2.0 | wrigh101 | Harry Wright | 8.0 | goulc101 | Charlie Gould | 3.0 | schah101 | Harry Schafer | 5.0 | conef101 | Fred Cone | 7.0 | spala101 | Al Spalding | 1.0 | flync101 | Clipper Flynn | 9.0 | mcgem101 | Mike McGeary | 2.0 | yorkt101 | Tom York | 8.0 | mcmuj101 | John McMullin | 1.0 | kings101 | Steve King | 7.0 | beave101 | Edward Beavens | 4.0 | bells101 | Steve Bellan | 5.0 | pikel101 | Lip Pike | 3.0 | cravb101 | Bill Craver | 6.0 | HTBF | Y |
game.tail()
date | number_of_game | day_of_week | v_name | v_league | v_game_number | h_name | h_league | h_game_number | v_score | h_score | length_outs | day_night | completion | forefeit | protest | park_id | attendance | length_minutes | v_line_score | h_line_score | v_at_bats | v_hits | v_doubles | v_triples | v_homeruns | v_rbi | v_sacrifice_hits | v_sacrifice_flies | v_hit_by_pitch | v_walks | v_intentional_walks | v_strikeouts | v_stolen_bases | v_caught_stealing | v_grounded_into_double | v_first_catcher_interference | v_left_on_base | v_pitchers_used | v_individual_earned_runs | v_team_earned_runs | v_wild_pitches | v_balks | v_putouts | v_assists | v_errors | v_passed_balls | v_double_plays | v_triple_plays | h_at_bats | h_hits | h_doubles | h_triples | h_homeruns | h_rbi | h_sacrifice_hits | h_sacrifice_flies | h_hit_by_pitch | h_walks | h_intentional_walks | h_strikeouts | h_stolen_bases | h_caught_stealing | h_grounded_into_double | h_first_catcher_interference | h_left_on_base | h_pitchers_used | h_individual_earned_runs | h_team_earned_runs | h_wild_pitches | h_balks | h_putouts | h_assists | h_errors | h_passed_balls | h_double_plays | h_triple_plays | hp_umpire_id | hp_umpire_name | 1b_umpire_id | 1b_umpire_name | 2b_umpire_id | 2b_umpire_name | 3b_umpire_id | 3b_umpire_name | lf_umpire_id | lf_umpire_name | rf_umpire_id | rf_umpire_name | v_manager_id | v_manager_name | h_manager_id | h_manager_name | winning_pitcher_id | winning_pitcher_name | losing_pitcher_id | losing_pitcher_name | saving_pitcher_id | saving_pitcher_name | winning_rbi_batter_id | winning_rbi_batter_id_name | v_starting_pitcher_id | v_starting_pitcher_name | h_starting_pitcher_id | h_starting_pitcher_name | v_player_1_id | v_player_1_name | v_player_1_def_pos | v_player_2_id | v_player_2_name | v_player_2_def_pos | v_player_3_id | v_player_3_name | v_player_3_def_pos | v_player_4_id | v_player_4_name | v_player_4_def_pos | v_player_5_id | v_player_5_name | v_player_5_def_pos | v_player_6_id | v_player_6_name | v_player_6_def_pos | v_player_7_id | v_player_7_name | v_player_7_def_pos | v_player_8_id | v_player_8_name | v_player_8_def_pos | v_player_9_id | v_player_9_name | v_player_9_def_pos | h_player_1_id | h_player_1_name | h_player_1_def_pos | h_player_2_id | h_player_2_name | h_player_2_def_pos | h_player_3_id | h_player_3_name | h_player_3_def_pos | h_player_4_id | h_player_4_name | h_player_4_def_pos | h_player_5_id | h_player_5_name | h_player_5_def_pos | h_player_6_id | h_player_6_name | h_player_6_def_pos | h_player_7_id | h_player_7_name | h_player_7_def_pos | h_player_8_id | h_player_8_name | h_player_8_def_pos | h_player_9_id | h_player_9_name | h_player_9_def_pos | additional_info | acquisition_info | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
171902 | 20161002 | 0 | Sun | MIL | NL | 162 | COL | NL | 162 | 6 | 4 | 60.0 | D | NaN | NaN | NaN | DEN02 | 27762.0 | 203.0 | 0200000202 | 1100100010 | 39.0 | 10.0 | 4.0 | 1.0 | 2.0 | 6.0 | 0.0 | 0.0 | 1.0 | 4.0 | 0.0 | 12.0 | 2.0 | 1.0 | 0.0 | 0.0 | 8.0 | 7.0 | 4.0 | 4.0 | 1.0 | 0.0 | 30.0 | 12.0 | 1.0 | 0.0 | 0.0 | 0.0 | 41.0 | 13.0 | 4.0 | 0.0 | 1.0 | 4.0 | 1.0 | 0.0 | 1.0 | 3.0 | 0.0 | 11.0 | 0.0 | 1.0 | 0.0 | 0.0 | 12.0 | 5.0 | 6.0 | 6.0 | 0.0 | 0.0 | 30.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | barrs901 | Scott Barry | woodt901 | Tom Woodring | randt901 | Tony Randazzo | ortir901 | Roberto Ortiz | NaN | NaN | NaN | NaN | counc001 | Craig Counsell | weisw001 | Walt Weiss | thort001 | Tyler Thornburg | rusic001 | Chris Rusin | knebc001 | Corey Knebel | susaa001 | Andrew Susac | cravt001 | Tyler Cravy | marqg001 | German Marquez | villj001 | Jonathan Villar | 5.0 | genns001 | Scooter Gennett | 4.0 | cartc002 | Chris Carter | 3.0 | santd002 | Domingo Santana | 9.0 | pereh001 | Hernan Perez | 8.0 | arcio002 | Orlando Arcia | 6.0 | susaa001 | Andrew Susac | 2.0 | elmoj001 | Jake Elmore | 7.0 | cravt001 | Tyler Cravy | 1.0 | blacc001 | Charlie Blackmon | 8.0 | dahld001 | David Dahl | 7.0 | arenn001 | Nolan Arenado | 5.0 | gonzc001 | Carlos Gonzalez | 9.0 | murpt002 | Tom Murphy | 2.0 | pattj005 | Jordan Patterson | 3.0 | valap001 | Pat Valaika | 4.0 | adamc001 | Cristhian Adames | 6.0 | marqg001 | German Marquez | 1.0 | NaN | Y |
171903 | 20161002 | 0 | Sun | NYN | NL | 162 | PHI | NL | 162 | 2 | 5 | 51.0 | D | NaN | NaN | NaN | PHI13 | 36935.0 | 159.0 | 000001100 | 00100031x | 33.0 | 8.0 | 3.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 9.0 | 1.0 | 1.0 | 1.0 | 0.0 | 6.0 | 6.0 | 3.0 | 3.0 | 0.0 | 0.0 | 24.0 | 12.0 | 3.0 | 1.0 | 2.0 | 0.0 | 33.0 | 10.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 3.0 | 0.0 | 0.0 | 2.0 | 0.0 | 7.0 | 5.0 | 2.0 | 2.0 | 0.0 | 0.0 | 27.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | barkl901 | Lance Barksdale | herna901 | Angel Hernandez | barrt901 | Ted Barrett | littw901 | Will Little | NaN | NaN | NaN | NaN | collt801 | Terry Collins | mackp101 | Pete Mackanin | murrc002 | Colton Murray | goede001 | Erik Goeddel | nerih001 | Hector Neris | hernc005 | Cesar Hernandez | ynoag001 | Gabriel Ynoa | eickj001 | Jerad Eickhoff | granc001 | Curtis Granderson | 8.0 | cabra002 | Asdrubal Cabrera | 6.0 | brucj001 | Jay Bruce | 9.0 | dudal001 | Lucas Duda | 3.0 | johnk003 | Kelly Johnson | 4.0 | confm001 | Michael Conforto | 7.0 | campe001 | Eric Campbell | 5.0 | plawk001 | Kevin Plawecki | 2.0 | ynoag001 | Gabriel Ynoa | 1.0 | hernc005 | Cesar Hernandez | 4.0 | parej002 | Jimmy Paredes | 7.0 | herro001 | Odubel Herrera | 8.0 | franm004 | Maikel Franco | 5.0 | howar001 | Ryan Howard | 3.0 | ruppc001 | Cameron Rupp | 2.0 | blana001 | Andres Blanco | 6.0 | altha001 | Aaron Altherr | 9.0 | eickj001 | Jerad Eickhoff | 1.0 | NaN | Y |
171904 | 20161002 | 0 | Sun | LAN | NL | 162 | SFN | NL | 162 | 1 | 7 | 51.0 | D | NaN | NaN | NaN | SFO03 | 41445.0 | 184.0 | 000100000 | 23000002x | 30.0 | 4.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | 4.0 | 7.0 | 7.0 | 7.0 | 0.0 | 0.0 | 24.0 | 5.0 | 1.0 | 0.0 | 0.0 | 0.0 | 39.0 | 16.0 | 3.0 | 1.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 11.0 | 2.0 | 1.0 | 0.0 | 0.0 | 12.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 27.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | knigb901 | Brian Knight | westj901 | Joe West | fleta901 | Andy Fletcher | danlk901 | Kerwin Danley | NaN | NaN | NaN | NaN | robed001 | Dave Roberts | bochb002 | Bruce Bochy | moorm003 | Matt Moore | maedk001 | Kenta Maeda | NaN | NaN | poseb001 | Buster Posey | maedk001 | Kenta Maeda | moorm003 | Matt Moore | kendh001 | Howie Kendrick | 7.0 | turnj001 | Justin Turner | 5.0 | seagc001 | Corey Seager | 6.0 | puigy001 | Yasiel Puig | 9.0 | gonza003 | Adrian Gonzalez | 3.0 | grany001 | Yasmani Grandal | 2.0 | pedej001 | Joc Pederson | 8.0 | utlec001 | Chase Utley | 4.0 | maedk001 | Kenta Maeda | 1.0 | spand001 | Denard Span | 8.0 | beltb001 | Brandon Belt | 3.0 | poseb001 | Buster Posey | 2.0 | pench001 | Hunter Pence | 9.0 | crawb001 | Brandon Crawford | 6.0 | pagaa001 | Angel Pagan | 7.0 | panij002 | Joe Panik | 4.0 | gillc001 | Conor Gillaspie | 5.0 | moorm003 | Matt Moore | 1.0 | NaN | Y |
171905 | 20161002 | 0 | Sun | PIT | NL | 162 | SLN | NL | 162 | 4 | 10 | 51.0 | D | NaN | NaN | NaN | STL10 | 44615.0 | 192.0 | 000020200 | 00100360x | 35.0 | 9.0 | 0.0 | 0.0 | 1.0 | 4.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 11.0 | 0.0 | 1.0 | 0.0 | 0.0 | 8.0 | 6.0 | 8.0 | 8.0 | 0.0 | 0.0 | 24.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 36.0 | 12.0 | 2.0 | 0.0 | 1.0 | 10.0 | 0.0 | 2.0 | 0.0 | 4.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 3.0 | 4.0 | 4.0 | 0.0 | 0.0 | 27.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | cuzzp901 | Phil Cuzzi | ticht901 | Todd Tichenor | vanol901 | Larry Vanover | marqa901 | Alfonso Marquez | NaN | NaN | NaN | NaN | hurdc001 | Clint Hurdle | mathm001 | Mike Matheny | broxj001 | Jonathan Broxton | nicaj001 | Juan Nicasio | NaN | NaN | piscs001 | Stephen Piscotty | voger001 | Ryan Vogelsong | waina001 | Adam Wainwright | jasoj001 | John Jaso | 3.0 | polag001 | Gregory Polanco | 9.0 | mccua001 | Andrew McCutchen | 8.0 | kangj001 | Jung Ho Kang | 5.0 | joycm001 | Matt Joyce | 7.0 | hansa001 | Alen Hanson | 4.0 | fryee001 | Eric Fryer | 2.0 | florp001 | Pedro Florimon | 6.0 | voger001 | Ryan Vogelsong | 1.0 | carpm002 | Matt Carpenter | 3.0 | diaza003 | Aledmys Diaz | 6.0 | moliy001 | Yadier Molina | 2.0 | piscs001 | Stephen Piscotty | 9.0 | peraj001 | Jhonny Peralta | 5.0 | mossb001 | Brandon Moss | 7.0 | gyorj001 | Jedd Gyorko | 4.0 | gricr001 | Randal Grichuk | 8.0 | waina001 | Adam Wainwright | 1.0 | NaN | Y |
171906 | 20161002 | 0 | Sun | MIA | NL | 161 | WAS | NL | 162 | 7 | 10 | 51.0 | D | NaN | NaN | NaN | WAS11 | 28730.0 | 216.0 | 000230020 | 03023002x | 38.0 | 14.0 | 1.0 | 1.0 | 2.0 | 7.0 | 1.0 | 0.0 | 0.0 | 3.0 | 2.0 | 10.0 | 1.0 | 1.0 | 1.0 | 0.0 | 8.0 | 7.0 | 10.0 | 10.0 | 1.0 | 0.0 | 24.0 | 11.0 | 0.0 | 0.0 | 1.0 | 0.0 | 30.0 | 10.0 | 2.0 | 0.0 | 1.0 | 10.0 | 1.0 | 1.0 | 1.0 | 8.0 | 0.0 | 3.0 | 2.0 | 0.0 | 1.0 | 0.0 | 7.0 | 6.0 | 7.0 | 7.0 | 1.0 | 0.0 | 27.0 | 11.0 | 0.0 | 0.0 | 1.0 | 0.0 | tumpj901 | John Tumpane | porta901 | Alan Porter | onorb901 | Brian O'Nora | kellj901 | Jeff Kellogg | NaN | NaN | NaN | NaN | mattd001 | Don Mattingly | baked002 | Dusty Baker | schem001 | Max Scherzer | brica001 | Austin Brice | melam001 | Mark Melancon | difow001 | Wilmer Difo | koeht001 | Tom Koehler | schem001 | Max Scherzer | gordd002 | Dee Gordon | 4.0 | telit001 | Tomas Telis | 2.0 | pradm001 | Martin Prado | 5.0 | yelic001 | Christian Yelich | 8.0 | bourj002 | Justin Bour | 3.0 | scrux001 | Xavier Scruggs | 7.0 | hoodd001 | Destin Hood | 9.0 | hecha001 | Adeiny Hechavarria | 6.0 | koeht001 | Tom Koehler | 1.0 | turnt001 | Trea Turner | 8.0 | reveb001 | Ben Revere | 7.0 | harpb003 | Bryce Harper | 9.0 | zimmr001 | Ryan Zimmerman | 3.0 | drews001 | Stephen Drew | 5.0 | difow001 | Wilmer Difo | 4.0 | espid001 | Danny Espinosa | 6.0 | lobaj001 | Jose Lobaton | 2.0 | schem001 | Max Scherzer | 1.0 | NaN | Y |
the data in game log has record of more than 170,000 games. the games in data ordered chronologically and in the period of year 1871 and 2016.
From the data we have:
park = pd.read_csv('park_codes.csv')
print(park.shape)
park.head()
(252, 9)
park_id | name | aka | city | state | start | end | league | notes | |
---|---|---|---|---|---|---|---|---|---|
0 | ALB01 | Riverside Park | NaN | Albany | NY | 09/11/1880 | 05/30/1882 | NL | TRN:9/11/80;6/15&9/10/1881;5/16-5/18&5/30/1882 |
1 | ALT01 | Columbia Park | NaN | Altoona | PA | 04/30/1884 | 05/31/1884 | UA | NaN |
2 | ANA01 | Angel Stadium of Anaheim | Edison Field; Anaheim Stadium | Anaheim | CA | 04/19/1966 | NaN | AL | NaN |
3 | ARL01 | Arlington Stadium | NaN | Arlington | TX | 04/21/1972 | 10/03/1993 | AL | NaN |
4 | ARL02 | Rangers Ballpark in Arlington | The Ballpark in Arlington; Ameriquest Fl | Arlington | TX | 04/11/1994 | NaN | AL | NaN |
This data seems about the list of baseball park. There is park_id column that match with table game_log
person = pd.read_csv('person_codes.csv')
print(person.shape)
person.head()
(20494, 7)
id | last | first | player_debut | mgr_debut | coach_debut | ump_debut | |
---|---|---|---|---|---|---|---|
0 | aardd001 | Aardsma | David | 04/06/2004 | NaN | NaN | NaN |
1 | aaroh101 | Aaron | Hank | 04/13/1954 | NaN | NaN | NaN |
2 | aarot101 | Aaron | Tommie | 04/10/1962 | NaN | 04/06/1979 | NaN |
3 | aased001 | Aase | Don | 07/26/1977 | NaN | NaN | NaN |
4 | abada001 | Abad | Andy | 09/10/2001 | NaN | NaN | NaN |
This table seems to be about the list of people with ID. The ID are match with those in game log table, The ID for Managers, players, coaches and umpire.
team = pd.read_csv('team_codes.csv')
print(team.shape)
team.head()
(150, 8)
team_id | league | start | end | city | nickname | franch_id | seq | |
---|---|---|---|---|---|---|---|---|
0 | ALT | UA | 1884 | 1884 | Altoona | Mountain Cities | ALT | 1 |
1 | ARI | NL | 1998 | 0 | Arizona | Diamondbacks | ARI | 1 |
2 | BFN | NL | 1879 | 1885 | Buffalo | Bisons | BFN | 1 |
3 | BFP | PL | 1890 | 1890 | Buffalo | Bisons | BFP | 1 |
4 | BL1 | NaN | 1872 | 1874 | Baltimore | Canaries | BL1 | 1 |
this table seems about the list of baseball team. the team_id columns match with the one in game log. Let's explore other column in this table
team['franch_id'].value_counts().head()
BS1 4 TRN 3 BR3 3 SE1 3 PHA 3 Name: franch_id, dtype: int64
team[team['franch_id']=='BS1']
team_id | league | start | end | city | nickname | franch_id | seq | |
---|---|---|---|---|---|---|---|---|
21 | BS1 | NaN | 1871 | 1875 | Boston | Braves | BS1 | 1 |
22 | BSN | NL | 1876 | 1952 | Boston | Braves | BS1 | 2 |
23 | MLN | NL | 1953 | 1965 | Milwaukee | Braves | BS1 | 3 |
24 | ATL | NL | 1966 | 0 | Atlanta | Braves | BS1 | 4 |
team[team['franch_id']=='LAA']
team_id | league | start | end | city | nickname | franch_id | seq | |
---|---|---|---|---|---|---|---|---|
69 | LAA | AL | 1961 | 1964 | Los Angeles | Angels | LAA | 1 |
70 | CAL | AL | 1965 | 1996 | California | Angels | LAA | 2 |
71 | ANA | AL | 1997 | 0 | Anaheim | Angels | LAA | 3 |
the franch_id seems not unique, one franch_id can be in various league and city.
team['nickname'].value_counts().head()
Athletics 6 Reds 6 Nationals 6 Senators 5 Orioles 4 Name: nickname, dtype: int64
team[team['nickname']== 'Athletics']
team_id | league | start | end | city | nickname | franch_id | seq | |
---|---|---|---|---|---|---|---|---|
90 | PH1 | NaN | 1871 | 1875 | Philadelphia | Athletics | PH1 | 1 |
91 | PHN | NL | 1876 | 1876 | Philadelphia | Athletics | PH1 | 2 |
94 | PH4 | AA | 1882 | 1891 | Philadelphia | Athletics | PH4 | 1 |
96 | PHA | AL | 1901 | 1954 | Philadelphia | Athletics | PHA | 1 |
97 | KC1 | AL | 1955 | 1967 | Kansas City | Athletics | PHA | 2 |
98 | OAK | AL | 1968 | 0 | Oakland | Athletics | PHA | 3 |
team[team['nickname']=='Reds']
team_id | league | start | end | city | nickname | franch_id | seq | |
---|---|---|---|---|---|---|---|---|
25 | BSP | PL | 1890 | 1890 | Boston | Reds | BSP | 1 |
26 | BS2 | AA | 1891 | 1891 | Boston | Reds | BSP | 2 |
27 | BSU | UA | 1884 | 1884 | Boston | Reds | BSU | 1 |
44 | CN1 | NL | 1876 | 1880 | Cincinnati | Reds | CN1 | 1 |
45 | CN2 | AA | 1882 | 1889 | Cincinnati | Reds | CN2 | 1 |
46 | CIN | NL | 1890 | 0 | Cincinnati | Reds | CN2 | 2 |
It seems the nickname can be in different franch_id, league and team
Defensive Position
in the game log, there is defensive position number listed. From here, I can find what each number position refer to
League
From wikipedia we know that there are two leagues in the USA, The National League (NL) and American League(AL), with 15 teamns in each league. Let's see what leagues in the game log
game["h_league"].value_counts()
NL 88867 AL 74712 AA 5039 FL 1243 PL 532 UA 428 Name: h_league, dtype: int64
Most of the games in the game log belong to the two main leagues however, there are still 4 other leagues. Let's check further to get more info about the leagues
def league_info(league):
league_games = game[game['h_league']==league]
earliest = league_games["date"].min()
latest = league_games["date"].max()
print("{} went from {} to {}". format(league,earliest, latest))
for league in game["h_league"].unique():
league_info(league)
nan went from nan to nan NL went from 18760422 to 20161002 AA went from 18820502 to 18911006 UA went from 18840417 to 18841019 PL went from 18900419 to 18901004 AL went from 19010424 to 20161002 FL went from 19140413 to 19151003
From the period of the league, we can do some research.
# Create helfer function
#USe Dataframe.to_sql() to create tables for each of our dataframes
#in a new SQLite database, mlb.db
DB = "mlb.db"
def run_query(q):
with sqlite3.connect(DB) as conn:
return pd.read_sql(q,conn)
def run_command(c):
with sqlite3.connect(DB) as conn:
conn.execute('PRAGMA foreign_keys = ON;')
conn.isolation_level = None
conn.execute(c)
def show_tables():
q = '''
SELECT
name,
type
FROM sqlite_master
WHERE type in ("table","view");
'''
return run_query(q)
tables = {
"game_log" : game,
"person_codes": person,
"team_codes": team,
"park_codes": park
}
with sqlite3.connect(DB) as conn:
for name, data in tables.items():
conn.execute("DROP TABLE IF EXISTS {};".format(name))
data.to_sql(name,conn,index=False)
show_tables()
name | type | |
---|---|---|
0 | person | table |
1 | park | table |
2 | league | table |
3 | team | table |
4 | game | table |
5 | team_appearance | table |
6 | person_appearance | table |
7 | apperance_type | table |
8 | team_codes | table |
9 | person_codes | table |
10 | game_log | table |
11 | park_codes | table |
c1 = """
ALTER TABLE game_log
ADD COLUMN game_id TEXT;
"""
# try/except loop since ALTER TABLE
# doesn't support IF NOT EXISTS
try:
run_command(c1)
except:
pass
c2 = """
UPDATE game_log
SET game_id = date || h_name || number_of_game
/* WHERE prevents this if it has already been done */
WHERE game_id IS NULL;
"""
run_command(c2)
q= """
SELECT
game_id,
date,
h_name,
number_of_game
FROM game_log
LIMIT 5;
"""
run_query(q)
game_id | date | h_name | number_of_game | |
---|---|---|---|---|
0 | 18710504FW10 | 18710504 | FW1 | 0 |
1 | 18710505WS30 | 18710505 | WS3 | 0 |
2 | 18710506RC10 | 18710506 | RC1 | 0 |
3 | 18710508CH10 | 18710508 | CH1 | 0 |
4 | 18710509TRO0 | 18710509 | TRO | 0 |
The following are opportunities for normalization of our data:
Eliminate any redudant data
in person_codes table, all the debut dates will be able to be reproduced using game log data.
in park_codes, the start, end and sequence columns will be able to be reproduce using game log data
in park_codes, the start and end years will be able to be reproduced using game log data. While technically the state is an atrribute of the city, we might not want to have an incomplete city/state table so we will leave this in.
There are lots of palces in game log where we have a player ID followed by the palyers name. We will be able to remove this and use the name data in person_codes
In game_log, all offensive and defensive stats are repeated for the home team and visiting team. We could break tehse out and have a table that lists each game twice, one for each team, and cut out this column repetition.
Similarly, in game_log, we have a listing for 9 palyers on each team with their position- we can remove these and have one table that tracks player appearances and their positions
We can do a similar thing with the unpires from game_log, instead of listing all four positions as columns, we can put the umpires either in their own table or make one table for players, umpires and managers
We have several awards in game_log like winning pitcher and losing pitcher. We can either break these out into their own table, have a table for awards, or combine the awards in with general apperarances like the players and umpires
The following schema was planned using a schema designing tool like DbDesigner.net..
c1 = """
CREATE TABLE IF NOT EXISTS person (
person_id TEXT PRIMARY KEY,
first_name TEXT,
last_name TEXT
);
"""
c2 = """
INSERT OR IGNORE INTO person
SELECT
id,
first,
last
FROM person_codes;
"""
q = """
SELECT * FROM person
LIMIT 5;
"""
run_command(c1)
run_command(c2)
run_query(q)
person_id | first_name | last_name | |
---|---|---|---|
0 | aardd001 | David | Aardsma |
1 | aaroh101 | Hank | Aaron |
2 | aarot101 | Tommie | Aaron |
3 | aased001 | Don | Aase |
4 | abada001 | Andy | Abad |
c1= """
CREATE TABLE IF NOT EXISTS park(
park_id TEXT PRIMARY KEY,
name TEXT,
nickname TEXT,
city TEXT,
state TEXT,
notes TEXT
);
"""
c2 = """
INSERT OR IGNORE INTO park
SELECT
park_id,
name,
aka,
city,
state,
notes
FROM park_codes;
"""
q = """
SELECT * FROM park
LIMIT 5;
"""
run_command(c1)
run_command(c2)
run_query(q)
park_id | name | nickname | city | state | notes | |
---|---|---|---|---|---|---|
0 | ALB01 | Riverside Park | None | Albany | NY | TRN:9/11/80;6/15&9/10/1881;5/16-5/18&5/30/1882 |
1 | ALT01 | Columbia Park | None | Altoona | PA | None |
2 | ANA01 | Angel Stadium of Anaheim | Edison Field; Anaheim Stadium | Anaheim | CA | None |
3 | ARL01 | Arlington Stadium | None | Arlington | TX | None |
4 | ARL02 | Rangers Ballpark in Arlington | The Ballpark in Arlington; Ameriquest Fl | Arlington | TX | None |
c1 = """
CREATE TABLE IF NOT EXISTS league (
league_id TEXT PRIMARY KEY,
name TEXT
);
"""
c2 = """
INSERT OR IGNORE INTO league
VALUES
("NL", "National League"),
("AL", "America League"),
("AA", "American Association"),
("FL", "Federal League"),
("PL", "Players League"),
("UA", "Union Association")
;
"""
q = """
SELECT * FROM league
"""
run_command(c1)
run_command(c2)
run_query(q)
league_id | name | |
---|---|---|
0 | NL | National League |
1 | AL | America League |
2 | AA | American Association |
3 | FL | Federal League |
4 | PL | Players League |
5 | UA | Union Association |
c1 = "DROP TABLE IF EXISTS apperance_type;"
run_command(c1)
c2 = """
CREATE TABLE apperance_type (
appearance_type_id TEXT PRIMARY KEY,
name TEXT,
category TEXT
);
"""
run_command(c2)
apperance_type = pd.read_csv('appearance_type.csv')
with sqlite3.connect('mlb.db') as conn:
apperance_type.to_sql('apperance_type',
conn,
index=False,
if_exists='append')
q= """
SELECT * FROM apperance_type;
"""
run_query(q)
appearance_type_id | name | category | |
---|---|---|---|
0 | O1 | Batter 1 | offense |
1 | O2 | Batter 2 | offense |
2 | O3 | Batter 3 | offense |
3 | O4 | Batter 4 | offense |
4 | O5 | Batter 5 | offense |
5 | O6 | Batter 6 | offense |
6 | O7 | Batter 7 | offense |
7 | O8 | Batter 8 | offense |
8 | O9 | Batter 9 | offense |
9 | D1 | Pitcher | defense |
10 | D2 | Catcher | defense |
11 | D3 | 1st Base | defense |
12 | D4 | 2nd Base | defense |
13 | D5 | 3rd Base | defense |
14 | D6 | Shortstop | defense |
15 | D7 | Left Field | defense |
16 | D8 | Center Field | defense |
17 | D9 | Right Field | defense |
18 | D10 | Unknown Position | defense |
19 | UHP | Home Plate | umpire |
20 | U1B | First Base | umpire |
21 | U2B | Second Base | umpire |
22 | U3B | Third Base | umpire |
23 | ULF | Left Field | umpire |
24 | URF | Right Field | umpire |
25 | MM | Manager | manager |
26 | AWP | Winning Pitcher | award |
27 | ALP | Losing Pitcher | award |
28 | ASP | Saving Pitcher | award |
29 | AWB | Winning RBI Batter | award |
30 | PSP | Starting Pitcher | pitcher |
c1 = """
CREATE TABLE IF NOT EXISTS team (
team_id TEXT PRIMARY KEY,
league_id TEXT,
city TEXT,
nickname TEXT,
franch_id TEXT,
FOREIGN KEY (league_id) REFERENCES league(league_id)
);
"""
c2 = """
INSERT OR IGNORE INTO team
SELECT
team_id,
league,
city,
nickname,
franch_id
FROM team_codes;
"""
q = """
SELECT * FROM team
LIMIT 5;
"""
run_command(c1)
run_command(c2)
run_query(q)
team_id | league_id | city | nickname | franch_id | |
---|---|---|---|---|---|
0 | ALT | UA | Altoona | Mountain Cities | ALT |
1 | ARI | NL | Arizona | Diamondbacks | ARI |
2 | BFN | NL | Buffalo | Bisons | BFN |
3 | BFP | PL | Buffalo | Bisons | BFP |
4 | BL1 | None | Baltimore | Canaries | BL1 |
c1 = """DROP TABLE IF EXISTS game;"""
run_command(c1)
c1 = """
CREATE TABLE IF NOT EXISTS game (
game_id TEXT PRIMARY KEY,
date TEXT,
number_of_game INTEGER,
park_id TEXT,
length_outs INTEGER,
day BOOLEAN,
completion TEXT,
forefeit TEXT,
protest TEXT,
attendance INTEGER,
legnth_minutes INTEGER,
additional_info TEXT,
acquisition_info TEXT,
FOREIGN KEY (park_id) REFERENCES park (park_id)
);
"""
c2 = """
INSERT OR IGNORE INTO game
SELECT
game_id,
date,
number_of_game,
park_id,
length_outs,
CASE
WHEN day_night = "D" THEN 1
WHEN day_night = "N" THEN 0
ELSE NULL
END
AS day,
completion,
forefeit,
protest,
attendance,
length_minutes,
additional_info,
acquisition_info
FROM game_log;
"""
q = """
SELECT * FROM game
LIMIT 5;
"""
run_command(c1)
run_command(c2)
run_query(q)
c1 = """
CREATE TABLE IF NOT EXISTS team_appearance (
team_id TEXT,
game_id TEXT,
home BOOLEAN,
league_id TEXT,
score INTEGER,
line_score TEXT,
at_bats INTEGER,
hits INTEGER,
doubles INTEGER,
triples INTEGER,
homeruns INTEGER,
rbi INTEGER,
sacrifice_hits INTEGER,
sacrifice_flies INTEGER,
hit_by_pitch INTEGER,
walks INTEGER,
intentional_walks INTEGER,
strikeouts INTEGER,
stolen_bases INTEGER,
caught_stealing INTEGER,
grounded_into_double INTEGER,
first_catcher_interference INTEGER,
left_on_base INTEGER,
pitchers_used INTEGER,
individual_earned_runs INTEGER,
team_earned_runs INTEGER,
wild_pitches INTEGER,
balks INTEGER,
putouts INTEGER,
assists INTEGER,
errors INTEGER,
passed_balls INTEGER,
double_plays INTEGER,
triple_plays INTEGER,
PRIMARY KEY (team_id, game_id),
FOREIGN KEY (team_id) REFERENCES team(team_id),
FOREIGN KEY (game_id) REFERENCES game(game_id),
FOREIGN KEY (team_id) REFERENCES team(team_id)
);
"""
run_command(c1)
c2= """
INSERT OR IGNORE INTO team_appearance
SELECT
h_name,
game_id,
1 AS home,
h_league,
h_score,
h_line_score,
h_at_bats,
h_hits,
h_doubles,
h_triples,
h_homeruns,
h_rbi,
h_sacrifice_hits,
h_sacrifice_flies,
h_hit_by_pitch,
h_walks,
h_intentional_walks,
h_strikeouts,
h_stolen_bases,
h_caught_stealing,
h_grounded_into_double,
h_first_catcher_interference,
h_left_on_base,
h_pitchers_used,
h_individual_earned_runs,
h_team_earned_runs,
h_wild_pitches,
h_balks,
h_putouts,
h_assists,
h_errors,
h_passed_balls,
h_double_plays,
h_triple_plays
FROM game_log
UNION
SELECT
v_name,
game_id,
0 AS home,
v_league,
v_score,
v_line_score,
v_at_bats,
v_hits,
v_doubles,
v_triples,
v_homeruns,
v_rbi,
v_sacrifice_hits,
v_sacrifice_flies,
v_hit_by_pitch,
v_walks,
v_intentional_walks,
v_strikeouts,
v_stolen_bases,
v_caught_stealing,
v_grounded_into_double,
v_first_catcher_interference,
v_left_on_base,
v_pitchers_used,
v_individual_earned_runs,
v_team_earned_runs,
v_wild_pitches,
v_balks,
v_putouts,
v_assists,
v_errors,
v_passed_balls,
v_double_plays,
v_triple_plays
FROM game_log;
"""
run_command(c2)
q = """
SELECT * FROM team_appearance
WHERE game_id = (
SELECT MIN(game_id) FROM game
)
OR game_id = (
SELECT MAX(game_id) FROM game
)
ORDER BY game_id, home;
"""
run_query(q)
c0 = "DROP TABLE IF EXISTS person_appearance"
run_command(c0)
c1 = """
CREATE TABLE person_appearance (
appearance_id INTEGER PRIMARY KEY,
person_id TEXT,
team_id TEXT,
game_id TEXT,
appearance_type_id,
FOREIGN KEY (person_id) REFERENCES person(person_id),
FOREIGN KEY (team_id) REFERENCES team(team_id),
FOREIGN KEY (game_id) REFERENCES game(game_id),
FOREIGN KEY (appearance_type_id) REFERENCES appearance_type(appearance_type_id)
);
"""
c2 = """
INSERT OR IGNORE INTO person_appearance (
game_id,
team_id,
person_id,
appearance_type_id
)
SELECT
game_id,
NULL,
hp_umpire_id,
"UHP"
FROM game_log
WHERE hp_umpire_id IS NOT NULL
UNION
SELECT
game_id,
NULL,
[1b_umpire_id],
"U1B"
FROM game_log
WHERE "1b_umpire_id" IS NOT NULL
UNION
SELECT
game_id,
NULL,
[2b_umpire_id],
"U2B"
FROM game_log
WHERE [2b_umpire_id] IS NOT NULL
UNION
SELECT
game_id,
NULL,
[3b_umpire_id],
"U3B"
FROM game_log
WHERE [3b_umpire_id] IS NOT NULL
UNION
SELECT
game_id,
NULL,
lf_umpire_id,
"ULF"
FROM game_log
WHERE lf_umpire_id IS NOT NULL
UNION
SELECT
game_id,
NULL,
rf_umpire_id,
"URF"
FROM game_log
WHERE rf_umpire_id IS NOT NULL
UNION
SELECT
game_id,
v_name,
v_manager_id,
"MM"
FROM game_log
WHERE v_manager_id IS NOT NULL
UNION
SELECT
game_id,
h_name,
h_manager_id,
"MM"
FROM game_log
WHERE h_manager_id IS NOT NULL
UNION
SELECT
game_id,
CASE
WHEN h_score > v_score THEN h_name
ELSE v_name
END,
winning_pitcher_id,
"AWP"
FROM game_log
WHERE winning_pitcher_id IS NOT NULL
UNION
SELECT
game_id,
CASE
WHEN h_score < v_score THEN h_name
ELSE v_name
END,
losing_pitcher_id,
"ALP"
FROM game_log
WHERE losing_pitcher_id IS NOT NULL
UNION
SELECT
game_id,
CASE
WHEN h_score > v_score THEN h_name
ELSE v_name
END,
saving_pitcher_id,
"ASP"
FROM game_log
WHERE saving_pitcher_id IS NOT NULL
UNION
SELECT
game_id,
CASE
WHEN h_score > v_score THEN h_name
ELSE v_name
END,
winning_rbi_batter_id,
"AWB"
FROM game_log
WHERE winning_rbi_batter_id IS NOT NULL
UNION
SELECT
game_id,
v_name,
v_starting_pitcher_id,
"PSP"
FROM game_log
WHERE v_starting_pitcher_id IS NOT NULL
UNION
SELECT
game_id,
h_name,
h_starting_pitcher_id,
"PSP"
FROM game_log
WHERE h_starting_pitcher_id IS NOT NULL;
"""
template = """
INSERT INTO person_appearance (
game_id,
team_id,
person_id,
appearance_type_id
)
SELECT
game_id,
{hv}_name,
{hv}_player_{num}_id,
"O{num}"
FROM game_log
WHERE {hv}_player_{num}_id IS NOT NULL
UNION
SELECT
game_id,
{hv}_name,
{hv}_player_{num}_id,
"D" || CAST({hv}_player_{num}_def_pos AS INT)
FROM game_log
WHERE {hv}_player_{num}_id IS NOT NULL;
"""
run_command(c1)
run_command(c2)
for hv in ["h", "v"]:
for num in range(1,10):
query_vars = {
"hv":hv,
"num": num
}
run_command(template.format(**query_vars))
print(run_query("SELECT COUNT game_id) games_game FROM game"))
print(run_query("SELECT COUNT(DISTINCT game_id) games_person_appearance FROM person_appearance"))
q = """
SELECT
pa.*,
at.name,
at.category
FROM person_appearance pa
INNER JOIN appearance_type at on at.appearance_type_id = pa.appearance_type_id
WHERE PA.game_id = (
SELECT max(game_id)
FROM person_aapearance
)
ORDER BY team_id, appearance_type_id
"""
run_query(q)