%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt from bs4 import BeautifulSoup import requests from pattern import web import scipy.stats as stats import statsmodels.api as sm from scipy.stats import binom from __future__ import division import re from StringIO import StringIO from zipfile import ZipFile from pandas import read_csv #nice defaults for matplotlib from matplotlib import rcParams dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = True rcParams['axes.facecolor'] = '#eeeeee' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content zip_files = StringIO() zip_files.write(zip_folder) csv_files = ZipFile(zip_files) teams = csv_files.open('Teams.csv') teams = read_csv(teams) players = csv_files.open('Batting.csv') players = read_csv(players) salaries = csv_files.open('Salaries.csv') salaries = read_csv(salaries) dat = teams[(teams['G'] == 162) & (teams['yearID']<2002) ] dat[["teamID","yearID", "H", "2B", "3B", "HR", "BB"]].head() players = players[players["yearID"]>=1947] def f(series): return series.index[np.where(series==min(series))][0] df = players[players["AB"]>502] grouped = df.groupby("playerID", as_index=False) rookie_idx = grouped["yearID"].aggregate({'min_index':f})['min_index'].values rookie = df.loc[rookie_idx][["playerID", "AB", "H"]] grouped = df.drop(rookie_idx).groupby("playerID", as_index=False) sophomore_idx = grouped["yearID"].aggregate({'min_index':f})['min_index'].values sophomore = df.loc[sophomore_idx][["playerID", "AB", "H"]] tab = pd.merge(rookie, sophomore, on='playerID', how='outer') tab = tab.dropna() tab["avg1"]=tab["H_x"]/tab["AB_x"]*1000 tab["avg2"]=tab["H_y"]/tab["AB_y"]*1000 plt.hist(tab["avg2"],bins=np.arange(200,380,10)) plt.ylabel("Frequency") plt.xlabel("Sophomore batting averages") plt.show() avgs = tab["avg2"] z = (avgs-np.mean(avgs))/np.std(avgs) stats.probplot(z, dist="norm", plot=plt) plt.title("Normal Q-Q plot") plt.show() plt.scatter(tab["avg1"], tab["avg2"]) plt.xlabel("Rookie") plt.ylabel("Sophomore") plt.show() plt.scatter(tab["avg1"], tab["avg2"]) plt.xlabel("Rookie") plt.ylabel("Sophomore") plt.vlines(295, 150 ,400, color = 'red', alpha = 0.5) plt.vlines(305, 150 ,400, color = 'red', alpha = 0.5) plt.ylim(150,400) plt.show() tab["rounded_avg1"] = np.array([round(a/10,0)*10 for a in tab["avg1"].values]) df = tab[tab["rounded_avg1"]==300] plt.hist(df["avg2"].values, bins=np.arange(200,380,10)) plt.vlines(np.mean(df["avg2"]), 0,20, color = 'Red', alpha = 0.7) plt.xlabel("Sophomore averages in strata") plt.show() tab.boxplot(column = "avg2", by = "rounded_avg1") plt.xlabel("Rookie average") plt.ylabel("Sophomore average") plt.xticks(rotation = 50) plt.show() grouped = tab.groupby("rounded_avg1", as_index=False) tmp = grouped["avg2"].aggregate({'mean':np.mean}) plt.scatter(tmp["rounded_avg1"], tmp["mean"], s=40) plt.plot(tmp["rounded_avg1"], tmp["rounded_avg1"], alpha = 0.6) plt.xlim(249,351) plt.ylim(249,351) plt.show() plt.boxplot([tab["avg2"], df["avg2"].values]) plt.xticks([1,2], ["All","Strata"]) plt.show() top10 = tab[["playerID","avg1", "avg2"]] top10["avg1"] = np.array([round(a) for a in top10["avg1"].values]) top10["avg2"] = np.array([round(a) for a in top10["avg2"].values]) top10 = top10.sort("avg1", ascending = 0) top10.head(10) bottom10 = top10.sort("avg1", ascending = 1) bottom10.head(10) # get all years with more than 162 games and before 2002 dat = teams[(teams["G"]==162) & (teams["yearID"]>=1947)] avg = dat["H"]/dat["AB"] plt.scatter(avg, dat["R"]/162) plt.xlabel("Batting average") plt.ylabel("Runs per game") plt.show() # compute lifetime totals for each player, then # compute AVG, HR rates and BB rates pdat = pd.merge(salaries, players, on=['yearID','playerID'], how='outer') pdat = pdat.sort(["playerID","yearID"]) #remove invalid data pdat = pdat.dropna(subset = ["teamID_x", "teamID_y"]) grouped = pdat.groupby("playerID", as_index=False) pdat = grouped[["AB", "H", "HR", "BB", "salary"]].aggregate(np.sum) pdat["salary"] = pdat["salary"]/1000000 pdat["AVG"] = np.array([min(max(240,pdat["H"][i]/pdat["AB"][i]*1000),320) for i in range(len(pdat))]) pdat["HRR"] = pdat["HR"]/(pdat["AB"]+pdat["BB"]) pdat["BBR"] = pdat["BB"]/(pdat["AB"]+pdat["BB"]) pdat = pdat[pdat["AB"]>1000] pdat["rounded_AVG"] = np.array([round(a,-1) for a in pdat["AVG"].values]) pdat.boxplot(column = "salary", by = "rounded_AVG") plt.xlabel("Lifetime Batting Average") plt.ylabel("Total Salary in Millions") plt.show() pdat["rounded_HRR"] = np.array([round(a*1000,-1) for a in pdat["HRR"].values]) pdat.boxplot(column = "salary", by = "rounded_HRR") plt.xlabel("Lifetime HR rate (per 1000 PA)") plt.ylabel("Total Salary in Millions") plt.show() dat = teams[(teams["G"]==162) & (teams["yearID"]>=1947)] avg = dat["H"]/dat["AB"] dat = dat[["R","H", "2B", "3B", "HR", "BB", "AB"]] dat["H"] = dat["H"]-dat["2B"]-dat["3B"]-dat["HR"] dat["PA"] = dat["BB"]+dat["AB"] plt.scatter(avg, dat["R"]/162) plt.xlabel("Average") plt.ylabel("Runs per game") plt.title("corr="+str(round(np.corrcoef(avg,dat["R"])[0][1],2))) plt.show() names = ["Singles","Doubles","Triples","HR","BB"] i = 0 for col in dat.columns[1:6]: plt.scatter(dat[col], dat["R"]) plt.xlabel(names[i]) i+=1 plt.ylabel("Runs per game") plt.title("corr="+str(round(np.corrcoef(dat[col],dat["R"])[0][1],2))) plt.show() pdat["rounded_BBR"] = np.array([int(round(a*1000,-1)) for a in pdat["BBR"].values]) pdat.boxplot(column = "salary", by = "rounded_BBR" ) plt.xlabel("Lifetime BB rate (per 1000 PA)") plt.ylabel("Total Salary in Millions") plt.xticks(rotation = 50) plt.show() dat = teams[(teams['G']==162) & (teams['yearID']>=1947)] plt.scatter(dat["yearID"], dat["R"]/162) plt.xlabel("Year") plt.ylabel("Runs per game") plt.show() dat["AVG"] = dat["H"]/dat["AB"] dat["1B"] = dat["H"]-dat["2B"]-dat["3B"]-dat["HR"] dat["PA"] = dat["BB"] + dat["AB"] #Fit before 2002 dat1 = dat[(dat["yearID"]>=1947) & (dat["yearID"]<=2001)] #test on 2002 dat2 = dat[dat["yearID"]==2002] fit = [] Y = dat1["R"].values X = np.transpose(dat1["AVG"].values) X = sm.add_constant(X) fit.append(sm.OLS(Y,X).fit()) X = np.transpose(dat1["HR"].values) X = sm.add_constant(X) fit.append(sm.OLS(Y,X).fit()) X = np.transpose([dat1["1B"].values,dat1["2B"].values, dat1["3B"].values, dat1["HR"].values, dat1["BB"].values]) X = sm.add_constant(X) fit.append(sm.OLS(Y,X).fit()) names = ["Model based on "+name for name in ["AVG alone: ","HR alone: ","all five statistics: "]] X = [] X.append(np.transpose(dat2["AVG"].values)) X.append(np.transpose(dat2["HR"].values)) X.append(np.transpose([dat2["1B"].values,dat2["2B"].values, dat2["3B"].values, dat2["HR"].values, dat2["BB"].values])) for i in range(3): X_new = sm.add_constant(X[i]) print names[i]+str(np.mean(fit[i].predict(X_new)-dat2["R"])**2) dat1["OPS"] = (dat1["BB"]+dat1["H"])/dat1["PA"]+(dat1["1B"]+ 2*dat1["2B"]+3*dat1["3B"]+4*dat1["HR"])/dat1["AB"] fitted = fit[2].predict(sm.add_constant(np.transpose([dat1["1B"].values,dat1["2B"].values, dat1["3B"].values, dat1["HR"].values, dat1["BB"].values]))) plt.scatter(dat1["OPS"], fitted) plt.xlabel("OPS") plt.ylabel("Our prediction") plt.title("correlation="+str(round(np.corrcoef(dat1["OPS"],fitted)[0][1],2))) plt.show() pdat = players[(players["AB"]>=502) & (players["yearID"]>=1996)] #players with 10 or more seasons with 502 at bats since 1996 grouped = pdat.groupby("playerID", as_index=False) count = grouped["yearID"].aggregate({"N":np.size}) count = count[count["N"]>=10]["playerID"].values pdat = pdat[pdat['playerID'].isin(count)] pdat["AVG"] = pdat["H"]/pdat["AB"] pdat["PA"] = pdat["BB"] + pdat["AB"] pdat["1B"] = pdat["H"] - pdat["2B"] - pdat["3B"] - pdat["HR"] pdat["OPS"] = (pdat["BB"]+pdat["H"])/pdat["PA"]+(pdat["1B"]+2*pdat["2B"]+3*pdat["3B"]+4*pdat["HR"])/pdat["AB"] y = pdat[pdat["playerID"]=="jeterde01"] plt.scatter(y["yearID"], y["AVG"]*1000, zorder = 1, s = 60) plt.plot(y["yearID"], y["AVG"]*1000, zorder = 0) plt.xlabel("Year") plt.ylabel("AVG") plt.title("Derek Jeter") plt.show() grouped = pdat.groupby("playerID", as_index = False) avg = grouped["AVG"].aggregate({"mean_AVG":np.mean}) avg = avg.sort("mean_AVG") idx = avg["playerID"].values boxes = [] for playerID in idx: df = pdat[pdat["playerID"]==playerID] boxes.append(df["AVG"].values) plt.boxplot(boxes) plt.xticks(np.arange(1,23,1), idx, rotation=70) plt.show() grouped = pdat.groupby("playerID", as_index = False) avg = grouped["OPS"].aggregate({"mean_OPS":np.mean}) avg = avg.sort("mean_OPS") idx = avg["playerID"].values boxes = [] for playerID in idx: df = pdat[pdat["playerID"]==playerID] boxes.append(df["OPS"].values) plt.boxplot(boxes) plt.xticks(np.arange(1,23,1), idx, rotation=70) plt.show()