import pandas as pd
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.charts import Bar
import numpy as np
import bokeh.plotting as bk
bk.output_notebook()
# Columns Definition
k_columns = ("Div", "Date", "HomeTeam", "AwayTeam")
p_columns = k_columns + ("FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR")
s_columns = k_columns + ("Attendance", "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HY", "AY", "HR", "AR")
d1_1516 = pd.read_csv("./data/D1_1516.csv")
sp1_1516 = pd.read_csv("./data/SP1_1516.csv")
d1_1516.ix[:, p_columns].head()
Div | Date | HomeTeam | AwayTeam | FTHG | FTAG | FTR | HTHG | HTAG | HTR | |
---|---|---|---|---|---|---|---|---|---|---|
0 | D1 | 14/08/15 | Bayern Munich | Hamburg | 5 | 0 | H | 1 | 0 | H |
1 | D1 | 15/08/15 | Augsburg | Hertha | 0 | 1 | A | 0 | 0 | D |
2 | D1 | 15/08/15 | Darmstadt | Hannover | 2 | 2 | D | 1 | 0 | H |
3 | D1 | 15/08/15 | Dortmund | M'gladbach | 4 | 0 | H | 3 | 0 | H |
4 | D1 | 15/08/15 | Leverkusen | Hoffenheim | 2 | 1 | H | 1 | 1 | D |
d1_1516.ix[:, s_columns].head()
Div | Date | HomeTeam | AwayTeam | Attendance | HS | AS | HST | AST | HC | AC | HF | AF | HY | AY | HR | AR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | D1 | 14/08/15 | Bayern Munich | Hamburg | NaN | 23 | 5 | 9 | 1 | 7 | 0 | 10 | 12 | 2 | 2 | 0 | 0 |
1 | D1 | 15/08/15 | Augsburg | Hertha | NaN | 20 | 11 | 3 | 4 | 7 | 4 | 20 | 22 | 1 | 2 | 1 | 1 |
2 | D1 | 15/08/15 | Darmstadt | Hannover | NaN | 11 | 14 | 4 | 5 | 5 | 9 | 21 | 22 | 1 | 2 | 0 | 0 |
3 | D1 | 15/08/15 | Dortmund | M'gladbach | NaN | 17 | 5 | 7 | 1 | 3 | 5 | 13 | 14 | 0 | 1 | 0 | 0 |
4 | D1 | 15/08/15 | Leverkusen | Hoffenheim | NaN | 25 | 6 | 9 | 2 | 13 | 5 | 12 | 18 | 1 | 0 | 0 | 0 |
d1_1516.ix[:, p_columns].describe()
FTHG | FTAG | HTHG | HTAG | |
---|---|---|---|---|
count | 306.000000 | 306.000000 | 306.000000 | 306.000000 |
mean | 1.565359 | 1.264706 | 0.696078 | 0.549020 |
std | 1.363689 | 1.130151 | 0.815295 | 0.676938 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
75% | 2.000000 | 2.000000 | 1.000000 | 1.000000 |
max | 6.000000 | 5.000000 | 4.000000 | 3.000000 |
d1_1516.ix[:, s_columns].describe()
Attendance | HS | AS | HST | AST | HC | AC | HF | AF | HY | AY | HR | AR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 0.0 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 | 306.000000 |
mean | NaN | 14.035948 | 11.598039 | 5.173203 | 4.333333 | 5.287582 | 4.218954 | 14.183007 | 14.862745 | 1.790850 | 2.003268 | 0.052288 | 0.075163 |
std | NaN | 5.655870 | 4.557667 | 2.831799 | 2.337155 | 3.042159 | 2.413303 | 4.053637 | 4.290376 | 1.234231 | 1.299428 | 0.222971 | 0.264087 |
min | NaN | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | NaN | 10.000000 | 8.000000 | 3.000000 | 3.000000 | 3.000000 | 2.250000 | 11.000000 | 12.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
50% | NaN | 14.000000 | 11.000000 | 5.000000 | 4.000000 | 5.000000 | 4.000000 | 14.000000 | 15.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 |
75% | NaN | 17.000000 | 15.000000 | 7.000000 | 6.000000 | 7.000000 | 6.000000 | 17.000000 | 18.000000 | 3.000000 | 3.000000 | 0.000000 | 0.000000 |
max | NaN | 36.000000 | 24.000000 | 14.000000 | 12.000000 | 18.000000 | 13.000000 | 26.000000 | 29.000000 | 6.000000 | 6.000000 | 1.000000 | 1.000000 |
def create_plot(title, hist, edges, fill_color="#00BCD4"):
"""Create a plot object.
Keyword arguments:
title -- A caption of plot.
hist --
edges --
"""
plot = figure(title=title,tools="save", background_fill_color="#E8DDCB")
plot.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color=fill_color, line_color="#424242")
return plot
def show_single_plot(plot):
"""Show a plot.
Keyword arguments:
plot -- plot object
"""
show(gridplot(plot, ncols=1, plot_width=400, plot_height=400, toolbar_location=None))
# Creating plots
hist_home, edges_home = np.histogram(d1_1516["FTHG"], bins=7)
plot_d1_hg = create_plot("Bundesliga D1 15/16 Home Team Goal", hist_home, edges_home)
hist_away, edges_away = np.histogram(d1_1516["FTAG"], bins=6)
plot_d1_ag = create_plot("Bundesliga D115/16 Away Team Goal", hist_away, edges_away)
# Goals
show(gridplot(plot_d1_hg, plot_d1_ag, ncols=2, plot_width=400, plot_height=400))
p = Bar(d1_1516, "FTHG", values="FTHG", agg="count", title="Bundesliga D1 15/16 Game Results" , legend="")
show_single_plot(p)
# Poisson Distribution
samples = np.random.poisson(lam=1.5, size=10000)
hist_dummy, edges_dummy = np.histogram(samples, density=True, bins=max(samples))
p3 = create_plot("Poisson Distribution", hist_dummy, edges_dummy)
show(gridplot(p3, ncols=1, plot_width=400, plot_height=400, toolbar_location=None))
# Shots
hist_shot, edges_shot = np.histogram(d1_1516["HS"], bins=15)
plot_shot = create_plot("Bundesliga D1 15/16 Shots", hist_shot, edges_shot)
show_single_plot(plot_shot)
p = Bar(d1_1516, "FTR", values="FTR", agg="count", title="Bundesliga D1 15/16 Game Results")
show_single_plot(p)
# Creating plots
hist_home, edges_home = np.histogram(sp1_1516["FTHG"], bins=10)
plot_sp1_hg = create_plot("Liga Española D1 15/16 Home Team Goal", hist_home, edges_home, "#FF9800")
hist_away, edges_away = np.histogram(sp1_1516["FTAG"], bins=9)
plot_sp1_ag = create_plot("Liga Española D1 15/16 Away Team Goal", hist_away, edges_away, "#FF9800")
show(gridplot(plot_d1_hg, plot_d1_ag, plot_sp1_hg, plot_sp1_ag , ncols=2, plot_width=400, plot_height=400))
sp1_1516.describe()
FTHG | FTAG | HTHG | HTAG | HS | AS | HST | AST | HF | AF | ... | BbAv<2.5 | BbAH | BbAHh | BbMxAHH | BbAvAHH | BbMxAHA | BbAvAHA | PSCH | PSCD | PSCA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | ... | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 | 380.000000 |
mean | 1.618421 | 1.126316 | 0.742105 | 0.505263 | 13.236842 | 10.526316 | 4.863158 | 3.757895 | 13.602632 | 13.594737 | ... | 2.095368 | 27.331579 | -0.403289 | 1.994000 | 1.935158 | 1.994263 | 1.936579 | 3.108974 | 4.817711 | 6.026421 |
std | 1.450749 | 1.148518 | 0.899613 | 0.709880 | 4.784177 | 4.448099 | 2.670509 | 2.107736 | 4.290567 | 4.346955 | ... | 0.746049 | 2.341673 | 1.033264 | 0.163487 | 0.152572 | 0.161801 | 0.150245 | 3.567728 | 3.080678 | 6.712190 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | ... | 1.450000 | 21.000000 | -3.500000 | 1.630000 | 1.590000 | 1.620000 | 1.580000 | 1.050000 | 2.990000 | 1.090000 |
25% | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 10.000000 | 7.000000 | 3.000000 | 2.000000 | 11.000000 | 11.000000 | ... | 1.660000 | 26.000000 | -1.000000 | 1.880000 | 1.837500 | 1.880000 | 1.830000 | 1.597500 | 3.417500 | 2.640000 |
50% | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 13.000000 | 10.000000 | 4.500000 | 4.000000 | 13.000000 | 13.000000 | ... | 1.820000 | 28.000000 | -0.250000 | 1.980000 | 1.920000 | 1.990000 | 1.930000 | 2.120000 | 3.690000 | 3.850000 |
75% | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 16.000000 | 13.000000 | 6.000000 | 5.000000 | 16.000000 | 16.000000 | ... | 2.125000 | 29.000000 | 0.062500 | 2.090000 | 2.020000 | 2.080000 | 2.020000 | 2.915000 | 4.525000 | 6.390000 |
max | 10.000000 | 8.000000 | 5.000000 | 4.000000 | 31.000000 | 28.000000 | 15.000000 | 11.000000 | 29.000000 | 27.000000 | ... | 6.220000 | 33.000000 | 2.750000 | 2.550000 | 2.440000 | 2.550000 | 2.420000 | 35.500000 | 24.500000 | 46.000000 |
8 rows × 58 columns
threshold = 6
sp1_1516[(sp1_1516.FTHG > threshold) | (sp1_1516.FTAG > threshold)].ix[:, p_columns]
Div | Date | HomeTeam | AwayTeam | FTHG | FTAG | FTR | HTHG | HTAG | HTR | |
---|---|---|---|---|---|---|---|---|---|---|
157 | SP1 | 20/12/15 | Real Madrid | Vallecano | 10 | 2 | H | 4 | 2 | H |
272 | SP1 | 05/03/16 | Real Madrid | Celta | 7 | 1 | H | 1 | 0 | H |
333 | SP1 | 20/04/16 | La Coruna | Barcelona | 0 | 8 | A | 0 | 2 | A |
sp1_1516[(sp1_1516.HomeTeam == "Barcelona")].ix[:, p_columns].describe()
FTHG | FTAG | HTHG | HTAG | |
---|---|---|---|---|
count | 19.000000 | 19.000000 | 19.000000 | 19.000000 |
mean | 3.526316 | 0.736842 | 1.210526 | 0.368421 |
std | 1.806421 | 0.805682 | 1.031662 | 0.597265 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2.000000 | 0.000000 | 0.500000 | 0.000000 |
50% | 4.000000 | 1.000000 | 1.000000 | 0.000000 |
75% | 5.000000 | 1.000000 | 2.000000 | 1.000000 |
max | 6.000000 | 2.000000 | 4.000000 | 2.000000 |
sp1_1516.groupby("HomeTeam").sum().ix[:, ("FTHG", )].sort_values(by="FTHG", ascending=False).head()
FTHG | |
---|---|
HomeTeam | |
Real Madrid | 70 |
Barcelona | 67 |
Sevilla | 38 |
Ath Bilbao | 35 |
Ath Madrid | 33 |
sp1_1516.groupby("AwayTeam").sum().ix[:, ("FTAG", )].sort_values(by="FTAG", ascending=False).head()
FTAG | |
---|---|
AwayTeam | |
Barcelona | 45 |
Real Madrid | 40 |
Ath Madrid | 30 |
Ath Bilbao | 23 |
Vallecano | 23 |
# ranking = sp1_1516.groupby("HomeTeam").sum().ix[:, ("FTHG",)].sort_values(by="FTHG", ascending=False).index
p = Bar(sp1_1516, label="HomeTeam", values="FTHG", agg="sum",
title="Liga Española D1 15/16 HomeTeam Goals", height=400, plot_width=900, legend="")
show(p)