%load_ext watermark
%watermark -v -m -p numpy,pandas,keras,tensorflow -g
Using TensorFlow backend.
CPython 3.5.3 IPython 5.1.0 numpy 1.11.3 pandas 0.19.2 keras 1.2.0 tensorflow 0.10.0rc0 compiler : GCC 4.4.7 20120313 (Red Hat 4.4.7-1) system : Linux release : 4.4.0-72-generic machine : x86_64 processor : x86_64 CPU cores : 4 interpreter: 64bit Git hash : 60c28751d01a6e854bbcdafc490acf97fa1c15da
import pandas as pd
df = pd.read_pickle('./2016_scores.pkl')
df['difference'] = df['home_score'] - df['away_score']
df.head()
away_score | away_team | home_score | home_team | difference | |
---|---|---|---|---|---|
0 | 5 | Toronto Blue Jays | 3 | Tampa Bay Rays | -2 |
1 | 1 | St. Louis Cardinals | 4 | Pittsburgh Pirates | 3 |
2 | 3 | New York Mets | 4 | Kansas City Royals | 1 |
3 | 2 | Seattle Mariners | 3 | Texas Rangers | 1 |
4 | 5 | Toronto Blue Jays | 3 | Tampa Bay Rays | -2 |
list_of_teams = list(df['away_team'].unique())
idx2teamid = {teamid2idx[key]:key for key in teamid2idx.keys()}
list_of_teams = df['away_team'].unique()
n_teams = len(list_of_teams)
teamid2idx = {o:i for i,o in enumerate(list_of_teams)}
idx2teamid = {teamid2idx[key]:key for key in teamid2idx.keys()}
df.away_team = df.away_team.apply(lambda x: teamid2idx[x])
df.home_team = df.home_team.apply(lambda x: teamid2idx[x])
import numpy as np
msk = np.random.rand(len(df)) < 0.8
trn = df[msk]
val = df[~msk]
from keras.layers import Input, Embedding, merge
from keras.layers.core import Flatten
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
n_factors = 10
away_in = Input(shape=(1,), dtype='int64', name='away_in')
home_in = Input(shape=(1,), dtype='int64', name='home_in')
embedding_layer = Embedding(n_teams, n_factors, input_length=1, W_regularizer=l2(1e-4))
#a = Embedding(n_teams, n_factors, input_length=1, W_regularizer=l2(1e-4))(away_in)
#h = Embedding(n_teams, n_factors, input_length=1, W_regularizer=l2(1e-4))(home_in)
a = embedding_layer(away_in)
h = embedding_layer(home_in)
x = merge([a, h], mode='dot')
x = Flatten()(x)
model = Model([away_in, home_in], x)
model.compile(Adam(0.001), loss='mse')
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=1,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/1 1980/1980 [==============================] - 0s - loss: 18.7924 - val_loss: 17.6625
<keras.callbacks.History at 0x7f436773c8d0>
model.optimizer.lr=0.01
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=3,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/3 1980/1980 [==============================] - 0s - loss: 18.7898 - val_loss: 17.6625 Epoch 2/3 1980/1980 [==============================] - 0s - loss: 18.7872 - val_loss: 17.6631 Epoch 3/3 1980/1980 [==============================] - 0s - loss: 18.7837 - val_loss: 17.6641
<keras.callbacks.History at 0x7f4367642748>
model.optimizer.lr=0.001
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=6,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/6 1980/1980 [==============================] - 0s - loss: 18.7789 - val_loss: 17.6653 Epoch 2/6 1980/1980 [==============================] - 0s - loss: 18.7726 - val_loss: 17.6668 Epoch 3/6 1980/1980 [==============================] - 0s - loss: 18.7640 - val_loss: 17.6695 Epoch 4/6 1980/1980 [==============================] - 0s - loss: 18.7535 - val_loss: 17.6725 Epoch 5/6 1980/1980 [==============================] - 0s - loss: 18.7397 - val_loss: 17.6758 Epoch 6/6 1980/1980 [==============================] - 0s - loss: 18.7234 - val_loss: 17.6792
<keras.callbacks.History at 0x7f43676b6518>
def embedding_input(name, n_in, n_out, reg):
inp = Input(shape=(1,), dtype='int64', name=name)
return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)
def create_bias(inp, n_in):
x = Embedding(n_in, 1, input_length=1)(inp)
return Flatten()(x)
#away_in, a = embedding_input('away_in', n_teams, n_factors, 1e-4)
#home_in, h = embedding_input('home_in', n_teams, n_factors, 1e-4)
away_in = Input(shape=(1,), dtype='int64', name='away_in')
home_in = Input(shape=(1,), dtype='int64', name='home_in')
embedding_layer = Embedding(n_teams, n_factors, input_length=1, W_regularizer=l2(1e-4))
a = embedding_layer(away_in)
h = embedding_layer(home_in)
ab = create_bias(away_in, n_teams)
hb = create_bias(home_in, n_teams)
x = merge([a, h], mode='dot')
x = Flatten()(x)
x = merge([x, ab], mode='sum')
x = merge([x, hb], mode='sum')
model = Model([away_in, home_in], x)
model.compile(Adam(0.001), loss='mse')
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=1,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/1 1980/1980 [==============================] - 0s - loss: 18.7880 - val_loss: 17.6421
<keras.callbacks.History at 0x7f4367491518>
model.optimizer.lr=0.01
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=6,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/6 1980/1980 [==============================] - 0s - loss: 17.9710 - val_loss: 17.7098 Epoch 2/6 1980/1980 [==============================] - 0s - loss: 17.9262 - val_loss: 17.7259 Epoch 3/6 1980/1980 [==============================] - 0s - loss: 17.8813 - val_loss: 17.7452 Epoch 4/6 1980/1980 [==============================] - 0s - loss: 17.8392 - val_loss: 17.7703 Epoch 5/6 1980/1980 [==============================] - 0s - loss: 17.7955 - val_loss: 17.7886 Epoch 6/6 1980/1980 [==============================] - 0s - loss: 17.7522 - val_loss: 17.8073: 17.65
<keras.callbacks.History at 0x7f4367867c18>
model.optimizer.lr=0.001
model.fit([trn.away_team, trn.home_team], trn.difference, batch_size=64, nb_epoch=15,
validation_data=([val.away_team, val.home_team], val.difference))
Train on 1980 samples, validate on 483 samples Epoch 1/15 1980/1980 [==============================] - 0s - loss: 18.6135 - val_loss: 17.6012 Epoch 2/15 1980/1980 [==============================] - 0s - loss: 18.5813 - val_loss: 17.5995 Epoch 3/15 1980/1980 [==============================] - 0s - loss: 18.5480 - val_loss: 17.5970 Epoch 4/15 1980/1980 [==============================] - 0s - loss: 18.5118 - val_loss: 17.5975 Epoch 5/15 1980/1980 [==============================] - 0s - loss: 18.4739 - val_loss: 17.5984 Epoch 6/15 1980/1980 [==============================] - 0s - loss: 18.4330 - val_loss: 17.5997 Epoch 7/15 1980/1980 [==============================] - 0s - loss: 18.3895 - val_loss: 17.6087 Epoch 8/15 1980/1980 [==============================] - 0s - loss: 18.3459 - val_loss: 17.6110 Epoch 9/15 1980/1980 [==============================] - 0s - loss: 18.3017 - val_loss: 17.6212 Epoch 10/15 1980/1980 [==============================] - 0s - loss: 18.2525 - val_loss: 17.6283 Epoch 11/15 1980/1980 [==============================] - 0s - loss: 18.2077 - val_loss: 17.6425 Epoch 12/15 1980/1980 [==============================] - 0s - loss: 18.1599 - val_loss: 17.6507 Epoch 13/15 1980/1980 [==============================] - 0s - loss: 18.1129 - val_loss: 17.6628 Epoch 14/15 1980/1980 [==============================] - 0s - loss: 18.0640 - val_loss: 17.6769: 1 Epoch 15/15 1980/1980 [==============================] - 0s - loss: 18.0180 - val_loss: 17.6948
<keras.callbacks.History at 0x7f43673a8f60>
get_away_bias = Model(away_in, ab)
away_bias = get_away_bias.predict(df.away_team.unique())
away_rating = [(b[0], idx2teamid[i]) for i,b in zip(df.away_team.unique(),away_bias)]
sorted(away_rating, key=lambda x: x[0], reverse=True)[:15]
[(0.3362835, 'Philadelphia Phillies'), (0.22890224, 'Cincinnati Reds'), (0.22622988, 'Milwaukee Brewers'), (0.20452867, 'Atlanta Braves'), (0.16434038, 'Oakland Athletics'), (0.15163806, "Arizona D'Backs"), (0.14097856, 'San Diego Padres'), (0.1341555, 'Kansas City Royals'), (0.11964823, 'Colorado Rockies'), (0.11323795, 'New York Yankees'), (0.094945267, 'Chicago White Sox'), (0.081414096, 'Miami Marlins'), (0.071450919, 'Houston Astros'), (0.062424611, 'Detroit Tigers'), (0.054559465, 'Baltimore Orioles')]
sorted(away_rating, key=lambda x: x[0])[:15]
[(-0.33137923, 'St. Louis Cardinals'), (-0.28681648, 'Washington Nationals'), (-0.27189431, 'Boston Red Sox'), (-0.27137929, 'Chicago Cubs'), (-0.26645699, 'Toronto Blue Jays'), (-0.24045761, 'Seattle Mariners'), (-0.14827146, 'San Francisco Giants'), (-0.095634982, 'Pittsburgh Pirates'), (-0.04807644, 'New York Mets'), (-0.023228975, 'LA Angels of Anaheim'), (-0.0085854754, 'Tampa Bay Rays'), (0.013221953, 'Texas Rangers'), (0.013871406, 'Los Angeles Dodgers'), (0.040301573, 'Cleveland Indians'), (0.049724065, 'Minnesota Twins')]