%matplotlib inline
import matplotlib.pyplot as plt
import zipfile
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
In this exercise we will load the Tennis dataset, choose one of the datasets, and fit a logistic model to try and predict if the player won a game.
filename = '../data/tennis.zip'
tennis_zip = zipfile.ZipFile(filename)
for f in tennis_zip.filelist:
print(f.filename)
data/Andy-Murray.csv data/Novak-Djokovic.csv data/Rafael-Nadal.csv data/Roger-Federer.csv
player = 'Rafael-Nadal'
path = 'data/{}.csv'
path = path.format(player.replace(' ', '-'))
with tennis_zip.open(path) as f:
df = pd.read_csv(f)
df.head()
year | tournament | start date | type | surface | draw | atp points | atp ranking | tournament prize money | round | ... | player2 2nd serve return points total | player2 break points converted won | player2 break points converted total | player2 return games played | player2 total service points won | player2 total service points total | player2 total return points won | player2 total return points total | player2 total points won | player2 total points total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2001 | Seville, Spain | 17.09.2001 | CH | Outdoor: Clay | Draw: 32 | 5 | NaN | $650 | R32 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2001 | Seville, Spain | 17.09.2001 | CH | Outdoor: Clay | Draw: 32 | 5 | NaN | $650 | R16 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2001 | Spain F10, Madrid | 10.09.2001 | FU | Outdoor: Hard | Draw: 32 | NaN | NaN | $117 | R32 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2002 | Spain F20, Gran Canaria | 25.11.2002 | FU | Outdoor: Carpet | Draw: 32 | 18 | 238.0 | $1,950 | R32 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2002 | Spain F20, Gran Canaria | 25.11.2002 | FU | Outdoor: Carpet | Draw: 32 | 18 | 238.0 | $1,950 | R16 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 70 columns
Step-by-step instructions:
df.columns
for ideas.player1 name
and winner
).pickle
model to dump the fitted model so it can be used again later for prediction or re-fitting.For bonus points:
sklearn.model_selection.train_test_split
(click the link for docs) to split the X
and y
into a training and testing set.Solution is given in solutions/logistic-tennis.py.
features = ['player1 aces', 'player1 double faults']
model_filename = 'logistic_tennis.model'
df['win'] = df['player1 name'] == df['winner']
target = 'win'
idx = np.isfinite(df[features]).all(axis=1)
df = df[idx]
df.head()
year | tournament | start date | type | surface | draw | atp points | atp ranking | tournament prize money | round | ... | player2 break points converted won | player2 break points converted total | player2 return games played | player2 total service points won | player2 total service points total | player2 total return points won | player2 total return points total | player2 total points won | player2 total points total | win | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
49 | 2002 | Mallorca, Spain | 29.04.2002 | WS | Outdoor: Clay | Draw: 32 | 15 | 762.0 | $5,850 | R32 | ... | 3.0 | 9.0 | 10.0 | 28.0 | 59.0 | 27.0 | 66.0 | 55.0 | 125.0 | True |
50 | 2002 | Mallorca, Spain | 29.04.2002 | WS | Outdoor: Clay | Draw: 32 | 15 | 762.0 | $5,850 | R16 | ... | 4.0 | 9.0 | 8.0 | 35.0 | 48.0 | 29.0 | 59.0 | 64.0 | 107.0 | False |
51 | 2003 | Basel, Switzerland | 20.10.2003 | WS | Indoor: Carpet | Draw: 32 | 5 | 48.0 | $10,000 | R32 | ... | 2.0 | 3.0 | 15.0 | 68.0 | 97.0 | 33.0 | 98.0 | 101.0 | 195.0 | False |
52 | 2003 | ATP Masters Series Madrid, Spain | 13.10.2003 | SU | Indoor: Hard | Draw: 48 | 5 | 49.0 | $7,500 | R64 | ... | 6.0 | 10.0 | 14.0 | 46.0 | 82.0 | 37.0 | 80.0 | 83.0 | 162.0 | False |
53 | 2003 | Lyon, France | 06.10.2003 | WS | Indoor: Carpet | Draw: 32 | 5 | 47.0 | $7,950 | R32 | ... | 4.0 | 8.0 | 10.0 | 39.0 | 55.0 | 34.0 | 72.0 | 73.0 | 127.0 | False |
5 rows × 71 columns
X_train, X_test, y_train, y_test = model_selection.train_test_split(
df[features], df[target], test_size=0.75)
model = LogisticRegression()
model.fit(X_train, y_train)
print("Accuracy:", model.score(X_test, y_test))
Accuracy: 0.8323586744639376
with open(model_filename, 'wb') as f:
pickle.dump(model, f)
print("Saving model to", model_filename)
Saving model to logistic_tennis.model
print('Prediction:')
print(X_test.iloc[0], y_test.iloc[0])
Prediction: player1 aces 2.0 player1 double faults 1.0 Name: 657, dtype: float64 False