import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('./data/tips.csv',index_col='UID',encoding = "ISO-8859-1")
df.head()
ID | Tipster | Date | Track | Horse | Bet Type | Odds | Result | TipsterActive | |
---|---|---|---|---|---|---|---|---|---|
UID | |||||||||
1 | 1 | Tipster A | 24/07/2015 | Ascot | Fredricka | Win | 8.00 | Lose | True |
2 | 2 | Tipster A | 24/07/2015 | Thirsk | Spend A Penny | Win | 4.50 | Lose | True |
3 | 3 | Tipster A | 24/07/2015 | York | Straightothepoint | Win | 7.00 | Lose | True |
4 | 4 | Tipster A | 24/07/2015 | Newmarket | Miss Inga Sock | Win | 5.00 | Lose | True |
5 | 5 | Tipster A | 25/07/2015 | Ascot | Peril | Win | 4.33 | Win | True |
cat_var = df.dtypes.loc[df.dtypes=='object'].index
cat_var
Index(['Tipster', 'Date', 'Track', 'Horse', 'Bet Type', 'Result'], dtype='object')
df[cat_var].apply(lambda x: len(x.unique()))
Tipster 31 Date 1055 Track 116 Horse 15791 Bet Type 3 Result 2 dtype: int64
le = LabelEncoder()
for var in cat_var:
df[var] = le.fit_transform(df[var])
df.head()
ID | Tipster | Date | Track | Horse | Bet Type | Odds | Result | TipsterActive | |
---|---|---|---|---|---|---|---|---|---|
UID | |||||||||
1 | 1 | 0 | 818 | 2 | 5158 | 1 | 8.00 | 0 | True |
2 | 2 | 0 | 818 | 96 | 13108 | 1 | 4.50 | 0 | True |
3 | 3 | 0 | 818 | 114 | 13411 | 1 | 7.00 | 0 | True |
4 | 4 | 0 | 818 | 74 | 8976 | 1 | 5.00 | 0 | True |
5 | 5 | 0 | 851 | 2 | 10554 | 1 | 4.33 | 1 | True |
X = df[['Tipster', 'Track', 'Horse', 'Bet Type', 'Odds']]
y = df.Result.values
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20)
def standard_pipe(algorism):
return Pipeline([
('scl', StandardScaler()),
('est', algorism)
])
def fit(algorism):
pipe = standard_pipe(algorism)
pipe.fit(X_train,y_train)
return pipe
print(X_train.shape)
print(y_train.shape)
(30598, 5) (30598,)
pipe = fit(LogisticRegression())
accuracy_score(y_train, pipe.predict(X_train))
0.80011765474867635