In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./data/tips.csv',index_col='UID',encoding = "ISO-8859-1")
df.head()
Out[4]:
ID Tipster Date Track Horse Bet Type Odds Result TipsterActive
UID
1 1 Tipster A 24/07/2015 Ascot Fredricka Win 8.00 Lose True
2 2 Tipster A 24/07/2015 Thirsk Spend A Penny Win 4.50 Lose True
3 3 Tipster A 24/07/2015 York Straightothepoint Win 7.00 Lose True
4 4 Tipster A 24/07/2015 Newmarket Miss Inga Sock Win 5.00 Lose True
5 5 Tipster A 25/07/2015 Ascot Peril Win 4.33 Win True
In [6]:
cat_var = df.dtypes.loc[df.dtypes=='object'].index
cat_var
Out[6]:
Index(['Tipster', 'Date', 'Track', 'Horse', 'Bet Type', 'Result'], dtype='object')
In [7]:
df[cat_var].apply(lambda x: len(x.unique()))
Out[7]:
Tipster        31
Date         1055
Track         116
Horse       15791
Bet Type        3
Result          2
dtype: int64
In [8]:
le = LabelEncoder()
for var in cat_var:
    df[var] = le.fit_transform(df[var])
In [12]:
df.head()
Out[12]:
ID Tipster Date Track Horse Bet Type Odds Result TipsterActive
UID
1 1 0 818 2 5158 1 8.00 0 True
2 2 0 818 96 13108 1 4.50 0 True
3 3 0 818 114 13411 1 7.00 0 True
4 4 0 818 74 8976 1 5.00 0 True
5 5 0 851 2 10554 1 4.33 1 True
In [41]:
X = df[['Tipster', 'Track', 'Horse', 'Bet Type', 'Odds']]
y = df.Result.values
In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [59]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20)
In [60]:
def standard_pipe(algorism):
    return Pipeline([
        ('scl', StandardScaler()),
        ('est', algorism)
    ])
In [61]:
def fit(algorism):
    pipe = standard_pipe(algorism)
    pipe.fit(X_train,y_train)
    return pipe
In [62]:
print(X_train.shape)
print(y_train.shape)
(30598, 5)
(30598,)
In [63]:
pipe = fit(LogisticRegression())
In [64]:
accuracy_score(y_train, pipe.predict(X_train))
Out[64]:
0.80011765474867635