In [4]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./data/tips.csv',index_col='UID',encoding = "ISO-8859-1")
df.head()

Out[4]:

	ID	Tipster	Date	Track	Horse	Bet Type	Odds	Result	TipsterActive
UID
1	1	Tipster A	24/07/2015	Ascot	Fredricka	Win	8.00	Lose	True
2	2	Tipster A	24/07/2015	Thirsk	Spend A Penny	Win	4.50	Lose	True
3	3	Tipster A	24/07/2015	York	Straightothepoint	Win	7.00	Lose	True
4	4	Tipster A	24/07/2015	Newmarket	Miss Inga Sock	Win	5.00	Lose	True
5	5	Tipster A	25/07/2015	Ascot	Peril	Win	4.33	Win	True

In [6]:

cat_var = df.dtypes.loc[df.dtypes=='object'].index
cat_var

Out[6]:

Index(['Tipster', 'Date', 'Track', 'Horse', 'Bet Type', 'Result'], dtype='object')

In [7]:

df[cat_var].apply(lambda x: len(x.unique()))

Out[7]:

Tipster        31
Date         1055
Track         116
Horse       15791
Bet Type        3
Result          2
dtype: int64

In [8]:

le = LabelEncoder()
for var in cat_var:
    df[var] = le.fit_transform(df[var])

In [12]:

df.head()

Out[12]:

	ID	Tipster	Date	Track	Horse	Bet Type	Odds	Result	TipsterActive
UID
1	1	0	818	2	5158	1	8.00	0	True
2	2	0	818	96	13108	1	4.50	0	True
3	3	0	818	114	13411	1	7.00	0	True
4	4	0	818	74	8976	1	5.00	0	True
5	5	0	851	2	10554	1	4.33	1	True

In [41]:

X = df[['Tipster', 'Track', 'Horse', 'Bet Type', 'Odds']]
y = df.Result.values

In [58]:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [59]:

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20)

In [60]:

def standard_pipe(algorism):
    return Pipeline([
        ('scl', StandardScaler()),
        ('est', algorism)
    ])

In [61]:

def fit(algorism):
    pipe = standard_pipe(algorism)
    pipe.fit(X_train,y_train)
    return pipe

In [62]:

print(X_train.shape)
print(y_train.shape)

(30598, 5)
(30598,)

In [63]:

pipe = fit(LogisticRegression())

In [64]:

accuracy_score(y_train, pipe.predict(X_train))

Out[64]:

0.80011765474867635

In [ ]: