In [1]:

import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib as mpl

In [2]:

click_df = read_csv('data/yoochoose-clicks.dat', names=["sid", "timestamp", "iid", "category"], parse_dates=[1])
buy_df = read_csv("data/yoochoose-buys.dat", names=["sid", "timestamp", "iid", "price", "quantity"], parse_dates=[1])
test_click_df = read_csv("data/yoochoose-test.dat", names=["sid", "timestamp", "iid", "category"], parse_dates=[1])

/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [3]:

click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)

In [4]:

click_df['count'] = 1
click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
click_count_df = click_count_df.reset_index()

In [5]:

test_click_df['count'] = 1
test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
test_click_count_df = test_click_count_df.reset_index()

In [6]:

feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner')

In [7]:

test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner')

In [8]:

del feature_df['count_x']
feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count']

In [9]:

del test_feature_df['count_x']
test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count']

In [10]:

feature_df.head()

Out[10]:

	sid	timestamp	iid	count
0	1	2014-04-07 10:51:09.277000	214536502	1
1	1	2014-04-07 10:54:09.868000	214536500	1
2	1	2014-04-07 10:54:46.998000	214536506	1
3	1	2014-04-07 10:57:00.306000	214577561	1
4	2	2014-04-07 13:56:37.614000	214662742	2

In [11]:

test_feature_df.head()

Out[11]:

	sid	timestamp	iid	count
0	5	2014-04-07 17:13:46.713000	214530776	3
1	5	2014-04-07 17:20:56.973000	214530776	3
2	5	2014-04-07 17:21:19.602000	214530776	3
3	10	2014-04-04 07:44:14.590000	214820942	1
4	10	2014-04-04 07:45:20.245000	214826810	1

It'll take around 3 minutes to get here.

In [12]:

session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count()
session_click_n = session_click_n.reset_index()
session_click_n.columns = ['sid', 'session_count']

In [13]:

test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count()
test_session_click_n = test_session_click_n.reset_index()
test_session_click_n.columns = ['sid', 'session_count']

In [14]:

feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner')

In [15]:

test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner')

In [16]:

feature_df.head()

Out[16]:

	sid	timestamp	iid	count	session_count
0	1	2014-04-07 10:51:09.277000	214536502	1	4
1	1	2014-04-07 10:54:09.868000	214536500	1	4
2	1	2014-04-07 10:54:46.998000	214536506	1	4
3	1	2014-04-07 10:57:00.306000	214577561	1	4
4	2	2014-04-07 13:56:37.614000	214662742	2	6

In [17]:

test_feature_df.head()

Out[17]:

	sid	timestamp	iid	count	session_count
0	5	2014-04-07 17:13:46.713000	214530776	3	3
1	5	2014-04-07 17:20:56.973000	214530776	3	3
2	5	2014-04-07 17:21:19.602000	214530776	3	3
3	10	2014-04-04 07:44:14.590000	214820942	1	2
4	10	2014-04-04 07:45:20.245000	214826810	1	2

Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays.

In [18]:

def timestamp_seconds(date):
    baseline = np.datetime64('2014-04-01')
    return (date - baseline) / np.timedelta64(1, 's')

In [19]:

click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values)

In [21]:

test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values)

In [22]:

g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']

In [23]:

test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']

In [24]:

%time session_duration = np.round(g.last() - g.first())

CPU times: user 1.83 s, sys: 469 ms, total: 2.3 s
Wall time: 2.14 s

In [25]:

test_session_duration = np.round(test_g.last() - test_g.first())

In [26]:

session_duration = pd.DataFrame(session_duration).reset_index()
session_duration.columns = ['sid', 'session_duration']

In [27]:

test_session_duration = pd.DataFrame(test_session_duration).reset_index()
test_session_duration.columns = ['sid', 'session_duration']

In [28]:

session_duration.head()

Out[28]:

	sid	session_duration
0	1	351
1	2	359
2	3	745
3	4	1034
4	6	246

In [29]:

test_session_duration.head()

Out[29]:

	sid	session_duration
0	5	453
1	10	66
2	15	327
3	20	163
4	25	41

In [30]:

feature_df = feature_df.merge(session_duration, on=['sid'], how='inner')

In [31]:

feature_df.head()

Out[31]:

	sid	timestamp	iid	count	session_count	session_duration
0	1	2014-04-07 10:51:09.277000	214536502	1	4	351
1	1	2014-04-07 10:54:09.868000	214536500	1	4	351
2	1	2014-04-07 10:54:46.998000	214536506	1	4	351
3	1	2014-04-07 10:57:00.306000	214577561	1	4	351
4	2	2014-04-07 13:56:37.614000	214662742	2	6	359

In [32]:

test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner')
test_feature_df.head()

Out[32]:

	sid	timestamp	iid	count	session_count	session_duration
0	5	2014-04-07 17:13:46.713000	214530776	3	3	453
1	5	2014-04-07 17:20:56.973000	214530776	3	3	453
2	5	2014-04-07 17:21:19.602000	214530776	3	3	453
3	10	2014-04-04 07:44:14.590000	214820942	1	2	66
4	10	2014-04-04 07:45:20.245000	214826810	1	2	66

In [33]:

def year(dates):
    "Return an array of the years given an array of datetime64s"
    return dates.astype('M8[Y]').astype('i8') + 1970

def month(dates):
    "Return an array of the months given an array of datetime64s"
    return dates.astype('M8[M]').astype('i8') % 12 + 1

def day(dates):
    "Return an array of the days of the month given an array of datetime64s"
    return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1

def hour(dates):
    "Return an array of the hours of the day given an array of datetime64s"
    return np.floor((dates - dates.astype("M8[D]")) / np.timedelta64(1, 'h'))

def dayofweek(dates):
    "Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6"
    baseline = np.datetime64('2014-04-01')  # Tuesday: 2
    return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7)

In [34]:

%time feature_df['month'] = month(feature_df['timestamp'].values)

CPU times: user 1.62 s, sys: 831 ms, total: 2.45 s
Wall time: 2.48 s

In [35]:

%time feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)

CPU times: user 765 ms, sys: 537 ms, total: 1.3 s
Wall time: 1.3 s

In [36]:

%time feature_df['hour'] = hour(feature_df['timestamp'].values)

CPU times: user 1.01 s, sys: 476 ms, total: 1.49 s
Wall time: 1.49 s

In [37]:

test_feature_df['month'] = month(test_feature_df['timestamp'].values)
test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values)
test_feature_df['hour'] = hour(test_feature_df['timestamp'].values)

In [38]:

feature_df.head()

Out[38]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour
0	1	2014-04-07 10:51:09.277000	214536502	1	4	351	4	1	10
1	1	2014-04-07 10:54:09.868000	214536500	1	4	351	4	1	10
2	1	2014-04-07 10:54:46.998000	214536506	1	4	351	4	1	10
3	1	2014-04-07 10:57:00.306000	214577561	1	4	351	4	1	10
4	2	2014-04-07 13:56:37.614000	214662742	2	6	359	4	1	13

In [39]:

test_feature_df.head()

Out[39]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour
0	5	2014-04-07 17:13:46.713000	214530776	3	3	453	4	1	17
1	5	2014-04-07 17:20:56.973000	214530776	3	3	453	4	1	17
2	5	2014-04-07 17:21:19.602000	214530776	3	3	453	4	1	17
3	10	2014-04-04 07:44:14.590000	214820942	1	2	66	4	5	7
4	10	2014-04-04 07:45:20.245000	214826810	1	2	66	4	5	7

Now for the list

label
Things to change to one-hot encoding
- iid
- category
- month
- dayofweek
- hour
list of items

Remeber, we need to combine training and test item list.

In [40]:

item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique()))

In [41]:

len(item_list)

Out[41]:

In [42]:

item_list.sort()

In [43]:

item_index_dict = {k: v for v, k in enumerate(item_list)}  # Beware that index starts at 0. This may cause some problems.

In [44]:

%time item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
item_list_df.columns = ['sid', 'item_list']

CPU times: user 5min 36s, sys: 5.02 s, total: 5min 41s
Wall time: 5min 32s

In [45]:

%time test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
test_item_list_df.columns = ['sid', 'item_list']

CPU times: user 1min 24s, sys: 1.06 s, total: 1min 25s
Wall time: 1min 23s

In [46]:

feature_df = feature_df.merge(item_list_df, on='sid', how='inner')

In [47]:

test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner')

In [48]:

feature_df.head()

Out[48]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour	item_list
0	1	2014-04-07 10:51:09.277000	214536502	1	4	351	4	1	10	[10040, 2097, 2095, 2096]
1	1	2014-04-07 10:54:09.868000	214536500	1	4	351	4	1	10	[10040, 2097, 2095, 2096]
2	1	2014-04-07 10:54:46.998000	214536506	1	4	351	4	1	10	[10040, 2097, 2095, 2096]
3	1	2014-04-07 10:57:00.306000	214577561	1	4	351	4	1	10	[10040, 2097, 2095, 2096]
4	2	2014-04-07 13:56:37.614000	214662742	2	6	359	4	1	13	[4943, 33274, 42167, 19891, 33280]

In [49]:

test_feature_df.head()

Out[49]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour	item_list
0	5	2014-04-07 17:13:46.713000	214530776	3	3	453	4	1	17	[1007]
1	5	2014-04-07 17:20:56.973000	214530776	3	3	453	4	1	17	[1007]
2	5	2014-04-07 17:21:19.602000	214530776	3	3	453	4	1	17	[1007]
3	10	2014-04-04 07:44:14.590000	214820942	1	2	66	4	5	7	[42615, 41214]
4	10	2014-04-04 07:45:20.245000	214826810	1	2	66	4	5	7	[42615, 41214]

In [50]:

feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values])

In [51]:

test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values])

In [52]:

feature_df.head()

Out[52]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour	item_list	item_index
0	1	2014-04-07 10:51:09.277000	214536502	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2096
1	1	2014-04-07 10:54:09.868000	214536500	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2095
2	1	2014-04-07 10:54:46.998000	214536506	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2097
3	1	2014-04-07 10:57:00.306000	214577561	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	10040
4	2	2014-04-07 13:56:37.614000	214662742	2	6	359	4	1	13	[4943, 33274, 42167, 19891, 33280]	19891

In [54]:

test_feature_df.head()

Out[54]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour	item_list	item_index
0	5	2014-04-07 17:13:46.713000	214530776	3	3	453	4	1	17	[1007]	1007
1	5	2014-04-07 17:20:56.973000	214530776	3	3	453	4	1	17	[1007]	1007
2	5	2014-04-07 17:21:19.602000	214530776	3	3	453	4	1	17	[1007]	1007
3	10	2014-04-04 07:44:14.590000	214820942	1	2	66	4	5	7	[42615, 41214]	41214
4	10	2014-04-04 07:45:20.245000	214826810	1	2	66	4	5	7	[42615, 41214]	42615

In [33]:

buy_df['target'] = 1

In [34]:

final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left')

In [35]:

final_df['target'].fillna(-1, inplace=True)

In [36]:

print("sid: {}".format(final_df['sid'].nunique()))

sid: 9249729

In [37]:

print("item index: {}".format(final_df['item_index'].nunique()))
print("category: {}".format(final_df['category'].nunique()))
print("month: {}".format(final_df['month'].nunique()))
print("hour: {}".format(final_df['hour'].nunique()))
print("dayofweek: {}".format(final_df['dayofweek'].nunique()))

item index: 52739
category: 340
month: 6
hour: 24
dayofweek: 7

Create training/test data. Training will contain validation data

In [38]:

training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique())

In [39]:

mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']

In [40]:

mask_df.head()

Out[40]:

	sid	mask
0	1	0
1	2	1
2	3	1
3	4	1
4	6	1

In [41]:

final_df = final_df.merge(mask_df, on='sid', how='inner')

In [42]:

final_df.head()

Out[42]:

	sid	timestamp	iid	count	session_count	session_duration	month	dayofweek	hour	item_list	item_index	target	mask
0	1	2014-04-07 10:51:09.277000	214536502	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2096	-1	0
1	1	2014-04-07 10:54:09.868000	214536500	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2095	-1	0
2	1	2014-04-07 10:54:46.998000	214536506	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	2097	-1	0
3	1	2014-04-07 10:57:00.306000	214577561	1	4	351	4	1	10	[10040, 2097, 2095, 2096]	10040	-1	0
4	2	2014-04-07 13:56:37.614000	214662742	2	6	359	4	1	13	[4943, 33274, 42167, 19891, 33280]	19891	-1	1

In [43]:

training_df = final_df[final_df['mask'] == 1]
cv_df = final_df[final_df['mask'] == 0]

In [44]:

print(len(training_df))
print(len(cv_df))

24932146
8302296

Buy probability per entry (not per session)

In [45]:

(final_df['target'].mean() + 1) / 2

Out[45]:

0.061387520813498242

In [46]:

(training_df['target'].mean() + 1) / 2

Out[46]:

0.061448781825679977

In [47]:

(cv_df['target'].mean() + 1) / 2

Out[47]:

0.061203551403129941

Buy probability per session