In [1]:
import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib as mpl
In [2]:
click_df = read_csv('data/yoochoose-clicks.dat', names=["sid", "timestamp", "iid", "category"], parse_dates=[1])
buy_df = read_csv("data/yoochoose-buys.dat", names=["sid", "timestamp", "iid", "price", "quantity"], parse_dates=[1])
test_click_df = read_csv("data/yoochoose-test.dat", names=["sid", "timestamp", "iid", "category"], parse_dates=[1])
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
In [3]:
click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
In [4]:
click_df['count'] = 1
click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
click_count_df = click_count_df.reset_index()
In [5]:
test_click_df['count'] = 1
test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
test_click_count_df = test_click_count_df.reset_index()
In [6]:
feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner')
In [7]:
test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner')
In [8]:
del feature_df['count_x']
feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count']
In [9]:
del test_feature_df['count_x']
test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count']
In [10]:
feature_df.head()
Out[10]:
sid timestamp iid category count
0 1 2014-04-07 10:51:09.277000 214536502 0 1
1 1 2014-04-07 10:54:09.868000 214536500 0 1
2 1 2014-04-07 10:54:46.998000 214536506 0 1
3 1 2014-04-07 10:57:00.306000 214577561 0 1
4 2 2014-04-07 13:56:37.614000 214662742 0 2
In [11]:
test_feature_df.head()
Out[11]:
sid timestamp iid category count
0 5 2014-04-07 17:13:46.713000 214530776 0 3
1 5 2014-04-07 17:20:56.973000 214530776 0 3
2 5 2014-04-07 17:21:19.602000 214530776 0 3
3 10 2014-04-04 07:44:14.590000 214820942 0 1
4 10 2014-04-04 07:45:20.245000 214826810 0 1

It'll take around 3 minutes to get here.

In [12]:
session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count()
session_click_n = session_click_n.reset_index()
session_click_n.columns = ['sid', 'session_count']
In [13]:
test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count()
test_session_click_n = test_session_click_n.reset_index()
test_session_click_n.columns = ['sid', 'session_count']
In [14]:
feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner')
In [15]:
test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner')
In [16]:
feature_df.head()
Out[16]:
sid timestamp iid category count session_count
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6
In [17]:
test_feature_df.head()
Out[17]:
sid timestamp iid category count session_count
0 5 2014-04-07 17:13:46.713000 214530776 0 3 3
1 5 2014-04-07 17:20:56.973000 214530776 0 3 3
2 5 2014-04-07 17:21:19.602000 214530776 0 3 3
3 10 2014-04-04 07:44:14.590000 214820942 0 1 2
4 10 2014-04-04 07:45:20.245000 214826810 0 1 2

Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays.

In [18]:
def timestamp_seconds(date):
    baseline = np.datetime64('2014-04-01')
    return (date - baseline) / np.timedelta64(1, 's')
In [19]:
click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values)
In [21]:
test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values)
In [22]:
g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']
In [23]:
test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']
In [24]:
%time session_duration = np.round(g.last() - g.first())
CPU times: user 1.83 s, sys: 469 ms, total: 2.3 s
Wall time: 2.14 s
In [25]:
test_session_duration = np.round(test_g.last() - test_g.first())
In [26]:
session_duration = pd.DataFrame(session_duration).reset_index()
session_duration.columns = ['sid', 'session_duration']
In [27]:
test_session_duration = pd.DataFrame(test_session_duration).reset_index()
test_session_duration.columns = ['sid', 'session_duration']
In [28]:
session_duration.head()
Out[28]:
sid session_duration
0 1 351
1 2 359
2 3 745
3 4 1034
4 6 246
In [29]:
test_session_duration.head()
Out[29]:
sid session_duration
0 5 453
1 10 66
2 15 327
3 20 163
4 25 41
In [30]:
feature_df = feature_df.merge(session_duration, on=['sid'], how='inner')
In [31]:
feature_df.head()
Out[31]:
sid timestamp iid category count session_count session_duration
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 351
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 351
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 351
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 351
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 359
In [32]:
test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner')
test_feature_df.head()
Out[32]:
sid timestamp iid category count session_count session_duration
0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 453
1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 453
2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 453
3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 66
4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 66
In [33]:
def year(dates):
    "Return an array of the years given an array of datetime64s"
    return dates.astype('M8[Y]').astype('i8') + 1970

def month(dates):
    "Return an array of the months given an array of datetime64s"
    return dates.astype('M8[M]').astype('i8') % 12 + 1

def day(dates):
    "Return an array of the days of the month given an array of datetime64s"
    return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1

def hour(dates):
    "Return an array of the hours of the day given an array of datetime64s"
    return np.floor((dates - dates.astype("M8[D]")) / np.timedelta64(1, 'h'))

def dayofweek(dates):
    "Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6"
    baseline = np.datetime64('2014-04-01')  # Tuesday: 2
    return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7)
In [34]:
%time feature_df['month'] = month(feature_df['timestamp'].values)
CPU times: user 1.62 s, sys: 831 ms, total: 2.45 s
Wall time: 2.48 s
In [35]:
%time feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)
CPU times: user 765 ms, sys: 537 ms, total: 1.3 s
Wall time: 1.3 s
In [36]:
%time feature_df['hour'] = hour(feature_df['timestamp'].values)
CPU times: user 1.01 s, sys: 476 ms, total: 1.49 s
Wall time: 1.49 s
In [37]:
test_feature_df['month'] = month(test_feature_df['timestamp'].values)
test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values)
test_feature_df['hour'] = hour(test_feature_df['timestamp'].values)
In [38]:
feature_df.head()
Out[38]:
sid timestamp iid category count session_count session_duration month dayofweek hour
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 351 4 1 10
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 351 4 1 10
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 351 4 1 10
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 351 4 1 10
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 359 4 1 13
In [39]:
test_feature_df.head()
Out[39]:
sid timestamp iid category count session_count session_duration month dayofweek hour
0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 453 4 1 17
1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 453 4 1 17
2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 453 4 1 17
3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 66 4 5 7
4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 66 4 5 7

Now for the list

  • label
  • Things to change to one-hot encoding
    • iid
    • category
    • month
    • dayofweek
    • hour
  • list of items

Remeber, we need to combine training and test item list.

In [40]:
item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique()))
In [41]:
len(item_list)
Out[41]:
54287
In [42]:
item_list.sort()
In [43]:
item_index_dict = {k: v for v, k in enumerate(item_list)}  # Beware that index starts at 0. This may cause some problems.
In [44]:
%time item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
item_list_df.columns = ['sid', 'item_list']
CPU times: user 5min 36s, sys: 5.02 s, total: 5min 41s
Wall time: 5min 32s
In [45]:
%time test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
test_item_list_df.columns = ['sid', 'item_list']
CPU times: user 1min 24s, sys: 1.06 s, total: 1min 25s
Wall time: 1min 23s
In [46]:
feature_df = feature_df.merge(item_list_df, on='sid', how='inner')
In [47]:
test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner')
In [48]:
feature_df.head()
Out[48]:
sid timestamp iid category count session_count session_duration month dayofweek hour item_list
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096]
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096]
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096]
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096]
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 359 4 1 13 [4943, 33274, 42167, 19891, 33280]
In [49]:
test_feature_df.head()
Out[49]:
sid timestamp iid category count session_count session_duration month dayofweek hour item_list
0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 453 4 1 17 [1007]
1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 453 4 1 17 [1007]
2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 453 4 1 17 [1007]
3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 66 4 5 7 [42615, 41214]
4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 66 4 5 7 [42615, 41214]
In [50]:
feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values])
In [51]:
test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values])
In [52]:
feature_df.head()
Out[52]:
sid timestamp iid category count session_count session_duration month dayofweek hour item_list item_index
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2096
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2095
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2097
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 10040
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 359 4 1 13 [4943, 33274, 42167, 19891, 33280] 19891
In [54]:
test_feature_df.head()
Out[54]:
sid timestamp iid category count session_count session_duration month dayofweek hour item_list item_index
0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 453 4 1 17 [1007] 1007
1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 453 4 1 17 [1007] 1007
2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 453 4 1 17 [1007] 1007
3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 66 4 5 7 [42615, 41214] 41214
4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 66 4 5 7 [42615, 41214] 42615
In [33]:
buy_df['target'] = 1
In [34]:
final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left')
In [35]:
final_df['target'].fillna(-1, inplace=True)
In [36]:
print("sid: {}".format(final_df['sid'].nunique()))
sid: 9249729
In [37]:
print("item index: {}".format(final_df['item_index'].nunique()))
print("category: {}".format(final_df['category'].nunique()))
print("month: {}".format(final_df['month'].nunique()))
print("hour: {}".format(final_df['hour'].nunique()))
print("dayofweek: {}".format(final_df['dayofweek'].nunique()))
item index: 52739
category: 340
month: 6
hour: 24
dayofweek: 7

Create training/test data. Training will contain validation data

In [38]:
training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique())
In [39]:
mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']
In [40]:
mask_df.head()
Out[40]:
sid mask
0 1 0
1 2 1
2 3 1
3 4 1
4 6 1
In [41]:
final_df = final_df.merge(mask_df, on='sid', how='inner')
In [42]:
final_df.head()
Out[42]:
sid timestamp iid category count session_count session_duration month dayofweek hour item_list item_index target mask
0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2096 -1 0
1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2095 -1 0
2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 2097 -1 0
3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 351 4 1 10 [10040, 2097, 2095, 2096] 10040 -1 0
4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 359 4 1 13 [4943, 33274, 42167, 19891, 33280] 19891 -1 1
In [43]:
training_df = final_df[final_df['mask'] == 1]
cv_df = final_df[final_df['mask'] == 0]
In [44]:
print(len(training_df))
print(len(cv_df))
24932146
8302296

Buy probability per entry (not per session)

In [45]:
(final_df['target'].mean() + 1) / 2
Out[45]:
0.061387520813498242
In [46]:
(training_df['target'].mean() + 1) / 2
Out[46]:
0.061448781825679977
In [47]:
(cv_df['target'].mean() + 1) / 2
Out[47]:
0.061203551403129941

Buy probability per session

In [48]:
(np.mean(final_df.groupby('sid')['target'].max()) + 1) / 2
Out[48]:
0.055103884665161529
In [49]:
(np.mean(training_df.groupby('sid')['target'].max()) + 1) / 2
Out[49]:
0.055144914996467675
In [50]:
(np.mean(cv_df.groupby('sid')['target'].max()) + 1) / 2
Out[50]:
0.054980584441226776
In [51]:
training_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration',  # numerical data
            'item_index', 'category', 'month', 'hour', 'dayofweek',  # categorical data
           'item_list',  # categorical data, variable length
           ]].to_csv('training_data.tsv', sep='\t', index=False)
In [52]:
cv_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration',  # numerical data
            'item_index', 'category', 'month', 'hour', 'dayofweek',  # categorical data
           'item_list',  # categorical data, variable length
           ]].to_csv('cv_data.tsv', sep='\t', index=False)
In [57]:
test_feature_df[['sid', 'iid', 'count', 'session_count', 'session_duration',  # numerical data
            'item_index', 'category', 'month', 'hour', 'dayofweek',  # categorical data
           'item_list',  # categorical data, variable length
           ]].to_csv('test_data.tsv', sep='\t', index=False)