import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl
click_df = read_csv('data/yoochoose-clicks.dat', names=["sid", "timestamp", "iid", "category"], parse_dates=[1])
buy_df = read_csv("data/yoochoose-buys.dat", names=["sid", "timestamp", "iid", "price", "quantity"], parse_dates=[1])
test_click_df = read_csv("data/yoochoose-test.dat", names=["sid", "timestamp", "iid", "category"], parse_dates=[1])
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)
click_df['count'] = 1
click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
click_count_df = click_count_df.reset_index()
test_click_df['count'] = 1
test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()
test_click_count_df = test_click_count_df.reset_index()
feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner')
test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner')
del feature_df['count_x']
feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count']
del test_feature_df['count_x']
test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count']
feature_df.head()
sid | timestamp | iid | category | count | |
---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 |
test_feature_df.head()
sid | timestamp | iid | category | count | |
---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 |
It'll take around 3 minutes to get here.
session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count()
session_click_n = session_click_n.reset_index()
session_click_n.columns = ['sid', 'session_count']
test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count()
test_session_click_n = test_session_click_n.reset_index()
test_session_click_n.columns = ['sid', 'session_count']
feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner')
test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner')
feature_df.head()
sid | timestamp | iid | category | count | session_count | |
---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 |
test_feature_df.head()
sid | timestamp | iid | category | count | session_count | |
---|---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 | 3 |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 | 3 |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 | 3 |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 | 2 |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 | 2 |
Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays.
def timestamp_seconds(date):
baseline = np.datetime64('2014-04-01')
return (date - baseline) / np.timedelta64(1, 's')
click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values)
test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values)
g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']
test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']
%time session_duration = np.round(g.last() - g.first())
CPU times: user 1.83 s, sys: 469 ms, total: 2.3 s Wall time: 2.14 s
test_session_duration = np.round(test_g.last() - test_g.first())
session_duration = pd.DataFrame(session_duration).reset_index()
session_duration.columns = ['sid', 'session_duration']
test_session_duration = pd.DataFrame(test_session_duration).reset_index()
test_session_duration.columns = ['sid', 'session_duration']
session_duration.head()
sid | session_duration | |
---|---|---|
0 | 1 | 351 |
1 | 2 | 359 |
2 | 3 | 745 |
3 | 4 | 1034 |
4 | 6 | 246 |
test_session_duration.head()
sid | session_duration | |
---|---|---|
0 | 5 | 453 |
1 | 10 | 66 |
2 | 15 | 327 |
3 | 20 | 163 |
4 | 25 | 41 |
feature_df = feature_df.merge(session_duration, on=['sid'], how='inner')
feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | |
---|---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 | 351 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 | 351 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 | 351 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 | 351 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 | 359 |
test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner')
test_feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | |
---|---|---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 | 3 | 453 |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 | 3 | 453 |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 | 3 | 453 |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 | 2 | 66 |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 | 2 | 66 |
def year(dates):
"Return an array of the years given an array of datetime64s"
return dates.astype('M8[Y]').astype('i8') + 1970
def month(dates):
"Return an array of the months given an array of datetime64s"
return dates.astype('M8[M]').astype('i8') % 12 + 1
def day(dates):
"Return an array of the days of the month given an array of datetime64s"
return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1
def hour(dates):
"Return an array of the hours of the day given an array of datetime64s"
return np.floor((dates - dates.astype("M8[D]")) / np.timedelta64(1, 'h'))
def dayofweek(dates):
"Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6"
baseline = np.datetime64('2014-04-01') # Tuesday: 2
return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7)
%time feature_df['month'] = month(feature_df['timestamp'].values)
CPU times: user 1.62 s, sys: 831 ms, total: 2.45 s Wall time: 2.48 s
%time feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)
CPU times: user 765 ms, sys: 537 ms, total: 1.3 s Wall time: 1.3 s
%time feature_df['hour'] = hour(feature_df['timestamp'].values)
CPU times: user 1.01 s, sys: 476 ms, total: 1.49 s Wall time: 1.49 s
test_feature_df['month'] = month(test_feature_df['timestamp'].values)
test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values)
test_feature_df['hour'] = hour(test_feature_df['timestamp'].values)
feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 | 351 | 4 | 1 | 10 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 | 351 | 4 | 1 | 10 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 | 351 | 4 | 1 | 10 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 | 351 | 4 | 1 | 10 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 | 359 | 4 | 1 | 13 |
test_feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 | 2 | 66 | 4 | 5 | 7 |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 | 2 | 66 | 4 | 5 | 7 |
Now for the list
Remeber, we need to combine training and test item list.
item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique()))
len(item_list)
54287
item_list.sort()
item_index_dict = {k: v for v, k in enumerate(item_list)} # Beware that index starts at 0. This may cause some problems.
%time item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
item_list_df.columns = ['sid', 'item_list']
CPU times: user 5min 36s, sys: 5.02 s, total: 5min 41s Wall time: 5min 32s
%time test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()
test_item_list_df.columns = ['sid', 'item_list']
CPU times: user 1min 24s, sys: 1.06 s, total: 1min 25s Wall time: 1min 23s
feature_df = feature_df.merge(item_list_df, on='sid', how='inner')
test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner')
feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | item_list | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 | 359 | 4 | 1 | 13 | [4943, 33274, 42167, 19891, 33280] |
test_feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | item_list | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 | 2 | 66 | 4 | 5 | 7 | [42615, 41214] |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 | 2 | 66 | 4 | 5 | 7 | [42615, 41214] |
feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values])
test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values])
feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | item_list | item_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2096 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2095 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2097 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 10040 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 | 359 | 4 | 1 | 13 | [4943, 33274, 42167, 19891, 33280] | 19891 |
test_feature_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | item_list | item_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 2014-04-07 17:13:46.713000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] | 1007 |
1 | 5 | 2014-04-07 17:20:56.973000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] | 1007 |
2 | 5 | 2014-04-07 17:21:19.602000 | 214530776 | 0 | 3 | 3 | 453 | 4 | 1 | 17 | [1007] | 1007 |
3 | 10 | 2014-04-04 07:44:14.590000 | 214820942 | 0 | 1 | 2 | 66 | 4 | 5 | 7 | [42615, 41214] | 41214 |
4 | 10 | 2014-04-04 07:45:20.245000 | 214826810 | 0 | 1 | 2 | 66 | 4 | 5 | 7 | [42615, 41214] | 42615 |
buy_df['target'] = 1
final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left')
final_df['target'].fillna(-1, inplace=True)
print("sid: {}".format(final_df['sid'].nunique()))
sid: 9249729
print("item index: {}".format(final_df['item_index'].nunique()))
print("category: {}".format(final_df['category'].nunique()))
print("month: {}".format(final_df['month'].nunique()))
print("hour: {}".format(final_df['hour'].nunique()))
print("dayofweek: {}".format(final_df['dayofweek'].nunique()))
item index: 52739 category: 340 month: 6 hour: 24 dayofweek: 7
Create training/test data. Training will contain validation data
training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique())
mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']
mask_df.head()
sid | mask | |
---|---|---|
0 | 1 | 0 |
1 | 2 | 1 |
2 | 3 | 1 |
3 | 4 | 1 |
4 | 6 | 1 |
final_df = final_df.merge(mask_df, on='sid', how='inner')
final_df.head()
sid | timestamp | iid | category | count | session_count | session_duration | month | dayofweek | hour | item_list | item_index | target | mask | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2014-04-07 10:51:09.277000 | 214536502 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2096 | -1 | 0 |
1 | 1 | 2014-04-07 10:54:09.868000 | 214536500 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2095 | -1 | 0 |
2 | 1 | 2014-04-07 10:54:46.998000 | 214536506 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 2097 | -1 | 0 |
3 | 1 | 2014-04-07 10:57:00.306000 | 214577561 | 0 | 1 | 4 | 351 | 4 | 1 | 10 | [10040, 2097, 2095, 2096] | 10040 | -1 | 0 |
4 | 2 | 2014-04-07 13:56:37.614000 | 214662742 | 0 | 2 | 6 | 359 | 4 | 1 | 13 | [4943, 33274, 42167, 19891, 33280] | 19891 | -1 | 1 |
training_df = final_df[final_df['mask'] == 1]
cv_df = final_df[final_df['mask'] == 0]
print(len(training_df))
print(len(cv_df))
24932146 8302296
Buy probability per entry (not per session)
(final_df['target'].mean() + 1) / 2
0.061387520813498242
(training_df['target'].mean() + 1) / 2
0.061448781825679977
(cv_df['target'].mean() + 1) / 2
0.061203551403129941
Buy probability per session
(np.mean(final_df.groupby('sid')['target'].max()) + 1) / 2
0.055103884665161529
(np.mean(training_df.groupby('sid')['target'].max()) + 1) / 2
0.055144914996467675
(np.mean(cv_df.groupby('sid')['target'].max()) + 1) / 2
0.054980584441226776
training_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data
'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data
'item_list', # categorical data, variable length
]].to_csv('training_data.tsv', sep='\t', index=False)
cv_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data
'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data
'item_list', # categorical data, variable length
]].to_csv('cv_data.tsv', sep='\t', index=False)
test_feature_df[['sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data
'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data
'item_list', # categorical data, variable length
]].to_csv('test_data.tsv', sep='\t', index=False)