#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from pandas.io.parsers import read_csv get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt import matplotlib as mpl # In[2]: click_df = read_csv('data/yoochoose-clicks.dat', names=["sid", "timestamp", "iid", "category"], parse_dates=[1]) buy_df = read_csv("data/yoochoose-buys.dat", names=["sid", "timestamp", "iid", "price", "quantity"], parse_dates=[1]) test_click_df = read_csv("data/yoochoose-test.dat", names=["sid", "timestamp", "iid", "category"], parse_dates=[1]) # In[3]: click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True) buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True) test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True) # In[4]: click_df['count'] = 1 click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum() click_count_df = click_count_df.reset_index() # In[5]: test_click_df['count'] = 1 test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum() test_click_count_df = test_click_count_df.reset_index() # In[6]: feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner') # In[7]: test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner') # In[8]: del feature_df['count_x'] feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count'] # In[9]: del test_feature_df['count_x'] test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count'] # In[10]: feature_df.head() # In[11]: test_feature_df.head() # It'll take around 3 minutes to get here. # In[12]: session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count() session_click_n = session_click_n.reset_index() session_click_n.columns = ['sid', 'session_count'] # In[13]: test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count() test_session_click_n = test_session_click_n.reset_index() test_session_click_n.columns = ['sid', 'session_count'] # In[14]: feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner') # In[15]: test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner') # In[16]: feature_df.head() # In[17]: test_feature_df.head() # Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays. # In[18]: def timestamp_seconds(date): baseline = np.datetime64('2014-04-01') return (date - baseline) / np.timedelta64(1, 's') # In[19]: click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values) # In[21]: test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values) # In[22]: g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds'] # In[23]: test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds'] # In[24]: get_ipython().run_line_magic('time', 'session_duration = np.round(g.last() - g.first())') # In[25]: test_session_duration = np.round(test_g.last() - test_g.first()) # In[26]: session_duration = pd.DataFrame(session_duration).reset_index() session_duration.columns = ['sid', 'session_duration'] # In[27]: test_session_duration = pd.DataFrame(test_session_duration).reset_index() test_session_duration.columns = ['sid', 'session_duration'] # In[28]: session_duration.head() # In[29]: test_session_duration.head() # In[30]: feature_df = feature_df.merge(session_duration, on=['sid'], how='inner') # In[31]: feature_df.head() # In[32]: test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner') test_feature_df.head() # In[33]: def year(dates): "Return an array of the years given an array of datetime64s" return dates.astype('M8[Y]').astype('i8') + 1970 def month(dates): "Return an array of the months given an array of datetime64s" return dates.astype('M8[M]').astype('i8') % 12 + 1 def day(dates): "Return an array of the days of the month given an array of datetime64s" return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1 def hour(dates): "Return an array of the hours of the day given an array of datetime64s" return np.floor((dates - dates.astype("M8[D]")) / np.timedelta64(1, 'h')) def dayofweek(dates): "Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6" baseline = np.datetime64('2014-04-01') # Tuesday: 2 return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7) # In[34]: get_ipython().run_line_magic('time', "feature_df['month'] = month(feature_df['timestamp'].values)") # In[35]: get_ipython().run_line_magic('time', "feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)") # In[36]: get_ipython().run_line_magic('time', "feature_df['hour'] = hour(feature_df['timestamp'].values)") # In[37]: test_feature_df['month'] = month(test_feature_df['timestamp'].values) test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values) test_feature_df['hour'] = hour(test_feature_df['timestamp'].values) # In[38]: feature_df.head() # In[39]: test_feature_df.head() # Now for the list # # - label # - Things to change to one-hot encoding # - iid # - category # - month # - dayofweek # - hour # - list of items # Remeber, we need to combine training and test item list. # In[40]: item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique())) # In[41]: len(item_list) # In[42]: item_list.sort() # In[43]: item_index_dict = {k: v for v, k in enumerate(item_list)} # Beware that index starts at 0. This may cause some problems. # In[44]: get_ipython().run_line_magic('time', "item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()") item_list_df.columns = ['sid', 'item_list'] # In[45]: get_ipython().run_line_magic('time', "test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()") test_item_list_df.columns = ['sid', 'item_list'] # In[46]: feature_df = feature_df.merge(item_list_df, on='sid', how='inner') # In[47]: test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner') # In[48]: feature_df.head() # In[49]: test_feature_df.head() # In[50]: feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values]) # In[51]: test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values]) # In[52]: feature_df.head() # In[54]: test_feature_df.head() # In[33]: buy_df['target'] = 1 # In[34]: final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left') # In[35]: final_df['target'].fillna(-1, inplace=True) # In[36]: print("sid: {}".format(final_df['sid'].nunique())) # In[37]: print("item index: {}".format(final_df['item_index'].nunique())) print("category: {}".format(final_df['category'].nunique())) print("month: {}".format(final_df['month'].nunique())) print("hour: {}".format(final_df['hour'].nunique())) print("dayofweek: {}".format(final_df['dayofweek'].nunique())) # Create training/test data. Training will contain validation data # In[38]: training_prob = 0.75 mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique()) # In[39]: mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index() mask_df.columns = ['sid', 'mask'] # In[40]: mask_df.head() # In[41]: final_df = final_df.merge(mask_df, on='sid', how='inner') # In[42]: final_df.head() # In[43]: training_df = final_df[final_df['mask'] == 1] cv_df = final_df[final_df['mask'] == 0] # In[44]: print(len(training_df)) print(len(cv_df)) # Buy probability per entry (not per session) # In[45]: (final_df['target'].mean() + 1) / 2 # In[46]: (training_df['target'].mean() + 1) / 2 # In[47]: (cv_df['target'].mean() + 1) / 2 # Buy probability per session # In[48]: (np.mean(final_df.groupby('sid')['target'].max()) + 1) / 2 # In[49]: (np.mean(training_df.groupby('sid')['target'].max()) + 1) / 2 # In[50]: (np.mean(cv_df.groupby('sid')['target'].max()) + 1) / 2 # In[51]: training_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data 'item_list', # categorical data, variable length ]].to_csv('training_data.tsv', sep='\t', index=False) # In[52]: cv_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data 'item_list', # categorical data, variable length ]].to_csv('cv_data.tsv', sep='\t', index=False) # In[57]: test_feature_df[['sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data 'item_list', # categorical data, variable length ]].to_csv('test_data.tsv', sep='\t', index=False) # In[ ]: