#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from pandas.io.parsers import read_csv get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt import matplotlib as mpl import scipy import xgboost as xgb # In[2]: df = pd.read_csv("training_data.tsv", sep='\t') # In[3]: df.head() # Train / Validation # In[4]: seed = 42 np.random.seed(seed) training_prob = 0.75 mask = np.random.binomial(1, training_prob, size=df['sid'].nunique()) mask_df = pd.DataFrame(mask, index=df['sid'].unique(), columns=['mask']).reset_index() mask_df.columns = ['sid', 'mask'] df = df.merge(mask_df, on='sid', how='inner') # In[5]: training_df = df[df['mask'] == 1] validation_df = df[df['mask'] == 0] # In[6]: np.save("training.sid", training_df['sid'].values) np.save("validation.sid", validation_df['sid'].values) # Feature to vector representation # In[2]: from sklearn.feature_extraction import FeatureHasher # In[8]: train_y = training_df['target'].values # In[9]: train_y[train_y < 0] = 0 # In[10]: dense_tr_X = training_df[['count', 'session_count', 'session_duration']].values # In[3]: def sparse_feature(df): blank_feat = np.chararray(df['category'].values.shape) blank_feat[:] = ' ' item_arr = np.chararray(df['item_index'].values.shape) item_arr[:] = 'I' item_feat = np.core.defchararray.add(np.core.defchararray.add(item_arr, df['item_index'].values.astype('str')), blank_feat) del item_arr cat_arr = np.chararray(df['category'].values.shape) cat_arr[:] = 'C' cat_feat = np.core.defchararray.add(np.core.defchararray.add(cat_arr, df['category'].values.astype('str')), blank_feat) del cat_arr month_arr = np.chararray(df['month'].values.shape) month_arr[:] = 'M' month_feat = np.core.defchararray.add(np.core.defchararray.add(month_arr, df['month'].values.astype('str')), blank_feat) del month_arr hour_arr = np.chararray(df['hour'].values.shape) hour_arr[:] = 'h' hour_feat = np.core.defchararray.add(np.core.defchararray.add(hour_arr, df['hour'].values.astype('str')), blank_feat) del hour_arr dow_arr = np.chararray(df['dayofweek'].values.shape) dow_arr[:] = 'dow' dow_feat = np.core.defchararray.add(np.core.defchararray.add(dow_arr, df['dayofweek'].values.astype('str')), blank_feat) del dow_arr feats = [item_feat, cat_feat, month_feat, hour_feat, dow_feat] res = reduce(np.core.defchararray.add, feats) return res # In[4]: def category_token(row): for k in row.strip().split(): yield k # In[17]: get_ipython().run_line_magic('time', 'sparse_tr_X = sparse_feature(training_df)') # In[18]: sparse_tr_X # In[5]: def tokenize_item(row): row = row[1:-1] data = row.split(', ') for token in data: yield "item" + token # add whole set data.sort() yield str(data) # In[20]: item_tr_X = (tokenize_item(k) for k in training_df['item_list'].values) # In[12]: hasher = FeatureHasher(input_type='string') # In[22]: get_ipython().run_line_magic('time', 'tmp_tr_X = hasher.transform(category_token(k) for k in sparse_tr_X)') # In[23]: get_ipython().run_line_magic('time', 'tmp2_tr_X = hasher.transform(item_tr_X)') # In[24]: tmp_tr_X # In[25]: tmp2_tr_X # In[26]: get_ipython().run_line_magic('time', 'tr_X = scipy.sparse.hstack((dense_tr_X, tmp_tr_X + tmp2_tr_X))') # In[27]: tr_X # In[17]: def save_sparse_coo(filename, matrix): np.savez(filename, data=matrix.data, row=matrix.row, col=matrix.col, shape=matrix.shape) def load_sparse_coo(filename): loader = np.load(filename) return scipy.sparse.coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) # In[29]: save_sparse_coo("train.feature", tr_X) # In[30]: del dense_tr_X del sparse_tr_X del tmp_tr_X del tmp2_tr_X # In[31]: get_ipython().run_line_magic('time', 'dtrain = xgb.DMatrix(tr_X, label=train_y, missing=-999.0)') # In[32]: dtrain.save_binary("train.buffer") # Validation Data # In[33]: validation_y = validation_df['target'].values validation_y[validation_y < 0] = 0 dense_va_X = validation_df[['count', 'session_count', 'session_duration']].values # In[34]: get_ipython().run_line_magic('time', 'sparse_va_X = sparse_feature(validation_df)') # In[35]: item_va_X = (tokenize_item(k) for k in validation_df['item_list'].values) # In[36]: get_ipython().run_line_magic('time', 'tmp_va_X = hasher.transform(category_token(k) for k in sparse_va_X)') # In[37]: get_ipython().run_line_magic('time', 'tmp2_va_X = hasher.transform(item_va_X)') # In[38]: tmp_va_X # In[39]: tmp2_va_X # In[40]: va_X = scipy.sparse.hstack((dense_va_X, tmp_va_X + tmp2_va_X)) # In[41]: va_X # In[42]: save_sparse_coo("validation.feature", va_X) # In[43]: get_ipython().run_line_magic('time', 'dvalidation = xgb.DMatrix(va_X, label=validation_y, missing=-999.0)') # In[44]: dvalidation.save_binary("validation.buffer") # Convert test data as well # In[6]: test_df = pd.read_csv("test_data.tsv", sep='\t') # In[7]: test_df.head() # In[8]: dense_test_X = test_df[['count', 'session_count', 'session_duration']].values # In[9]: sparse_test_X = sparse_feature(test_df) # In[10]: item_test_X = (tokenize_item(k) for k in test_df['item_list'].values) # In[13]: tmp_test_X = hasher.transform(category_token(k) for k in sparse_test_X) # In[14]: tmp2_test_X = hasher.transform(item_test_X) # In[15]: test_X = scipy.sparse.hstack((dense_test_X, tmp_test_X + tmp2_test_X)) # In[18]: save_sparse_coo("test.feature", test_X) # In[19]: dtest = xgb.DMatrix(test_X, missing=-999.0) # In[20]: dtest.save_binary("test.buffer") # In[21]: np.save("test.sid", test_df['sid'].values) # In[22]: np.save("test.iid", test_df['iid'].values) # In[23]: len(test_df['sid'].values) # In[24]: len(test_df['iid'].values) # In[ ]: