In [1]:
import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib as mpl

import scipy

import xgboost as xgb
In [2]:
df = pd.read_csv("training_data.tsv", sep='\t')
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
In [3]:
df.head()
Out[3]:
target sid count session_count session_duration item_index category month hour dayofweek item_list
0 -1 1 1 4 351 2096 0 4 10 1 [10040, 2097, 2095, 2096]
1 -1 1 1 4 351 2095 0 4 10 1 [10040, 2097, 2095, 2096]
2 -1 1 1 4 351 2097 0 4 10 1 [10040, 2097, 2095, 2096]
3 -1 1 1 4 351 10040 0 4 10 1 [10040, 2097, 2095, 2096]
4 -1 2 2 6 359 19891 0 4 13 1 [4943, 33274, 42167, 19891, 33280]

Train / Validation

In [4]:
seed = 42
np.random.seed(seed)

training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=df['sid'].nunique())
mask_df = pd.DataFrame(mask, index=df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']
df = df.merge(mask_df, on='sid', how='inner')
In [5]:
training_df = df[df['mask'] == 1]
validation_df = df[df['mask'] == 0]
In [6]:
np.save("training.sid", training_df['sid'].values)
np.save("validation.sid", validation_df['sid'].values)

Feature to vector representation

In [2]:
from sklearn.feature_extraction import FeatureHasher
In [8]:
train_y = training_df['target'].values
In [9]:
train_y[train_y < 0] = 0
In [10]:
dense_tr_X = training_df[['count', 'session_count', 'session_duration']].values
In [3]:
def sparse_feature(df):
    blank_feat = np.chararray(df['category'].values.shape)
    blank_feat[:] = ' '
    
    item_arr = np.chararray(df['item_index'].values.shape)
    item_arr[:] = 'I'
    
    item_feat = np.core.defchararray.add(np.core.defchararray.add(item_arr, df['item_index'].values.astype('str')), blank_feat)
    del item_arr
    
    cat_arr = np.chararray(df['category'].values.shape)
    cat_arr[:] = 'C'

    cat_feat = np.core.defchararray.add(np.core.defchararray.add(cat_arr, df['category'].values.astype('str')), blank_feat)
    del cat_arr
    
    month_arr = np.chararray(df['month'].values.shape)
    month_arr[:] = 'M'

    month_feat = np.core.defchararray.add(np.core.defchararray.add(month_arr, df['month'].values.astype('str')), blank_feat)
    del month_arr
    
    hour_arr = np.chararray(df['hour'].values.shape)
    hour_arr[:] = 'h'

    hour_feat = np.core.defchararray.add(np.core.defchararray.add(hour_arr, df['hour'].values.astype('str')), blank_feat)
    del hour_arr

    dow_arr = np.chararray(df['dayofweek'].values.shape)
    dow_arr[:] = 'dow'

    dow_feat = np.core.defchararray.add(np.core.defchararray.add(dow_arr, df['dayofweek'].values.astype('str')), blank_feat)
    del dow_arr
    
    feats = [item_feat, cat_feat, month_feat, hour_feat, dow_feat]
    res = reduce(np.core.defchararray.add, feats)
    
    return res
In [4]:
def category_token(row):
    for k in row.strip().split():
        yield k
In [17]:
%time sparse_tr_X = sparse_feature(training_df)
CPU times: user 56.4 s, sys: 2.91 s, total: 59.3 s
Wall time: 59.4 s
In [18]:
sparse_tr_X
Out[18]:
array(['I2096 C0 M4 h10.0 d1.0 ', 'I2095 C0 M4 h10.0 d1.0 ',
       'I2097 C0 M4 h10.0 d1.0 ', ..., 'I744 C4 M9 h12.0 d4.0 ',
       'I50232 CS M9 h19.0 d5.0 ', 'I3971 C2 M9 h20.0 d5.0 '], 
      dtype='|S126')
In [5]:
def tokenize_item(row):
    row = row[1:-1]
    data = row.split(', ')
    for token in data:
        yield "item" + token
    
    # add whole set
    data.sort()
    yield str(data)
In [20]:
item_tr_X = (tokenize_item(k) for k in training_df['item_list'].values)
In [12]:
hasher = FeatureHasher(input_type='string')
In [22]:
%time tmp_tr_X = hasher.transform(category_token(k) for k in sparse_tr_X)
CPU times: user 47.7 s, sys: 621 ms, total: 48.3 s
Wall time: 47.3 s
In [23]:
%time tmp2_tr_X = hasher.transform(item_tr_X)
CPU times: user 1min 53s, sys: 1.96 s, total: 1min 55s
Wall time: 1min 52s
In [24]:
tmp_tr_X
Out[24]:
<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 93511655 stored elements in Compressed Sparse Row format>
In [25]:
tmp2_tr_X
Out[25]:
<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 127302377 stored elements in Compressed Sparse Row format>
In [26]:
%time tr_X = scipy.sparse.hstack((dense_tr_X, tmp_tr_X + tmp2_tr_X))
CPU times: user 8.12 s, sys: 19.4 s, total: 27.5 s
Wall time: 2min 5s
In [27]:
tr_X
Out[27]:
<18702331x1048579 sparse matrix of type '<type 'numpy.float64'>'
	with 276161564 stored elements in COOrdinate format>
In [17]:
def save_sparse_coo(filename, matrix):
    np.savez(filename, data=matrix.data, row=matrix.row,
             col=matrix.col, shape=matrix.shape)
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'], (loader['row'], loader['col'])),
                         shape=loader['shape'])
In [29]:
save_sparse_coo("train.feature", tr_X)
In [30]:
del dense_tr_X
del sparse_tr_X
del tmp_tr_X
del tmp2_tr_X
In [31]:
%time dtrain = xgb.DMatrix(tr_X, label=train_y, missing=-999.0)
CPU times: user 2min 29s, sys: 21.3 s, total: 2min 51s
Wall time: 4min 26s
In [32]:
dtrain.save_binary("train.buffer")

Validation Data

In [33]:
validation_y = validation_df['target'].values
validation_y[validation_y < 0] = 0

dense_va_X = validation_df[['count', 'session_count', 'session_duration']].values
In [34]:
%time sparse_va_X = sparse_feature(validation_df)
CPU times: user 18.9 s, sys: 1.2 s, total: 20.1 s
Wall time: 33 s
In [35]:
item_va_X = (tokenize_item(k) for k in validation_df['item_list'].values)
In [36]:
%time tmp_va_X = hasher.transform(category_token(k) for k in sparse_va_X)
CPU times: user 16.1 s, sys: 1.65 s, total: 17.7 s
Wall time: 36.1 s
In [37]:
%time tmp2_va_X = hasher.transform(item_va_X)
CPU times: user 35.8 s, sys: 2 s, total: 37.8 s
Wall time: 1min 12s
In [38]:
tmp_va_X
Out[38]:
<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 31139410 stored elements in Compressed Sparse Row format>
In [39]:
tmp2_va_X
Out[39]:
<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 41949158 stored elements in Compressed Sparse Row format>
In [40]:
va_X = scipy.sparse.hstack((dense_va_X, tmp_va_X + tmp2_va_X))
In [41]:
va_X
Out[41]:
<6227882x1048579 sparse matrix of type '<type 'numpy.float64'>'
	with 91518499 stored elements in COOrdinate format>
In [42]:
save_sparse_coo("validation.feature", va_X)
In [43]:
%time dvalidation = xgb.DMatrix(va_X, label=validation_y, missing=-999.0)
CPU times: user 40.5 s, sys: 2.71 s, total: 43.3 s
Wall time: 54.3 s
In [44]:
dvalidation.save_binary("validation.buffer")

Convert test data as well

In [6]:
test_df = pd.read_csv("test_data.tsv", sep='\t')
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
In [7]:
test_df.head()
Out[7]:
sid iid count session_count session_duration item_index category month hour dayofweek item_list
0 5 214530776 3 3 453 1007 0 4 17 1 [1007]
1 5 214530776 3 3 453 1007 0 4 17 1 [1007]
2 5 214530776 3 3 453 1007 0 4 17 1 [1007]
3 10 214820942 1 2 66 41214 0 4 7 5 [42615, 41214]
4 10 214826810 1 2 66 42615 0 4 7 5 [42615, 41214]
In [8]:
dense_test_X = test_df[['count', 'session_count', 'session_duration']].values
In [9]:
sparse_test_X = sparse_feature(test_df)
In [10]:
item_test_X = (tokenize_item(k) for k in test_df['item_list'].values)
In [13]:
tmp_test_X = hasher.transform(category_token(k) for k in sparse_test_X)
In [14]:
tmp2_test_X = hasher.transform(item_test_X)
In [15]:
test_X = scipy.sparse.hstack((dense_test_X, tmp_test_X + tmp2_test_X))
In [18]:
save_sparse_coo("test.feature", test_X)
In [19]:
dtest = xgb.DMatrix(test_X, missing=-999.0)
In [20]:
dtest.save_binary("test.buffer")
In [21]:
np.save("test.sid", test_df['sid'].values)
In [22]:
np.save("test.iid", test_df['iid'].values)
In [23]:
len(test_df['sid'].values)
Out[23]:
8251791
In [24]:
len(test_df['iid'].values)
Out[24]:
8251791