In [1]:

import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib as mpl

import scipy

import xgboost as xgb

In [2]:

df = pd.read_csv("training_data.tsv", sep='\t')

/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [3]:

df.head()

Out[3]:

	target	sid	count	session_count	session_duration	item_index	month	hour	dayofweek	item_list
0	-1	1	1	4	351	2096	4	10	1	[10040, 2097, 2095, 2096]
1	-1	1	1	4	351	2095	4	10	1	[10040, 2097, 2095, 2096]
2	-1	1	1	4	351	2097	4	10	1	[10040, 2097, 2095, 2096]
3	-1	1	1	4	351	10040	4	10	1	[10040, 2097, 2095, 2096]
4	-1	2	2	6	359	19891	4	13	1	[4943, 33274, 42167, 19891, 33280]

Train / Validation

In [4]:

seed = 42
np.random.seed(seed)

training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=df['sid'].nunique())
mask_df = pd.DataFrame(mask, index=df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']
df = df.merge(mask_df, on='sid', how='inner')

In [5]:

training_df = df[df['mask'] == 1]
validation_df = df[df['mask'] == 0]

In [6]:

np.save("training.sid", training_df['sid'].values)
np.save("validation.sid", validation_df['sid'].values)

Feature to vector representation

In [2]:

from sklearn.feature_extraction import FeatureHasher

In [8]:

train_y = training_df['target'].values

In [9]:

train_y[train_y < 0] = 0

In [10]:

dense_tr_X = training_df[['count', 'session_count', 'session_duration']].values

In [3]:

def sparse_feature(df):
    blank_feat = np.chararray(df['category'].values.shape)
    blank_feat[:] = ' '
    
    item_arr = np.chararray(df['item_index'].values.shape)
    item_arr[:] = 'I'
    
    item_feat = np.core.defchararray.add(np.core.defchararray.add(item_arr, df['item_index'].values.astype('str')), blank_feat)
    del item_arr
    
    cat_arr = np.chararray(df['category'].values.shape)
    cat_arr[:] = 'C'

    cat_feat = np.core.defchararray.add(np.core.defchararray.add(cat_arr, df['category'].values.astype('str')), blank_feat)
    del cat_arr
    
    month_arr = np.chararray(df['month'].values.shape)
    month_arr[:] = 'M'

    month_feat = np.core.defchararray.add(np.core.defchararray.add(month_arr, df['month'].values.astype('str')), blank_feat)
    del month_arr
    
    hour_arr = np.chararray(df['hour'].values.shape)
    hour_arr[:] = 'h'

    hour_feat = np.core.defchararray.add(np.core.defchararray.add(hour_arr, df['hour'].values.astype('str')), blank_feat)
    del hour_arr

    dow_arr = np.chararray(df['dayofweek'].values.shape)
    dow_arr[:] = 'dow'

    dow_feat = np.core.defchararray.add(np.core.defchararray.add(dow_arr, df['dayofweek'].values.astype('str')), blank_feat)
    del dow_arr
    
    feats = [item_feat, cat_feat, month_feat, hour_feat, dow_feat]
    res = reduce(np.core.defchararray.add, feats)
    
    return res

In [4]:

def category_token(row):
    for k in row.strip().split():
        yield k

In [17]:

%time sparse_tr_X = sparse_feature(training_df)

CPU times: user 56.4 s, sys: 2.91 s, total: 59.3 s
Wall time: 59.4 s

In [18]:

sparse_tr_X

Out[18]:

array(['I2096 C0 M4 h10.0 d1.0 ', 'I2095 C0 M4 h10.0 d1.0 ',
       'I2097 C0 M4 h10.0 d1.0 ', ..., 'I744 C4 M9 h12.0 d4.0 ',
       'I50232 CS M9 h19.0 d5.0 ', 'I3971 C2 M9 h20.0 d5.0 '], 
      dtype='|S126')

In [5]:

def tokenize_item(row):
    row = row[1:-1]
    data = row.split(', ')
    for token in data:
        yield "item" + token
    
    # add whole set
    data.sort()
    yield str(data)

In [20]:

item_tr_X = (tokenize_item(k) for k in training_df['item_list'].values)

In [12]:

hasher = FeatureHasher(input_type='string')

In [22]:

%time tmp_tr_X = hasher.transform(category_token(k) for k in sparse_tr_X)

CPU times: user 47.7 s, sys: 621 ms, total: 48.3 s
Wall time: 47.3 s

In [23]:

%time tmp2_tr_X = hasher.transform(item_tr_X)

CPU times: user 1min 53s, sys: 1.96 s, total: 1min 55s
Wall time: 1min 52s

In [24]:

tmp_tr_X

Out[24]:

<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 93511655 stored elements in Compressed Sparse Row format>

In [25]:

tmp2_tr_X

Out[25]:

<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 127302377 stored elements in Compressed Sparse Row format>

In [26]:

%time tr_X = scipy.sparse.hstack((dense_tr_X, tmp_tr_X + tmp2_tr_X))

CPU times: user 8.12 s, sys: 19.4 s, total: 27.5 s
Wall time: 2min 5s

In [27]:

tr_X

Out[27]:

<18702331x1048579 sparse matrix of type '<type 'numpy.float64'>'
	with 276161564 stored elements in COOrdinate format>

In [17]:

def save_sparse_coo(filename, matrix):
    np.savez(filename, data=matrix.data, row=matrix.row,
             col=matrix.col, shape=matrix.shape)
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'], (loader['row'], loader['col'])),
                         shape=loader['shape'])

In [29]:

save_sparse_coo("train.feature", tr_X)

In [30]:

del dense_tr_X
del sparse_tr_X
del tmp_tr_X
del tmp2_tr_X

In [31]:

%time dtrain = xgb.DMatrix(tr_X, label=train_y, missing=-999.0)

CPU times: user 2min 29s, sys: 21.3 s, total: 2min 51s
Wall time: 4min 26s

In [32]:

dtrain.save_binary("train.buffer")

Validation Data

In [33]:

validation_y = validation_df['target'].values
validation_y[validation_y < 0] = 0

dense_va_X = validation_df[['count', 'session_count', 'session_duration']].values

In [34]:

%time sparse_va_X = sparse_feature(validation_df)

CPU times: user 18.9 s, sys: 1.2 s, total: 20.1 s
Wall time: 33 s

In [35]:

item_va_X = (tokenize_item(k) for k in validation_df['item_list'].values)

In [36]:

%time tmp_va_X = hasher.transform(category_token(k) for k in sparse_va_X)

CPU times: user 16.1 s, sys: 1.65 s, total: 17.7 s
Wall time: 36.1 s

In [37]:

%time tmp2_va_X = hasher.transform(item_va_X)

CPU times: user 35.8 s, sys: 2 s, total: 37.8 s
Wall time: 1min 12s

In [38]:

tmp_va_X

Out[38]:

<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 31139410 stored elements in Compressed Sparse Row format>

In [39]:

tmp2_va_X

Out[39]:

<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 41949158 stored elements in Compressed Sparse Row format>

In [40]:

va_X = scipy.sparse.hstack((dense_va_X, tmp_va_X + tmp2_va_X))

In [41]:

va_X

Out[41]:

<6227882x1048579 sparse matrix of type '<type 'numpy.float64'>'
	with 91518499 stored elements in COOrdinate format>

In [42]:

save_sparse_coo("validation.feature", va_X)

In [43]:

%time dvalidation = xgb.DMatrix(va_X, label=validation_y, missing=-999.0)

CPU times: user 40.5 s, sys: 2.71 s, total: 43.3 s
Wall time: 54.3 s

In [44]:

dvalidation.save_binary("validation.buffer")

Convert test data as well

In [6]:

test_df = pd.read_csv("test_data.tsv", sep='\t')

/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [7]:

test_df.head()

Out[7]:

	sid	iid	count	session_count	session_duration	item_index	month	hour	dayofweek	item_list
0	5	214530776	3	3	453	1007	4	17	1	[1007]
1	5	214530776	3	3	453	1007	4	17	1	[1007]
2	5	214530776	3	3	453	1007	4	17	1	[1007]
3	10	214820942	1	2	66	41214	4	7	5	[42615, 41214]
4	10	214826810	1	2	66	42615	4	7	5	[42615, 41214]

In [8]:

dense_test_X = test_df[['count', 'session_count', 'session_duration']].values

In [9]:

sparse_test_X = sparse_feature(test_df)

In [10]:

item_test_X = (tokenize_item(k) for k in test_df['item_list'].values)

In [13]:

tmp_test_X = hasher.transform(category_token(k) for k in sparse_test_X)

In [14]:

tmp2_test_X = hasher.transform(item_test_X)

In [15]:

test_X = scipy.sparse.hstack((dense_test_X, tmp_test_X + tmp2_test_X))

In [18]:

save_sparse_coo("test.feature", test_X)

In [19]:

dtest = xgb.DMatrix(test_X, missing=-999.0)

In [20]:

dtest.save_binary("test.buffer")

In [21]:

np.save("test.sid", test_df['sid'].values)

In [22]:

np.save("test.iid", test_df['iid'].values)

In [23]:

len(test_df['sid'].values)

Out[23]:

In [24]:

len(test_df['iid'].values)

Out[24]:

In [ ]: