import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl
import scipy
import xgboost as xgb
df = pd.read_csv("training_data.tsv", sep='\t')
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
df.head()
target | sid | count | session_count | session_duration | item_index | category | month | hour | dayofweek | item_list | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1 | 1 | 1 | 4 | 351 | 2096 | 0 | 4 | 10 | 1 | [10040, 2097, 2095, 2096] |
1 | -1 | 1 | 1 | 4 | 351 | 2095 | 0 | 4 | 10 | 1 | [10040, 2097, 2095, 2096] |
2 | -1 | 1 | 1 | 4 | 351 | 2097 | 0 | 4 | 10 | 1 | [10040, 2097, 2095, 2096] |
3 | -1 | 1 | 1 | 4 | 351 | 10040 | 0 | 4 | 10 | 1 | [10040, 2097, 2095, 2096] |
4 | -1 | 2 | 2 | 6 | 359 | 19891 | 0 | 4 | 13 | 1 | [4943, 33274, 42167, 19891, 33280] |
Train / Validation
seed = 42
np.random.seed(seed)
training_prob = 0.75
mask = np.random.binomial(1, training_prob, size=df['sid'].nunique())
mask_df = pd.DataFrame(mask, index=df['sid'].unique(), columns=['mask']).reset_index()
mask_df.columns = ['sid', 'mask']
df = df.merge(mask_df, on='sid', how='inner')
training_df = df[df['mask'] == 1]
validation_df = df[df['mask'] == 0]
np.save("training.sid", training_df['sid'].values)
np.save("validation.sid", validation_df['sid'].values)
Feature to vector representation
from sklearn.feature_extraction import FeatureHasher
train_y = training_df['target'].values
train_y[train_y < 0] = 0
dense_tr_X = training_df[['count', 'session_count', 'session_duration']].values
def sparse_feature(df):
blank_feat = np.chararray(df['category'].values.shape)
blank_feat[:] = ' '
item_arr = np.chararray(df['item_index'].values.shape)
item_arr[:] = 'I'
item_feat = np.core.defchararray.add(np.core.defchararray.add(item_arr, df['item_index'].values.astype('str')), blank_feat)
del item_arr
cat_arr = np.chararray(df['category'].values.shape)
cat_arr[:] = 'C'
cat_feat = np.core.defchararray.add(np.core.defchararray.add(cat_arr, df['category'].values.astype('str')), blank_feat)
del cat_arr
month_arr = np.chararray(df['month'].values.shape)
month_arr[:] = 'M'
month_feat = np.core.defchararray.add(np.core.defchararray.add(month_arr, df['month'].values.astype('str')), blank_feat)
del month_arr
hour_arr = np.chararray(df['hour'].values.shape)
hour_arr[:] = 'h'
hour_feat = np.core.defchararray.add(np.core.defchararray.add(hour_arr, df['hour'].values.astype('str')), blank_feat)
del hour_arr
dow_arr = np.chararray(df['dayofweek'].values.shape)
dow_arr[:] = 'dow'
dow_feat = np.core.defchararray.add(np.core.defchararray.add(dow_arr, df['dayofweek'].values.astype('str')), blank_feat)
del dow_arr
feats = [item_feat, cat_feat, month_feat, hour_feat, dow_feat]
res = reduce(np.core.defchararray.add, feats)
return res
def category_token(row):
for k in row.strip().split():
yield k
%time sparse_tr_X = sparse_feature(training_df)
CPU times: user 56.4 s, sys: 2.91 s, total: 59.3 s Wall time: 59.4 s
sparse_tr_X
array(['I2096 C0 M4 h10.0 d1.0 ', 'I2095 C0 M4 h10.0 d1.0 ', 'I2097 C0 M4 h10.0 d1.0 ', ..., 'I744 C4 M9 h12.0 d4.0 ', 'I50232 CS M9 h19.0 d5.0 ', 'I3971 C2 M9 h20.0 d5.0 '], dtype='|S126')
def tokenize_item(row):
row = row[1:-1]
data = row.split(', ')
for token in data:
yield "item" + token
# add whole set
data.sort()
yield str(data)
item_tr_X = (tokenize_item(k) for k in training_df['item_list'].values)
hasher = FeatureHasher(input_type='string')
%time tmp_tr_X = hasher.transform(category_token(k) for k in sparse_tr_X)
CPU times: user 47.7 s, sys: 621 ms, total: 48.3 s Wall time: 47.3 s
%time tmp2_tr_X = hasher.transform(item_tr_X)
CPU times: user 1min 53s, sys: 1.96 s, total: 1min 55s Wall time: 1min 52s
tmp_tr_X
<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>' with 93511655 stored elements in Compressed Sparse Row format>
tmp2_tr_X
<18702331x1048576 sparse matrix of type '<type 'numpy.float64'>' with 127302377 stored elements in Compressed Sparse Row format>
%time tr_X = scipy.sparse.hstack((dense_tr_X, tmp_tr_X + tmp2_tr_X))
CPU times: user 8.12 s, sys: 19.4 s, total: 27.5 s Wall time: 2min 5s
tr_X
<18702331x1048579 sparse matrix of type '<type 'numpy.float64'>' with 276161564 stored elements in COOrdinate format>
def save_sparse_coo(filename, matrix):
np.savez(filename, data=matrix.data, row=matrix.row,
col=matrix.col, shape=matrix.shape)
def load_sparse_coo(filename):
loader = np.load(filename)
return scipy.sparse.coo_matrix((loader['data'], (loader['row'], loader['col'])),
shape=loader['shape'])
save_sparse_coo("train.feature", tr_X)
del dense_tr_X
del sparse_tr_X
del tmp_tr_X
del tmp2_tr_X
%time dtrain = xgb.DMatrix(tr_X, label=train_y, missing=-999.0)
CPU times: user 2min 29s, sys: 21.3 s, total: 2min 51s Wall time: 4min 26s
dtrain.save_binary("train.buffer")
Validation Data
validation_y = validation_df['target'].values
validation_y[validation_y < 0] = 0
dense_va_X = validation_df[['count', 'session_count', 'session_duration']].values
%time sparse_va_X = sparse_feature(validation_df)
CPU times: user 18.9 s, sys: 1.2 s, total: 20.1 s Wall time: 33 s
item_va_X = (tokenize_item(k) for k in validation_df['item_list'].values)
%time tmp_va_X = hasher.transform(category_token(k) for k in sparse_va_X)
CPU times: user 16.1 s, sys: 1.65 s, total: 17.7 s Wall time: 36.1 s
%time tmp2_va_X = hasher.transform(item_va_X)
CPU times: user 35.8 s, sys: 2 s, total: 37.8 s Wall time: 1min 12s
tmp_va_X
<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>' with 31139410 stored elements in Compressed Sparse Row format>
tmp2_va_X
<6227882x1048576 sparse matrix of type '<type 'numpy.float64'>' with 41949158 stored elements in Compressed Sparse Row format>
va_X = scipy.sparse.hstack((dense_va_X, tmp_va_X + tmp2_va_X))
va_X
<6227882x1048579 sparse matrix of type '<type 'numpy.float64'>' with 91518499 stored elements in COOrdinate format>
save_sparse_coo("validation.feature", va_X)
%time dvalidation = xgb.DMatrix(va_X, label=validation_y, missing=-999.0)
CPU times: user 40.5 s, sys: 2.71 s, total: 43.3 s Wall time: 54.3 s
dvalidation.save_binary("validation.buffer")
Convert test data as well
test_df = pd.read_csv("test_data.tsv", sep='\t')
/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
test_df.head()
sid | iid | count | session_count | session_duration | item_index | category | month | hour | dayofweek | item_list | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 214530776 | 3 | 3 | 453 | 1007 | 0 | 4 | 17 | 1 | [1007] |
1 | 5 | 214530776 | 3 | 3 | 453 | 1007 | 0 | 4 | 17 | 1 | [1007] |
2 | 5 | 214530776 | 3 | 3 | 453 | 1007 | 0 | 4 | 17 | 1 | [1007] |
3 | 10 | 214820942 | 1 | 2 | 66 | 41214 | 0 | 4 | 7 | 5 | [42615, 41214] |
4 | 10 | 214826810 | 1 | 2 | 66 | 42615 | 0 | 4 | 7 | 5 | [42615, 41214] |
dense_test_X = test_df[['count', 'session_count', 'session_duration']].values
sparse_test_X = sparse_feature(test_df)
item_test_X = (tokenize_item(k) for k in test_df['item_list'].values)
tmp_test_X = hasher.transform(category_token(k) for k in sparse_test_X)
tmp2_test_X = hasher.transform(item_test_X)
test_X = scipy.sparse.hstack((dense_test_X, tmp_test_X + tmp2_test_X))
save_sparse_coo("test.feature", test_X)
dtest = xgb.DMatrix(test_X, missing=-999.0)
dtest.save_binary("test.buffer")
np.save("test.sid", test_df['sid'].values)
np.save("test.iid", test_df['iid'].values)
len(test_df['sid'].values)
8251791
len(test_df['iid'].values)
8251791