Глупый градиентный бенчмарк¶

Берём только весовые признаки и скармливаем их бустингу

In [ ]:

# подгружаем все нужные пакеты
import pandas as pd
import numpy as np


# для встроенных картинок
%pylab inline
# чуть покрасивше картинки:
pd.set_option('display.mpl_style', 'default')
figsize(12, 9)

import warnings
warnings.filterwarnings("ignore")

#plt.rcParams['figure.figsize'] = 10, 7.5
#plt.rcParams['axes.grid'] = True
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams['font.family'] = 'Ubuntu'

plt.rc('text', usetex=False)
plt.rc('font', family='serif')
plt.rc('font', weight='bold')
plt.rc('xtick', labelsize=14) 
plt.rc('ytick', labelsize=14)

# чтобы был русский шрифт
from matplotlib import rc
 
font = {'family': 'Droid Sans',
        'weight': 'normal'}
rc('font', **font)

In [3]:

from scipy.sparse import csr_matrix

data = pd.read_csv('train2.csv')

ss = csr_matrix((data['sum'].values, (data.id.values - 1, data.date.values - 1)))

In [4]:

def split_train_test(ss):
    """
    разделение на целевой вектор и спарс-матрицу
    """
    y, s = ss[:, -7:], ss[:, :-7]
    y = np.array(y.todense())
    y = (((y > 0).cumsum(axis=1) == 1) * y).sum(axis=1)
    print (y.shape, s.shape)
    return y, s
    
y, s = split_train_test(ss)
y2, s2 = split_train_test(s)

((110000,), (110000, 431))
((110000,), (110000, 424))

In [5]:

def get_features(s):
    """
    генерация признаков по спарс-матрице
    """
    m, n = s.shape
    k = n % 7
    
    f = np.zeros((m, 17*24)) # матрица признаков
    
    
    for i in range(m): # отдельно по каждому пользователю
        sh = s[i,:].toarray().ravel()
        h = sh[k:].reshape(-1, 7)
        g = (((h > 0).cumsum(axis=1) == 1) * h).sum(axis=1) # все первые покупки
        
        q = h.shape[0]
        
        def get_weighted(w, q, g):
            """
            для создания признаков весовых-схем
            w - веса
            q = len(g) - можно убрать
            g - перечень сумм
            """
            return (np.dot (w, csr_matrix((np.ones(q), (np.arange(q), g)), shape=(q, 17)).toarray()) / sum(w))
        
        for deg in [0, 1, 2, 3]: # степени в весовой схеме и... индексы
            new_features = get_weighted(np.arange(q) ** deg, q, g)
            f[i, (17 * deg):(17 * (deg + 1))] = new_features
            f[i, (17*6 + 17 * deg):(17*6 + 17 * (deg + 1))] = new_features / max(new_features)
            
            
        # теперь по всем покупкам вообще
        l = len(sh)
            
        for deg in [0, 1, 2, 3]:
            new_features = get_weighted(np.arange(l) ** deg, l, sh)
            f[i, (17*12 + 17 * deg):(17*12 + 17 * (deg + 1))] = new_features
            f[i, (17*18 + 17 * deg):(17*18 + 17 * (deg + 1))] = new_features / max(new_features)            
                        
        if mod(i, 10000) == 0:
            print (i)
        
    return (f)  

In [6]:

X  = get_features(s)
X2  = get_features(s2)

In [7]:

import lightgbm as lgb
from time import time
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [31]:

e = 0.02 * np.random.randn(*X.shape)

tm = time()
gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)
gbm.fit(X2 + e, y2)
a = gbm.predict(X)
print (time() - tm)

39.3623058796

In [32]:

print (u'точность', np.mean(a==y))
# 0.39483636363636365 0.39496363636363635

(u'\u0442\u043e\u0447\u043d\u043e\u0441\u0442\u044c', 0.39496363636363635)

In [29]:

In [33]:

X0  = get_features(ss)

In [34]:

gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)
gbm.fit(X, y)
a = gbm.predict(X0)
pd.DataFrame({'id': np.arange(1, 110001), 'sum':a}).to_csv('lgb_benchmark-1.csv', index=False)

In [36]:

gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)
XX = np.concatenate([X, X2])
# XX = XX + 0.02 * np.random.randn(*XX.shape)
gbm.fit(XX, np.concatenate([y, y2]))
a = gbm.predict(X0)

pd.DataFrame({'id': np.arange(1, 110001), 'sum':a}).to_csv('lgb_benchmark-2.csv', index=False)

In [ ]: