# подгружаем все нужные пакеты
import pandas as pd
import numpy as np
import sklearn
# для встроенных картинок
%pylab inline
# %matplotlib inline
# чуть покрасивше картинки:
pd.set_option('display.mpl_style', 'default')
figsize(10, 3)
Populating the interactive namespace from numpy and matplotlib
datatrain = pd.read_csv('D:\\Competitions\\Rossman\\train.csv')
datatest = pd.read_csv('D:\\Competitions\\Rossman\\test.csv')
datastore = pd.read_csv('D:\\Competitions\\Rossman\\store.csv')
datatrain['StateHoliday'] = datatrain['StateHoliday'].astype(str) # сразу
C:\Python27\lib\site-packages\pandas\io\parsers.py:1139: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
print 'обучение' + str(datatrain.shape)
print 'контроль' + str(datatest.shape)
print 'магазины' + str(datastore.shape)
print datatrain[-3:]
datatrain[:3]
#data = data.reindex(index=data.index[::-1]) # почему - будет видно потом
#data = data.iloc[::-1]
#data.index = range(data.shape[0])
обучение(1017209, 9) контроль(41088, 8) магазины(1115, 10) Store DayOfWeek Date Sales Customers Open Promo \ 1017206 1113 2 2013-01-01 0 0 0 0 1017207 1114 2 2013-01-01 0 0 0 0 1017208 1115 2 2013-01-01 0 0 0 0 StateHoliday SchoolHoliday 1017206 a 1 1017207 a 1 1017208 a 1
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 |
print datatest[-3:]
datatest[:3]
Id Store DayOfWeek Date Open Promo StateHoliday \ 41085 41086 1113 6 2015-08-01 1 0 0 41086 41087 1114 6 2015-08-01 1 0 0 41087 41088 1115 6 2015-08-01 1 0 0 SchoolHoliday 41085 0 41086 0 41087 1
Id | Store | DayOfWeek | Date | Open | Promo | StateHoliday | SchoolHoliday | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 |
1 | 2 | 3 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 |
2 | 3 | 7 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 |
print datastore[-3:]
datastore[:3]
Store StoreType Assortment CompetitionDistance \ 1112 1113 a c 9260 1113 1114 a c 870 1114 1115 d c 5350 CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 \ 1112 NaN NaN 0 1113 NaN NaN 0 1114 NaN NaN 1 Promo2SinceWeek Promo2SinceYear PromoInterval 1112 NaN NaN NaN 1113 NaN NaN NaN 1114 22 2012 Mar,Jun,Sept,Dec
Store | StoreType | Assortment | CompetitionDistance | CompetitionOpenSinceMonth | CompetitionOpenSinceYear | Promo2 | Promo2SinceWeek | Promo2SinceYear | PromoInterval | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | c | a | 1270 | 9 | 2008 | 0 | NaN | NaN | NaN |
1 | 2 | a | a | 570 | 11 | 2007 | 1 | 13 | 2010 | Jan,Apr,Jul,Oct |
2 | 3 | a | a | 14130 | 12 | 2006 | 1 | 14 | 2011 | Jan,Apr,Jul,Oct |
Статистика по признакам
# смотрим уникальные значения
for i in datatrain.columns:
#addtext = + ' '+ )
# addtext = ((' intest=' + (str(datatest[i].nunique()))) + ' '+ (str(datatrain[i].unique()) if (datatrain[i].nunique()<10) else ''))
print (str(i) + \
(' c ' if (type(datatrain[i][0]) is str) else ' n ') + \
'intrain=' + str(datatrain[i].nunique()) + \
(' intest=' + (str(datatest[i].nunique()) if (i in datatest.columns) else ' not in test')) ) + \
(' uniques=' + str(datatrain[i].unique()) + (' / ' + str(datatest[i].unique()) if (i in datatest.columns) else ' ') if (datatrain[i].nunique()<10) else '')
print datatrain.isnull().sum() # нет ли NULL
print datatest.isnull().sum() # нет ли NULL
print datatrain.dtypes
# print data['Store'].unique().shape
# print data['DayOfWeek'].unique()
# print data['Date'].unique().shape
# print data['Sales'].unique().shape
# print data['Customers'].unique().shape
# print data['Open'].unique()
# print data['Promo'].unique()
# print data['StateHoliday'].unique()
# print data['SchoolHoliday'].unique()
Store n intrain=1115 intest=856 DayOfWeek n intrain=7 intest=7 uniques=[5 4 3 2 1 7 6] / [4 3 2 1 7 6 5] Date c intrain=942 intest=48 Sales n intrain=21734 intest= not in test Customers n intrain=4086 intest= not in test Open n intrain=2 intest=2 uniques=[1 0] / [ 1. nan 0.] Promo n intrain=2 intest=2 uniques=[1 0] / [1 0] StateHoliday c intrain=5 intest=2 uniques=['0' 'a' 'b' 'c' 0L] / ['0' 'a'] SchoolHoliday n intrain=2 intest=2 uniques=[1 0] / [0 1] Store 0 DayOfWeek 0 Date 0 Sales 0 Customers 0 Open 0 Promo 0 StateHoliday 0 SchoolHoliday 0 dtype: int64 Id 0 Store 0 DayOfWeek 0 Date 0 Open 11 Promo 0 StateHoliday 0 SchoolHoliday 0 dtype: int64 Store int64 DayOfWeek int64 Date object Sales int64 Customers int64 Open int64 Promo int64 StateHoliday object SchoolHoliday int64 dtype: object
11 нанов в test/Open
Проблема с типом train/StateHoliday
StateHoliday на контроле только А
datatrain.Date = pd.to_datetime(datatrain.Date)
datatest.Date = pd.to_datetime(datatest.Date)
datatrain['Month'] = datatrain.Date.map(lambda x: x.month) # ИНАЧЕ НЕ ПРИСВАИВАЕТСЯ
datatrain['DayOfYear'] = datatrain.Date.map(lambda x: x.dayofyear)
datatrain['Year'] = datatrain.Date.map(lambda x: x.year)
datatrain['Week'] = datatrain.Date.map(lambda x: x.week)
datatest['Month'] = datatest.Date.map(lambda x: x.month)
datatest['DayOfYear'] = datatest.Date.map(lambda x: x.dayofyear)
datatest['Year'] = datatest.Date.map(lambda x: x.year)
datatest['Week'] = datatest.Date.map(lambda x: x.week)
datatrain.DayOfYear.plot(color='blue')
datatest.DayOfYear.plot(color='red')
<matplotlib.axes.AxesSubplot at 0xb7171d0>
pd.crosstab(datatest.Date.map(lambda x: x.dayofweek), datatest.DayOfWeek)
DayOfWeek | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|---|---|---|---|---|---|---|
Date | |||||||
0 | 5992 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 5992 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 5992 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 5992 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 5136 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 5992 | 0 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 5992 |
pd.DataFrame({'DayOfYear': datatest.DayOfYear, 'DayOfWeek': datatest.DayOfWeek}).plot(kind='scatter', x='DayOfYear', y='DayOfWeek')
<matplotlib.axes.AxesSubplot at 0x12bdbf98>
datatrain.Date[0]
Timestamp('2015-07-31 00:00:00')
data1 = datatrain[datatrain.Date>=pd.Timestamp('2015-01-01 00:00:00')]
data2 = datatrain[(datatrain.Date<pd.Timestamp('2015-01-01 00:00:00'))&(datatrain.Date>=pd.Timestamp('2014-01-01 00:00:00'))]
data3 = datatrain[(datatrain.Date<pd.Timestamp('2014-01-01 00:00:00'))]
# можно train.Date.isin(pd.date_range('2013-01-01', '2015-06-13'))
figsize(15, 5)
data1.Sales.plot(color='g')
data2.Sales.plot(color='b')
data3.Sales.plot(color='r')
<matplotlib.axes.AxesSubplot at 0x238310f0>
figsize(15, 5)
data1.groupby('Month')['Sales'].mean().plot(color='g')
data2.groupby('Month')['Sales'].mean().plot(color='b')
data3.groupby('Month')['Sales'].mean().plot(color='r')
data1[data1.Sales>0].groupby('Month')['Sales'].mean().plot(color='g', style='--')
data2[data2.Sales>0].groupby('Month')['Sales'].mean().plot(color='b', style='--')
data3[data3.Sales>0].groupby('Month')['Sales'].mean().plot(color='r', style='--')
<matplotlib.axes.AxesSubplot at 0x11788630>
data1.groupby('Week')['Sales'].mean().plot(color='g')
data2.groupby('Week')['Sales'].mean().plot(color='b')
data3.groupby('Week')['Sales'].mean().plot(color='r')
data1[data1.Sales>0].groupby('Week')['Sales'].mean().plot(color='g', style='--')
data2[data2.Sales>0].groupby('Week')['Sales'].mean().plot(color='b', style='--')
data3[data3.Sales>0].groupby('Week')['Sales'].mean().plot(color='r', style='--')
<matplotlib.axes.AxesSubplot at 0x15957b38>
data1.groupby('Month')['Store'].nunique().plot(color='g')
data2.groupby('Month')['Store'].nunique().plot(color='b')
data3.groupby('Month')['Store'].nunique().plot(color='r')
datatest.groupby('Month')['Store'].nunique().plot(color='k')
<matplotlib.axes.AxesSubplot at 0xf49c780>
print np.intersect1d(datatest.Store.unique(), data1.Store.unique()).__len__()
print np.intersect1d(datatest.Store.unique(), data2.Store.unique()).__len__()
print np.intersect1d(datatest.Store.unique(), data3.Store.unique()).__len__()
print np.intersect1d(datatest.Store.unique(), data2[data2.Month==12].Store.unique()).__len__()
856 856 856 676
data1.groupby('DayOfYear')['Sales'].mean().plot(color='g')
data2.groupby('DayOfYear')['Sales'].mean().plot(color='b')
data3.groupby('DayOfYear')['Sales'].mean().plot(color='r')
<matplotlib.axes.AxesSubplot at 0x16b71550>
figsize(15, 5)
ax = data1['Sales'].plot(kind='kde', color='g')
data2['Sales'].plot(kind='kde', color='b')
data3['Sales'].plot(kind='kde', color='r')
ax.set_xlim(-2000, 20000)
(-2000, 20000)
figsize(15, 5)
ax = data3['Sales'].hist(bins=500, color='r')
data2['Sales'].hist(bins=500, color='b')
data1['Sales'].hist(bins=500, color='g')
ax.set_ylim(0, 5000)
ax.set_xlim(-1, 25000)
(-1, 25000)
figsize(15, 5)
ax = data1.groupby('Week')['Promo'].mean().plot(color='g')
data2.groupby('Week')['Promo'].mean().plot(color='b')
data3.groupby('Week')['Promo'].mean().plot(color='r')
ax.legend(['2015', '2014', '2013'])
<matplotlib.legend.Legend at 0x15242470>
np.array([1,2,3,4,5,6,7])/7.0
array([ 0.14285714, 0.28571429, 0.42857143, 0.57142857, 0.71428571, 0.85714286, 1. ])
figsize(15, 5)
ax = data1[data1.Sales>0].groupby('Date')['Promo'].mean().plot(color='g')
data2[data2.Sales>0].groupby('Date')['Promo'].mean().plot(color='b')
data3[data3.Sales>0].groupby('Date')['Promo'].mean().plot(color='r')
ax.legend(['2015', '2014', '2013'])
<matplotlib.legend.Legend at 0xe41ec18>
Акция - это свойство дня!
cols = ['r','b','g','y','c','m','k']
ax = datatrain[datatrain['DayOfWeek']==7]['Sales'].plot(kind='kde', color=cols[6])
for d in range(6):
datatrain[datatrain['DayOfWeek']==(d+1)]['Sales'].plot(kind='kde', color=cols[d])
ax.set_xlim(-500, 15000)
ax.set_ylim(0, 0.0003)
ax.legend([u'воскресенье',u'понедельник',u'вторник',u'среда',u'четверг',u'пятница',u'суббота'])
<matplotlib.legend.Legend at 0x3b159710>
figsize(15, 5)
ax = datatrain[datatrain['DayOfWeek']==6]['Sales'].hist(bins=500, color='m')
datatrain[datatrain['DayOfWeek']==3]['Sales'].hist(bins=500, color='g')
datatrain[datatrain['DayOfWeek']==7]['Sales'].hist(bins=500, color='k')
ax.set_ylim(0, 2000)
ax.set_xlim(-1, 20000)
(-1, 20000)
ax = datatrain[datatrain.Sales>0].Sales.hist(bins=200)
ax.set_title(u'Все покупки')
<matplotlib.text.Text at 0x3e3b8eb8>
ax = datatrain[datatrain.Sales>0].Sales.apply(lambda x: np.log(x+1.0)).hist(bins=200)
ax.set_title(u'Логарифм всех покупок')
<matplotlib.text.Text at 0x398d01d0>
Очень похоже на нормальное распределение;)
cols = ['r','b','g','y','c','m','k']
for i, s in enumerate(datatrain['Store'].unique()[:7]):
data1[data1['Store']==s]['Sales'].plot(color=cols[i])
tmp = data1[data1['Store']==1]
tmp[:5]
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | Month | DayOfYear | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 | 7 | 212 |
1115 | 1 | 4 | 2015-07-30 | 5020 | 546 | 1 | 1 | 0 | 1 | 7 | 211 |
2230 | 1 | 3 | 2015-07-29 | 4782 | 523 | 1 | 1 | 0 | 1 | 7 | 210 |
3345 | 1 | 2 | 2015-07-28 | 5011 | 560 | 1 | 1 | 0 | 1 | 7 | 209 |
4460 | 1 | 1 | 2015-07-27 | 6102 | 612 | 1 | 1 | 0 | 1 | 7 | 208 |
tmp['Promo'] *= 1000
tmp['StateHoliday'] *= 2000
tmp['SchoolHoliday'] *= 3000
-c:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead -c:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead -c:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead
tmp.plot()
<matplotlib.axes.AxesSubplot at 0x47923208>
data1.groupby('DayOfWeek')['Sales'].mean().plot(color='g')
data2.groupby('DayOfWeek')['Sales'].mean().plot(color='b')
data3.groupby('DayOfWeek')['Sales'].mean().plot(color='r')
<matplotlib.axes.AxesSubplot at 0x3accebe0>
Потрясающая стабильность
data1[data1['Month']==1].groupby('DayOfWeek')['Sales'].mean().plot(color='g')
data1[data1['Month']==2].groupby('DayOfWeek')['Sales'].mean().plot(color='r')
data1[data1['Month']==3].groupby('DayOfWeek')['Sales'].mean().plot(color='b')
data1[data1['Month']==4].groupby('DayOfWeek')['Sales'].mean().plot(color='m')
data1[data1['Month']==5].groupby('DayOfWeek')['Sales'].mean().plot(color='c')
data1[data1['Month']==6].groupby('DayOfWeek')['Sales'].mean().plot(color='y')
data1[data1['Month']==7].groupby('DayOfWeek')['Sales'].mean().plot(color='k')
<matplotlib.axes.AxesSubplot at 0x5532a5c0>
ax = data1[data1['Store']==2].plot(kind='scatter', x='Sales', y='Customers', color='b')
data2[data2['Store']==2].plot(kind='scatter', x='Sales', y='Customers', color='r', ax=ax)
data3[data3['Store']==2].plot(kind='scatter', x='Sales', y='Customers', color='g', ax=ax)
<matplotlib.axes.AxesSubplot at 0x5527e7b8>
ax = data2[data2['Store']==2].plot(kind='scatter', x='Sales', y='Customers', color='r')
data2[data2['Store']==3].plot(kind='scatter', x='Sales', y='Customers', color='k', ax=ax)
data2[data2['Store']==4].plot(kind='scatter', x='Sales', y='Customers', color='m', ax=ax)
data2[data2['Store']==5].plot(kind='scatter', x='Sales', y='Customers', color='y', ax=ax)
<matplotlib.axes.AxesSubplot at 0x39da64e0>
for m in range(7):
print data1[data1['Month']==m+1].groupby('Promo')['Sales'].mean()
Promo 0 3970.815415 1 7653.475815 Name: Sales, dtype: float64 Promo 0 4620.047085 1 7672.745561 Name: Sales, dtype: float64 Promo 0 4373.435450 1 8443.980045 Name: Sales, dtype: float64 Promo 0 4436.538914 1 8137.335575 Name: Sales, dtype: float64 Promo 0 4430.031570 1 7366.831879 Name: Sales, dtype: float64 Promo 0 4625.217289 1 8560.184006 Name: Sales, dtype: float64 Promo 0 4676.847085 1 8172.355640 Name: Sales, dtype: float64
for y in [2013, 2014, 2015]:
print datatrain[datatrain['Year']==y].groupby('Open')['Sales'].mean()
Open 0 0.000000 1 6814.392025 Name: Sales, dtype: float64 Open 0 0.000000 1 7025.404201 Name: Sales, dtype: float64 Open 0 0.000000 1 7088.126648 Name: Sales, dtype: float64
Закрыт - нет продаж
for y in [2013, 2014, 2015]:
print datatrain[datatrain['Year']==y].groupby('StateHoliday')['Sales'].mean()
StateHoliday 0 5824.885269 a 268.288627 b 204.034529 c 148.512108 Name: Sales, dtype: float64 StateHoliday 0 5733.530624 0 6159.600371 a 293.153706 b 207.781614 c 192.847059 Name: Sales, dtype: float64 StateHoliday 0 6067.023402 a 318.836334 b 231.118386 Name: Sales, dtype: float64
for y in [2013, 2014, 2015]:
print datatrain[datatrain['Year']==y].groupby('SchoolHoliday')['Sales'].mean()
SchoolHoliday 0 5523.034939 1 6240.818798 Name: Sales, dtype: float64 SchoolHoliday 0 5677.092502 1 6515.963368 Name: Sales, dtype: float64 SchoolHoliday 0 5696.882291 1 6911.932234 Name: Sales, dtype: float64
# ошибка
def rmspe(y, a):
# y=0 - игнорировать
return np.mean((((y-a)/y)[y>0]) ** 2) ** 0.5
# ошибка
def prmse(y, a):
# y=0 - игнорировать
return (np.mean(((y-a)[y>0]) ** 2) ** 0.5)/np.mean(y[y>0])
# ошибка
def smape(y, a):
# y=0 - игнорировать
return np.mean((2*np.abs(y-a)/np.abs(y+a))[y>0])
test = datatrain[datatrain.Date>pd.Timestamp('2015-06-13 00:00:00')]
train = datatrain[datatrain.Date<=pd.Timestamp('2015-06-13 00:00:00')]
print train.shape, test.shape
ytrain = train.Sales.values
ytest = test.Sales.values
(963689, 13) (53520, 13)
Константные модели
print rmspe(ytest, 0)
print rmspe(ytest, ytrain[ytrain>0].mean())
print rmspe(ytest, 1.1*ytrain[ytrain>0].mean())
print rmspe(ytest, 0.9*ytrain[ytrain>0].mean())
print rmspe(ytest, 0.7*ytrain[ytrain>0].mean())
print rmspe(ytest, np.median(ytrain[ytrain>0]))
print rmspe(ytest, 0.75*np.median(ytrain[ytrain>0]))
1.0 0.557527013858 0.648281661016 0.483560482379 0.420833467405 0.493581387838 0.421824754067
Медиана лучше среднего
Но если делать понижение проноза, то среднее лучше
l = np.linspace(0.4,1.2,1001)
e = []
e2 = []
e3 = []
for x in l:
a = np.median(x*ytrain[ytrain>0]) # np.mean(y[y>0]) #
e.append(rmspe(ytest, a))
e2.append(prmse(ytest, a))
e3.append(smape(ytest, a))
figsize(15, 5)
tmp = pd.DataFrame({'rmspe': e, 'prmse': e2, 'smape': e3})
tmp.index = l
ax = tmp.plot()
ax.set_title(u'Качество константного решения')
ax.set_xlabel(u'множитель')
ax.set_ylabel(u'ошибка')
print 'min rmspe = ' + str(min(e))
print 'min prmse = ' + str(min(e2))
print 'min smape = ' + str(min(e3))
min rmspe = 0.4205310736 min prmse = 0.432856724607 min smape = 0.313709222153
Вывод: для нашего функционала хорош занижающий прогноз (в отличие от классических)
С обучением и контролем
train[:3]
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | Month | DayOfYear | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
53520 | 1 | 6 | 2015-06-13 | 4256 | 502 | 1 | 0 | 0 | 0 | 6 | 164 | 2015 |
53521 | 2 | 6 | 2015-06-13 | 2574 | 319 | 1 | 0 | 0 | 0 | 6 | 164 | 2015 |
53522 | 3 | 6 | 2015-06-13 | 4687 | 492 | 1 | 0 | 0 | 0 | 6 | 164 | 2015 |
test[:3]
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | Month | DayOfYear | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 | 7 | 212 | 2015 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 | 7 | 212 | 2015 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 | 7 | 212 | 2015 |
st = train.groupby('DayOfWeek')['Sales'].mean()
a = test['DayOfWeek'].apply(lambda x: st[x]).values
print rmspe(ytest, a)
0.489942236354
def investlina(a,x1=0.6,x2=1.2):
l = np.linspace(0.6,1.2,1001)
e = []
e2 = []
e3 = []
for x in l:
a2 = x*a
e.append(rmspe(ytest, a2))
e2.append(prmse(ytest, a2))
e3.append(smape(ytest, a2))
tmp = pd.DataFrame({'rmspe': e, 'prmse': e2, 'smape': e3})
tmp.index = l
ax = tmp.plot()
ax.set_title(u'Качество константного решения')
ax.set_xlabel(u'множитель')
ax.set_ylabel(u'ошибка')
print 'min rmspe = ' + str(min(e))
print 'min prmse = ' + str(min(e2))
print 'min smape = ' + str(min(e3))
investlina(a)
min rmspe = 0.401827901185 min prmse = 0.426212287572 min smape = 0.309894063366
st = train.groupby(['DayOfWeek', 'Store'])['Sales'].mean() # группировка по двум признакам
a = test[['DayOfWeek', 'Store']].apply(lambda x: st[x[0], x[1]], axis=1).values # ТОЛЬКО ТАК...
print rmspe(ytest, a)
0.2229627585
investlina(a)
min rmspe = 0.221031220985 min prmse = 0.234138863758 min smape = 0.180176335548
test['Forecast'] = a
ax = test[test.Store==2][['Sales', 'Forecast']].plot()
test[test.Store==1000][['Sales', 'Forecast']].plot()
<matplotlib.axes.AxesSubplot at 0x111c1278>
Получили существенный выигрыш в качестве!
Но ещё далеко до оптимума... ~0.1
st = train[train.Year==2015].groupby(['DayOfWeek', 'Store'])['Sales'].mean() # группировка по двум признакам
a = test[['DayOfWeek', 'Store']].apply(lambda x: st[x[0], x[1]], axis=1).values # ТОЛЬКО ТАК...
print rmspe(ytest, a)
0.218436591399
investlina(a)
min rmspe = 0.215392672505 min prmse = 0.232004471326 min smape = 0.179174960539
Свежая информация ценнее!
А вот в следующем коде лучше брать статистику за всё время...
st = train.groupby(['DayOfWeek', 'Store', 'Promo'])['Sales'].mean() # группировка по двум признакам
a = test[['DayOfWeek', 'Store', 'Promo']].apply(lambda x: st[x[0], x[1], x[2]], axis=1).values # ТОЛЬКО ТАК...
print rmspe(ytest, a)
0.151292282637
investlina(a)
min rmspe = 0.150580685895 min prmse = 0.157648418518 min smape = 0.112999153503
figsize(15,5)
test['Forecast'] = a
ax = test[test.Store==2][['Sales', 'Forecast']].plot()
test[test.Store==1000][['Sales', 'Forecast']].plot()
<matplotlib.axes.AxesSubplot at 0x1580c240>
Визуально - всё почти идеально
st = train[train.Year==2015].groupby(['DayOfWeek', 'Store'])['Sales'].mean()
tmp = pd.DataFrame(st)
tmp.T.stack(0)
Store | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
DayOfWeek | ||||||||||||||||||||||
Sales | 1 | 4458.652174 | 5498.086957 | 7495.608696 | 9767.000000 | 5596.652174 | 5363.565217 | 9384.565217 | 7209.086957 | 8511.000000 | 6127.826087 | ... | 5653.739130 | 7231.521739 | 7475.391304 | 5630.347826 | 5781.956522 | 6055.565217 | 10954.260870 | 6563.217391 | 20024.043478 | 7128.739130 |
2 | 4473.000000 | 5636.130435 | 7529.695652 | 9494.130435 | 5205.608696 | 5074.434783 | 9612.260870 | 6894.913043 | 7624.695652 | 5961.173913 | ... | 5566.130435 | 6326.565217 | 6764.434783 | 4966.043478 | 5397.782609 | 5394.521739 | 9126.521739 | 6516.391304 | 20717.434783 | 6467.304348 | |
3 | 4431.521739 | 5866.565217 | 7181.739130 | 9011.130435 | 5158.434783 | 5070.304348 | 8543.304348 | 6348.782609 | 7057.521739 | 5597.391304 | ... | 5115.956522 | 6052.043478 | 6176.739130 | 4860.434783 | 4876.782609 | 4919.000000 | 8951.478261 | 6594.217391 | 20369.956522 | 6607.173913 | |
4 | 3716.375000 | 4518.250000 | 5979.041667 | 8656.458333 | 4231.708333 | 4490.625000 | 7718.375000 | 6627.125000 | 6220.833333 | 5045.416667 | ... | 4847.041667 | 5007.541667 | 5639.375000 | 4171.333333 | 4532.916667 | 4264.958333 | 7656.750000 | 6385.250000 | 18780.458333 | 5631.291667 | |
5 | 4068.416667 | 4125.291667 | 6412.333333 | 8748.875000 | 4364.666667 | 4488.041667 | 8283.416667 | 5241.958333 | 6758.541667 | 5385.208333 | ... | 4644.375000 | 5544.041667 | 5239.458333 | 4403.625000 | 4570.625000 | 4562.583333 | 8211.250000 | 5879.458333 | 19689.375000 | 6758.041667 | |
6 | 4865.958333 | 2976.500000 | 4501.791667 | 10535.375000 | 2128.875000 | 3197.166667 | 6413.666667 | 3611.916667 | 6930.416667 | 5079.583333 | ... | 2992.041667 | 5554.041667 | 2407.166667 | 3787.833333 | 3608.458333 | 2921.125000 | 8139.416667 | 6015.166667 | 22747.875000 | 7796.875000 | |
7 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
7 rows × 1115 columns
Крутой вывод;)
тут большая проблема в том, что не все комбинации признаков могут быть в обучении
# ОЧЕНЬ ДОЛГО....
def takemean(st, x1, x2, x3, x4, x5):
if st.index.isin([(x1, x2, x3, x4, x5)]).any():
return st[(x1, x2, x3, x4, x5)]
else:
return 5742.0
st = train.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'SchoolHoliday'])['Sales'].mean() # группировка по двум признакам
a = test[['DayOfWeek', 'Store', 'Open', 'Promo', 'SchoolHoliday']].apply(lambda x: takemean(st, x[0], x[1], x[2], x[3], x[4]), axis=1).values # ТОЛЬКО ТАК...
print rmspe(ytest, a)
st = train.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['Sales'].mean()
test['dummy'] = 5742.0
st2 = test.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['dummy'].mean()
np.sum(~st2.index.isin(st.index)) # есть новые индексы
45
st = st.append(st2[~st2.index.isin(st.index)])
np.sum(~st2.index.isin(st.index)) # ура! нет "новинок"
0
st[:5]
DayOfWeek Store Open Promo StateHoliday SchoolHoliday 1 1 0 0 a 0 0.000000 b 1 0.000000 1 0 0 0 3822.285714 1 3650.000000 0 4042.028571 dtype: float64
a = test[['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']].apply(lambda x: st[x[0], x[1], x[2], x[3], x[4], x[5]], axis=1).values # ТОЛЬКО ТАК...
print rmspe(ytest, a)
0.15154666421
investlina(a)
min rmspe = 0.148754855759 min prmse = 0.158953548121 min smape = 0.110712363584
test['Forecast'] = a
ax = test[test.Store==2][['Sales', 'Forecast']].plot()
test[test.Store==1000][['Sales', 'Forecast']].plot()
-c:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead
<matplotlib.axes.AxesSubplot at 0x1f4599e8>
train['Holiday'] = (train.StateHoliday!='0')|(train.SchoolHoliday!=0)
test['Holiday'] = (test.StateHoliday!='0')|(test.SchoolHoliday!=0)
feats = ['DayOfWeek', 'Store', 'Promo', 'Holiday'] # , 'SchoolHoliday', 'StateHoliday'
st = train[train.Open>0].groupby(feats)['Sales'].mean()
test['dummy'] = 6200.0
st2 = test.groupby(feats)['dummy'].mean()
t = (~st2.index.isin(st.index))
print 'не хватает индексов:' + str(np.sum(t)) + ' %:' + str(np.mean(t))
st = st.append(st2[~st2.index.isin(st.index)])
a = test[feats].apply(lambda x: st[x[0], x[1], x[2], x[3]], axis=1).values # КАК ЕЩЁ СДЕЛАТЬ?????
print 'ошибка = ' + str(rmspe(ytest, a))
investlina(a)
не хватает индексов:1103 %:0.0516289084441 ошибка = 0.146861059175 min rmspe = 0.145358057596 min prmse = 0.155383366755 min smape = 0.10815332693
Не получается красивого кода для перебора :(
def mymean(x):
w = linspace(1,0.1,x.__len__()) ** 2.0
w = w/w.sum()
return np.dot(x,w)
st = train[train.Open>0].groupby(feats)['Sales'].apply(mymean)
test['dummy'] = 6200.0
st2 = test.groupby(feats)['dummy'].apply(mymean)
t = (~st2.index.isin(st.index))
print 'не хватает индексов:' + str(np.sum(t)) + ' %:' + str(np.mean(t))
st = st.append(st2[~st2.index.isin(st.index)])
a = test[feats].apply(lambda x: st[x[0], x[1], x[2], x[3]], axis=1).values # КАК ЕЩЁ СДЕЛАТЬ?????
print 'ошибка = ' + str(rmspe(ytest, a))
investlina(a)
не хватает индексов:1103 %:0.0516289084441 ошибка = 0.144661886707 min rmspe = 0.13966759364 min prmse = 0.150114327132 min smape = 0.103691933713
С устареванием ещё лучше!
Но dummy всё портит - его можно сделать умнее: средние по этому магазину
train = pd.read_csv('D:\\Competitions\\Rossman\\train.csv')
test = pd.read_csv('D:\\Competitions\\Rossman\\test.csv')
test[:3]
Id | Store | DayOfWeek | Date | Open | Promo | StateHoliday | SchoolHoliday | dummy | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 | 5742 |
1 | 2 | 3 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 | 5742 |
2 | 3 | 7 | 4 | 2015-09-17 | 1 | 1 | 0 | 0 | 5742 |
test.isnull().sum()
Id 0 Store 0 DayOfWeek 0 Date 0 Open 11 Promo 0 StateHoliday 0 SchoolHoliday 0 dummy 0 dtype: int64
test = test.fillna(1)
train[:3]
Store | DayOfWeek | Date | Sales | Customers | Open | Promo | StateHoliday | SchoolHoliday | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2015-07-31 | 5263 | 555 | 1 | 1 | 0 | 1 |
1 | 2 | 5 | 2015-07-31 | 6064 | 625 | 1 | 1 | 0 | 1 |
2 | 3 | 5 | 2015-07-31 | 8314 | 821 | 1 | 1 | 0 | 1 |
st = train.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['Sales'].mean()
test['dummy'] = 5742.0
st2 = test.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['dummy'].mean()
np.sum(~st2.index.isin(st.index)) # есть новые индексы
199
st = st.append(st2[~st2.index.isin(st.index)])
np.sum(~st2.index.isin(st.index)) # ура! нет "новинок"
0
a = test[['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']].apply(lambda x: st[x[0], x[1], x[2], x[3], x[4], x[5]], axis=1).values # ТОЛЬКО ТАК...
forout = pd.DataFrame({'Id': test['Id'], 'Sales': a})
forout[:3]
Id | Sales | |
---|---|---|
0 | 1 | 5030.391304 |
1 | 2 | 7997.255319 |
2 | 3 | 9090.000000 |
forout.to_csv('D:\\Competitions\\Rossman\\trivial_10.csv', index=False) # 0.16046
pd.DataFrame({'Id': test['Id'], 'Sales': 0.95*a}).to_csv('D:\\Competitions\\Rossman\\trivial_095.csv', index=False) # 0.1459
pd.DataFrame({'Id': test['Id'], 'Sales': 0.9*a}).to_csv('D:\\Competitions\\Rossman\\trivial_09.csv', index=False) # 0.14990
модель не очень хорошая, но простая...
st = train.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['Sales'].apply(mymean)
test['dummy'] = 5742.0
st2 = test.groupby(['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday'])['dummy'].apply(mymean)
st = st.append(st2[~st2.index.isin(st.index)])
a = test[['DayOfWeek', 'Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']].apply(lambda x: st[x[0], x[1], x[2], x[3], x[4], x[5]], axis=1).values # ТОЛЬКО ТАК...
pd.DataFrame({'Id': test['Id'], 'Sales': 0.95*a}).to_csv('D:\\Competitions\\Rossman\\trivialmymean_095.csv', index=False) # 0.14713
pd.DataFrame({'Id': test['Id'], 'Sales': 0.9*a}).to_csv('D:\\Competitions\\Rossman\\trivialmymean_09.csv', index=False) # 0.14120
pd.DataFrame({'Id': test['Id'], 'Sales': 0.85*a}).to_csv('D:\\Competitions\\Rossman\\trivialmymean_085.csv', index=False) # 0.15537