Исследование данных Хакатона Озон¶

2017, Александр Дьяконов

In [2]:

# подгружаем все нужные пакеты
import pandas as pd
import numpy as np


# для встроенных картинок
%pylab inline
# чуть покрасивше картинки:
pd.set_option('display.mpl_style', 'default')
figsize(12, 9)

import warnings
warnings.filterwarnings("ignore")

#plt.rcParams['figure.figsize'] = 10, 7.5
#plt.rcParams['axes.grid'] = True
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams['font.family'] = 'Ubuntu'

plt.rc('text', usetex=False)
plt.rc('font', family='serif')
plt.rc('font', weight='bold')
plt.rc('xtick', labelsize=14) 
plt.rc('ytick', labelsize=14)

# чтобы был русский шрифт
from matplotlib import rc
 
font = {'family': 'Droid Sans',
        'weight': 'normal'}
rc('font', **font)

Populating the interactive namespace from numpy and matplotlib

In [2]:

# чтобы был русский шрифт
# from matplotlib import rc
 
# font = {'family': 'Helvetica',
#         'weight': 'normal'}
# rc('font', **font)

загрузка данных¶

In [2]:

data = pd.read_excel('Training Part.xlsx')
day1 = pd.read_excel('day1.xlsx')
day2 = pd.read_excel('day2.xlsx')

In [3]:

print (data.shape, day1.shape, day2.shape)

((390000, 197), (130000, 197), (132016, 197))

In [3]:

# если созданы csv-файлы
data = pd.read_csv('data.csv')
day1 = pd.read_csv('day1.csv')
day2 = pd.read_csv('day2.csv')

In [4]:

print (data.shape, day1.shape, day2.shape)

(390000, 197) (130000, 197) (132016, 197)

In [4]:

print (data.columns)

Index([u'ID', u'NetSales', u'gender', u'ActivityIndex', u'Alcohol',
       u'AverageFriendsRegMonthDelta', u'CanSeeAllPosts',
       u'CanSeeAllPosts_CanPost', u'EducationType', u'EducationType_VK',
       ...
       u'TD_TV', u'TD_ukraine', u'TD_video', u'TD_way_of_life',
       u'TD_wedding_communities', u'TD_women_communities', u'TD_workout',
       u'TD_youth_communities', u'TD_sum', u'animal_owner'],
      dtype='object', length=197)

In [8]:

day1.columns

Out[8]:

Index(['ID', 'NetSales', 'gender', 'ActivityIndex', 'Alcohol',
       'AverageFriendsRegMonthDelta', 'CanSeeAllPosts',
       'CanSeeAllPosts_CanPost', 'EducationType', 'EducationType_VK',
       ...
       'TD_TV', 'TD_ukraine', 'TD_video', 'TD_way_of_life',
       'TD_wedding_communities', 'TD_women_communities', 'TD_workout',
       'TD_youth_communities', 'TD_sum', 'animal_owner'],
      dtype='object', length=197)

In [5]:

data[:2]

Out[5]:

	ID	NetSales	gender	ActivityIndex	Alcohol	AverageFriendsRegMonthDelta	CanSeeAllPosts	CanSeeAllPosts_CanPost	EducationType	EducationType_VK	FriendsAverageDeletedAccounts	FriendsAverageHasHighSchools	FriendsAverageHasJobs	FriendsPerDay	HasPhone	HasSkype	HasTwitter	LastActivity	LifeTime	MaxRegDate	MinRegDate	MobileUsageAll	MobileUsageAndroid	MobileUsageIPad	MobileUsageIphone	MobileUsageWinPhone	MonthsFromMaxRegDate	MonthsFromMinRegDate	NumberOfAccounts	NumberOfAccountsMOIMIR	NumberOfAccountsODKL	NumberOfAccountsVK	NumberOfAdvancedSchools	NumberOfChilds	NumberOfChilds_VK	NumberOfCompanies	NumberOfDeletedAccounts	NumberOfDeletedAccounts_VK	NumberOfEntrance	NumberOfFollowers	NumberOfFriendsMax	NumberOfFriendsMax_VK	NumberOfFriendsMin	NumberOfFriendsSum	NumberOfGroupsMax	NumberOfGroupsSum_OK	NumberOfHighSchools	NumberOfNotesMax	NumberOfPhotosMax	NumberOfPhotosMin_VK	NumberOfPrivateAccounts	NumberOfPrivateAccounts_OK	NumberOfRelatives	NumberOfRelatives_OK	NumberOfSchools	NumberOfSubscriptions	NumberOfVideos	Relation	SaScore1	SaScore2	SaScore5	SaScore6	SaScore7	SaScoreFraud	SaScoreSocial	Smoking	UseScreenName	Worldviews	YearsSinceMinRegDate	YearsSinceMinRegDate_OK	YearsSinceMinRegDate_VK	NumberOfAccountsFB	Interest_1	Interest_2	Interest_3	Interest_4	Interest_5	Interest_6	Interest_7	Interest_8	Interest_9	Interest_10	Interest_11	Interest_12	Interest_13	Interest_14	Interest_15	Interest_16	Interest_17	Interest_18	Interest_19	Interest_20	Interest_21	Interest_22	Interest_23	Interest_24	Interest_25	TD_acquaint_communic	TD_acquaintances	TD_active_rest	TD_ad	TD_animals	TD_anime_hentai	TD_architecture	TD_art_design	TD_beautiful_girls	TD_beauty	TD_books	TD_business	TD_cars	TD_cartoons	TD_caucasian	TD_celebrities	TD_children	TD_cognitive	TD_companies	TD_computers	TD_cookery	TD_design	TD_design_renovation	TD_diets	TD_do_yourself	TD_electronics_electrappliances	TD_entertainment	TD_family_home	TD_fashion	TD_finance	TD_fitness	TD_football	TD_foreign_lang	TD_gadgets	TD_games	TD_geopolitics_economy	TD_gif	TD_goods_services	TD_hobbies	TD_horoscope	TD_horror	TD_humour	TD_images	TD_insurance	TD_interior	TD_kazakhstan	TD_landscape_design	TD_literature_poetry	TD_mass_media	TD_mass_media_ad_PR	TD_men_communities	TD_mobile_internet	TD_motivation	TD_moto	TD_movies	TD_music	TD_names	TD_nature	TD_nature_and_travel	TD_nostalgia	TD_other_services	TD_parents_communities	TD_philosophy_esoterics	TD_photo	TD_poetry	TD_politics	TD_professions	TD_proposed_news	TD_real_estate	TD_regional_communities	TD_relations	TD_religion	TD_rest	TD_russia	TD_science	TD_science_education	TD_shops	TD_slimming	TD_society	TD_soft	TD_softporno_porno	TD_sport_and_health	TD_sport_diet	TD_sport_other	TD_start_ups	TD_tech_IT	TD_technologies	TD_thoughts_ideas	TD_tourism	TD_travel	TD_TV	TD_ukraine	TD_video	TD_way_of_life	TD_wedding_communities	TD_women_communities	TD_workout	TD_youth_communities	TD_sum	animal_owner
0	1	0	F	-	0	92	-	-	12	-	0.06667	0.33333	0.06667	0.01768	0	0	0	113	2623	2012-10-19 00:00:00	2008-01-20 00:00:00	-	-	-	-	-	51	108	2	0	2	0	0	0	-	0	0	-	-	-	14	-	1	15	-	-	1	0	1	-	0	0	4	4	1	-	-	0	-3.82756	-1.67380	-5.15525	-6.52538	113	-4.14635	-4.86583	0	0	0	9.0197	9.0197	-	NaN	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	7	0	F	-	0	-	-	-	-	-	0	0.33333	0.22222	-	0	0	0	79	-	2008-06-22 00:00:00	2008-06-22 00:00:00	-	-	-	-	-	103	103	1	1	0	0	0	0	-	0	0	-	-	-	8	-	8	8	-	-	0	-	4	-	1	-	0	-	0	-	-	0	-4.06040	-1.92671	-4.93723	-6.52939	110	-3.98769	-5.21042	0	0	0	8.59808	-	-	NaN	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [6]:

day1[:3]

Out[6]:

	ID	NetSales	gender	ActivityIndex	Alcohol	AverageFriendsRegMonthDelta	CanSeeAllPosts	CanSeeAllPosts_CanPost	EducationType	EducationType_VK	FriendsAverageDeletedAccounts	FriendsAverageHasHighSchools	FriendsAverageHasJobs	FriendsPerDay	HasPhone	HasSkype	HasTwitter	LastActivity	LifeTime	MaxRegDate	MinRegDate	MobileUsageAll	MobileUsageAndroid	MobileUsageIPad	MobileUsageIphone	MobileUsageWinPhone	MonthsFromMaxRegDate	MonthsFromMinRegDate	NumberOfAccounts	NumberOfAccountsMOIMIR	NumberOfAccountsODKL	NumberOfAccountsVK	NumberOfAdvancedSchools	NumberOfChilds	NumberOfChilds_VK	NumberOfCompanies	NumberOfDeletedAccounts	NumberOfDeletedAccounts_VK	NumberOfEntrance	NumberOfFollowers	NumberOfFriendsMax	NumberOfFriendsMax_VK	NumberOfFriendsMin	NumberOfFriendsSum	NumberOfGroupsMax	NumberOfGroupsSum_OK	NumberOfHighSchools	NumberOfNotesMax	NumberOfPhotosMax	NumberOfPhotosMin_VK	NumberOfPrivateAccounts	NumberOfPrivateAccounts_OK	NumberOfRelatives	NumberOfRelatives_OK	NumberOfSchools	NumberOfSubscriptions	NumberOfVideos	Relation	SaScore1	SaScore2	SaScore5	SaScore6	SaScore7	SaScoreFraud	SaScoreSocial	Smoking	UseScreenName	Worldviews	YearsSinceMinRegDate	YearsSinceMinRegDate_OK	YearsSinceMinRegDate_VK	NumberOfAccountsFB	Interest_1	Interest_2	Interest_3	Interest_4	Interest_5	Interest_6	Interest_7	Interest_8	Interest_9	Interest_10	Interest_11	Interest_12	Interest_13	Interest_14	Interest_15	Interest_16	Interest_17	Interest_18	Interest_19	Interest_20	Interest_21	Interest_22	Interest_23	Interest_24	Interest_25	TD_acquaint_communic	TD_acquaintances	TD_active_rest	TD_ad	TD_animals	TD_anime_hentai	TD_architecture	TD_art_design	TD_beautiful_girls	TD_beauty	TD_books	TD_business	TD_cars	TD_cartoons	TD_caucasian	TD_celebrities	TD_children	TD_cognitive	TD_companies	TD_computers	TD_cookery	TD_design	TD_design_renovation	TD_diets	TD_do_yourself	TD_electronics_electrappliances	TD_entertainment	TD_family_home	TD_fashion	TD_finance	TD_fitness	TD_football	TD_foreign_lang	TD_gadgets	TD_games	TD_geopolitics_economy	TD_gif	TD_goods_services	TD_hobbies	TD_horoscope	TD_horror	TD_humour	TD_images	TD_insurance	TD_interior	TD_kazakhstan	TD_landscape_design	TD_literature_poetry	TD_mass_media	TD_mass_media_ad_PR	TD_men_communities	TD_mobile_internet	TD_motivation	TD_moto	TD_movies	TD_music	TD_names	TD_nature	TD_nature_and_travel	TD_nostalgia	TD_other_services	TD_parents_communities	TD_philosophy_esoterics	TD_photo	TD_poetry	TD_politics	TD_professions	TD_proposed_news	TD_real_estate	TD_regional_communities	TD_relations	TD_religion	TD_rest	TD_russia	TD_science	TD_science_education	TD_shops	TD_slimming	TD_society	TD_soft	TD_softporno_porno	TD_sport_and_health	TD_sport_diet	TD_sport_other	TD_start_ups	TD_tech_IT	TD_technologies	TD_thoughts_ideas	TD_tourism	TD_travel	TD_TV	TD_ukraine	TD_video	TD_way_of_life	TD_wedding_communities	TD_women_communities	TD_workout	TD_youth_communities	TD_sum	animal_owner
0	390004	NaN	F	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	390010	NaN	NaN	4.5	0	60	1	1	-	-	0.04138	0.18506	0.07356	0.14374	1	0	0	1	2261	2014-03-12 00:00:00	2009-06-30 00:00:00	0.95402	0.91429	0	0.59036	0	34	91	7	1	3	3	0	0	0	0	0	0	151	2	270	111	0	826	12	12	0	79	20	1	1	1	16	28	2	6	-	6	-1.90297	-1.72063	-3.57124	-5.0395	155	-2.39785	-2.97663	0	1	0	7.5767	7.57379	7.5767	1	0	5	0	0	5	0	6	0	5	2	6	0	5	0	7	0	0	0	0	0	0	0	0	0	1	0.011765	0	0	0.011765	0	0.011765	0	0.047059	0	0	0	0.023529	0	0	0	0	0	0.023529	0	0	0.011765	0.011765	0	0	0	0	0.047059	0	0.011765	0.023529	0	0.011765	0.011765	0	0	0	0	0.105882	0	0	0	0.035294	0	0	0	0	0.023529	0	0	0.011765	0.011765	0.011765	0	0	0.035294	0	0	0.011765	0.011765	0	0.011765	0	0	0.011765	0	0	0.023529	0.011765	0.011765	0.082353	0	0	0	0.035294	0	0.058824	0.082353	0	0.023529	0.023529	0	0.023529	0	0	0	0.035294	0.023529	0	0	0	0	0	0	0	0	0.011765	0.011765	0	85	0
2	390016	NaN	F	-	0	87	-	-	12	-	0.03125	0.17188	0.03125	0.01774	0	0	0	21	1003	2012-07-18 00:00:00	2008-03-26 00:00:00	-	-	-	-	-	54	106	3	1	2	0	0	0	-	0	0	-	-	-	44	-	1	61	-	-	2	1	32	-	0	0	0	0	1	-	-	0	-3.57804	-1.89699	-4.78375	-6.3099	124	-3.51037	-4.46552	0	0	0	8.83984	4.80161	-	NaN	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [7]:

day2[:3]

Out[7]:

	ID	Net Sales	gender	ActivityIndex	Alcohol	AverageFriendsRegMonthDelta	CanSeeAllPosts	CanSeeAllPosts_CanPost	EducationType	EducationType_VK	FriendsAverageDeletedAccounts	FriendsAverageHasHighSchools	FriendsAverageHasJobs	FriendsPerDay	LastActivity	LifeTime	MaxRegDate	MinRegDate	MobileUsageAll	MobileUsageAndroid	MobileUsageIPad	MobileUsageIphone	MobileUsageWinPhone	MonthsFromMaxRegDate	MonthsFromMinRegDate	NumberOfAccounts	NumberOfAccountsMOIMIR	NumberOfAccountsODKL	NumberOfAccountsVK	NumberOfChilds_VK	NumberOfCompanies	NumberOfDeletedAccounts_VK	NumberOfEntrance	NumberOfFollowers	NumberOfFriendsMax	NumberOfFriendsMax_VK	NumberOfFriendsMin	NumberOfFriendsSum	NumberOfGroupsMax	NumberOfGroupsSum_OK	NumberOfHighSchools	NumberOfNotesMax	NumberOfPhotosMax	NumberOfPhotosMin_VK	NumberOfPrivateAccounts	NumberOfPrivateAccounts_OK	NumberOfRelatives	NumberOfRelatives_OK	NumberOfSchools	NumberOfSubscriptions	NumberOfVideos	Relation	SaScore1	SaScore2	SaScore5	SaScore6	SaScore7	SaScoreFraud	SaScoreSocial	Smoking	UseScreenName	Worldviews	YearsSinceMinRegDate	YearsSinceMinRegDate_OK	YearsSinceMinRegDate_VK	NumberOfAccountsFB	Interest_1	Interest_2	Interest_3	Interest_4	Interest_5	Interest_6	Interest_7	Interest_8	Interest_9	Interest_10	Interest_11	Interest_12	Interest_13	Interest_14	Interest_15	Interest_16	Interest_17	Interest_18	Interest_19	Interest_20	Interest_21	Interest_22	Interest_23	Interest_24	Interest_25	TD_acquaint_communic	TD_acquaintances	TD_active_rest	TD_ad	TD_animals	TD_anime_hentai	TD_architecture	TD_art_design	TD_beautiful_girls	TD_beauty	TD_books	TD_business	TD_cars	TD_cartoons	TD_caucasian	TD_celebrities	TD_children	TD_cognitive	TD_companies	TD_computers	TD_cookery	TD_design	TD_design_renovation	TD_diets	TD_do_yourself	TD_electronics_electrappliances	TD_entertainment	TD_family_home	TD_fashion	TD_finance	TD_fitness	TD_football	TD_foreign_lang	TD_gadgets	TD_games	TD_geopolitics_economy	TD_gif	TD_goods_services	TD_hobbies	TD_horoscope	TD_horror	TD_humour	TD_images	TD_insurance	TD_interior	TD_kazakhstan	TD_landscape_design	TD_literature_poetry	TD_mass_media	TD_mass_media_ad_PR	TD_men_communities	TD_mobile_internet	TD_motivation	TD_moto	TD_movies	TD_music	TD_names	TD_nature	TD_nature_and_travel	TD_nostalgia	TD_other_services	TD_parents_communities	TD_philosophy_esoterics	TD_photo	TD_poetry	TD_politics	TD_professions	TD_proposed_news	TD_real_estate	TD_regional_communities	TD_relations	TD_religion	TD_rest	TD_russia	TD_science	TD_science_education	TD_shops	TD_slimming	TD_society	TD_soft	TD_softporno_porno	TD_sport_and_health	TD_sport_diet	TD_sport_other	TD_start_ups	TD_tech_IT	TD_technologies	TD_thoughts_ideas	TD_tourism	TD_travel	TD_TV	TD_ukraine	TD_video	TD_way_of_life	TD_wedding_communities	TD_women_communities	TD_workout	TD_youth_communities	TD_sum	animal_owner
0	520001	NaN	M	0.9	0	78	1	2	0	0	0	0.24	0.12	0.00338	1	1478	2011-08-23 00:00:00	2008-12-25 00:00:00	0.75	1	0	0	0	65	97	2	1	0	1	0	0	0	3	1	18	5	5	23	9	-	1	-	4	1	0	-	0	-	1	6	8	7	-4.07950	-2.42172	-4.79584	-6.03038	121	-3.95212	-4.19340	0	1	0	8.08853	-	5.42952	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	10	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0.133333	0	0	0	0.133333	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0.133333	0	0	0	0	0	0	0	0	0	0.133333	0	0	0	0	0	0	0	0	0	0.066667	0	0	0	0	0	0	0	0	0	0	0	0	0.066667	0	0	0	0	0	0	0	0	0	0	0.066667	0	0	0	0	0	0	0.066667	0	0	0	0.2	15	0
1	520007	NaN	M	-	2	92	1	2	0	0	0.09053	0.39506	0.16049	0.33846	39	2288	2012-01-10 00:00:00	2009-03-02 00:00:00	0	0	0	0	0	60	94	3	1	1	1	0	1	0	-	6	209	22	3	234	19	16	1	10	14	4	0	0	3	3	2	-	6	7	-3.57355	-1.58798	-4.35452	-5.16231	132	-4.28645	-4.61878	2	0	1	7.90421	7.90421	5.04392	1	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	520013	NaN	F	-	0	80	-	-	-	-	0.03356	0.44295	0.15436	0.09244	0	2495	2014-05-14 00:00:00	2008-08-19 00:00:00	-	-	-	-	-	32	101	3	1	2	0	-	0	-	-	-	70	-	44	181	28	28	0	48	238	-	2	1	13	15	0	-	-	0	-3.98220	-1.75361	-5.49793	-6.42880	143	-4.40719	-4.61141	0	0	0	8.44049	8.36627	-	1	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	-	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [9]:

# чтобы создать csv-файлы
data.to_csv('data.csv', index=False)
day1.to_csv('day1.csv', index=False)
day2.to_csv('day2.csv', index=False)

целевой признак¶

In [11]:

figsize(12, 4)
plt.hist(data.NetSales.values, bins=100, color='#0088DD')
plt.xlim([0, 50000])
plt.xlabel('net sales')
plt.ylabel('count')

Out[11]:

<matplotlib.text.Text at 0x2a31cf60>

In [6]:

# число нулевых и больших значений
np.mean(data.NetSales.values==0), np.mean(data.NetSales.values > 10000)

Out[6]:

(0.62155897435897434, 0.048105128205128203)

In [20]:

# после логарифмирования целевого признака
figsize(12, 4)
plt.hist(np.log(data.NetSales.values + 1.0), bins=60, color='#0088DD')
plt.ylim([0, 10000])
plt.xlabel('log(net sales + 1)')
plt.ylabel('count')

Out[20]:

<matplotlib.text.Text at 0x6bb135c0>

вывод:¶

лучше логарифмировать

работа с пропусками + новый признак¶

In [5]:

from time import time

def make_feature_matrix(data):
    tm = time()
    
    data.fillna(-2, inplace=True)
    data.replace('-', -1, inplace=True)
    data.gender = data.gender.map({'F': -1, 'M': +1, -2: 0})
    data.MaxRegDate = pd.to_datetime(data.MaxRegDate.replace(-1, datetime.datetime(2010, 6, 6, 0, 0)).replace(-2, datetime.datetime(2009, 7, 7, 0, 0)))
    data.MinRegDate = pd.to_datetime(data.MinRegDate.replace(-1, datetime.datetime(2010, 6, 6, 0, 0)).replace(-2, datetime.datetime(2009, 7, 7, 0, 0)))
    # print (tmp.max()) # Timestamp('2015-09-08 00:00:00') 2015-08-17 00:00:00
    # print (tmp.min()) # Timestamp('2006-03-06 00:00:00') 2006-03-04 00:00:00
    data.deltatime = data.MaxRegDate - data.MinRegDate
    data.deltatime = data.deltatime.dt.total_seconds() /(60*60*24)
    data.MaxRegDate = (data.MaxRegDate - pd.to_datetime('2006-01-01T00:00:00.000000000')).dt.total_seconds() /(60*60*24)
    data.MinRegDate = (data.MinRegDate - pd.to_datetime('2006-01-01T00:00:00.000000000')).dt.total_seconds() /(60*60*24)
    
    data = data.astype(float)
    
    print ('time = ' + str(time() - tm))
    return (data)

In [6]:

data = make_feature_matrix(data)

time = 9.350315570831299

In [7]:

day1 = make_feature_matrix(day1)
day2 = make_feature_matrix(day2)

time = 2.025214433670044
time = 2.039580821990967

Визуализации¶

In [8]:

# СРАЗУ логарифмируем
data.NetSales = np.log(data.NetSales.values + 1)

признаки со своими значениями¶

In [10]:

for name in data.columns:
    tmp = data[name].unique()
    n = len(tmp)
    if n > 5:
        tmp = tmp[:5]
    print (name, n, tmp)

ID 390000 [  1.   7.  13.  19.  25.]
NetSales 101 [ 0.          5.19849703  8.49236757  8.14118979  8.65538869]
gender 3 [-1.  1.  0.]
ActivityIndex 890 [-1.    4.58  5.74  4.75  4.13]
Alcohol 7 [ 0. -2.  4.  3.  2.]
AverageFriendsRegMonthDelta 115 [ 92.  -1.  96.  89.  97.]
CanSeeAllPosts 4 [-1.  0.  1. -2.]
CanSeeAllPosts_CanPost 5 [-1.  1.  2. -2.  0.]
EducationType 15 [ 12.  -1.   0.  10.   7.]
EducationType_VK 14 [ -1.   0.  -2.  13.   7.]
FriendsAverageDeletedAccounts 12159 [ 0.06667  0.       0.11579  0.02691  0.01754]
FriendsAverageHasHighSchools 35096 [ 0.33333  0.45263  0.72197  0.625    0.08434]
FriendsAverageHasJobs 24591 [ 0.06667  0.22222  0.06316  0.10762  0.33114]
FriendsPerDay 32199 [ 0.01768 -1.       0.03699  0.07254  0.15117]
HasPhone 3 [ 0. -2.  1.]
HasSkype 3 [ 0.  1. -2.]
HasTwitter 3 [ 0. -2.  1.]
LastActivity 2324 [ 113.   79.  145.    3.    0.]
LifeTime 3396 [  2.62300000e+03  -1.00000000e+00   2.51400000e+03   2.57800000e+03
   2.78500000e+03]
MaxRegDate 3422 [ 2483.   903.   779.   907.   697.]
MinRegDate 3399 [ 749.  903.  779.  890.  697.]
MobileUsageAll 21353 [-1.       0.93583  0.51657  0.84138  0.27027]
MobileUsageAndroid 9731 [-1.       0.       0.05    -2.       0.84615]
MobileUsageIPad 5523 [-1.       0.65714  0.       0.95902  0.95   ]
MobileUsageIphone 9688 [-1.       0.10857  0.9893   0.04098  0.     ]
MobileUsageWinPhone 2712 [-1.       0.      -2.       0.01     0.21429]
MonthsFromMaxRegDate 117 [  51.  103.  107.  110.   61.]
MonthsFromMinRegDate 116 [ 108.  103.  107.  110.   61.]
NumberOfAccounts 41 [ 2.  1.  4.  3. -2.]
NumberOfAccountsMOIMIR 15 [ 0.  1. -2.  2.  3.]
NumberOfAccountsODKL 32 [ 2.  0.  1.  3. -2.]
NumberOfAccountsVK 27 [ 0.  1. -2.  2.  5.]
NumberOfAdvancedSchools 7 [ 0.  2. -2.  1.  3.]
NumberOfChilds 16 [ 0. -2.  2.  1.  3.]
NumberOfChilds_VK 19 [-1.  0. -2.  2.  1.]
NumberOfCompanies 13 [ 0.  1.  2. -2.  4.]
NumberOfDeletedAccounts 13 [ 0.  1. -2.  2.  3.]
NumberOfDeletedAccounts_VK 10 [-1.  0. -2.  1.  2.]
NumberOfEntrance 1030 [  -1.  123.  253.  106.   54.]
NumberOfFollowers 978 [  -1.  105.   96.   27.   10.]
NumberOfFriendsMax 2444 [  14.    8.   93.  187.  421.]
NumberOfFriendsMax_VK 2080 [  -1.  187.  421.  140.   59.]
NumberOfFriendsMin 1354 [   1.    8.   93.   16.  421.]
NumberOfFriendsSum 2790 [  15.    8.   93.  203.  421.]
NumberOfGroupsMax 568 [ -1.   3.  28.  35.  14.]
NumberOfGroupsSum_OK 557 [ -1.   3.  35.   5.  -2.]
NumberOfHighSchools 9 [ 1.  0.  2. -2.  3.]
NumberOfNotesMax 1361 [  0.  -1.   7.  38.   1.]
NumberOfPhotosMax 2946 [   1.    4.   11.   96.  128.]
NumberOfPhotosMin_VK 2448 [  -1.   96.  128.   23.   41.]
NumberOfPrivateAccounts 8 [ 0.  1. -2.  2.  3.]
NumberOfPrivateAccounts_OK 9 [ 0. -1.  1. -2.  2.]
NumberOfRelatives 106 [  4.   0.  27.  10.   2.]
NumberOfRelatives_OK 109 [  4.  -1.   0.  27.  10.]
NumberOfSchools 17 [ 1.  0.  2. -2.  4.]
NumberOfSubscriptions 896 [  -1.   31.   59.  112.   13.]
NumberOfVideos 2194 [  -1.  246.   22.  107.    2.]
Relation 9 [ 0.  7. -2.  6.  4.]
SaScore1 47081 [-3.82756 -4.0604  -4.00498 -3.97826 -4.45843]
SaScore2 26085 [-1.6738  -1.92671 -1.88295 -2.7878  -3.21992]
SaScore5 86330 [-5.15525 -4.93723 -5.82966 -5.45857 -5.08907]
SaScore6 36547 [-6.52538 -6.52939 -7.60862 -7.31933 -6.78496]
SaScore7 157 [ 113.  110.  108.   91.   99.]
SaScoreFraud 118106 [-4.14635 -3.98769 -4.59382 -4.87099 -4.95631]
SaScoreSocial 14808 [-4.86583 -5.21042 -4.90845 -5.04582 -3.19845]
Smoking 7 [ 0. -2.  2.  3.  1.]
UseScreenName 3 [ 0.  1. -2.]
Worldviews 8 [ 0.  1. -2.  6.  2.]
YearsSinceMinRegDate 185607 [ 9.0197   8.59808  8.93898  8.63466  9.16308]
YearsSinceMinRegDate_OK 165120 [ 9.0197  -1.       8.93898  5.14489  5.52771]
YearsSinceMinRegDate_VK 95940 [-1.       8.58786  9.16308  9.24801  6.67429]
NumberOfAccountsFB 6 [-2.  1.  2.  3.  4.]
Interest_1 13 [-1.  0. -2.  1.  2.]
Interest_2 13 [-1.  0.  1.  9. -2.]
Interest_3 13 [-1.  0.  1. -2.  7.]
Interest_4 13 [-1.  4.  0.  1. -2.]
Interest_5 13 [-1.  0.  2.  4.  7.]
Interest_6 13 [-1.  6.  0. -2.  5.]
Interest_7 13 [-1.  1.  4.  3.  7.]
Interest_8 13 [-1.  0.  7.  9. -2.]
Interest_9 13 [-1.  8.  2.  0. -2.]
Interest_10 13 [-1.  5.  0. -2.  8.]
Interest_11 13 [-1.  8.  2.  1.  0.]
Interest_12 13 [-1.  0.  8.  6. -2.]
Interest_13 13 [-1.  7.  0.  3.  8.]
Interest_14 13 [-1.  0.  5. -2.  6.]
Interest_15 13 [-1.  6.  0. -2.  8.]
Interest_16 13 [-1.  0. -2.  2.  5.]
Interest_17 13 [ -1.  10.   0.   7.  -2.]
Interest_18 13 [-1.  4.  1.  2.  0.]
Interest_19 32 [-1.  0. -2.  1.  2.]
Interest_20 44 [-1.  0. -2.  4.  2.]
Interest_21 41 [-1.  0. -2.  1.  2.]
Interest_22 9 [-1.  0. -2.  1.  2.]
Interest_23 27 [-1.  0. -2.  1.  2.]
Interest_24 8 [-1.  0. -2.  1.  2.]
Interest_25 43 [-1.  0. -2.  1.  2.]
TD_acquaint_communic 5996 [-2.          0.          0.00641026  0.00219298  0.03448276]
TD_acquaintances 2023 [-2.          0.          0.00892857  0.05128205  0.00341297]
TD_active_rest 3710 [-2.          0.02631579  0.          0.00438596  0.00571429]
TD_ad 5298 [-2.          0.          0.00641026  0.00657895  0.02631579]
TD_animals 3453 [-2.          0.00877193  0.          0.00438596  0.02631579]
TD_anime_hentai 2571 [-2.          0.          0.00657895  0.03296703  0.05517241]
TD_architecture 1196 [-2.          0.          0.25        0.00202429  0.00826446]
TD_art_design 10838 [-2.          0.03508772  0.12179487  0.0877193   0.05263158]
TD_beautiful_girls 4269 [-2.          0.          0.03448276  0.00990099  0.00892857]
TD_beauty 7653 [-2.          0.          0.00641026  0.01754386  0.02631579]
TD_books 4973 [-2.          0.03508772  0.          0.00438596  0.05882353]
TD_business 8589 [-2.          0.00877193  0.00641026  0.01535088  0.02631579]
TD_cars 5732 [-2.          0.00877193  0.          0.00219298  0.02631579]
TD_cartoons 3379 [-2.          0.          0.00219298  0.03571429  0.01408451]
TD_caucasian 1150 [-2.          0.          0.01219512  0.00241741  0.04545455]
TD_celebrities 6243 [-2.          0.          0.00641026  0.00657895  0.05882353]
TD_children 4044 [-2.          0.          0.01096491  0.07692308  0.00925926]
TD_cognitive 9558 [-2.          0.03508772  0.          0.02850877  0.02631579]
TD_companies 2527 [-2.          0.          0.00505051  0.01098901  0.01980198]
TD_computers 1922 [-2.          0.          0.00341297  0.00298507  0.00271739]
TD_cookery 7889 [-2.          0.          0.00641026  0.01315789  0.02631579]
TD_design 6564 [-2.          0.          0.03846154  0.03508772  0.02631579]
TD_design_renovation 3011 [-2.          0.          0.00219298  0.05882353  0.00892857]
TD_diets 4768 [-2.          0.          0.00219298  0.00571429  0.02197802]
TD_do_yourself 5609 [-2.          0.          0.01923077  0.04605263  0.02631579]
TD_electronics_electrappliances 1635 [-2.          0.          0.00892857  0.00925926  0.00298507]
TD_entertainment 14363 [-2.          0.12280702  0.02564103  0.03947368  0.02631579]
TD_family_home 9525 [-2.          0.          0.08333333  0.0372807   0.02631579]
TD_fashion 6833 [-2.          0.          0.00641026  0.00219298  0.00571429]
TD_finance 4269 [-2.          0.00877193  0.          0.00219298  0.03448276]
TD_fitness 4814 [-2.          0.          0.00641026  0.00219298  0.01098901]
TD_football 3589 [-2.          0.          0.00990099  0.00892857  0.01010101]
TD_foreign_lang 4964 [-2.          0.02631579  0.          0.00219298  0.03448276]
TD_gadgets 2448 [-2.          0.          0.00641026  0.00990099  0.00588235]
TD_games 6274 [-2.          0.          0.03571429  0.01408451  0.02970297]
TD_geopolitics_economy 1730 [-2.          0.          0.01869159  0.00341297  0.01886792]
TD_gif 7 [ -2.00000000e+00   0.00000000e+00   5.24934383e-04   1.82149362e-03
   5.88235294e-03]
TD_goods_services 13477 [-2.          0.04385965  0.12179487  0.05921053  0.02631579]
TD_hobbies 5797 [-2.          0.03508772  0.00641026  0.02412281  0.        ]
TD_horoscope 4800 [-2.          0.          0.00892857  0.01010101  0.02941176]
TD_horror 2543 [-2.          0.          0.01010101  0.00588235  0.00271739]
TD_humour 11736 [-2.          0.00877193  0.01282051  0.00657895  0.13157895]
TD_images 6570 [-2.          0.02631579  0.03846154  0.00877193  0.        ]
TD_insurance 753 [ -2.00000000e+00   0.00000000e+00   1.65837479e-03   4.75737393e-04
   1.05374078e-03]
TD_interior 5219 [-2.          0.          0.01923077  0.02631579  0.01142857]
TD_kazakhstan 452 [ -2.00000000e+00   0.00000000e+00   3.04878049e-03   2.18340611e-03
   4.75737393e-04]
TD_landscape_design 2146 [-2.          0.          0.01535088  0.00233645  0.00925926]
TD_literature_poetry 6164 [-2.          0.05263158  0.          0.00438596  0.05882353]
TD_mass_media 3712 [-2.          0.          0.00641026  0.00219298  0.05882353]
TD_mass_media_ad_PR 7873 [-2.          0.          0.01923077  0.01535088  0.02631579]
TD_men_communities 10506 [-2.          0.00877193  0.00641026  0.01535088  0.05263158]
TD_mobile_internet 1849 [-2.          0.          0.25        0.01666667  0.00341297]
TD_motivation 7531 [-2.          0.          0.00641026  0.01315789  0.02631579]
TD_moto 2056 [-2.          0.          0.01282051  0.00298507  0.00396825]
TD_movies 8353 [-2.          0.00877193  0.01282051  0.01096491  0.02631579]
TD_music 9498 [-2.          0.0877193   0.          0.00877193  0.07142857]
TD_names 2440 [-2.          0.          0.02631579  0.00429185  0.00233645]
TD_nature 2412 [-2.          0.          0.00505051  0.01666667  0.00429185]
TD_nature_and_travel 6178 [-2.          0.01754386  0.          0.00877193  0.02631579]
TD_nostalgia 2729 [-2.          0.          0.00877193  0.03571429  0.01408451]
TD_other_services 8127 [-2.          0.          0.03205128  0.02631579  0.03571429]
TD_parents_communities 6352 [-2.          0.          0.02412281  0.02631579  0.07142857]
TD_philosophy_esoterics 7188 [-2.          0.          0.01535088  0.05263158  0.05882353]
TD_photo 5985 [-2.          0.00877193  0.02564103  0.00219298  0.02631579]
TD_poetry 3771 [-2.          0.01754386  0.          0.01980198  0.00793651]
TD_politics 2668 [-2.          0.          0.05882353  0.10526316  0.01666667]
TD_professions 4612 [-2.          0.          0.01973684  0.1         0.00892857]
TD_proposed_news 5848 [-2.          0.          0.00641026  0.00219298  0.03448276]
TD_real_estate 3508 [-2.          0.          0.00641026  0.05263158  0.01714286]
TD_regional_communities 11665 [-2.          0.02631579  0.03205128  0.02192982  0.11764706]
TD_relations 5286 [-2.          0.          0.00641026  0.00990099  0.01785714]
TD_religion 2848 [-2.          0.          0.01096491  0.00990099  0.00793651]
TD_rest 8064 [-2.          0.06140351  0.02564103  0.0745614   0.02631579]
TD_russia 7429 [-2.          0.00877193  0.01282051  0.00438596  0.        ]
TD_science 3627 [-2.          0.00877193  0.          0.00505051  0.01666667]
TD_science_education 11078 [-2.          0.07017544  0.          0.03289474  0.02631579]
TD_shops 11056 [-2.          0.04385965  0.08333333  0.02850877  0.02631579]
TD_slimming 4396 [-2.          0.          0.00641026  0.01098901  0.01492537]
TD_society 5839 [-2.          0.          0.03070175  0.05882353  0.1       ]
TD_soft 3122 [-2.          0.          0.00641026  0.00990099  0.01098901]
TD_softporno_porno 5055 [-2.          0.          0.02970297  0.00892857  0.04040404]
TD_sport_and_health 9092 [-2.          0.04385965  0.01923077  0.01315789  0.        ]
TD_sport_diet 3467 [-2.          0.01754386  0.00641026  0.          0.01098901]
TD_sport_other 4566 [-2.          0.01754386  0.          0.01096491  0.00571429]
TD_start_ups 2127 [-2.          0.          0.01408451  0.00505051  0.00429185]
TD_tech_IT 4793 [-2.          0.          0.01282051  0.01980198  0.00892857]
TD_technologies 3714 [-2.          0.          0.00219298  0.00892857  0.01010101]
TD_thoughts_ideas 4560 [-2.          0.          0.01980198  0.00746269  0.00793651]
TD_tourism 3733 [-2.          0.          0.00438596  0.00990099  0.01980198]
TD_travel 5229 [-2.          0.00877193  0.          0.00438596  0.02816901]
TD_TV 6130 [-2.          0.02631579  0.          0.03448276  0.04347826]
TD_ukraine 2972 [-2.          0.          0.00571429  0.00934579  0.05681818]
TD_video 3747 [-2.          0.          0.01282051  0.00219298  0.02816901]
TD_way_of_life 5709 [-2.          0.          0.01535088  0.02631579  0.05882353]
TD_wedding_communities 4654 [-2.          0.          0.07692308  0.10714286  0.1       ]
TD_women_communities 11585 [-2.          0.          0.01282051  0.03289474  0.05263158]
TD_workout 3017 [-2.          0.00877193  0.          0.03448276  0.01098901]
TD_youth_communities 10711 [-2.          0.02631579  0.01923077  0.00877193  0.        ]
TD_sum 1782 [  -2.  114.  156.  456.   38.]
animal_owner 3 [-2.  0.  1.]

исследование отдельных признаков¶

In [12]:

name = 'Alcohol'
figsize(12, 4)
print (data.groupby(name)['NetSales'].count())
tmp = data.groupby(name)['NetSales'].mean()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('NetSales')

Alcohol
-2.0     97362
 0.0    281298
 1.0      1584
 2.0      2485
 3.0      3695
 4.0      3025
 5.0       551
Name: NetSales, dtype: int64

Out[12]:

<matplotlib.text.Text at 0x7f4885952d68>

In [32]:

name = 'Alcohol'
tmp = data.groupby(name)['NetSales'].count()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('count')

Out[32]:

<matplotlib.text.Text at 0x323469e8>

In [34]:

# как всегда много неизвестных значений
(data[name]<=0).mean()

Out[34]:

0.97092307692307689

In [37]:

name = 'gender'
tmp = data.groupby(name)['NetSales'].count()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('count')

Out[37]:

<matplotlib.text.Text at 0x4b54d080>

In [15]:

name = 'gender'
plt.figure(figsize=(12, 5))
print (data.groupby(name)['NetSales'].count())
tmp = data.groupby(name)['NetSales'].mean()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('NetSales')
plt.ylim([2.7, 3.1])
plt.xticks([-1.0, 0.0, 1.0], [u'Female', u'unknown', u'Male'])

gender
-1.0    231186
 0.0      9800
 1.0    149014
Name: NetSales, dtype: int64

Out[15]:

([<matplotlib.axis.XTick at 0x7f806e94e390>,
  <matplotlib.axis.XTick at 0x7f806e964d68>,
  <matplotlib.axis.XTick at 0x7f806fe8c1d0>],
 <a list of 3 Text xticklabel objects>)

In [55]:

name = 'CanSeeAllPosts'
print (data.groupby(name)['NetSales'].count())
tmp = data.groupby(name)['NetSales'].mean()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('NetSales')
# plt.xticks([-0.6, 0.4, 1.4], [u'Female', u'unknown', u'Male'])

CanSeeAllPosts
-2     97362
-1    149089
 0     61687
 1     81862
Name: NetSales, dtype: int64

Out[55]:

<matplotlib.text.Text at 0x450caac8>

In [65]:

(data.CanSeeAllPosts<0).mean()

Out[65]:

0.63192564102564097

In [66]:

data.EducationType.unique()

Out[66]:

array([ 12.,  -1.,   0.,  10.,   7.,  13.,   6.,  -2.,   5.,   9.,   8.,
        11.,   4.,   3.,   2.])

In [67]:

plt.figure(figsize=(12, 4))
name = 'EducationType'
print (data.groupby(name)['NetSales'].count())
tmp = data.groupby(name)['NetSales'].mean()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel(u'среднее log(NetSales)') # NetSales
plt.xticks([-2,-1,0, 2,3,4,5,6,7,8,9,10,11,12,13], [u'none',u'--',u'Школа', u'Проф.училище', u'ПТУ', u'Лицей',
                                                   u'Колледж', u'Техникум', u'Филиал', u'Академия',
                                                   u'Институт', u'Институт', u'Училище', u'Университет',
                                                   u'Гос. инст-ты и унив-ты'], rotation=50)
plt.show()

EducationType
-2      97362
-1     129559
 0      38562
 2        331
 3       4023
 4       1194
 5       9283
 6       6405
 7      15624
 8       7223
 9      10893
 10     12541
 11      5246
 12     12093
 13     39661
Name: NetSales, dtype: int64

In [69]:

name = 'EducationType'

plt.figure(figsize=(12, 4))

tmp = data.groupby(name)['NetSales'].count()
plt.bar(tmp.index, tmp, color='#0088DD')
plt.xlabel(name)
plt.ylabel('count')

# plt.xticks([-0.6, 0.4, 1.4], [u'Female', u'unknown', u'Male'])
plt.xticks([-2,-1,0, 2,3,4,5,6,7,8,9,10,11,12,13], [u'none',u'--',u'Школа', u'Проф.училище', u'ПТУ', u'Лицей',
                                                   u'Колледж', u'Техникум', u'Филиал', u'Академия',
                                                   u'Институт', u'Институт', u'Училище', u'Университет',
                                                   u'Гос. инст-ты и унив-ты'], rotation=50)
plt.show()

In [71]:

e1 = 0.3*np.random.rand(data.shape[0])
e2 = 0.3*np.random.rand(data.shape[0])
plt.scatter(data.EducationType.values + e1, data.EducationType_VK.values + e2)

Out[71]:

<matplotlib.collections.PathCollection at 0x40484c50>

In [74]:

data.groupby(['EducationType', 'EducationType_VK'])['NetSales'].count().unstack()

Out[74]:

EducationType_VK	-2.0	-1.0	0.0	2.0	3.0	4.0	5.0	6.0	7.0	8.0	9.0	11.0	12.0	13.0
EducationType
-2	97362	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
-1	NaN	129559	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
0	NaN	3205	35357	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	301	23	7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	3759	242	7	15	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	1077	115	NaN	NaN	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	NaN	7801	1450	NaN	NaN	NaN	32	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	NaN	5659	704	1	1	NaN	2	38	NaN	NaN	NaN	NaN	NaN	NaN
7	NaN	7971	5028	NaN	1	NaN	NaN	7	2349	6	19	1	238	4
8	NaN	4367	2654	NaN	NaN	NaN	NaN	3	NaN	199	NaN	NaN	NaN	NaN
9	NaN	6715	4100	NaN	NaN	NaN	2	NaN	NaN	4	72	NaN	NaN	NaN
10	NaN	7508	5014	NaN	NaN	NaN	3	1	NaN	13	2	NaN	NaN	NaN
11	NaN	4078	1144	NaN	1	NaN	3	1	NaN	2	2	15	NaN	NaN
12	NaN	6792	3871	NaN	NaN	1	NaN	1	NaN	33	9	1	1385	NaN
13	NaN	22340	16990	NaN	NaN	NaN	5	1	NaN	20	12	4	221	68

In [87]:

data.groupby(['HasPhone', 'HasSkype', 'HasTwitter'])['NetSales'].agg({'mean': np.mean, 'count': lambda x: x.count()}) #.unstack()

Out[87]:

			count	mean
HasPhone	HasSkype	HasTwitter
-2	-2	-2	97362	2.937212
0	0	0	253128	2.994881
	0	1	1312	2.807146
	1	0	16811	2.872637
	1	1	1589	2.796710
1	0	0	11847	2.460538
	0	1	246	2.337781
	1	0	6959	2.276721
	1	1	746	2.445111

In [76]:

data.groupby(['HasPhone', 'HasSkype', 'HasTwitter'])['NetSales'].mean() #.unstack()

Out[76]:

HasPhone  HasSkype  HasTwitter
-2        -2        -2            2.937212
 0         0         0            2.994881
                     1            2.807146
           1         0            2.872637
                     1            2.796710
 1         0         0            2.460538
                     1            2.337781
           1         0            2.276721
                     1            2.445111
Name: NetSales, dtype: float64

In [91]:

plt.figure(figsize=(7,6))
plt.scatter(data.LastActivity, data.LifeTime, 5, data.NetSales > 0)
plt.xlim([-2.5, 3500])
plt.ylim([-2.5, 3500])
plt.xlabel('LastActivity')
plt.ylabel('LifeTime')

Out[91]:

<matplotlib.text.Text at 0x5b39a358>

In [ ]:

#data.groupby(['MobileUsageAndroid', 'MobileUsageIPad', 'MobileUsageIphone'])['NetSales'].agg({'mean': np.mean, 'count': lambda x: x.count()}) #.unstack()
plt.figure(figsize=(7,6))
plt.scatter(data.MobileUsageAndroid, data.MobileUsageIphone, 5, data.NetSales > 0)
plt.xlim([-0.001, 1.001])
plt.ylim([-0.001, 1.001])
plt.xlabel('MobileUsageAndroid')
plt.ylabel('MobileUsageIphone')

In [45]:

# самые важные признаки
plt.figure(figsize=(7,6))
plt.scatter(data.AverageFriendsRegMonthDelta.values, data.YearsSinceMinRegDate.values, 5, data.NetSales.values > 0)
# plt.xlim([-2.5, 3500])
# plt.ylim([-2.5, 3500])
plt.xlabel('AverageFriendsRegMonthDelta')
plt.ylabel('YearsSinceMinRegDate')

Out[45]:

<matplotlib.text.Text at 0x233a9860>

In [55]:

np.mean(data.NetSales.values[data.AverageFriendsRegMonthDelta.values > 65])

Out[55]:

3.113393404629742

In [58]:

np.mean(data.NetSales.values[data.AverageFriendsRegMonthDelta.values <= 65])

Out[58]:

2.7461372315894477

In [57]:

np.mean(data.NetSales.values[data.YearsSinceMinRegDate.values > 6])

Out[57]:

3.1016901085295796

In [60]:

np.mean(data.NetSales.values[data.YearsSinceMinRegDate.values <= 6])

Out[60]:

2.681165505975696

In [ ]:

# самые важные признаки
plt.figure(figsize=(7,6))
plt.scatter(data.AverageFriendsRegMonthDelta.values, data.YearsSinceMinRegDate.values, 5, data.NetSales.values > 0)
# plt.xlim([-2.5, 3500])
# plt.ylim([-2.5, 3500])
plt.xlabel('AverageFriendsRegMonthDelta')
plt.ylabel('YearsSinceMinRegDate')

In [ ]:

u'', u'MinRegDate', u'MonthsFromMinRegDate',
       u''

In [44]:

np.mean(data.MobileUsageAndroid < 0)

Out[44]:

0.6480948717948718

In [35]:

n_nans = ((data!=-1) & (data!=-2)).mean(0)
plt.figure(figsize=(12, 4))
plt.bar(np.arange(len(n_nans)), n_nans, color='#0088DD')
plt.xlabel(u'номер признака')
plt.ylabel(u'% известных значений')

Out[35]:

<matplotlib.text.Text at 0x1d4a8c88>

In [39]:

n_nans[n_nans < 0.3]

Out[39]:

NetSales                 0.000000
EducationType_VK         0.207285
NumberOfSubscriptions    0.288723
dtype: float64

In [40]:

n_nans = ((data!=-1) & (data!=-2)).mean(1)
plt.figure(figsize=(12, 4))
plt.bar(np.arange(len(n_nans)), n_nans, color='#0088DD')
plt.xlabel(u'номер признака')
plt.ylabel(u'% известных значений')

Out[40]:

<matplotlib.text.Text at 0x345d0550>

Исследование алгоритмов¶

In [16]:

X = data.drop(['ID', 'NetSales'], 1)
y = data.NetSales.values

In [22]:

print (data.shape, X.shape, y.shape)

(390000, 197) (390000, 195) (390000,)

In [23]:

from scipy.integrate import trapz, simps

def hackathone_metric(df, do_draw=True):
    '''На фход подается словарь с ответами алгоритма(поле pred) 
    и фактическими значениями(поле test):
    df = {'pred': [2,3,55,63,1,2,100,100,10],'test': [1,2,3,4,5,0,99,121,14]}
    '''

    # Сортируем фактические продажи по возрастанию и считаем кумулятивную сумму – это нижняя линия
    n = range(1, len(df['test'])+1)
    c1 = np.cumsum(sorted(df['test'], reverse=False))

    # Сортируем фактические продажи по убыванию и считаем кумулятивную сумму – это верхняя линия
    c2 = np.cumsum(sorted(df['test'], reverse=True))

    # Площадь от верхней линии к нижней линии
    s1 = simps(c2, n) - simps(c1, n)

    # Теперь берём пары предсказаний и фактических продаж
    # Сортируем по возрастанию прогнозного значения
    c3 = np.cumsum(list(zip(*sorted(zip(df['pred'],df['test']), key=lambda x: x[0])))[1])
    s2 = simps(c3, n) -  simps(c1, n)
    
    if do_draw:
        plt.plot(n,c1, label='Lower Line')
        plt.plot(n,c2, label='Upper Line')
        plt.plot(n,c3, label='Actual Line')
        plt.legend()

    return (1-s2/s1)

In [32]:

# случайные разбиения выборки на фолды
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=10)

In [24]:

import lightgbm as lgb
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict


tm = time()
# cv = 5

# оптимизированный
clf = lgb.LGBMRegressor(learning_rate=0.06, n_estimators=50, nthread=-1, colsample_bytree=0.7, max_depth=8, min_child_samples=76, num_leaves=46)

cvp = cross_val_predict(estimator=clf, X=X, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

# learning_rate=0.06, n_estimators=50

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))

# 0.620036885673 - 0.620268307369

45.27469348907471
Metric:  0.620268307369

In [ ]:

In [50]:

tm = time()
# cv = 5

# параметры по умолчанию
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

a_lgb = cross_val_predict(estimator=clf, X=X, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_lgb,'test': y}
print ("Metric: ", hackathone_metric(df))
# 0.612690125729
# 0.614893275397
# 0.618805261162
# 0.621053597456 learning_rate=0.1

# 0.620026516158 - 0.62094301361

47.005587577819824
Metric:  0.62094301361

In [27]:

tm = time()
# cv = 5

# имитация xgboost
clf = lgb.LGBMRegressor(learning_rate=0.3, n_estimators=50, max_depth=4, subsample=0.8, colsample_bytree=0.8, nthread=-1)

cvp = cross_val_predict(estimator=clf, X=X, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))
# 0.619490675267
# 0.619746131906 learning_rate=0.2

# 0.619224715858 - 0.618768309225

25.263100385665894
Metric:  0.618768309225

In [28]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100,
                            criterion='mse', max_depth=None,
                            min_samples_split=200, min_samples_leaf=100,
                            min_weight_fraction_leaf=0.0,
                            max_features=20, max_leaf_nodes=None,
                            min_impurity_split=1e-07, bootstrap=True,
                            oob_score=False, n_jobs=-1, random_state=10, # None
                            verbose=0, warm_start=False)


tm = time()
#cv = 5

a_rf = cross_val_predict(estimator=rf, X=X, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_rf,'test': y}
print ("Metric: ", hackathone_metric(df))

# 0.61789219547 0.618003737937

233.68027639389038
Metric:  0.618003737937

In [31]:

cv

Out[31]:

In [33]:

from xgboost import XGBRegressor


gbm = XGBRegressor(max_depth=4, learning_rate=0.1,
                   n_estimators=10, silent=True,
                   objective='reg:linear', gamma=0.6,
                   min_child_weight=5, max_delta_step=0,
                   subsample=0.8, colsample_bytree=0.8,
                   colsample_bylevel=1, reg_alpha=0,
                   reg_lambda=1, scale_pos_weight=1, base_score=0.5,
                   seed=1, missing=None, nthread=-1)


tm = time()
# cv = 5

a_gbm = cross_val_predict(estimator=gbm, X=X, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_gbm,'test': y}
print ("Metric: ", hackathone_metric(df))
# 0.612381394838 0.611961390597

41.5311233997345
Metric:  0.611961390597

In [65]:

e_xgb = [0.612381394838, 0.611961390597]
e_rf = [0.61789219547, 0.618003737937]
e_lgb = [0.620026516158, 0.62094301361]
e_tunedlgb = [0.620036885673, 0.620268307369]


plt.figure(figsize=(12, 4))
plt.bar([1,2,3,4, 5], [e_rf[1], e_xgb[1], e_tunedlgb[1], e_lgb[1], 0.62105], color='#0088DD', width=0.3)
plt.bar(np.array([1,2,3,4])+0.1, [e_rf[0], e_xgb[0], e_tunedlgb[0], e_lgb[0]], width=0.3, color='#004499')
plt.xticks([1,2,3,4, 5], ['rf', 'xgboost', 'lgbm_tuned', 'lgb', 'rf+lgb'])
plt.ylim([0.61, 0.6225])

Out[65]:

(0.61, 0.6225)

In [55]:

# смесь алгоритмов

lspace = np.linspace(0, 1, 20)
e1 = []
e2 = []
e3 = []

for alpha in lspace:
    e1.append(hackathone_metric({'pred': alpha * a_lgb + (1 - alpha) * a_rf,'test': y}, do_draw=False))
    e2.append(hackathone_metric({'pred': alpha * a_lgb + (1 - alpha) * a_gbm,'test': y}, do_draw=False))
    e3.append(hackathone_metric({'pred': alpha * a_gbm + (1 - alpha) * a_rf,'test': y}, do_draw=False))

In [62]:

plt.figure(figsize=(12, 4))
plt.plot(lspace, e1, lw=2, color='#0088DD', label='rf + lgb')
plt.plot(lspace, e2, lw=2, color='#0044AA', label='gbm + lgb')
plt.plot(lspace, e3, lw=2, color='#000044', label='rf + gbm')
plt.legend()
plt.xlabel('коэффициент в комбинации')
plt.ylabel('метрика хакатона')

Out[62]:

<matplotlib.text.Text at 0x7f8078f52278>

In [63]:

max(e1)

Out[63]:

0.62105259128005375

In [39]:

tm = time()
cv = 5
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

cvp = cross_val_predict(estimator=clf, X=X[['AverageFriendsRegMonthDelta', 'MinRegDate']], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))

2.7375810146331787
Metric:  0.605694410257

In [43]:

clf.fit(X, y)

Out[43]:

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
       fair_c=1.0, gaussian_eta=1.0, huber_delta=1.0, learning_rate=0.1,
       max_bin=255, max_depth=-1, max_drop=50, min_child_samples=10,
       min_child_weight=5, min_split_gain=0, n_estimators=50, nthread=-1,
       num_leaves=31, objective='regression', poisson_max_delta_step=0.7,
       reg_alpha=0, reg_lambda=0, seed=0, silent=True, skip_drop=0.5,
       subsample=1, subsample_for_bin=50000, subsample_freq=1,
       uniform_drop=False, xgboost_dart_mode=False)

In [51]:

v = clf.feature_importances_
figure(figsize=(12, 40))
plt.barh(np.arange(len(v)), v, color='#0088DD')
plt.yticks(np.arange(len(v)), X.columns, rotation=0, fontsize=12)
plt.xlabel(u'важность признака')
plt.show()

In [ ]:

imp = sorted(imp.items(), key=lambda x: x[1])

v = [x[1] for x in imp]
k = [x[0] for x in imp]
#imp.values()
plt.barh(np.arange(len(v)), v, color='#000099')
plt.yticks(np.arange(len(v)), k, rotation=0, fontsize=12)
plt.xlabel(u'важность признака')
plt.show()

In [62]:

mean((X>0).mean(0) > 0.25)

Out[62]:

0.23076923076923078

In [64]:

tm = time()
cv = 5
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

cvp = cross_val_predict(estimator=clf, X=X[X.columns[(X>0).mean(0) > 0.25]], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))

12.539789199829102
Metric:  0.619544190149

In [65]:

tm = time()
cv = 5
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

cvp = cross_val_predict(estimator=clf, X=X[X.columns[(X>0).mean(0) <= 0.25]], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))

34.94797110557556
Metric:  0.59888099705

In [66]:

tm = time()
cv = 5
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

cvp = cross_val_predict(estimator=clf, X=X[X.columns[(X>0).mean(0) <= 0.5]], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': cvp,'test': y}
print ("Metric: ", hackathone_metric(df))

43.57594633102417
Metric:  0.608520849963

In [ ]:

In [21]:

оптимизация параметров¶

In [73]:

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


random_state = 1

space = {
    'n_estimators': 50, # hp.quniform('n_estimators', 10, 10, 0),
    'learning_rate': hp.quniform('eta', 0.01, 0.5, 0.01),
    # A problem with max_depth casted to float instead of int with
    'max_depth':  hp.choice('max_depth', np.arange(2, 14, dtype=int)),
    'num_leaves': hp.choice('num_leaves', np.arange(2, 62, dtype=int)),
    'min_child_samples': hp.choice('min_child_samples', np.arange(1, 100, dtype=int)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1),
     }

def score(params):
    tm = time()
    #print("Training with params: ")
    #print(params)
    #num_round = int(params['n_estimators'])
    #del params['n_estimators']
    
    clf = lgb.LGBMRegressor(max_depth=params['max_depth'],
                            learning_rate=params['learning_rate'],
                            n_estimators=100, # n_estimators=params['n_estimators'] 
                            num_leaves=params['num_leaves'],
                            min_child_samples=params['min_child_samples'],
                            subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'])

    
    a = cross_val_predict(estimator=clf, X=X, y=y, n_jobs=-1 ,cv=3, method='predict')
    
    df = {'pred': a,'test': y}
    score = hackathone_metric(df, do_draw=False)
    
    if (score > 0.621):
        print(params)
        print("\tScore =  {0} Time = {1}\n\n".format(score, time() - tm))
    
    return ( -score ) # т.к. минимум


trials = Trials()
    
best = fmin(score, space, algo=tpe.suggest, # trials=trials,
            max_evals=300)

{'num_leaves': 46, 'learning_rate': 0.07, 'max_depth': 13, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 45}
	Score =  0.6212414668005066 Time = 32.76194643974304


{'num_leaves': 25, 'learning_rate': 0.08, 'max_depth': 13, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 0.6000000000000001, 'min_child_samples': 66}
	Score =  0.6210239326097324 Time = 26.475329160690308


{'num_leaves': 44, 'learning_rate': 0.08, 'max_depth': 13, 'colsample_bytree': 0.6000000000000001, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 37}
	Score =  0.6213500215401293 Time = 34.462475538253784


{'num_leaves': 44, 'learning_rate': 0.08, 'max_depth': 9, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 11}
	Score =  0.6213584635394642 Time = 30.77764868736267


{'num_leaves': 36, 'learning_rate': 0.06, 'max_depth': 13, 'colsample_bytree': 0.8, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 83}
	Score =  0.6212749481204666 Time = 32.0214319229126


{'num_leaves': 34, 'learning_rate': 0.09, 'max_depth': 13, 'colsample_bytree': 0.9, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 83}
	Score =  0.6212382276584048 Time = 34.73256254196167


{'num_leaves': 55, 'learning_rate': 0.06, 'max_depth': 7, 'colsample_bytree': 0.8, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 92}
	Score =  0.6214233827868774 Time = 35.246747732162476


{'num_leaves': 28, 'learning_rate': 0.1, 'max_depth': 7, 'colsample_bytree': 0.9, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 92}
	Score =  0.6211886795695883 Time = 33.15094208717346


{'num_leaves': 49, 'learning_rate': 0.06, 'max_depth': 7, 'colsample_bytree': 0.9, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 44}
	Score =  0.6213240446706889 Time = 42.02438187599182


{'num_leaves': 59, 'learning_rate': 0.07, 'max_depth': 9, 'colsample_bytree': 0.6000000000000001, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 32}
	Score =  0.6211079795917778 Time = 37.589173793792725


{'num_leaves': 48, 'learning_rate': 0.06, 'max_depth': 10, 'colsample_bytree': 0.6000000000000001, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 77}
	Score =  0.621520947851093 Time = 36.987398862838745


{'num_leaves': 48, 'learning_rate': 0.06, 'max_depth': 10, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 77}
	Score =  0.6214290942033145 Time = 33.662314653396606


{'num_leaves': 48, 'learning_rate': 0.06, 'max_depth': 10, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 77}
	Score =  0.621734831865407 Time = 40.123149156570435


{'num_leaves': 48, 'learning_rate': 0.04, 'max_depth': 10, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 77}
	Score =  0.6210922443005261 Time = 47.36481070518494


{'num_leaves': 40, 'learning_rate': 0.09, 'max_depth': 10, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 12}
	Score =  0.6210996722080271 Time = 26.13451886177063


{'num_leaves': 21, 'learning_rate': 0.1, 'max_depth': 8, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 36}
	Score =  0.6210778669612143 Time = 27.143941640853882


{'num_leaves': 47, 'learning_rate': 0.09, 'max_depth': 10, 'colsample_bytree': 0.8, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 95}
	Score =  0.6211730275746561 Time = 40.83017897605896


{'num_leaves': 61, 'learning_rate': 0.09, 'max_depth': 7, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 60}
	Score =  0.6210455132616435 Time = 34.27331829071045


{'num_leaves': 29, 'learning_rate': 0.07, 'max_depth': 11, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 38}
	Score =  0.6211107434333178 Time = 31.876691579818726


{'num_leaves': 57, 'learning_rate': 0.04, 'max_depth': 10, 'colsample_bytree': 0.9, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 77}
	Score =  0.6212445186242708 Time = 52.71968913078308


{'num_leaves': 48, 'learning_rate': 0.07, 'max_depth': 8, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 0.7000000000000001, 'min_child_samples': 53}
	Score =  0.6213578913359767 Time = 33.70841193199158


{'num_leaves': 28, 'learning_rate': 0.11, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 48}
	Score =  0.6210118562572771 Time = 22.50042700767517


{'num_leaves': 43, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 8}
	Score =  0.621403467875624 Time = 27.520830392837524


{'num_leaves': 29, 'learning_rate': 0.09, 'max_depth': 10, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 21}
	Score =  0.6211294350932425 Time = 29.940505266189575


{'num_leaves': 49, 'learning_rate': 0.06, 'max_depth': 10, 'colsample_bytree': 0.6000000000000001, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 17}
	Score =  0.6213657259336232 Time = 35.28687286376953


{'num_leaves': 54, 'learning_rate': 0.05, 'max_depth': 10, 'colsample_bytree': 0.6000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 56}
	Score =  0.6213689723200626 Time = 35.399444580078125


{'num_leaves': 17, 'learning_rate': 0.11, 'max_depth': 5, 'colsample_bytree': 0.9, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 92}
	Score =  0.6210196852565769 Time = 26.239187717437744


{'num_leaves': 22, 'learning_rate': 0.08, 'max_depth': 8, 'colsample_bytree': 0.8, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 78}
	Score =  0.6211725425274872 Time = 30.440789222717285


{'num_leaves': 53, 'learning_rate': 0.05, 'max_depth': 7, 'colsample_bytree': 0.7000000000000001, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 4}
	Score =  0.6210215799354012 Time = 40.133230209350586


{'num_leaves': 37, 'learning_rate': 0.06, 'max_depth': 10, 'colsample_bytree': 0.8, 'n_estimators': 50, 'subsample': 1.0, 'min_child_samples': 89}
	Score =  0.6212048362587979 Time = 44.37146306037903


{'num_leaves': 43, 'learning_rate': 0.07, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 41}
	Score =  0.6214265769701983 Time = 29.642651319503784


{'num_leaves': 43, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6215410830206874 Time = 28.054609537124634


{'num_leaves': 43, 'learning_rate': 0.07, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 41}
	Score =  0.6214265769701983 Time = 30.163920640945435


{'num_leaves': 30, 'learning_rate': 0.12, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 43}
	Score =  0.6212417865325484 Time = 23.70250964164734


{'num_leaves': 43, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6215410830206874 Time = 29.03331160545349


{'num_leaves': 40, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 46}
	Score =  0.6215094488639252 Time = 30.560876846313477


{'num_leaves': 40, 'learning_rate': 0.09, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 46}
	Score =  0.6212960141382518 Time = 27.892940044403076


{'num_leaves': 43, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6215410830206874 Time = 29.10798931121826


{'num_leaves': 21, 'learning_rate': 0.11, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6210907751705355 Time = 23.617905378341675


{'num_leaves': 43, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6215410830206874 Time = 26.932640075683594


{'num_leaves': 43, 'learning_rate': 0.11, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6213750985969947 Time = 28.538962602615356


{'num_leaves': 56, 'learning_rate': 0.08, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 82}
	Score =  0.6212737508187609 Time = 30.078145503997803


{'num_leaves': 43, 'learning_rate': 0.05, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.8, 'min_child_samples': 45}
	Score =  0.6210253611505576 Time = 29.760560512542725


{'num_leaves': 61, 'learning_rate': 0.06, 'max_depth': 7, 'colsample_bytree': 0.5, 'n_estimators': 50, 'subsample': 0.9, 'min_child_samples': 25}
	Score =  0.6212395266137987 Time = 30.0674090385437

In [83]:

figsize(12, 30)
imp = clf.booster().get_fscore()
imp = sorted(imp.items(), key=lambda x: x[1])

v = [x[1] for x in imp]
k = [x[0] for x in imp]
#imp.values()
plt.barh(np.arange(len(v)), v, color='#000099')
plt.yticks(np.arange(len(v)), k, rotation=0, fontsize=12)
plt.xlabel(u'важность признака')
plt.show()

In [74]:

best



# 0.612690125729
# 0.614893275397
# 0.618805261162
# 0.621053597456 learning_rate=0.1

# 0.620026516158 - 0.62094301361

Out[74]:

{'colsample_bytree': 0.7000000000000001,
 'eta': 0.06,
 'max_depth': 8,
 'min_child_samples': 76,
 'num_leaves': 46,
 'subsample': 1.0}

In [96]:

print (mean((X>0).mean(1)>0.34))


i = ((X>0).mean(1)>0.34)
i = i.values

0.210748717949

In [89]:

tm = time()
# cv = 5

# параметры по умолчанию
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

a_lgb = cross_val_predict(estimator=clf, X=X.values[i,:], y=y[i], n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_lgb,'test': y[i]}
print ("Metric: ", hackathone_metric(df))

14.081033945083618
Metric:  0.658059162287

In [106]:

0.658059162287

0.609510921237

0.62094301361


plt.figure(figsize=(12, 4))
plt.bar([1,2,3], [0.62094301361, 0.658059162287, 0.609510921237], color='#0088DD', width=0.3)
#plt.bar(np.array([1,2,3,4])+0.1, [e_rf[0], e_xgb[0], e_tunedlgb[0], e_lgb[0]], width=0.3, color='#004499')
plt.xticks([1,2,3], ['все признаки', 'треть известна', 'мало известных'])
plt.xlabel('известные значения признаков')
plt.ylim([0.60, 0.66])
plt.ylabel('метрика хакатона')

Out[106]:

<matplotlib.text.Text at 0x7f8077510278>

In [102]:

print (mean((X>0).mean(0)>0.5))


i = ((X>0).mean(0)>0.5)
i = i.values

0.0974358974359

In [105]:

tm = time()
# cv = 5

# параметры по умолчанию
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

a_lgb = cross_val_predict(estimator=clf, X=X.values, y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_lgb,'test': y}
print ("Metric: ", hackathone_metric(df))

50.01448082923889
Metric:  0.62094301361

In [109]:

plt.figure(figsize=(12, 4))
plt.bar([1,2,3], [0.62094301361, 0.614487273913, 0.608630732257], color='#0088DD', width=0.3)
#plt.bar(np.array([1,2,3,4])+0.1, [e_rf[0], e_xgb[0], e_tunedlgb[0], e_lgb[0]], width=0.3, color='#004499')
plt.xticks([1,2,3], ['все признаки', 'половина известна', 'мало известных'])
plt.xlabel('известные значения признаков')
plt.ylim([0.60, 0.63])
plt.ylabel('метрика хакатона')

Out[109]:

<matplotlib.text.Text at 0x7f8070b9b438>

In [125]:

X.columns[-125:]

Out[125]:

Index(['Interest_1', 'Interest_2', 'Interest_3', 'Interest_4', 'Interest_5',
       'Interest_6', 'Interest_7', 'Interest_8', 'Interest_9', 'Interest_10',
       ...
       'TD_TV', 'TD_ukraine', 'TD_video', 'TD_way_of_life',
       'TD_wedding_communities', 'TD_women_communities', 'TD_workout',
       'TD_youth_communities', 'TD_sum', 'animal_owner'],
      dtype='object', length=125)

In [126]:

tm = time()
# cv = 5

# параметры по умолчанию
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

a_lgb = cross_val_predict(estimator=clf, X=X.values[:,:125], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_lgb,'test': y}
print ("Metric: ", hackathone_metric(df))
# 0.620564728301

34.66258788108826
Metric:  0.620519642468

In [127]:

tm = time()
# cv = 5

# параметры по умолчанию
clf = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=50, nthread=-1)

a_lgb = cross_val_predict(estimator=clf, X=X.values[:,125:], y=y, n_jobs=-1 ,cv=cv, method='predict')
print (time() - tm)

figsize(6, 5)
df = {'pred': a_lgb,'test': y}
print ("Metric: ", hackathone_metric(df))

14.960119247436523
Metric:  0.535889606696

In [129]:

plt.figure(figsize=(12, 4))
plt.bar([1,2,3], [0.62094301361, 0.620519642468, 0.535889606696], color='#0088DD', width=0.3)
#plt.bar(np.array([1,2,3,4])+0.1, [e_rf[0], e_xgb[0], e_tunedlgb[0], e_lgb[0]], width=0.3, color='#004499')
plt.xticks([1,2,3], ['все признаки', 'без последних 125', 'последние 125'])
plt.ylim([0.52, 0.63])
plt.ylabel('метрика хакатона')

Out[129]:

<matplotlib.text.Text at 0x7f8073d25a20>

In [ ]: