#!/usr/bin/env python # coding: utf-8 # # Ak Bars Online (mobile) churn rate prediction # # ### Источник данных # # Для составления исходного dataFrame (df_main) используются сессионные данные, доступные в приложении ApMetrica, а также данные о платежах и переводах, отправляемых разработчиками в AppMetrica. # # Количество пользователей считается по уникальному идентификатору устройства, который присваивается AppMetrica (параметр DeviceID). # ### Импорт библиотек + вспомогательные функции для работы с данными Clickhouse # In[8]: HOST = 'http://localhost:8123' import requests import pandas as pd import numpy as np import seaborn as sns try: from StringIO import StringIO except ImportError: from io import StringIO import seaborn as sns import matplotlib import matplotlib.pyplot as plt import datetime get_ipython().run_line_magic('matplotlib', 'inline') # убираем scientific notation pd.options.display.float_format = '{:.2f}'.format # отключим предупреждения import warnings warnings.filterwarnings('ignore') import itertools from pylab import rcParams rcParams['figure.figsize'] = 12, 8 # возвращаем результат из DataBase def get_clickhouse_data(query, host = HOST, connection_timeout = 1500): r = requests.post(host, params = {'query': query}, timeout = connection_timeout) if r.status_code == 200: return r.text else: raise ValueError(r.text) # преобразуем полученные данные в pandas DataFrame def get_clickhouse_df(query, host = HOST, connection_timeout = 1500): data = get_clickhouse_data(query, host, connection_timeout) df = pd.read_csv(StringIO(data), sep = '\t') return df # константы для работы с временными диапазонами # вчерашняя дата yesterday = [(datetime.date.today() - datetime.timedelta(days=2)).strftime('%Y-%m-%d'), (datetime.date.today() - datetime.timedelta(days=2)).strftime('%Y-%m-%d')] # последние 7 дней (без учета текущей даты) week_dates = [(datetime.date.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d'), \ (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')] # текущий месяц (без учета текущей даты) month_dates = [datetime.date.today().strftime('%Y-%m-') + '01', \ (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')] # выставляем даты для отчета query_dates = ['2018-06-01', '2018-07-31']; # выставляем даты для целевого признака по оттоку churn_query_dates = ['2018-08-01', '2018-08-31']; # формат даты fmt = '%Y-%m-%d' # функция для расчета признаков, связанных с сессиями пользователей (например, avgdaylag (средний кол-во дней между сессиями)) def func(a): i1, i2 = itertools.tee(iter(a)) next(i2) res = [(int(round((datetime.strptime(y, fmt) - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440 for x, y in zip(i1, i2)] # or just zip in Python 3 return round(np.mean(res),2) from datetime import datetime # здесь выставляем первое число месяца, на который прогнозируем отток reportDate = datetime(2018, 8, 1) sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'grey'}) sns.set(palette='Blues') # ## ВСЕ СЕССИИ # Получим данные по сессиям за период. В нашем случае за два месяца (июнь и июль 2018) # In[9]: get_ipython().run_cell_magic('time', '', "\nq = '''\nSELECT\n DeviceID,\n AppVersionName,\n OSName,\n City,\n SessionStartDate,\n SessionStartDateTime,\n SessionStartTimestamp\nFROM\n mobile.sessions_starts_all\nWHERE\n SessionStartDate BETWEEN ''' + ''' \\'''' + \\\n query_dates[0] + '''\\' AND ''' + '''\\'''' + \\\n query_dates[1] + '''\\' ''''FORMAT TabSeparatedWithNames'''\n\nsessions_2months = get_clickhouse_df(q)\n") # Добавим данные за май и апрель 2018 # In[10]: sessions_may = pd.read_csv('../dmitriilin/Downloads/sessions_starts_may.csv', sep=',') sessions_april = pd.read_csv('../dmitriilin/Downloads/sessions_starts_april.csv', sep=',') sessions_may['SessionStartDate'] = sessions_may['SessionStartDateTime'].apply(lambda x: x.split(' ')[0]) # In[11]: frames = [sessions_2months, sessions_may, sessions_april] sessions = pd.concat(frames) # In[12]: sessions.head() # Создадим общий dataframe для пользователей, куда будем добавлять агрегированные данные. Посчитаем кол-во сессий на DeviceID, используя SessionStartTimestamp в качестве ID сессии # In[13]: get_ipython().run_cell_magic('time', '', "\ndf_main = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\\\n .count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False)\n") # Запишем дни сессий в порядке возрастания массивом в колонку SessionStartDate # In[14]: df_main['SessionStartDate'] = [sorted(list(set(sessions['SessionStartDate'].loc[sessions['DeviceID'] == x['DeviceID']]))) for _, x in df_main.iterrows()] # Посчитаем среднюю продолжительность перерыва между днями, в течение которых были сессии # In[15]: df_main['sessions_avgdaylag'] = df_main.apply(lambda x: func(x['SessionStartDate']), axis=1) # Если у пользователя был один активный день, то avgdaylag будет равен кол-ву дней с этого дня и до даты отчета # In[16]: tempSer = df_main[df_main['sessions_avgdaylag'].isnull()]['SessionStartDate'].apply( lambda x: round((int(round((reportDate - datetime.strptime(x[0], fmt)).total_seconds() / 60))) / 1440)) # In[17]: df_main['sessions_avgdaylag'] = df_main['sessions_avgdaylag'].fillna(tempSer) # Посчитаем кол-во дней, прошешдишх с последней сессии пользователя от даты формирования отчета # In[18]: temp = sessions.groupby('DeviceID')[['SessionStartDate']].max().reset_index() temp['sessions_daysSinceLastSession'] = temp['SessionStartDate'].apply( lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440)) df_main = df_main.join(temp.drop(['SessionStartDate'], axis=1).set_index('DeviceID'), on='DeviceID') # Посчитаем количество дней, когда пользователь был активен в приложении # In[19]: df_main['sessions_totaldaysactive'] = df_main['SessionStartDate'].apply(lambda x: len(x)) # Удалим массив уникальных дат с сессиями и переименуем столбец SessionStartTimestamp # In[20]: df_main = df_main.drop(['SessionStartDate'], axis=1).rename(index=str, columns={ "SessionStartTimestamp": "sessions_totalnumber" }) # ## ВСЕ СОБЫТИЯ # Создадим временный dataframe для всех событий # In[21]: q = ''' SELECT AppVersionName, OSName, EventDate, ReceiveDate, ReceiveTimestamp, EventTimestamp, EventDateTime, EventName, DeviceID, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'crmId') AS VARCHAR) AS crmId, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'status') AS VARCHAR) AS authStatus, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'gender') AS VARCHAR) AS gender, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип операции') AS VARCHAR) AS operationType, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'сумма операции') AS VARCHAR) AS operationAmount, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'источник') AS VARCHAR) AS sender, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта отправителя') AS VARCHAR) AS senderCurrency, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'получатель') AS VARCHAR) AS recipient, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта получателя') AS VARCHAR) AS recipientCurrency, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'размер комиссии') AS VARCHAR) AS operationFee, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'способ операции') AS VARCHAR) AS operationMethod, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'срок') AS VARCHAR) AS depositTerm, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта') AS VARCHAR) AS depositCurrency, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'ставка') AS VARCHAR) AS depositRate, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'название') AS VARCHAR) AS depositName, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип чата') AS VARCHAR) AS chatType, CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'операция') AS VARCHAR) AS operation FROM mobile.events_all WHERE match(EventName,'продажи|переводы|Переводы|^платежи$') AND EventDate BETWEEN ''' + ''' \'''' + \ query_dates[0] + '''\' AND ''' + '''\'''' + \ query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames''' events = get_clickhouse_df(q) # ## ПЛАТЕЖИ # Создадим отдельный dataframe для платежей без добавления платежи, где сумма операция неизвестна (недоступна). Используем сумму операции, так как, если она пустая, то все остальные поля платежа тоже пустые. # In[22]: payments_2months = events[(events['EventName'] == 'платежи')] # In[23]: payments_2months = payments_2months.dropna(subset = ['operationAmount']) # Добавим данные за май и апрель 2018 # In[24]: payments_may = pd.read_csv('../dmitriilin/Downloads/payments_may.csv', sep=',') # In[25]: payments_april = pd.read_csv('../dmitriilin/Downloads/payments_april.csv', sep=',') # In[26]: frames = [payments_2months, payments_may, payments_april] payments = pd.concat(frames) # In[27]: payments.drop(['Unnamed: 0', 'chatType', 'depositCurrency', 'depositName', 'depositRate', 'depositTerm'], axis=1, inplace=True) # Рассчитаем агрегаты для платежей # In[28]: f = { 'operationAmount':['count','sum', 'median', 'min', 'max'], #количество, сумма, медиана, минимум и максимум 'sender': ['nunique'], #кол-во уникальных источников платежей 'recipient': ['nunique'], #кол-во получателей платежей 'operationMethod': ['nunique'] #кол-во способов платежей } # расчитаем агрегаты по каждому DeviceID payments_agg = payments.groupby(['DeviceID']).agg(f) payments_agg.columns = payments_agg.columns.map('_'.join) payments_agg = payments_agg.reset_index().rename(columns={ 'operationAmount_count': 'payments_operationAmount_count', 'operationAmount_sum': 'payments_operationAmount_sum', 'operationAmount_median': 'payments_operationAmount_median', 'operationAmount_min': 'payments_operationAmount_min', 'operationAmount_max': 'payments_operationAmount_max', 'sender_nunique': 'payments_sender_nunique', 'recipient_nunique': 'payments_recipient_nunique', 'operationMethod_nunique': 'payments_operationMethod_nunique' }) # Запишем дни платежей в порядке возрастания массивом в колонку payments_date # In[29]: payments_agg['payments_date'] = [sorted(list(set(payments['EventDate'].loc[payments['DeviceID'] == x['DeviceID']]))) for _, x in payments_agg.iterrows()] # Посчитаем среднюю продолжительность перерыва между днями, в течение которых были платежи. # Посчитаем количество дней, когда пользователь совершал платежи в приложении. # In[30]: payments_agg['payments_avgdaylag'] = payments_agg.apply(lambda x: func(x['payments_date']), axis=1) payments_agg['payments_daysactive'] = payments_agg['payments_date'].apply(lambda x: len(x)) payments_agg = payments_agg.drop(['payments_date'], axis=1) # Запишем данные о платежах в основой dataFrame # In[31]: df_main = df_main.join(payments_agg.set_index('DeviceID'), on='DeviceID') # Посчитаем кол-во дней, прошешдишх с последнего платежаа пользователя от даты формирования отчета # In[32]: temp = payments.groupby('DeviceID')[['EventDate']].max().reset_index() temp['payments_daysSinceLastPayment'] = temp['EventDate'].apply( lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440)) df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID') # Заполним отсутствующие данные для тех, кто ни разу не совершал платеж в выбранный период. Для признаков avgdaylag & daysSinceLastPayemnt запишем 122 дней (общее количество дней в периоде). # In[33]: fillna0 = [ 'payments_operationAmount_count', 'payments_operationAmount_sum', 'payments_operationAmount_median', 'payments_operationAmount_min', 'payments_operationAmount_max', 'payments_sender_nunique', 'payments_recipient_nunique', 'payments_operationMethod_nunique', 'payments_daysactive' ] fillna122 = ['payments_avgdaylag', 'payments_daysSinceLastPayment'] for i in fillna0: df_main[i].fillna(0, inplace=True) for i in fillna122: df_main[i].fillna(122, inplace=True) # ## ПЕРЕВОДЫ # Для переводов посчитаем те же агрегаты, что и для платежей # In[34]: fundTransfers_2months = events[(events['EventName'] == 'переводы')] fundTransfers_2months = fundTransfers_2months.dropna(subset = ['sender']) # In[35]: fundTransfers_may = pd.read_csv('../dmitriilin/Downloads/transfers_may.csv', sep=',') # In[36]: fundTransfers_april = pd.read_csv('../dmitriilin/Downloads/transfers_april.csv', sep=',') # In[37]: frames = [fundTransfers_2months, fundTransfers_may, fundTransfers_april] fundTransfers = pd.concat(frames) # In[38]: f = { 'operationAmount':['count','sum', 'median', 'min', 'max'], #количество и сумма операций 'operationFee': ['sum'], #сумма комиссий 'sender': ['nunique'], #кол-во источников перевода 'recipient': ['nunique'], #кол-во получателей перевода 'operationMethod': ['nunique'], #кол-во способов перевода 'senderCurrency': ['nunique'], #кол-во валют источника перевода 'recipientCurrency': ['nunique'] #кол-во валют получателя перевода } fundTransfers_agg = fundTransfers.groupby(['DeviceID']).agg(f) fundTransfers_agg.columns = fundTransfers_agg.columns.map('_'.join) fundTransfers_agg = fundTransfers_agg.reset_index().rename(columns={ 'operationAmount_count': 'transfers_operationAmount_count', 'operationAmount_sum': 'transfers_operationAmount_sum', 'operationAmount_median': 'transfers_operationAmount_median', 'operationAmount_min': 'transfers_operationAmount_min', 'operationAmount_max': 'transfers_operationAmount_max', 'operationFee_sum': 'transfers_operationFee_sum', 'sender_nunique': 'transfers_sender_nunique', 'recipient_nunique': 'transfers_recipient_nunique', 'operationMethod_nunique': 'transfers_operationMethod_nunique', 'senderCurrency_nunique': 'transfers_senderCurrency_nunique', 'recipientCurrency_nunique': 'transfers_recipientCurrency_nunique' }) # In[39]: fundTransfers_agg['transfers_date'] = [sorted(list(set(fundTransfers['EventDate'].loc[fundTransfers['DeviceID'] == x['DeviceID']]))) for _, x in fundTransfers_agg.iterrows()] fundTransfers_agg['transfers_avgdaylag'] = fundTransfers_agg.apply(lambda x: func(x['transfers_date']), axis=1) fundTransfers_agg['transfers_daysactive'] = fundTransfers_agg['transfers_date'].apply(lambda x: len(x)) fundTransfers_agg = fundTransfers_agg.drop(['transfers_date'], axis=1) # In[40]: df_main = df_main.join(fundTransfers_agg.set_index('DeviceID'), on='DeviceID') # In[41]: temp = fundTransfers.groupby('DeviceID')[['EventDate']].max().reset_index() temp['transfers_daysSinceLastTransfer'] = temp['EventDate'].apply( lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440)) df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID') # In[42]: fillna0 = [ 'transfers_operationAmount_count', 'transfers_operationAmount_sum', 'transfers_operationAmount_median', 'transfers_operationAmount_min', 'transfers_operationAmount_max', 'transfers_operationFee_sum', 'transfers_sender_nunique', 'transfers_recipient_nunique', 'transfers_operationMethod_nunique', 'transfers_senderCurrency_nunique', 'transfers_recipientCurrency_nunique', ] fillna122 = ['transfers_avgdaylag', 'transfers_daysactive', 'transfers_daysSinceLastTransfer'] for i in fillna0: df_main[i].fillna(0, inplace=True) for i in fillna122: df_main[i].fillna(122, inplace=True) # ## СЧИТАЕМ CHURN И ПРОСТАВЛЯЕМ TARGET FEATURE # In[43]: q = ''' SELECT DeviceID, AppVersionName, OSName, ReceiveDate, ReceiveTimestamp, SessionStartDate, SessionStartDateTime, SessionStartTimestamp FROM mobile.sessions_starts_all WHERE SessionStartDate BETWEEN ''' + ''' \'''' + \ churn_query_dates[0] + '''\' AND ''' + '''\'''' + \ churn_query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames''' sessions = get_clickhouse_df(q) # посчитаем кол-во сессий на DeviceID, используя SessionStartTimestamp в качестве ID сессии df_sessions_churn = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\ .count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False) # Посмотрим, какие DeviceID, из тех, что присутствуют в общем dataFrame за период, вернулись в прогнозируем периоде. Если DeviceID найден, ставим "0" (неотток), в противном случае - "1" (отток) # In[44]: df_main['churned'] = df_main['DeviceID'].isin(df_sessions_churn['DeviceID']).apply(lambda x: 0 if x is True else 1) # Как видим, 4-месячный отток составляет порядка 30% процентов. # In[45]: df_main['churned'].value_counts(normalize=True) # Посмотрим, как признаки из разных категория коррелируют с целевым # In[46]: sns.heatmap(df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]].corr(), annot=True); # sns.heatmap(df_main[[f for f in df_main.columns if 'payments' in f or 'churned' in f]].corr(), annot=True); # sns.heatmap(df_main[[f for f in df_main.columns if 'transfers' in f or 'churned' in f]].corr(), annot=True); # Наконец, посмотрим на получившийся dataFrame # In[47]: df_main.tail() # In[48]: df_main.columns # In[49]: df_main.shape # In[50]: df_main.info() # __________________ # ## FEATURE ENGINEERING # ### Добавим признаки # Добавим несколько новых признаков, основывающихся, прежде всего, на сессионых данных пользователей. # In[51]: # общее количество сессий за 3 месяца меньше 2 df_main['sessions_totalnumber_less_2'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 2 else 0) # In[52]: # общее количество сессий за 3 месяца меньше 6 df_main['sessions_totalnumber_less_6'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 6 else 0) # In[53]: # общее количество активных дней за 1 месяц меньше 2 df_main['sessions_totaldaysactive_less_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x < 2 else 0) # In[54]: df_main['sessions_avgdaylag_greater_20'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 20.5 else 0) df_main['sessions_avgdaylag_greater_10'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 10 else 0) df_main['sessions_avgdaylag_greater_15'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 15 else 0) df_main['sessions_totaldaysactive_equalsless_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x <= 2 else 0) df_main['sessions_daysSinceLastSession_equalsgreater_19'] = df_main['sessions_daysSinceLastSession'].apply(lambda x: 1 if x >= 19 else 0) df_main['sessions_totalnumber_equalsless_5'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x <= 5 else 0) # In[55]: def label_race(row): if (row['sessions_avgdaylag_greater_10'] == 1 and row['sessions_totaldaysactive_equalsless_2'] == 1 and row['sessions_daysSinceLastSession_equalsgreater_19'] == 1 and row['sessions_totalnumber_equalsless_5'] == 1): return 1 else: return 0 # In[56]: df_main['sessions_isPassive'] = df_main.apply(lambda row: label_race (row),axis=1) # In[57]: df_main['payments_isPassive'] = df_main['payments_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0) df_main['transfers_isPassive'] = df_main['transfers_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0) # Построим кор. матрицу для "сессионных" признаков # In[58]: fig, ax = plt.subplots(figsize=(15,10)) # sns.heatmap(df_main.corr()); sns.heatmap(df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]].corr(), annot=True); # _______ # ## ОБУЧАЕМ МОДЕЛЬ (-И) # In[52]: from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_curve, classification_report from sklearn.metrics import f1_score from collections import Counter # In[53]: y = df_main['churned'] X = df_main.drop('churned', axis=1) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.33, random_state=42) # df_small = X_valid # __________________ # ## RANDOM FOREST # In[147]: from sklearn.ensemble import RandomForestClassifier # In[148]: forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, max_depth=6, max_features=10) print(np.mean(cross_val_score(forest, X_train, y_train, cv=10))) # In[149]: forest.fit(X_train, y_train) # In[150]: print(classification_report(y_valid, forest.predict(X_valid), target_names=['non-churned', 'churned'])) # In[203]: forest_params = { 'max_depth': list(range(5,40,10)), 'max_features': list(range(5,41,10)) + [40] } forest_grid = GridSearchCV(forest, forest_params, cv=10, n_jobs=-1, verbose=True, scoring='f1') #try randomsearchgrid # In[204]: forest_grid.fit(X_train, y_train) # In[205]: forest_grid.best_params_, forest_grid.best_score_ # In[206]: print(classification_report(y_valid, forest_grid.predict(X_valid), target_names=['non-churned', 'churned'])) # ## XGBOOST # In[132]: from IPython.display import display import re import pandas as pd import numpy as np import xgboost as xgb from sklearn import preprocessing from sklearn import cross_validation from sklearn.model_selection import KFold from sklearn.feature_selection import RFECV from sklearn.grid_search import GridSearchCV # In[133]: X = np.array(df_main.drop('churned', 1)) training_features = np.array(df_main.drop('churned', 1).columns) #X = preprocessing.scale(X) --- not needed for XGboost? y = np.array(df_main['churned']) # In[134]: xgb = xgb.XGBClassifier() scores = cross_validation.cross_val_score(xgb, X, y, cv=5, n_jobs=-1, verbose=True, scoring='f1') xgb.fit(X_train, y_train) print(scores) print('F1: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores))) # In[135]: report = classification_report(y_valid, xgb.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # In[199]: xgbgridparams = {'learning_rate':[0, 0.001, 0.002, 0.004, 0.006, 0.008, 0.010, 0.1], 'reg_lambda':[0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]} xgb_grid = GridSearchCV( estimator=xgb, param_grid=xgbgridparams, scoring='f1', n_jobs=-1, refit=True, verbose=True, cv=5) xgb_grid.fit(X_train,y_train) print(xgb_grid.best_params_) print(xgb_grid.best_score_) # In[200]: report = classification_report(y_valid, xgb_grid.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # ## GradientBoosting (LIGHT) # In[54]: import lightgbm as lgb # Set params # Scores ~0.784 (without tuning and early stopping) params = { 'boosting_type': 'gbdt', 'max_depth' : -1, 'objective': 'binary', 'nthread': 3, # Updated from nthread 'num_leaves': 34, 'learning_rate': 0.1, 'max_bin': 1024, 'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1, 'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 10, 'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 5, 'scale_pos_weight': 1, 'num_class' : 1, 'metric' : 'binary_logloss', 'num_iterations' : 200 } gridParams = { 'learning_rate': [0.1], 'n_estimators': [40], 'num_leaves': [31], 'boosting_type' : ['gbdt'], 'objective' : ['binary'], 'random_state' : [42], # Updated from 'seed' 'colsample_bytree' : [0.65, 0.66], 'subsample' : [0.7,0.75], 'reg_alpha' : [1.2, 1.4], 'reg_lambda' : [1.2, 1.4] } lgbm = lgb.LGBMClassifier( boosting_type= params['boosting_type'], objective = params['objective'], n_jobs = -1, # Updated from 'nthread' silent = True, max_depth = params['max_depth'], max_bin = params['max_bin'], subsample_for_bin = params['subsample_for_bin'], subsample = params['subsample'], subsample_freq = params['subsample_freq'], min_split_gain = params['min_split_gain'], min_child_weight = params['min_child_weight'], min_child_samples = params['min_child_samples'], scale_pos_weight = params['scale_pos_weight'], learning_rate = params['learning_rate'], num_iterations=params['num_iterations'], num_leaves=params['num_leaves'] ) # In[55]: lgbm.fit(X_train, y_train) # In[56]: # F1 score для litegb - параметры НЕоптимизированы report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # In[190]: get_ipython().run_cell_magic('time', '', "lgbm_grid = GridSearchCV(lgbm, gridParams,\n verbose=True,\n cv=5,\n n_jobs=-1, scoring='f1')\nlgbm_grid.fit(X_train, y_train)\n") # In[191]: lgbm_grid.best_params_, lgbm_grid.best_score_ # In[192]: # F1 score для litegb - параметры НЕоптимизированы report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # In[193]: # F1 score для litegb - параметры оптимизированы report = classification_report(y_valid, lgbm_grid.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # ## CATBOOST # In[219]: from catboost import CatBoostClassifier, cv, Pool # In[220]: model = CatBoostClassifier( custom_loss=['F1'], random_seed=42, logging_level='Silent' ) # In[221]: # np.where(X.dtypes != np.float)[0] # In[222]: model.fit( X_train, y_train, eval_set=(X_valid, y_valid), plot=False ); # In[223]: cv_data = cv( Pool(X, y), model.get_params(), plot=False ) # In[224]: print('Best validation F1 score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-F1-mean']), cv_data['test-F1-std'][np.argmax(cv_data['test-F1-mean'])], np.argmax(cv_data['test-F1-mean']) )) # In[225]: print('Precise validation F1 score: {}'.format(np.max(cv_data['test-F1-mean']))) # In[226]: model_without_seed = CatBoostClassifier(iterations=500, logging_level='Silent') model_without_seed.fit(X_train, y_train) print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_)) # In[227]: params = { 'iterations': 200, 'learning_rate': 0.1, 'eval_metric': 'F1', 'random_seed': 42, 'logging_level': 'Silent' } train_pool = Pool(X_train, y_train) validate_pool = Pool(X_valid, y_valid) # In[228]: cb = CatBoostClassifier(**params) cb.fit(train_pool, eval_set=validate_pool) # best_model_params = params.copy() # best_model_params.update({ # 'use_best_model': True # }) # best_model = CatBoostClassifier(**best_model_params) # best_model.fit(train_pool, eval_set=validate_pool); print('Best model validation F1: {:.4}'.format( # accuracy_score(y_valid, best_model.predict(X_valid)) f1_score(y_valid, cb.predict(X_valid)) )) # ### Пробуем Early Stopping # In[229]: get_ipython().run_cell_magic('time', '', "params.update({\n 'od_type': 'Iter',\n 'od_wait': 50\n})\ncb_early_stopping = CatBoostClassifier(**params)\ncb_early_stopping.fit(train_pool, eval_set=validate_pool);\n") # In[230]: print('Early-stopped best model tree count: {}'.format(cb.tree_count_)) print('Early-stopped best model validation F1: {:.4}'.format( f1_score(y_valid, cb_early_stopping.predict(X_valid)) )) # In[231]: cb_early_stopping.get_params(), cb.get_params() # ### Пробуем оптимизировать параметры # In[232]: import hyperopt def hyperopt_objective(params): model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], iterations=100, #make 50 eval_metric='F1', random_seed=42, logging_level='Silent' ) cv_data = cv( Pool(X, y), model.get_params() ) best_accuracy = np.max(cv_data['test-F1-mean']) return 1 - best_accuracy # as hyperopt minimises # In[233]: from numpy.random import RandomState params_space = { 'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1), 'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1) } trials = hyperopt.Trials() best = hyperopt.fmin( hyperopt_objective, space=params_space, algo=hyperopt.tpe.suggest, max_evals=50, trials=trials, rstate=RandomState(123) ) print(best) # In[234]: model = CatBoostClassifier( l2_leaf_reg=int(best['l2_leaf_reg']), learning_rate=best['learning_rate'], iterations=100, eval_metric='F1', random_seed=42, logging_level='Silent', use_best_model=True ) cv_data = cv(Pool(X, y), model.get_params()) # In[235]: print('Precise validation F1 score: {}'.format(np.max(cv_data['test-F1-mean']))) # In[236]: # F1 score для CatBoost - параметры НЕоптимизированы report = classification_report(y_valid, cb.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # In[237]: model.fit(train_pool, eval_set=validate_pool); # In[238]: # F1 score для CatBoost - параметры оптимизированы report = classification_report(y_valid, model.predict(X_valid), target_names=['Non-churned', 'Churned']) print(report) # ### CONFUSION MATRIX ДЛЯ LGBM # In[239]: # Воспользуемся функцией построения матрицы ошибок из документации sklearn def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') font = {'size' : 18} plt.rc('font', **font) cnf_matrix = confusion_matrix(y_valid, lgbm.predict(X_valid)) plt.figure(figsize=(10, 8)) plot_confusion_matrix(cnf_matrix, classes=['Non-churned', 'Churned'], title='Confusion matrix') # plt.savefig("conf_matrix.png") plt.show()