HOST = 'http://localhost:8123'
import requests
import pandas as pd
import numpy as np
import seaborn as sns
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
# убираем scientific notation
pd.options.display.float_format = '{:.2f}'.format
# отключим предупреждения
import warnings
warnings.filterwarnings('ignore')
import itertools
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
# возвращаем результат из DataBase
def get_clickhouse_data(query, host = HOST, connection_timeout = 1500):
r = requests.post(host, params = {'query': query}, timeout = connection_timeout)
if r.status_code == 200:
return r.text
else:
raise ValueError(r.text)
# преобразуем полученные данные в pandas DataFrame
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500):
data = get_clickhouse_data(query, host, connection_timeout)
df = pd.read_csv(StringIO(data), sep = '\t')
return df
# константы для работы с временными диапазонами
# вчерашняя дата
yesterday = [(datetime.date.today() - datetime.timedelta(days=2)).strftime('%Y-%m-%d'),
(datetime.date.today() - datetime.timedelta(days=2)).strftime('%Y-%m-%d')]
# последние 7 дней (без учета текущей даты)
week_dates = [(datetime.date.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d'), \
(datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')]
# текущий месяц (без учета текущей даты)
month_dates = [datetime.date.today().strftime('%Y-%m-') + '01', \
(datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')]
# выставляем даты для отчета
query_dates = ['2018-06-01', '2018-07-31'];
# выставляем даты для целевого признака по оттоку
churn_query_dates = ['2018-08-01', '2018-08-31'];
# формат даты
fmt = '%Y-%m-%d'
# функция для расчета признаков, связанных с сессиями пользователей (например, avgdaylag (средний кол-во дней между сессиями))
def func(a):
i1, i2 = itertools.tee(iter(a))
next(i2)
res = [(int(round((datetime.strptime(y, fmt) - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440
for x, y in zip(i1, i2)] # or just zip in Python 3
return round(np.mean(res),2)
from datetime import datetime
# здесь выставляем первое число месяца, на который прогнозируем отток
reportDate = datetime(2018, 8, 1)
sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'grey'})
sns.set(palette='Blues')
Получим данные по сессиям за период. В нашем случае за два месяца (июнь и июль 2018)
%%time
q = '''
SELECT
DeviceID,
AppVersionName,
OSName,
City,
SessionStartDate,
SessionStartDateTime,
SessionStartTimestamp
FROM
mobile.sessions_starts_all
WHERE
SessionStartDate BETWEEN ''' + ''' \'''' + \
query_dates[0] + '''\' AND ''' + '''\'''' + \
query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
sessions_2months = get_clickhouse_df(q)
CPU times: user 12.9 s, sys: 7.76 s, total: 20.7 s Wall time: 26.9 s
Добавим данные за май и апрель 2018
sessions_may = pd.read_csv('../dmitriilin/Downloads/sessions_starts_may.csv', sep=',')
sessions_april = pd.read_csv('../dmitriilin/Downloads/sessions_starts_april.csv', sep=',')
sessions_may['SessionStartDate'] = sessions_may['SessionStartDateTime'].apply(lambda x: x.split(' ')[0])
frames = [sessions_2months, sessions_may, sessions_april]
sessions = pd.concat(frames)
sessions.head()
AppVersionName | City | DeviceID | OSName | SessionStartDate | SessionStartDateTime | SessionStartTimestamp | Unnamed: 0 | |
---|---|---|---|---|---|---|---|---|
0 | 3.17.2 | Kazan | 7543012021622200292 | ios | 2018-06-01 | 2018-06-01 11:57:29 | 1527854249 | nan |
1 | 3.17.1 | Samara | 16883537386825900026 | android | 2018-06-01 | 2018-06-01 11:18:10 | 1527851890 | nan |
2 | 3.17.1 | Samara | 16883537386825900026 | android | 2018-06-01 | 2018-06-01 07:11:19 | 1527837079 | nan |
3 | 3.18.0 | Kazan | 170802125429386870 | android | 2018-06-01 | 2018-06-01 08:03:44 | 1527840224 | nan |
4 | 3.18.0 | Kazan | 170802125429386870 | android | 2018-06-01 | 2018-06-01 11:31:19 | 1527852679 | nan |
Создадим общий dataframe для пользователей, куда будем добавлять агрегированные данные. Посчитаем кол-во сессий на DeviceID, используя SessionStartTimestamp в качестве ID сессии
%%time
df_main = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\
.count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False)
CPU times: user 1.56 s, sys: 1.17 s, total: 2.73 s Wall time: 2.95 s
Запишем дни сессий в порядке возрастания массивом в колонку SessionStartDate
df_main['SessionStartDate'] = [sorted(list(set(sessions['SessionStartDate'].loc[sessions['DeviceID'] == x['DeviceID']])))
for _, x in df_main.iterrows()]
Посчитаем среднюю продолжительность перерыва между днями, в течение которых были сессии
df_main['sessions_avgdaylag'] = df_main.apply(lambda x: func(x['SessionStartDate']), axis=1)
Если у пользователя был один активный день, то avgdaylag будет равен кол-ву дней с этого дня и до даты отчета
tempSer = df_main[df_main['sessions_avgdaylag'].isnull()]['SessionStartDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x[0], fmt)).total_seconds() / 60))) / 1440))
df_main['sessions_avgdaylag'] = df_main['sessions_avgdaylag'].fillna(tempSer)
Посчитаем кол-во дней, прошешдишх с последней сессии пользователя от даты формирования отчета
temp = sessions.groupby('DeviceID')[['SessionStartDate']].max().reset_index()
temp['sessions_daysSinceLastSession'] = temp['SessionStartDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['SessionStartDate'], axis=1).set_index('DeviceID'), on='DeviceID')
Посчитаем количество дней, когда пользователь был активен в приложении
df_main['sessions_totaldaysactive'] = df_main['SessionStartDate'].apply(lambda x: len(x))
Удалим массив уникальных дат с сессиями и переименуем столбец SessionStartTimestamp
df_main = df_main.drop(['SessionStartDate'], axis=1).rename(index=str, columns={
"SessionStartTimestamp": "sessions_totalnumber"
})
Создадим временный dataframe для всех событий
q = '''
SELECT
AppVersionName,
OSName,
EventDate,
ReceiveDate,
ReceiveTimestamp,
EventTimestamp,
EventDateTime,
EventName,
DeviceID,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'crmId') AS VARCHAR) AS crmId,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'status') AS VARCHAR) AS authStatus,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'gender') AS VARCHAR) AS gender,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип операции') AS VARCHAR) AS operationType,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'сумма операции') AS VARCHAR) AS operationAmount,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'источник') AS VARCHAR) AS sender,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта отправителя') AS VARCHAR) AS senderCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'получатель') AS VARCHAR) AS recipient,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта получателя') AS VARCHAR) AS recipientCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'размер комиссии') AS VARCHAR) AS operationFee,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'способ операции') AS VARCHAR) AS operationMethod,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'срок') AS VARCHAR) AS depositTerm,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта') AS VARCHAR) AS depositCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'ставка') AS VARCHAR) AS depositRate,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'название') AS VARCHAR) AS depositName,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип чата') AS VARCHAR) AS chatType,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'операция') AS VARCHAR) AS operation
FROM
mobile.events_all
WHERE
match(EventName,'продажи|переводы|Переводы|^платежи$')
AND EventDate BETWEEN ''' + ''' \'''' + \
query_dates[0] + '''\' AND ''' + '''\'''' + \
query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
events = get_clickhouse_df(q)
Создадим отдельный dataframe для платежей без добавления платежи, где сумма операция неизвестна (недоступна). Используем сумму операции, так как, если она пустая, то все остальные поля платежа тоже пустые.
payments_2months = events[(events['EventName'] == 'платежи')]
payments_2months = payments_2months.dropna(subset = ['operationAmount'])
Добавим данные за май и апрель 2018
payments_may = pd.read_csv('../dmitriilin/Downloads/payments_may.csv', sep=',')
payments_april = pd.read_csv('../dmitriilin/Downloads/payments_april.csv', sep=',')
frames = [payments_2months, payments_may, payments_april]
payments = pd.concat(frames)
payments.drop(['Unnamed: 0', 'chatType', 'depositCurrency', 'depositName', 'depositRate', 'depositTerm'],
axis=1,
inplace=True)
Рассчитаем агрегаты для платежей
f = {
'operationAmount':['count','sum', 'median', 'min', 'max'], #количество, сумма, медиана, минимум и максимум
'sender': ['nunique'], #кол-во уникальных источников платежей
'recipient': ['nunique'], #кол-во получателей платежей
'operationMethod': ['nunique'] #кол-во способов платежей
}
# расчитаем агрегаты по каждому DeviceID
payments_agg = payments.groupby(['DeviceID']).agg(f)
payments_agg.columns = payments_agg.columns.map('_'.join)
payments_agg = payments_agg.reset_index().rename(columns={
'operationAmount_count': 'payments_operationAmount_count',
'operationAmount_sum': 'payments_operationAmount_sum',
'operationAmount_median': 'payments_operationAmount_median',
'operationAmount_min': 'payments_operationAmount_min',
'operationAmount_max': 'payments_operationAmount_max',
'sender_nunique': 'payments_sender_nunique',
'recipient_nunique': 'payments_recipient_nunique',
'operationMethod_nunique': 'payments_operationMethod_nunique'
})
Запишем дни платежей в порядке возрастания массивом в колонку payments_date
payments_agg['payments_date'] = [sorted(list(set(payments['EventDate'].loc[payments['DeviceID'] == x['DeviceID']])))
for _, x in payments_agg.iterrows()]
Посчитаем среднюю продолжительность перерыва между днями, в течение которых были платежи. Посчитаем количество дней, когда пользователь совершал платежи в приложении.
payments_agg['payments_avgdaylag'] = payments_agg.apply(lambda x: func(x['payments_date']), axis=1)
payments_agg['payments_daysactive'] = payments_agg['payments_date'].apply(lambda x: len(x))
payments_agg = payments_agg.drop(['payments_date'], axis=1)
Запишем данные о платежах в основой dataFrame
df_main = df_main.join(payments_agg.set_index('DeviceID'), on='DeviceID')
Посчитаем кол-во дней, прошешдишх с последнего платежаа пользователя от даты формирования отчета
temp = payments.groupby('DeviceID')[['EventDate']].max().reset_index()
temp['payments_daysSinceLastPayment'] = temp['EventDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID')
Заполним отсутствующие данные для тех, кто ни разу не совершал платеж в выбранный период. Для признаков avgdaylag & daysSinceLastPayemnt запишем 122 дней (общее количество дней в периоде).
fillna0 = [
'payments_operationAmount_count',
'payments_operationAmount_sum',
'payments_operationAmount_median',
'payments_operationAmount_min',
'payments_operationAmount_max',
'payments_sender_nunique',
'payments_recipient_nunique',
'payments_operationMethod_nunique',
'payments_daysactive'
]
fillna122 = ['payments_avgdaylag', 'payments_daysSinceLastPayment']
for i in fillna0:
df_main[i].fillna(0, inplace=True)
for i in fillna122:
df_main[i].fillna(122, inplace=True)
Для переводов посчитаем те же агрегаты, что и для платежей
fundTransfers_2months = events[(events['EventName'] == 'переводы')]
fundTransfers_2months = fundTransfers_2months.dropna(subset = ['sender'])
fundTransfers_may = pd.read_csv('../dmitriilin/Downloads/transfers_may.csv', sep=',')
fundTransfers_april = pd.read_csv('../dmitriilin/Downloads/transfers_april.csv', sep=',')
frames = [fundTransfers_2months, fundTransfers_may, fundTransfers_april]
fundTransfers = pd.concat(frames)
f = {
'operationAmount':['count','sum', 'median', 'min', 'max'], #количество и сумма операций
'operationFee': ['sum'], #сумма комиссий
'sender': ['nunique'], #кол-во источников перевода
'recipient': ['nunique'], #кол-во получателей перевода
'operationMethod': ['nunique'], #кол-во способов перевода
'senderCurrency': ['nunique'], #кол-во валют источника перевода
'recipientCurrency': ['nunique'] #кол-во валют получателя перевода
}
fundTransfers_agg = fundTransfers.groupby(['DeviceID']).agg(f)
fundTransfers_agg.columns = fundTransfers_agg.columns.map('_'.join)
fundTransfers_agg = fundTransfers_agg.reset_index().rename(columns={
'operationAmount_count': 'transfers_operationAmount_count',
'operationAmount_sum': 'transfers_operationAmount_sum',
'operationAmount_median': 'transfers_operationAmount_median',
'operationAmount_min': 'transfers_operationAmount_min',
'operationAmount_max': 'transfers_operationAmount_max',
'operationFee_sum': 'transfers_operationFee_sum',
'sender_nunique': 'transfers_sender_nunique',
'recipient_nunique': 'transfers_recipient_nunique',
'operationMethod_nunique': 'transfers_operationMethod_nunique',
'senderCurrency_nunique': 'transfers_senderCurrency_nunique',
'recipientCurrency_nunique': 'transfers_recipientCurrency_nunique'
})
fundTransfers_agg['transfers_date'] = [sorted(list(set(fundTransfers['EventDate'].loc[fundTransfers['DeviceID'] == x['DeviceID']])))
for _, x in fundTransfers_agg.iterrows()]
fundTransfers_agg['transfers_avgdaylag'] = fundTransfers_agg.apply(lambda x: func(x['transfers_date']), axis=1)
fundTransfers_agg['transfers_daysactive'] = fundTransfers_agg['transfers_date'].apply(lambda x: len(x))
fundTransfers_agg = fundTransfers_agg.drop(['transfers_date'], axis=1)
df_main = df_main.join(fundTransfers_agg.set_index('DeviceID'), on='DeviceID')
temp = fundTransfers.groupby('DeviceID')[['EventDate']].max().reset_index()
temp['transfers_daysSinceLastTransfer'] = temp['EventDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID')
fillna0 = [
'transfers_operationAmount_count',
'transfers_operationAmount_sum',
'transfers_operationAmount_median',
'transfers_operationAmount_min',
'transfers_operationAmount_max',
'transfers_operationFee_sum',
'transfers_sender_nunique',
'transfers_recipient_nunique',
'transfers_operationMethod_nunique',
'transfers_senderCurrency_nunique',
'transfers_recipientCurrency_nunique',
]
fillna122 = ['transfers_avgdaylag', 'transfers_daysactive', 'transfers_daysSinceLastTransfer']
for i in fillna0:
df_main[i].fillna(0, inplace=True)
for i in fillna122:
df_main[i].fillna(122, inplace=True)
q = '''
SELECT
DeviceID,
AppVersionName,
OSName,
ReceiveDate,
ReceiveTimestamp,
SessionStartDate,
SessionStartDateTime,
SessionStartTimestamp
FROM
mobile.sessions_starts_all
WHERE
SessionStartDate BETWEEN ''' + ''' \'''' + \
churn_query_dates[0] + '''\' AND ''' + '''\'''' + \
churn_query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
sessions = get_clickhouse_df(q)
# посчитаем кол-во сессий на DeviceID, используя SessionStartTimestamp в качестве ID сессии
df_sessions_churn = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\
.count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False)
Посмотрим, какие DeviceID, из тех, что присутствуют в общем dataFrame за период, вернулись в прогнозируем периоде. Если DeviceID найден, ставим "0" (неотток), в противном случае - "1" (отток)
df_main['churned'] = df_main['DeviceID'].isin(df_sessions_churn['DeviceID']).apply(lambda x: 0 if x is True else 1)
Как видим, 4-месячный отток составляет порядка 30% процентов.
df_main['churned'].value_counts(normalize=True)
0 0.67 1 0.33 Name: churned, dtype: float64
Посмотрим, как признаки из разных категория коррелируют с целевым
sns.heatmap(df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]].corr(), annot=True);
# sns.heatmap(df_main[[f for f in df_main.columns if 'payments' in f or 'churned' in f]].corr(), annot=True);
# sns.heatmap(df_main[[f for f in df_main.columns if 'transfers' in f or 'churned' in f]].corr(), annot=True);
example = df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]][
((df_main['sessions_daysSinceLastSession'] > df_main['sessions_daysSinceLastSession'].quantile(.1)) &
(df_main['sessions_daysSinceLastSession'] < df_main['sessions_daysSinceLastSession'].quantile(.75)))
&
((df_main['sessions_avgdaylag'] > df_main['sessions_avgdaylag'].quantile(.1)) &
(df_main['sessions_avgdaylag'] < df_main['sessions_avgdaylag'].quantile(.75)))
&
((df_main['sessions_totaldaysactive'] > df_main['sessions_totaldaysactive'].quantile(.10)) &
(df_main['sessions_totaldaysactive'] < df_main['sessions_totaldaysactive'].quantile(.75)))
&
((df_main['sessions_totalnumber'] > df_main['sessions_totalnumber'].quantile(.10)) &
(df_main['sessions_totalnumber'] < df_main['sessions_totalnumber'].quantile(.75)))
]
_, axes = plt.subplots(1, 4, sharey=True, figsize=(16,4))
sns.boxplot(y='sessions_daysSinceLastSession', x='churned', data=example, palette='Blues', ax=axes[0]);
sns.boxplot(y='sessions_avgdaylag', x='churned', data=example, palette='Blues', ax=axes[1]);
sns.boxplot(y='sessions_totaldaysactive', x='churned', data=example, palette='Blues', ax=axes[2]);
sns.boxplot(y='sessions_totalnumber', x='churned', data=example, palette='Blues', ax=axes[3]);
sns.set(rc={"figure.figsize": (18, 8)})
sns.countplot(
x='sessions_daysSinceLastSession',
hue='churned',
data=df_main[(df_main['sessions_daysSinceLastSession'] > 10) &
(df_main['sessions_daysSinceLastSession'] < 35)],
palette='Blues'
);
Добавим несколько новых признаков, основывающихся, прежде всего, на сессионых данных пользователей.
# общее количество сессий за 3 месяца меньше 2
df_main['sessions_totalnumber_less_2'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 2 else 0)
# общее количество сессий за 3 месяца меньше 6
df_main['sessions_totalnumber_less_6'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 6 else 0)
# общее количество активных дней за 1 месяц меньше 2
df_main['sessions_totaldaysactive_less_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x < 2 else 0)
df_main['sessions_avgdaylag_greater_20'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 20.5 else 0)
df_main['sessions_avgdaylag_greater_10'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 10 else 0)
df_main['sessions_avgdaylag_greater_15'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 15 else 0)
df_main['sessions_totaldaysactive_equalsless_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x <= 2 else 0)
df_main['sessions_daysSinceLastSession_equalsgreater_19'] = df_main['sessions_daysSinceLastSession'].apply(lambda x: 1 if x >= 19 else 0)
df_main['sessions_totalnumber_equalsless_5'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x <= 5 else 0)
def label_race(row):
if (row['sessions_avgdaylag_greater_10'] == 1 and row['sessions_totaldaysactive_equalsless_2'] == 1 and
row['sessions_daysSinceLastSession_equalsgreater_19'] == 1 and row['sessions_totalnumber_equalsless_5'] == 1):
return 1
else:
return 0
df_main['sessions_isPassive'] = df_main.apply(lambda row: label_race (row),axis=1)
df_main['payments_isPassive'] = df_main['payments_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0)
df_main['transfers_isPassive'] = df_main['transfers_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0)
Наконец, посмотрим на получившийся dataFrame
df_main.head(10)
DeviceID | sessions_totalnumber | sessions_avgdaylag | sessions_daysSinceLastSession | sessions_totaldaysactive | payments_operationAmount_count | payments_operationAmount_sum | payments_operationAmount_median | payments_operationAmount_min | payments_operationAmount_max | ... | sessions_totaldaysactive_less_2 | sessions_avgdaylag_greater_20 | sessions_avgdaylag_greater_10 | sessions_avgdaylag_greater_15 | sessions_totaldaysactive_equalsless_2 | sessions_daysSinceLastSession_equalsgreater_19 | sessions_totalnumber_equalsless_5 | sessions_isPassive | payments_isPassive | transfers_isPassive | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
185272 | 17711056090712135034 | 2429 | 1.00 | 1 | 122 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
113760 | 10870103398657921835 | 2376 | 1.02 | 1 | 120 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
89049 | 8503699400535621652 | 2121 | 2.02 | 1 | 61 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
17655 | 1684057822033854533 | 2031 | 1.46 | 1 | 84 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
99239 | 9476564823644259242 | 1767 | 1.00 | 1 | 122 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
21926 | 2096057656045679170 | 1759 | 1.00 | 1 | 122 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
161609 | 15463201339127358568 | 1752 | 1.38 | 3 | 35 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
6791 | 650285170630914109 | 1716 | 21.25 | 17 | 5 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
16249 | 1545944349431951680 | 1602 | 1.23 | 1 | 85 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
73273 | 7006810564216597347 | 1575 | 2.09 | 1 | 59 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
10 rows × 43 columns
df_main.columns
Index(['DeviceID', 'sessions_totalnumber', 'sessions_avgdaylag', 'sessions_daysSinceLastSession', 'sessions_totaldaysactive', 'payments_operationAmount_count', 'payments_operationAmount_sum', 'payments_operationAmount_median', 'payments_operationAmount_min', 'payments_operationAmount_max', 'payments_sender_nunique', 'payments_recipient_nunique', 'payments_operationMethod_nunique', 'payments_avgdaylag', 'payments_daysactive', 'payments_daysSinceLastPayment', 'transfers_operationAmount_count', 'transfers_operationAmount_sum', 'transfers_operationAmount_median', 'transfers_operationAmount_min', 'transfers_operationAmount_max', 'transfers_operationFee_sum', 'transfers_sender_nunique', 'transfers_recipient_nunique', 'transfers_operationMethod_nunique', 'transfers_senderCurrency_nunique', 'transfers_recipientCurrency_nunique', 'transfers_avgdaylag', 'transfers_daysactive', 'transfers_daysSinceLastTransfer', 'churned', 'sessions_totalnumber_less_2', 'sessions_totalnumber_less_6', 'sessions_totaldaysactive_less_2', 'sessions_avgdaylag_greater_20', 'sessions_avgdaylag_greater_10', 'sessions_avgdaylag_greater_15', 'sessions_totaldaysactive_equalsless_2', 'sessions_daysSinceLastSession_equalsgreater_19', 'sessions_totalnumber_equalsless_5', 'sessions_isPassive', 'payments_isPassive', 'transfers_isPassive'], dtype='object')
df_main.shape
(193054, 43)
df_main.info()
<class 'pandas.core.frame.DataFrame'> Index: 193054 entries, 185272 to 45952 Data columns (total 43 columns): DeviceID 193054 non-null uint64 sessions_totalnumber 193054 non-null int64 sessions_avgdaylag 193054 non-null float64 sessions_daysSinceLastSession 193054 non-null int64 sessions_totaldaysactive 193054 non-null int64 payments_operationAmount_count 193054 non-null float64 payments_operationAmount_sum 193054 non-null float64 payments_operationAmount_median 193054 non-null float64 payments_operationAmount_min 193054 non-null float64 payments_operationAmount_max 193054 non-null float64 payments_sender_nunique 193054 non-null float64 payments_recipient_nunique 193054 non-null float64 payments_operationMethod_nunique 193054 non-null float64 payments_avgdaylag 193054 non-null float64 payments_daysactive 193054 non-null float64 payments_daysSinceLastPayment 193054 non-null float64 transfers_operationAmount_count 193054 non-null float64 transfers_operationAmount_sum 193054 non-null float64 transfers_operationAmount_median 193054 non-null float64 transfers_operationAmount_min 193054 non-null float64 transfers_operationAmount_max 193054 non-null float64 transfers_operationFee_sum 193054 non-null float64 transfers_sender_nunique 193054 non-null float64 transfers_recipient_nunique 193054 non-null float64 transfers_operationMethod_nunique 193054 non-null float64 transfers_senderCurrency_nunique 193054 non-null float64 transfers_recipientCurrency_nunique 193054 non-null float64 transfers_avgdaylag 193054 non-null float64 transfers_daysactive 193054 non-null float64 transfers_daysSinceLastTransfer 193054 non-null float64 churned 193054 non-null int64 sessions_totalnumber_less_2 193054 non-null int64 sessions_totalnumber_less_6 193054 non-null int64 sessions_totaldaysactive_less_2 193054 non-null int64 sessions_avgdaylag_greater_20 193054 non-null int64 sessions_avgdaylag_greater_10 193054 non-null int64 sessions_avgdaylag_greater_15 193054 non-null int64 sessions_totaldaysactive_equalsless_2 193054 non-null int64 sessions_daysSinceLastSession_equalsgreater_19 193054 non-null int64 sessions_totalnumber_equalsless_5 193054 non-null int64 sessions_isPassive 193054 non-null int64 payments_isPassive 193054 non-null int64 transfers_isPassive 193054 non-null int64 dtypes: float64(26), int64(16), uint64(1) memory usage: 64.8+ MB
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.metrics import f1_score
from collections import Counter
df_main.drop('DeviceID', axis=1, inplace=True)
y = df_main['churned']
X = df_main.drop('churned', axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.33, random_state=42)
import lightgbm as lgb
# Set params
# Scores ~0.784 (without tuning and early stopping)
params = {
'boosting_type': 'gbdt',
'max_depth' : -1,
'objective': 'binary',
'nthread': 3, # Updated from nthread
'num_leaves': 34,
'learning_rate': 0.1,
'max_bin': 1024,
'subsample_for_bin': 200,
'subsample': 1,
'subsample_freq': 1,
'colsample_bytree': 0.8,
'reg_alpha': 5,
'reg_lambda': 10,
'min_split_gain': 0.5,
'min_child_weight': 1,
'min_child_samples': 5,
'scale_pos_weight': 1,
'num_class' : 1,
'metric' : 'binary_logloss',
'num_iterations' : 200
}
gridParams = {
'learning_rate': [0.1],
'n_estimators': [40],
'num_leaves': [31],
'boosting_type' : ['gbdt'],
'objective' : ['binary'],
'random_state' : [42], # Updated from 'seed'
'colsample_bytree' : [0.65, 0.66],
'subsample' : [0.7,0.75],
'reg_alpha' : [1.2, 1.4],
'reg_lambda' : [1.2, 1.4]
}
lgbm = lgb.LGBMClassifier(
boosting_type= params['boosting_type'],
objective = params['objective'],
n_jobs = -1, # Updated from 'nthread'
silent = True,
max_depth = params['max_depth'],
max_bin = params['max_bin'],
subsample_for_bin = params['subsample_for_bin'],
subsample = params['subsample'],
subsample_freq = params['subsample_freq'],
min_split_gain = params['min_split_gain'],
min_child_weight = params['min_child_weight'],
min_child_samples = params['min_child_samples'],
scale_pos_weight = params['scale_pos_weight'],
learning_rate = params['learning_rate'],
num_iterations=params['num_iterations'],
num_leaves=params['num_leaves']
)
lgbm.fit(X_train, y_train)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, learning_rate=0.1, max_bin=1024, max_depth=-1, min_child_samples=5, min_child_weight=1, min_split_gain=0.5, n_estimators=100, n_jobs=-1, num_iterations=200, num_leaves=34, objective='binary', random_state=None, reg_alpha=0.0, reg_lambda=0.0, scale_pos_weight=1, silent=True, subsample=1, subsample_for_bin=200, subsample_freq=1)
# F1 score для litegb - параметры НЕоптимизированы
report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
%%time
lgbm_grid = GridSearchCV(lgbm, gridParams,
verbose=True,
cv=5,
n_jobs=-1, scoring='f1')
lgbm_grid.fit(X_train, y_train)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 4.0min [Parallel(n_jobs=-1)]: Done 80 out of 80 | elapsed: 7.0min finished
CPU times: user 21.5 s, sys: 2.13 s, total: 23.6 s Wall time: 7min
lgbm_grid.best_params_, lgbm_grid.best_score_
({'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 31, 'objective': 'binary', 'random_state': 42, 'reg_alpha': 1.4, 'reg_lambda': 1.4, 'subsample': 0.75}, 0.8006453281114974)
# F1 score для litegb - параметры НЕоптимизированы
report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
# F1 score для litegb - параметры оптимизированы
report = classification_report(y_valid, lgbm_grid.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
# Воспользуемся функцией построения матрицы ошибок из документации sklearn
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
font = {'size' : 18}
plt.rc('font', **font)
cnf_matrix = confusion_matrix(y_valid, lgbm.predict(X_valid))
plt.figure(figsize=(10, 8))
plot_confusion_matrix(cnf_matrix, classes=['Non-churned', 'Churned'],
title='Confusion matrix')
# plt.savefig("conf_matrix.png")
plt.show();
/Users/dmitriilin/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty. if diff:
Confusion matrix, without normalization [[38568 3634] [ 4667 16839]]