HOST = 'http://localhost:8123'
import requests
import pandas as pd
import numpy as np
import seaborn as sns
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
# get rid of scientific notation
pd.options.display.float_format = '{:.2f}'.format
# disable warnings
import warnings
warnings.filterwarnings('ignore')
import itertools
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
# get query results from DataBase
def get_clickhouse_data(query, host = HOST, connection_timeout = 1500):
r = requests.post(host, params = {'query': query}, timeout = connection_timeout)
if r.status_code == 200:
return r.text
else:
raise ValueError(r.text)
# convert query results to pandas DataFrame
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500):
data = get_clickhouse_data(query, host, connection_timeout)
df = pd.read_csv(StringIO(data), sep = '\t')
return df
query_dates = ['2018-06-01', '2018-07-31'];
churn_query_dates = ['2018-08-01', '2018-08-31'];
# date format
fmt = '%Y-%m-%d'
# a function for calculation some of the features related to users' session activity
def func(a):
i1, i2 = itertools.tee(iter(a))
next(i2)
res = [(int(round((datetime.strptime(y, fmt) - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440
for x, y in zip(i1, i2)] # or just zip in Python 3
return round(np.mean(res),2)
from datetime import datetime
reportDate = datetime(2018, 8, 1)
sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'grey'})
sns.set(palette='Blues')
Let's get users' session data. In my case I neeed data for 4 months (april through july 2018)
%%time
q = '''
SELECT
DeviceID,
AppVersionName,
OSName,
City,
SessionStartDate,
SessionStartDateTime,
SessionStartTimestamp
FROM
mobile.sessions_starts_all
WHERE
SessionStartDate BETWEEN ''' + ''' \'''' + \
query_dates[0] + '''\' AND ''' + '''\'''' + \
query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
sessions = get_clickhouse_df(q)
CPU times: user 11.8 s, sys: 5.01 s, total: 16.8 s Wall time: 19.2 s
Add session data for may and april 2018
sessions_may = pd.read_csv('../dmitriilin/Downloads/sessions_starts_may.csv', sep=',')
sessions_april = pd.read_csv('../dmitriilin/Downloads/sessions_starts_april.csv', sep=',')
sessions_may['SessionStartDate'] = sessions_may['SessionStartDateTime'].apply(lambda x: x.split(' ')[0])
frames = [sessions, sessions_may, sessions_april]
sessions = pd.concat(frames)
This is how the dataFrame looks like
sessions.head()
AppVersionName | City | DeviceID | OSName | SessionStartDate | SessionStartDateTime | SessionStartTimestamp | Unnamed: 0 | |
---|---|---|---|---|---|---|---|---|
0 | 3.17.2 | Kazan | 7543012021622200292 | ios | 2018-06-01 | 2018-06-01 11:57:29 | 1527854249 | nan |
1 | 3.17.1 | Samara | 16883537386825900026 | android | 2018-06-01 | 2018-06-01 11:18:10 | 1527851890 | nan |
2 | 3.17.1 | Samara | 16883537386825900026 | android | 2018-06-01 | 2018-06-01 07:11:19 | 1527837079 | nan |
3 | 3.18.0 | Kazan | 170802125429386870 | android | 2018-06-01 | 2018-06-01 08:03:44 | 1527840224 | nan |
4 | 3.18.0 | Kazan | 170802125429386870 | android | 2018-06-01 | 2018-06-01 11:31:19 | 1527852679 | nan |
Let's create another dataFrame for aggregated data. Next, let's count total number of sessions per each user (DeviceID) using SessionStartTimestamp as a session ID.
%%time
df_main = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\
.count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False)
CPU times: user 1.5 s, sys: 813 ms, total: 2.32 s Wall time: 3.82 s
Creating a list of all session dates in ascending order and writting it down to SessionStartDate column
df_main['SessionStartDate'] = [sorted(list(set(sessions['SessionStartDate'].loc[sessions['DeviceID'] == x['DeviceID']])))
for _, x in df_main.iterrows()]
Counting average day lag between days where at least one session occured
df_main['sessions_avgdaylag'] = df_main.apply(lambda x: func(x['SessionStartDate']), axis=1)
If a certain user has been active only for one day, the sessions_avgdaylag will be equal to the amount of days passed since that day to the report date. (1 august 2018, in my case)
tempSer = df_main[df_main['sessions_avgdaylag'].isnull()]['SessionStartDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x[0], fmt)).total_seconds() / 60))) / 1440))
df_main['sessions_avgdaylag'] = df_main['sessions_avgdaylag'].fillna(tempSer)
Let's count the amount of days passed since user's last session day to the report date.
temp = sessions.groupby('DeviceID')[['SessionStartDate']].max().reset_index()
temp['sessions_daysSinceLastSession'] = temp['SessionStartDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['SessionStartDate'], axis=1).set_index('DeviceID'), on='DeviceID')
Counting total amount of days a user has been active (at least one session)
df_main['sessions_totaldaysactive'] = df_main['SessionStartDate'].apply(lambda x: len(x))
Deleting the list containing session dates and renaming SessionStartTimestamp column
df_main = df_main.drop(['SessionStartDate'], axis=1).rename(index=str, columns={
"SessionStartTimestamp": "sessions_totalnumber"
})
Let's create a temporary dataFrame for all the events we recognize as "usefull"
q = '''
SELECT
AppVersionName,
OSName,
EventDate,
ReceiveDate,
ReceiveTimestamp,
EventTimestamp,
EventDateTime,
EventName,
DeviceID,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'crmId') AS VARCHAR) AS crmId,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'status') AS VARCHAR) AS authStatus,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'gender') AS VARCHAR) AS gender,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип операции') AS VARCHAR) AS operationType,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'сумма операции') AS VARCHAR) AS operationAmount,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'источник') AS VARCHAR) AS sender,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта отправителя') AS VARCHAR) AS senderCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'получатель') AS VARCHAR) AS recipient,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта получателя') AS VARCHAR) AS recipientCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'размер комиссии') AS VARCHAR) AS operationFee,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'способ операции') AS VARCHAR) AS operationMethod,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'срок') AS VARCHAR) AS depositTerm,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'валюта') AS VARCHAR) AS depositCurrency,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'ставка') AS VARCHAR) AS depositRate,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'название') AS VARCHAR) AS depositName,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'тип чата') AS VARCHAR) AS chatType,
CAST(visitParamExtractRaw(replaceAll(EventParameters, '""', '"'), 'операция') AS VARCHAR) AS operation
FROM
mobile.events_all
WHERE
match(EventName,'продажи|переводы|Переводы|^платежи$')
AND EventDate BETWEEN ''' + ''' \'''' + \
query_dates[0] + '''\' AND ''' + '''\'''' + \
query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
events = get_clickhouse_df(q)
Let's create a separate dataframe for payment events only.
payments = events[(events['EventName'] == 'платежи')]
payments = payments.dropna(subset = ['operationAmount'])
payments_may = pd.read_csv('../dmitriilin/Downloads/payments_may.csv', sep=',')
payments_april = pd.read_csv('../dmitriilin/Downloads/payments_april.csv', sep=',')
frames = [payments, payments_may, payments_april]
payments = pd.concat(frames)
payments.drop(['Unnamed: 0', 'chatType', 'depositCurrency', 'depositName', 'depositRate', 'depositTerm'],
axis=1,
inplace=True)
Calculating aggregate data for payment events
f = {
'operationAmount':['count','sum', 'median', 'min', 'max'],
'sender': ['nunique'],
'recipient': ['nunique'],
'operationMethod': ['nunique']
}
payments_agg = payments.groupby(['DeviceID']).agg(f)
payments_agg.columns = payments_agg.columns.map('_'.join)
payments_agg = payments_agg.reset_index().rename(columns={
'operationAmount_count': 'payments_operationAmount_count',
'operationAmount_sum': 'payments_operationAmount_sum',
'operationAmount_median': 'payments_operationAmount_median',
'operationAmount_min': 'payments_operationAmount_min',
'operationAmount_max': 'payments_operationAmount_max',
'sender_nunique': 'payments_sender_nunique',
'recipient_nunique': 'payments_recipient_nunique',
'operationMethod_nunique': 'payments_operationMethod_nunique'
})
As we did with sessions, let's create a list contaning all payment date in ascending order
payments_agg['payments_date'] = [sorted(list(set(payments['EventDate'].loc[payments['DeviceID'] == x['DeviceID']])))
for _, x in payments_agg.iterrows()]
Counting average day lag between payment events + total amount of payment days per each user
payments_agg['payments_avgdaylag'] = payments_agg.apply(lambda x: func(x['payments_date']), axis=1)
payments_agg['payments_daysactive'] = payments_agg['payments_date'].apply(lambda x: len(x))
payments_agg = payments_agg.drop(['payments_date'], axis=1)
Add payment events' data to the main dataFrame
df_main = df_main.join(payments_agg.set_index('DeviceID'), on='DeviceID')
Counting the number of days passed since last user's payment event.
temp = payments.groupby('DeviceID')[['EventDate']].max().reset_index()
temp['payments_daysSinceLastPayment'] = temp['EventDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID')
Let's fill NA values for those users who have never made a payment within the chosen time period. If this case avgdaylag & daysSinceLastPayemnt features are both equal to 122 (total number of days in the peiod)
fillna0 = [
'payments_operationAmount_count',
'payments_operationAmount_sum',
'payments_operationAmount_median',
'payments_operationAmount_min',
'payments_operationAmount_max',
'payments_sender_nunique',
'payments_recipient_nunique',
'payments_operationMethod_nunique',
'payments_daysactive'
]
fillna122 = ['payments_avgdaylag', 'payments_daysSinceLastPayment']
for i in fillna0:
df_main[i].fillna(0, inplace=True)
for i in fillna122:
df_main[i].fillna(122, inplace=True)
Let's create the same features for fund transfer events as we just did for payments.
fundTransfers = events[(events['EventName'] == 'переводы')]
fundTransfers = fundTransfers.dropna(subset = ['sender'])
fundTransfers_may = pd.read_csv('../dmitriilin/Downloads/transfers_may.csv', sep=',')
fundTransfers_april = pd.read_csv('../dmitriilin/Downloads/transfers_april.csv', sep=',')
frames = [fundTransfers, fundTransfers_may, fundTransfers_april]
fundTransfers = pd.concat(frames)
f = {
'operationAmount':['count','sum', 'median', 'min', 'max'],
'operationFee': ['sum'],
'sender': ['nunique'],
'recipient': ['nunique'],
'operationMethod': ['nunique'],
'senderCurrency': ['nunique'],
'recipientCurrency': ['nunique']
}
fundTransfers_agg = fundTransfers.groupby(['DeviceID']).agg(f)
fundTransfers_agg.columns = fundTransfers_agg.columns.map('_'.join)
fundTransfers_agg = fundTransfers_agg.reset_index().rename(columns={
'operationAmount_count': 'transfers_operationAmount_count',
'operationAmount_sum': 'transfers_operationAmount_sum',
'operationAmount_median': 'transfers_operationAmount_median',
'operationAmount_min': 'transfers_operationAmount_min',
'operationAmount_max': 'transfers_operationAmount_max',
'operationFee_sum': 'transfers_operationFee_sum',
'sender_nunique': 'transfers_sender_nunique',
'recipient_nunique': 'transfers_recipient_nunique',
'operationMethod_nunique': 'transfers_operationMethod_nunique',
'senderCurrency_nunique': 'transfers_senderCurrency_nunique',
'recipientCurrency_nunique': 'transfers_recipientCurrency_nunique'
})
fundTransfers_agg['transfers_date'] = [sorted(list(set(fundTransfers['EventDate'].loc[fundTransfers['DeviceID'] == x['DeviceID']])))
for _, x in fundTransfers_agg.iterrows()]
fundTransfers_agg['transfers_avgdaylag'] = fundTransfers_agg.apply(lambda x: func(x['transfers_date']), axis=1)
fundTransfers_agg['transfers_daysactive'] = fundTransfers_agg['transfers_date'].apply(lambda x: len(x))
fundTransfers_agg = fundTransfers_agg.drop(['transfers_date'], axis=1)
df_main = df_main.join(fundTransfers_agg.set_index('DeviceID'), on='DeviceID')
temp = fundTransfers.groupby('DeviceID')[['EventDate']].max().reset_index()
temp['transfers_daysSinceLastTransfer'] = temp['EventDate'].apply(
lambda x: round((int(round((reportDate - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440))
df_main = df_main.join(temp.drop(['EventDate'], axis=1).set_index('DeviceID'), on='DeviceID')
fillna0 = [
'transfers_operationAmount_count',
'transfers_operationAmount_sum',
'transfers_operationAmount_median',
'transfers_operationAmount_min',
'transfers_operationAmount_max',
'transfers_operationFee_sum',
'transfers_sender_nunique',
'transfers_recipient_nunique',
'transfers_operationMethod_nunique',
'transfers_senderCurrency_nunique',
'transfers_recipientCurrency_nunique',
]
fillna122 = ['transfers_avgdaylag', 'transfers_daysactive', 'transfers_daysSinceLastTransfer']
for i in fillna0:
df_main[i].fillna(0, inplace=True)
for i in fillna122:
df_main[i].fillna(122, inplace=True)
q = '''
SELECT
DeviceID,
AppVersionName,
OSName,
ReceiveDate,
ReceiveTimestamp,
SessionStartDate,
SessionStartDateTime,
SessionStartTimestamp
FROM
mobile.sessions_starts_all
WHERE
SessionStartDate BETWEEN ''' + ''' \'''' + \
churn_query_dates[0] + '''\' AND ''' + '''\'''' + \
churn_query_dates[1] + '''\' ''''FORMAT TabSeparatedWithNames'''
sessions = get_clickhouse_df(q)
df_sessions_churn = sessions.groupby('DeviceID')[['SessionStartTimestamp']]\
.count().reset_index().sort_values(by='SessionStartTimestamp', ascending=False)
Let's see which DeviceIDs from our main dataFrame have turned in August 2018. If a certain DeviceID is found in df_sessions_churn frame, then it is a non-churned client (0), otherwise he/she has churned (1).
df_main['churned'] = df_main['DeviceID'].isin(df_sessions_churn['DeviceID']).apply(lambda x: 0 if x is True else 1)
As we can see, a 4-month churn rate is about 30%
df_main['churned'].value_counts(normalize=True)
0 0.67 1 0.33 Name: churned, dtype: float64
Let's look at correlation matrix
sns.heatmap(df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]].corr(), annot=True);
# sns.heatmap(df_main[[f for f in df_main.columns if 'payments' in f or 'churned' in f]].corr(), annot=True);
# sns.heatmap(df_main[[f for f in df_main.columns if 'transfers' in f or 'churned' in f]].corr(), annot=True);
Finally, let's have a look at our ready-to-use dataFrame
df_main.tail()
DeviceID | sessions_totalnumber | sessions_avgdaylag | sessions_daysSinceLastSession | sessions_totaldaysactive | payments_operationAmount_count | payments_operationAmount_sum | payments_operationAmount_median | payments_operationAmount_min | payments_operationAmount_max | ... | transfers_operationFee_sum | transfers_sender_nunique | transfers_recipient_nunique | transfers_operationMethod_nunique | transfers_senderCurrency_nunique | transfers_recipientCurrency_nunique | transfers_avgdaylag | transfers_daysactive | transfers_daysSinceLastTransfer | churned | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
107876 | 10313336111367727706 | 1 | 53.00 | 53 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 122.00 | 122.00 | 122.00 | 1 |
68108 | 6514306888048145041 | 1 | 114.00 | 114 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 122.00 | 122.00 | 122.00 | 1 |
38900 | 3725470542799209790 | 1 | 59.00 | 59 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 122.00 | 122.00 | 122.00 | 0 |
107882 | 10313953945953922630 | 1 | 122.00 | 122 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 122.00 | 122.00 | 122.00 | 1 |
45952 | 4396644337809361834 | 1 | 10.00 | 10 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 122.00 | 122.00 | 122.00 | 1 |
5 rows × 31 columns
df_main.columns
Index(['DeviceID', 'sessions_totalnumber', 'sessions_avgdaylag', 'sessions_daysSinceLastSession', 'sessions_totaldaysactive', 'payments_operationAmount_count', 'payments_operationAmount_sum', 'payments_operationAmount_median', 'payments_operationAmount_min', 'payments_operationAmount_max', 'payments_sender_nunique', 'payments_recipient_nunique', 'payments_operationMethod_nunique', 'payments_avgdaylag', 'payments_daysactive', 'payments_daysSinceLastPayment', 'transfers_operationAmount_count', 'transfers_operationAmount_sum', 'transfers_operationAmount_median', 'transfers_operationAmount_min', 'transfers_operationAmount_max', 'transfers_operationFee_sum', 'transfers_sender_nunique', 'transfers_recipient_nunique', 'transfers_operationMethod_nunique', 'transfers_senderCurrency_nunique', 'transfers_recipientCurrency_nunique', 'transfers_avgdaylag', 'transfers_daysactive', 'transfers_daysSinceLastTransfer', 'churned'], dtype='object')
df_main.shape
(193054, 31)
df_main.info()
<class 'pandas.core.frame.DataFrame'> Index: 193054 entries, 185272 to 45952 Data columns (total 31 columns): DeviceID 193054 non-null uint64 sessions_totalnumber 193054 non-null int64 sessions_avgdaylag 193054 non-null float64 sessions_daysSinceLastSession 193054 non-null int64 sessions_totaldaysactive 193054 non-null int64 payments_operationAmount_count 193054 non-null float64 payments_operationAmount_sum 193054 non-null float64 payments_operationAmount_median 193054 non-null float64 payments_operationAmount_min 193054 non-null float64 payments_operationAmount_max 193054 non-null float64 payments_sender_nunique 193054 non-null float64 payments_recipient_nunique 193054 non-null float64 payments_operationMethod_nunique 193054 non-null float64 payments_avgdaylag 193054 non-null float64 payments_daysactive 193054 non-null float64 payments_daysSinceLastPayment 193054 non-null float64 transfers_operationAmount_count 193054 non-null float64 transfers_operationAmount_sum 193054 non-null float64 transfers_operationAmount_median 193054 non-null float64 transfers_operationAmount_min 193054 non-null float64 transfers_operationAmount_max 193054 non-null float64 transfers_operationFee_sum 193054 non-null float64 transfers_sender_nunique 193054 non-null float64 transfers_recipient_nunique 193054 non-null float64 transfers_operationMethod_nunique 193054 non-null float64 transfers_senderCurrency_nunique 193054 non-null float64 transfers_recipientCurrency_nunique 193054 non-null float64 transfers_avgdaylag 193054 non-null float64 transfers_daysactive 193054 non-null float64 transfers_daysSinceLastTransfer 193054 non-null float64 churned 193054 non-null int64 dtypes: float64(26), int64(4), uint64(1) memory usage: 47.1+ MB
Let's add some new features, based primary on users' session data.
# total number of sessions for 4 months is less than 2
df_main['sessions_totalnumber_less_2'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 2 else 0)
# total number of sessions for 4 months is less than 6
df_main['sessions_totalnumber_less_6'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x < 6 else 0)
# total number of active days is less than 2
df_main['sessions_totaldaysactive_less_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x < 2 else 0)
# average day lag between session days is greater than 20 (10, 15)
# number of total active days is less or equals 2
# number of days passed since last user's session is greater or equals 19
# total number of sessions is less or equals 5
df_main['sessions_avgdaylag_greater_20'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 20.5 else 0)
df_main['sessions_avgdaylag_greater_10'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 10 else 0)
df_main['sessions_avgdaylag_greater_15'] = df_main['sessions_avgdaylag'].apply(lambda x: 1 if x > 15 else 0)
df_main['sessions_totaldaysactive_equalsless_2'] = df_main['sessions_totaldaysactive'].apply(lambda x: 1 if x <= 2 else 0)
df_main['sessions_daysSinceLastSession_equalsgreater_19'] = df_main['sessions_daysSinceLastSession'].apply(lambda x: 1 if x >= 19 else 0)
df_main['sessions_totalnumber_equalsless_5'] = df_main['sessions_totalnumber'].apply(lambda x: 1 if x <= 5 else 0)
With the help of this function we gather some of the most useful (as we'll see further) featurers into one single feautre named "sessions_isPassive"
def label_race(row):
if (row['sessions_avgdaylag_greater_10'] == 1 and row['sessions_totaldaysactive_equalsless_2'] == 1 and
row['sessions_daysSinceLastSession_equalsgreater_19'] == 1 and row['sessions_totalnumber_equalsless_5'] == 1):
return 1
else:
return 0
df_main['sessions_isPassive'] = df_main.apply(lambda row: label_race (row),axis=1)
df_main['payments_isPassive'] = df_main['payments_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0)
df_main['transfers_isPassive'] = df_main['transfers_operationAmount_count'].apply(lambda x: 1 if x == 0 else 0)
Let's see the corr() matrix, again
fig, ax = plt.subplots(figsize=(15,10))
# sns.heatmap(df_main.corr());
sns.heatmap(df_main[[f for f in df_main.columns if 'sessions' in f or 'churned' in f]].corr(), annot=True);
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.metrics import f1_score
from collections import Counter
y = df_main['churned']
X = df_main.drop('churned', axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.33, random_state=42)
# df_small = X_valid
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, max_depth=6, max_features=10)
print(np.mean(cross_val_score(forest, X_train, y_train, cv=10)))
0.8655234983543763
forest.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=6, max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)
print(classification_report(y_valid, forest.predict(X_valid), target_names=['non-churned', 'churned']))
precision recall f1-score support non-churned 0.90 0.90 0.90 42202 churned 0.80 0.79 0.80 21506 avg / total 0.86 0.86 0.86 63708
forest_params = {
'max_depth': list(range(5,40,10)),
'max_features': list(range(5,41,10)) + [40]
}
forest_grid = GridSearchCV(forest, forest_params, cv=10, n_jobs=-1, verbose=True, scoring='f1') #try randomsearchgrid
forest_grid.fit(X_train, y_train)
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 4.7min [Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 50.1min [Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 53.8min finished
GridSearchCV(cv=10, error_score='raise', estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=6, max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False), fit_params=None, iid=True, n_jobs=-1, param_grid={'max_depth': [5, 15, 25, 35], 'max_features': [5, 15, 25, 35, 40]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring='f1', verbose=True)
forest_grid.best_params_, forest_grid.best_score_
({'max_depth': 5, 'max_features': 25}, 0.7975545453175075)
print(classification_report(y_valid, forest_grid.predict(X_valid), target_names=['non-churned', 'churned']))
precision recall f1-score support non-churned 0.89 0.91 0.90 42202 churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
from IPython.display import display
import re
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
X = np.array(df_main.drop('churned', 1))
training_features = np.array(df_main.drop('churned', 1).columns)
#X = preprocessing.scale(X) --- not needed for XGboost?
y = np.array(df_main['churned'])
xgb = xgb.XGBClassifier()
scores = cross_validation.cross_val_score(xgb, X, y, cv=5, n_jobs=-1, verbose=True, scoring='f1')
xgb.fit(X_train, y_train)
print(scores)
print('F1: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores)))
[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 1.3min finished
[0.01780643 0.01222984 0.02732627 0.15069699 0.50539171] F1: 0.143 stdev: 0.19
report = classification_report(y_valid, xgb.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.92 0.90 42202 Churned 0.83 0.77 0.80 21506 avg / total 0.87 0.87 0.87 63708
xgbgridparams = {'learning_rate':[0, 0.001, 0.002, 0.004, 0.006, 0.008, 0.010, 0.1],
'reg_lambda':[0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]}
xgb_grid = GridSearchCV(
estimator=xgb,
param_grid=xgbgridparams,
scoring='f1',
n_jobs=-1,
refit=True,
verbose=True,
cv=5)
xgb_grid.fit(X_train,y_train)
print(xgb_grid.best_params_)
print(xgb_grid.best_score_)
Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 5.4min [Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 23.0min [Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 33.0min finished
{'learning_rate': 0.1, 'reg_lambda': 0} 0.7991178080485845
report = classification_report(y_valid, xgb_grid.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.92 0.90 42202 Churned 0.83 0.77 0.80 21506 avg / total 0.87 0.87 0.87 63708
import lightgbm as lgb
# Set params
# Scores ~0.784 (without tuning and early stopping)
params = {
'boosting_type': 'gbdt',
'max_depth' : -1,
'objective': 'binary',
'nthread': 3, # Updated from nthread
'num_leaves': 34,
'learning_rate': 0.1,
'max_bin': 1024,
'subsample_for_bin': 200,
'subsample': 1,
'subsample_freq': 1,
'colsample_bytree': 0.8,
'reg_alpha': 5,
'reg_lambda': 10,
'min_split_gain': 0.5,
'min_child_weight': 1,
'min_child_samples': 5,
'scale_pos_weight': 1,
'num_class' : 1,
'metric' : 'binary_logloss',
'num_iterations' : 200
}
gridParams = {
'learning_rate': [0.1],
'n_estimators': [40],
'num_leaves': [31],
'boosting_type' : ['gbdt'],
'objective' : ['binary'],
'random_state' : [42], # Updated from 'seed'
'colsample_bytree' : [0.65, 0.66],
'subsample' : [0.7,0.75],
'reg_alpha' : [1.2, 1.4],
'reg_lambda' : [1.2, 1.4]
}
lgbm = lgb.LGBMClassifier(
boosting_type= params['boosting_type'],
objective = params['objective'],
n_jobs = -1, # Updated from 'nthread'
silent = True,
max_depth = params['max_depth'],
max_bin = params['max_bin'],
subsample_for_bin = params['subsample_for_bin'],
subsample = params['subsample'],
subsample_freq = params['subsample_freq'],
min_split_gain = params['min_split_gain'],
min_child_weight = params['min_child_weight'],
min_child_samples = params['min_child_samples'],
scale_pos_weight = params['scale_pos_weight'],
learning_rate = params['learning_rate'],
num_iterations=params['num_iterations'],
num_leaves=params['num_leaves']
)
lgbm.fit(X_train, y_train)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, learning_rate=0.1, max_bin=1024, max_depth=-1, min_child_samples=5, min_child_weight=1, min_split_gain=0.5, n_estimators=100, n_jobs=-1, num_iterations=200, num_leaves=34, objective='binary', random_state=None, reg_alpha=0.0, reg_lambda=0.0, scale_pos_weight=1, silent=True, subsample=1, subsample_for_bin=200, subsample_freq=1)
# F1 score для litegb - параметры НЕоптимизированы
report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
%%time
lgbm_grid = GridSearchCV(lgbm, gridParams,
verbose=True,
cv=5,
n_jobs=-1, scoring='f1')
lgbm_grid.fit(X_train, y_train)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 3.7min [Parallel(n_jobs=-1)]: Done 80 out of 80 | elapsed: 7.0min finished
CPU times: user 18.9 s, sys: 1.92 s, total: 20.8 s Wall time: 7min 3s
lgbm_grid.best_params_, lgbm_grid.best_score_
({'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 31, 'objective': 'binary', 'random_state': 42, 'reg_alpha': 1.2, 'reg_lambda': 1.4, 'subsample': 0.75}, 0.8000277816005431)
# F1 score для litegb - параметры НЕоптимизированы
report = classification_report(y_valid, lgbm.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
# F1 score для litegb - параметры оптимизированы
report = classification_report(y_valid, lgbm_grid.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
from catboost import CatBoostClassifier, cv, Pool
model = CatBoostClassifier(
custom_loss=['F1'],
random_seed=42,
logging_level='Silent'
)
# np.where(X.dtypes != np.float)[0]
model.fit(
X_train, y_train,
eval_set=(X_valid, y_valid),
plot=False
);
Learning rate set to 0.130013
cv_data = cv(
Pool(X, y),
model.get_params(),
plot=False
)
Learning rate set to 0.066243
print('Best validation F1 score: {:.2f}±{:.2f} on step {}'.format(
np.max(cv_data['test-F1-mean']),
cv_data['test-F1-std'][np.argmax(cv_data['test-F1-mean'])],
np.argmax(cv_data['test-F1-mean'])
))
Best validation F1 score: 0.80±0.00 on step 197
print('Precise validation F1 score: {}'.format(np.max(cv_data['test-F1-mean'])))
Precise validation F1 score: 0.8021786765326228
model_without_seed = CatBoostClassifier(iterations=500, logging_level='Silent')
model_without_seed.fit(X_train, y_train)
print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))
Learning rate set to 0.123018 Random seed assigned for this model: 21553925559374
params = {
'iterations': 200,
'learning_rate': 0.1,
'eval_metric': 'F1',
'random_seed': 42,
'logging_level': 'Silent'
}
train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_valid, y_valid)
cb = CatBoostClassifier(**params)
cb.fit(train_pool, eval_set=validate_pool)
# best_model_params = params.copy()
# best_model_params.update({
# 'use_best_model': True
# })
# best_model = CatBoostClassifier(**best_model_params)
# best_model.fit(train_pool, eval_set=validate_pool);
print('Best model validation F1: {:.4}'.format(
# accuracy_score(y_valid, best_model.predict(X_valid))
f1_score(y_valid, cb.predict(X_valid))
))
Best model validation F1: 0.8022
%%time
params.update({
'od_type': 'Iter',
'od_wait': 50
})
cb_early_stopping = CatBoostClassifier(**params)
cb_early_stopping.fit(train_pool, eval_set=validate_pool);
CPU times: user 30.1 s, sys: 965 ms, total: 31.1 s Wall time: 9.21 s
print('Early-stopped best model tree count: {}'.format(cb.tree_count_))
print('Early-stopped best model validation F1: {:.4}'.format(
f1_score(y_valid, cb_early_stopping.predict(X_valid))
))
Early-stopped best model tree count: 200 Early-stopped best model validation F1: 0.802
cb_early_stopping.get_params(), cb.get_params()
({'eval_metric': 'F1', 'iterations': 200, 'learning_rate': 0.1, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 42}, {'eval_metric': 'F1', 'iterations': 200, 'learning_rate': 0.1, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42})
import hyperopt
def hyperopt_objective(params):
model = CatBoostClassifier(
l2_leaf_reg=int(params['l2_leaf_reg']),
learning_rate=params['learning_rate'],
iterations=100, #make 50
eval_metric='F1',
random_seed=42,
logging_level='Silent'
)
cv_data = cv(
Pool(X, y),
model.get_params()
)
best_accuracy = np.max(cv_data['test-F1-mean'])
return 1 - best_accuracy # as hyperopt minimises
from numpy.random import RandomState
params_space = {
'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1)
}
trials = hyperopt.Trials()
best = hyperopt.fmin(
hyperopt_objective,
space=params_space,
algo=hyperopt.tpe.suggest,
max_evals=50,
trials=trials,
rstate=RandomState(123)
)
print(best)
{'l2_leaf_reg': 1.0, 'learning_rate': 0.33181404533347925}
model = CatBoostClassifier(
l2_leaf_reg=int(best['l2_leaf_reg']),
learning_rate=best['learning_rate'],
iterations=100,
eval_metric='F1',
random_seed=42,
logging_level='Silent',
use_best_model=True
)
cv_data = cv(Pool(X, y), model.get_params())
print('Precise validation F1 score: {}'.format(np.max(cv_data['test-F1-mean'])))
Precise validation F1 score: 0.8022381788027445
# F1 score для CatBoost - параметры НЕоптимизированы
report = classification_report(y_valid, cb.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
model.fit(train_pool, eval_set=validate_pool);
# F1 score for CatBoost - parameters optimized
report = classification_report(y_valid, model.predict(X_valid), target_names=['Non-churned', 'Churned'])
print(report)
precision recall f1-score support Non-churned 0.89 0.91 0.90 42202 Churned 0.82 0.78 0.80 21506 avg / total 0.87 0.87 0.87 63708
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
font = {'size' : 18}
plt.rc('font', **font)
cnf_matrix = confusion_matrix(y_valid, lgbm.predict(X_valid))
plt.figure(figsize=(10, 8))
plot_confusion_matrix(cnf_matrix, classes=['Non-churned', 'Churned'],
title='Confusion matrix')
# plt.savefig("conf_matrix.png")
plt.show()
Confusion matrix, without normalization [[38551 3651] [ 4662 16844]]