Для работы нам понадобится:
ClickHouse можно установить на Linux (подробно про установку написано в документации https://clickhouse.yandex/).
На Windows или Mac можно запустить ClickHouse под Docker'ом: первая команда поднимает clickhouse-server на порту 8123, а вторая - позволяет подключиться к консольному ClickHouse клиенту.
docker run -d --name clickhouse-server --publish=8123:8123 --publish=9000:9000 yandex/clickhouse-server
docker run -it --rm --link clickhouse-server:9000 yandex/clickhouse-client --host clickhouse-server
Python можно скачать на официальном сайте. Также в скрипте будут использованы некоторые дополнительные библиотеки, их проще всего поставить с помощью менеджера пакетов pip (инструкция по установке pip)
pip install pandas jupyter requests plotly
Для того, чтобы запустить jupyter notebook, достаточно выполнить в консоли команду (после этого web-interface будет доступно по адресу http://localhost:8888):
jupyter notebook
Проще всего загрузить данные из LogsAPI в ClickHouse с помощью скрипта на GitHub'e. Прежде всего нужно указать всю информацию в конфиге.
В конфиге собраны основные параметры и будет достаточно задать их один раз и затем пользоваться. Для данной задачи я использовала вот такой конфиг:
{
"token" : "<token>",
"counter_id": "29761725",
"visits_fields": [
"ym:s:counterID",
"ym:s:dateTime",
"ym:s:date",
"ym:s:visitDuration",
"ym:s:bounce",
"ym:s:pageViews",
"ym:s:goalsID",
"ym:s:clientID",
"ym:s:lastTrafficSource",
"ym:s:lastAdvEngine",
"ym:s:lastSearchEngineRoot",
"ym:s:visitID",
"ym:s:startURL",
"ym:s:browser"
],
"log_level": "INFO",
"retries": 1,
"retries_delay": 60,
"clickhouse": {
"host": "http://localhost:8123",
"user": "",
"password": "",
"visits_table": "visits_all",
"hits_table": "hits_all",
"database": "demo"
}
}
Часть параметров задается непосредственно при вызове функции в CLI options. Например, источник (хиты или визиты) и период выгрузки.
У скрипта есть несколько режимов работы.
Во-первых, можно задать конкретные период (start_date
и end_date
):
python metrica_logs_api.py -source visits -start_date 2016-01-01 -end_date 2016-12-31
Другая опция это использовать один из режимов:
python metrica_logs_api.py -source visits -mode regular
Напишем функции для интеграции с ClickHouse: первая функция просто возвращает результат из DataBase, вторая же преобразует его в pandas DataFrame.
HOST = 'http://localhost:8123'
import requests
import pandas as pd
import StringIO
import datetime
def get_clickhouse_data(query, host = HOST, connection_timeout = 1500):
r = requests.post(host, params = {'query': query}, timeout = connection_timeout)
if r.status_code == 200:
return r.text
else:
raise ValueError, r.text
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500):
data = get_clickhouse_data(query, host, connection_timeout)
df = pd.read_csv(StringIO.StringIO(data), sep = '\t')
return df
Для начала посмотрим, какие данные есть в выгруженной с помощью Logs API таблице visits_all
q = '''DESCRIBE demo.visits_all'''
print get_clickhouse_data(q)
Bounce UInt8 Browser String ClientID UInt64 CounterID UInt32 Date Date DateTime DateTime GoalsID Array(UInt32) LastAdvEngine String LastSearchEngineRoot String LastTrafficSource String PageViews Int32 StartURL String VisitDuration UInt32 VisitID UInt64
Описание и полный список полей можно найти в документации Logs API: https://tech.yandex.ru/metrika/doc/api2/logs/intro-docpage/
q = '''
SELECT * FROM demo.visits_all
LIMIT 5
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
Bounce | Browser | ClientID | CounterID | Date | DateTime | GoalsID | LastAdvEngine | LastSearchEngineRoot | LastTrafficSource | PageViews | StartURL | VisitDuration | VisitID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | operamini | 1449647949482143849 | 29761725 | 2016-01-01 | 2016-01-01 19:45:49 | [] | ya_undefined | NaN | internal | 1 | https://yandex.ru/support/metrika/troubleshoot... | 0 | 5946027005326809537 |
1 | 0 | opera | 1450991301857057590 | 29761725 | 2016-01-01 | 2016-01-01 17:58:51 | [] | ya_undefined | NaN | internal | 1 | https://yandex.ru/support/metrika/reports/fals... | 16 | 5946000737738979198 |
2 | 0 | chrome | 1451627947145006808 | 29761725 | 2016-01-01 | 2016-01-01 13:44:09 | [] | ya_undefined | NaN | referral | 5 | https://events.yandex.ru/surveys/1705/?iframe=... | 1530 | 5945938122840090407 |
3 | 0 | chrome | 1451627947145006808 | 29761725 | 2016-01-01 | 2016-01-01 16:14:39 | [] | ya_undefined | NaN | referral | 1 | https://yandex.ru/support/metrika/behavior/web... | 15 | 5945975107921245974 |
4 | 1 | chrome | 1451673786489426257 | 29761725 | 2016-01-01 | 2016-01-01 21:44:45 | [] | ya_undefined | NaN | referral | 1 | https://yandex.ru/support/metrika/troubleshoot... | 0 | 5946056234508984012 |
q = '''
SELECT min(Date), max(Date)
FROM demo.visits_all
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
min(Date) | max(Date) | |
---|---|---|
0 | 2016-01-01 | 2016-12-31 |
Посчитаем общее число визитов и пользователей.
q = '''
SELECT
count() as TotalVisits,
uniq(ClientID) as TotalUsers
FROM demo.visits_all
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
TotalVisits | TotalUsers | |
---|---|---|
0 | 1240021 | 851044 |
Будем считать retention по неделям, рассматривая пользователей, которые впервые пришли на сервис с 24 октября по 25 ноября.
Для удобства обозначим временной период переменными.
start_date = datetime.datetime(2016, 10, 24)
end_date = datetime.datetime(2016, 11, 25)
start_date_str, end_date_str = map(lambda x: x.strftime('%Y-%m-%d'), (start_date, end_date))
Для начала посчитаем получим всех пользователей, которые впервые появились на сервисе в указанный период.
q = '''
SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}')
LIMIT 5
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
get_clickhouse_df(q)
client_id | min_date | |
---|---|---|
0 | 1463659610347692840 | 2016-11-21 |
1 | 1471853493532765800 | 2016-10-24 |
2 | 1470150814903327151 | 2016-10-24 |
3 | 1479994646578665295 | 2016-11-21 |
4 | 1476856735518320455 | 2016-11-21 |
Мы получили соответствие пользователей когортам (неделе, когда они впервые появились на сайте). Теперь, чтобы посчитать retention нам нужно сделать JOIN таблицы с самой собой по ClientID
, чтобы понять, когда возвращались на сайт пользователи каждой из групп.
q = '''
SELECT
client_id,
min_date,
date,
(date - min_date) as days,
days/7 as week_num
FROM
(SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}'))
ALL INNER JOIN
(SELECT DISTINCT
ClientID as client_id,
toMonday(Date) as date
FROM demo.visits_all)
USING client_id
LIMIT 5
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
get_clickhouse_df(q)
client_id | min_date | date | days | week_num | |
---|---|---|---|---|---|
0 | 1463659610347692840 | 2016-11-21 | 2016-11-21 | 0 | 0 |
1 | 1463659610347692840 | 2016-11-21 | 2016-11-28 | 7 | 1 |
2 | 1471853493532765800 | 2016-10-24 | 2016-10-24 | 0 | 0 |
3 | 1470150814903327151 | 2016-10-24 | 2016-10-24 | 0 | 0 |
4 | 1479994646578665295 | 2016-11-21 | 2016-11-21 | 0 | 0 |
Осталось только посчитать по когортам, сколько пользователей появлялись в каждую из недель weeks
после первого появления.
q = '''
SELECT
uniq(client_id) as clients,
(date - min_date)/7 as week_num,
min_date
FROM
(SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}'))
ALL INNER JOIN
(SELECT DISTINCT
ClientID as client_id,
toMonday(Date) as date
FROM demo.visits_all)
USING client_id
GROUP BY min_date, week_num
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
raw_ret_df = get_clickhouse_df(q)
ret_df = raw_ret_df.pivot_table(index = 'min_date', values = 'clients', columns = 'week_num').fillna(0).T
ret_df_norm = ret_df.apply(lambda x: 100*x/ret_df.loc[0], axis = 1).applymap(lambda x: x if x!=0 else None)
ret_df
min_date | 2016-10-24 | 2016-10-31 | 2016-11-07 | 2016-11-14 | 2016-11-21 |
---|---|---|---|---|---|
week_num | |||||
0 | 13667.0 | 12919.0 | 13951.0 | 14097.0 | 15788.0 |
1 | 956.0 | 1007.0 | 1097.0 | 1068.0 | 1070.0 |
2 | 637.0 | 576.0 | 602.0 | 589.0 | 620.0 |
3 | 436.0 | 414.0 | 492.0 | 481.0 | 451.0 |
4 | 393.0 | 374.0 | 411.0 | 351.0 | 345.0 |
5 | 353.0 | 343.0 | 334.0 | 267.0 | 224.0 |
6 | 276.0 | 249.0 | 245.0 | 155.0 | 0.0 |
7 | 258.0 | 199.0 | 170.0 | 0.0 | 0.0 |
8 | 191.0 | 141.0 | 0.0 | 0.0 | 0.0 |
9 | 139.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Напишем пару функций, которые будем использовать для визуализации результатов. Мы будем использовать open-source библиотеку plot.ly для построения интерактивных графиков.
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd
print __version__ # need 1.9.0 or greater
init_notebook_mode(connected = True)
def plotly_df(df, title = ''):
data = []
for column in df.columns:
trace = go.Scatter(
x = df.index,
y = df[column],
mode = 'lines',
name = column
)
data.append(trace)
layout = dict(title = title)
fig = dict(data = data, layout = layout)
# plotly.offline.plot(fig, filename=filename, show_link = False)
iplot(fig, show_link = False)
def highlight_vals(val):
if (val is None) or (val == ''):
return ''
p = 0.5
if val > 90:
return 'background-color: rgba(229, 0, 20, %f)' % p
if val > 80:
return 'background-color: rgba(231, 25, 43, %f)' % p
if val > 70:
return 'background-color: rgba(234, 51, 67, %f)' % p
if val > 60:
return 'background-color: rgba(236, 76, 90, %f)' % p
if val > 50:
return 'background-color: rgba(239, 102, 114, %f)' % p
if val > 40:
return 'background-color: rgba(242, 137, 127, %f)' % p
if val > 30:
return 'background-color: rgba(244, 153, 161, %f)' % p
if val > 20:
return 'background-color: rgba(247, 178, 184, %f)' % p
if val > 10:
return 'background-color: rgba(249, 204, 208, %f)' % p
return 'background-color: rgba(252, 229, 231, %f)' % p
2.0.0
plotly_df(ret_df_norm, title = 'Retention')
ret_df_norm.T.fillna('').style.applymap(highlight_vals)
week_num | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
---|---|---|---|---|---|---|---|---|---|---|
min_date | ||||||||||
2016-10-24 | 100 | 6.99495 | 4.66086 | 3.19017 | 2.87554 | 2.58286 | 2.01946 | 1.88776 | 1.39753 | 1.01705 |
2016-10-31 | 100 | 7.79472 | 4.45855 | 3.20458 | 2.89496 | 2.655 | 1.92739 | 1.54037 | 1.09142 | |
2016-11-07 | 100 | 7.86324 | 4.3151 | 3.52663 | 2.94603 | 2.39409 | 1.75615 | 1.21855 | ||
2016-11-14 | 100 | 7.57608 | 4.17819 | 3.41207 | 2.48989 | 1.89402 | 1.09952 | |||
2016-11-21 | 100 | 6.7773 | 3.92703 | 2.8566 | 2.1852 | 1.4188 |
Основное отличие rolling retention в том, что мы считаем, что пользователь "жив" до момента последнего посещения сайта. Предположим пользователь пришел на 1й неделе, а затем на 5й. При расчете retention'a мы не будем учитывать этого клиента на второй неделе, поскольку он не появлялся на сайте. При расчете же метрики rolling retention мы будем считать пользователя активным вполоть до 5й недели.
В этом случай нам важно всего лишь посчитать для каждого пользователя его первое и последнее появление на сайте.
q = '''
SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date,
max(toMonday(Date)) as max_date,
(max_date - min_date)/7 as week_num,
range(toUInt64((max_date - min_date)/7) + 1) as weeks_visited
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}') AND length(weeks_visited) > 1
ORDER BY client_id
LIMIT 5
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
get_clickhouse_df(q)
client_id | min_date | max_date | week_num | weeks_visited | |
---|---|---|---|---|---|
0 | 1467898659188080 | 2016-11-21 | 2016-12-26 | 5 | [0,1,2,3,4,5] |
1 | 1471841770370806 | 2016-11-07 | 2016-12-19 | 6 | [0,1,2,3,4,5,6] |
2 | 1472213988195344 | 2016-11-14 | 2016-11-21 | 1 | [0,1] |
3 | 1472405582143366 | 2016-11-14 | 2016-12-19 | 5 | [0,1,2,3,4,5] |
4 | 1472533641195152 | 2016-10-24 | 2016-10-31 | 1 | [0,1] |
В clickhouse есть функция arrayJoin()
, которая позволяет развернуть массив, оставив все остальные колонки без изменений. Воспользуемся ей.
Условие length(weeks_visited) > 1
добавлено в запросы исключительно для целей иллюстрации работы arrayJoin
.
q = '''
SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date,
max(toMonday(Date)) as max_date,
range(toUInt64((max_date - min_date)/7) + 1) as weeks_visited,
arrayJoin(weeks_visited) as week_num
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}') AND length(weeks_visited) > 1
ORDER BY client_id, week_num
LIMIT 10
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
get_clickhouse_df(q)
client_id | min_date | max_date | weeks_visited | week_num | |
---|---|---|---|---|---|
0 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 0 |
1 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 1 |
2 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 2 |
3 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 3 |
4 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 4 |
5 | 1467898659188080 | 2016-11-21 | 2016-12-26 | [0,1,2,3,4,5] | 5 |
6 | 1471841770370806 | 2016-11-07 | 2016-12-19 | [0,1,2,3,4,5,6] | 0 |
7 | 1471841770370806 | 2016-11-07 | 2016-12-19 | [0,1,2,3,4,5,6] | 1 |
8 | 1471841770370806 | 2016-11-07 | 2016-12-19 | [0,1,2,3,4,5,6] | 2 |
9 | 1471841770370806 | 2016-11-07 | 2016-12-19 | [0,1,2,3,4,5,6] | 3 |
Осталось только посчитать число пользователей в каждую неделю по когортам.
q = '''
SELECT
uniq(client_id) as clients,
week_num,
min_date
FROM
(SELECT
ClientID as client_id,
min(toMonday(Date)) as min_date,
max(toMonday(Date)) as max_date,
range(toUInt64((max_date - min_date)/7) + 1) as weeks_visited,
arrayJoin(weeks_visited) as week_num
FROM demo.visits_all
GROUP BY
client_id
HAVING (min_date >= '{start_date}') AND (min_date <= '{end_date}')
ORDER BY client_id, week_num)
GROUP BY
week_num,
min_date
FORMAT TabSeparatedWithNames
'''.format(
start_date = start_date_str,
end_date = end_date_str
)
raw_roll_ret_df = get_clickhouse_df(q)
roll_ret_df = raw_roll_ret_df.pivot_table(index = 'min_date',
values = 'clients',
columns = 'week_num').fillna(0).T
roll_ret_df_norm = roll_ret_df.apply(lambda x: 100*x/roll_ret_df.loc[0], axis = 1).applymap(lambda x: x if x!=0 else None)
plotly_df(roll_ret_df_norm, title = 'Rolling retention')
roll_ret_df_norm.T.fillna('').style.applymap(highlight_vals)
week_num | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
---|---|---|---|---|---|---|---|---|---|---|
min_date | ||||||||||
2016-10-24 | 100 | 16.9679 | 12.768 | 10.0461 | 8.18029 | 6.54862 | 4.8072 | 3.35845 | 2.07068 | 1.01705 |
2016-10-31 | 100 | 16.9982 | 11.9359 | 9.26542 | 7.33029 | 5.45708 | 3.73094 | 2.41505 | 1.09142 | |
2016-11-07 | 100 | 16.0419 | 10.9813 | 8.48685 | 6.3723 | 4.34377 | 2.6378 | 1.21855 | ||
2016-11-14 | 100 | 14.4073 | 9.42044 | 6.83833 | 4.40519 | 2.5892 | 1.09952 | |||
2016-11-21 | 100 | 12.6425 | 7.8414 | 5.18115 | 3.15429 | 1.4188 |
Для начала выгрузим визиты, в которых были совершены покупки
q = '''
SELECT
LastTraficSource,
ClientID,
VisitID
FROM visits_all
WHERE (ClientID != 0) AND (length(PurchaseID) != 0)
LIMIT 5
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
LastTraficSource | ClientID | VisitID | |
---|---|---|---|
0 | direct | 1450095022347120805 | 5940262123258949328 |
1 | organic | 1446488424961943503 | 5940419290323804278 |
2 | referral | 145038088022989772 | 5940420766409453257 |
3 | organic | 1450293631537114559 | 5940402531744356186 |
4 | ad | 1450262460605071255 | 5940275038225886929 |
Для того, чтобы построить пути, т.е. для каждой покупки определить, с какими источниками пользователь появлялся на сайте ранее, нужно сделать JOIN таблицы с самой собой по идентификатору пользователя ClientID
.
q = '''
SELECT
ClientID,
VisitID,
PurchaseTime,
DateTime,
LastTraficSource
FROM
(SELECT
ClientID,
DateTime as PurchaseTime,
VisitID
FROM visits_all
WHERE (ClientID != 0) AND (length(PurchaseID) != 0))
ALL LEFT JOIN
(SELECT
ClientID,
LastTraficSource,
DateTime
FROM visits_all
ORDER BY DateTime)
USING ClientID
WHERE PurchaseTime >= DateTime
LIMIT 5
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
ClientID | VisitID | PurchaseTime | DateTime | LastTraficSource | |
---|---|---|---|---|---|
0 | 1446494115200148125 | 5935388931880519536 | 2015-12-02 18:19:23 | 2015-12-02 18:19:23 | direct |
1 | 1449082733165609622 | 5935444959877079158 | 2015-12-02 22:07:21 | 2015-12-02 22:07:21 | ad |
2 | 1450095022347120805 | 5940262123258949328 | 2015-12-16 12:46:48 | 2015-12-14 15:19:26 | internal |
3 | 1450095022347120805 | 5940262123258949328 | 2015-12-16 12:46:48 | 2015-12-16 12:46:48 | direct |
4 | 1446488424961943503 | 5940419290323804278 | 2015-12-16 23:27:58 | 2015-12-03 20:53:36 | organic |
Объединим все по VisitID
и с помощью функции groupArray
соберем все источника трафика в массивы
q = '''
SELECT
VisitID,
groupArray(LastTraficSource) as Sources
FROM
(SELECT
ClientID,
DateTime as PurchaseTime,
VisitID
FROM visits_all
WHERE (ClientID != 0) AND (length(PurchaseID) != 0))
ALL LEFT JOIN
(SELECT
ClientID,
LastTraficSource,
DateTime
FROM visits_all
ORDER BY DateTime)
USING ClientID
WHERE PurchaseTime >= DateTime
GROUP BY VisitID
LIMIT 5
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
VisitID | Sources | |
---|---|---|
0 | 5942036913160910687 | ['ad','ad','ad','ad','ad'] |
1 | 5944887285864662889 | ['ad','internal','ad'] |
2 | 5936327790758690505 | ['ad'] |
3 | 5935674958795425569 | ['social'] |
4 | 5937596701532043135 | ['organic','organic','referral','ad'] |
И, в конце концов, посчитаем для каждого пути количество покупок и построим топ.
q = '''
SELECT
Sources,
sum(NumPurchases) as TotalNumPurchases
FROM
(SELECT
any(NumPurchases) as NumPurchases,
VisitID,
groupArray(LastTraficSource) as Sources
FROM
(SELECT
length(PurchaseID) as NumPurchases,
ClientID,
DateTime as PurchaseTime,
VisitID
FROM visits_all
WHERE (ClientID != 0) AND (length(PurchaseID) != 0))
ALL LEFT JOIN
(SELECT
ClientID,
LastTraficSource,
DateTime
FROM visits_all
ORDER BY DateTime)
USING ClientID
WHERE PurchaseTime >= DateTime
GROUP BY VisitID)
WHERE length(Sources) >= 2
GROUP BY Sources
ORDER BY TotalNumPurchases DESC
LIMIT 5
FORMAT TabSeparatedWithNames
'''
%time get_clickhouse_df(q)
CPU times: user 5.86 ms, sys: 4.4 ms, total: 10.3 ms Wall time: 5.45 s
Sources | TotalNumPurchases | |
---|---|---|
0 | ['ad','ad'] | 2200 |
1 | ['organic','organic'] | 2159 |
2 | ['organic','ad'] | 1701 |
3 | ['direct','direct'] | 989 |
4 | ['ad','ad','ad'] | 903 |
Получившиеся результаты уже достаточно интересны. Но, на самом деле, мы хотим разделять источники на более мелкие группы, а не только на те, которые уже есть в Метрике:
UTMMedium
равно CPA
или cpa
)UTMMedium
равно cpc
)UTMMedium
равно marketplace
или это рекламный трафик с Яндекс.Маркета)DirectConditionType
или по наличию параметра ref=ga_ref
в URL)q = '''
SELECT
LastTraficSource,
UTMMedium,
StartURL,
LastAdvEngine,
LastSearchEngine,
if(
lower(UTMMedium) IN ('cpa', 'cpc', 'marketplace'),
lower(UTMMedium),
if(
LastTraficSource = 'ad',
if(
LastAdvEngine = 'market',
'marketplace',
if(
(DirectConditionType = 2) OR (extractURLParameter(StartURL, 'ref') = 'ga_ref'),
'retargeting',
'media_ad')
),
if(
LastTraficSource IN ('direct', 'saved', 'undefined', 'internal'),
'direct',
LastTraficSource
)
)
) as Source,
if(Source = 'organic',
if(LastSearchEngine IN ('yandex', 'google'), concat('organic ', LastSearchEngine), 'organic other'),
if(Source IN ('media_ad', 'retargeting', 'cpa', 'cpc'),
if(
LastAdvEngine IN ('market', 'google_adwords', 'yandex'),
concat(Source, concat(' ', LastAdvEngine)),
concat(Source, ' other')
),
Source
)) as SourceDetailed
FROM visits_all
LIMIT 10
FORMAT TabSeparatedWithNames
'''
get_clickhouse_df(q)
LastTraficSource | UTMMedium | StartURL | LastAdvEngine | LastSearchEngine | Source | SourceDetailed | |
---|---|---|---|---|---|---|---|
0 | ad | cpc | http://220-volt.ru/catalog-156860/?ref=yamar&y... | market | yandex | cpc | cpc market |
1 | organic | NaN | http://m.220-volt.ru/catalog/svetodiodnyj-zana... | NaN | yandex | organic | organic yandex |
2 | ad | cpc | http://220-volt.ru/catalog-210053/?ref=yamar&y... | market | yandex | cpc | cpc market |
3 | direct | NaN | http://ulyanovsk.220-volt.ru/catalog-59739/ | NaN | NaN | direct | direct |
4 | direct | NaN | http://m.220-volt.ru/catalog/2-55/ | NaN | NaN | direct | direct |
5 | organic | NaN | http://omsk.220-volt.ru/catalog/ | NaN | yandex | organic | organic yandex |
6 | ad | NaN | http://m.220-volt.ru/catalog-53213/?ref=ga_ret | google_adwords | NaN | media_ad | media_ad google_adwords |
7 | ad | NaN | http://m.220-volt.ru/catalog-220325/?ref=ga_ret | google_adwords | NaN | media_ad | media_ad google_adwords |
8 | organic | NaN | http://kemerovo.220-volt.ru/catalog/svarochie-... | NaN | yandex | organic | organic yandex |
9 | organic | NaN | http://220-volt.ru/catalog-136605/ | NaN | organic | organic google |
Добавляем в скрипт для расчета путей конверсии определение категорий и получаем пути по новым переопределенным каналам.
q = '''
SELECT
SourcesDetailed,
sum(VisitPurchases) as Purchases
FROM
(SELECT
groupArray(Source) as Sources,
groupArray(SourceDetailed) as SourcesDetailed,
VisitID,
any(ClientID) as ClientID,
any(DateTime) as StartTime,
any(VisitRevenue) as VisitRevenue,
any(VisitPurchases) as VisitPurchases
FROM
(SELECT
ClientID,
DateTime,
VisitRevenue,
VisitPurchases,
Source,
SourceDetailed,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM
(SELECT
ClientID,
DateTime as PurchaseTime,
arraySum(PurchaseRevenue) as VisitRevenue,
length(PurchaseID) as VisitPurchases,
LastTraficSource,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM visits_all
WHERE ClientID != 0 AND VisitPurchases != 0)
ALL LEFT JOIN
(SELECT
ClientID,
DateTime,
if(
lower(UTMMedium) IN ('cpa', 'cpc', 'marketplace'),
lower(UTMMedium),
if(
LastTraficSource = 'ad',
if(
LastAdvEngine = 'market',
'marketplace',
if(
(DirectConditionType = 2) OR (extractURLParameter(StartURL, 'ref') = 'ga_ref'),
'retargeting',
'media_ad')
),
if(
LastTraficSource IN ('direct', 'saved', 'undefined', 'internal'),
'direct',
LastTraficSource
)
)
) as Source,
if(Source = 'organic',
if(LastSearchEngine IN ('yandex', 'google'), concat('organic ', LastSearchEngine), 'organic other'),
if(Source IN ('media_ad', 'retargeting', 'cpa', 'cpc'),
if(
LastAdvEngine IN ('market', 'google_adwords', 'yandex'),
concat(Source, concat(' ', LastAdvEngine)),
concat(Source, ' other')
),
Source
)) as SourceDetailed
FROM visits_all
ORDER BY DateTime)
USING ClientID
WHERE PurchaseTime >= DateTime)
GROUP BY VisitID)
GROUP BY SourcesDetailed
ORDER BY Purchases DESC
LIMIT 10
FORMAT TabSeparatedWithNames
'''
%time get_clickhouse_df(q)
CPU times: user 12.8 ms, sys: 24.6 ms, total: 37.4 ms Wall time: 13.9 s
SourcesDetailed | Purchases | |
---|---|---|
0 | ['organic yandex'] | 8532 |
1 | ['direct'] | 4707 |
2 | ['organic google'] | 3989 |
3 | ['cpc market'] | 3690 |
4 | ['cpa other'] | 2249 |
5 | ['cpc yandex'] | 1966 |
6 | ['cpc other'] | 1531 |
7 | ['organic yandex','organic yandex'] | 1321 |
8 | ['direct','direct'] | 1164 |
9 | ['email'] | 1044 |
q = '''
SELECT
count() as Visits,
sum(arraySum(PurchaseRevenue)) as Revenue,
sum(length(PurchaseID)) as Purchases,
if(
lower(UTMMedium) IN ('cpa', 'cpc', 'marketplace'),
lower(UTMMedium),
if(
LastTraficSource = 'ad',
if(
LastAdvEngine = 'market',
'marketplace',
if(
(DirectConditionType = 2) OR (extractURLParameter(StartURL, 'ref') = 'ga_ref'),
'retargeting',
'media_ad')
),
if(
LastTraficSource IN ('direct', 'saved', 'undefined', 'internal'),
'direct',
LastTraficSource
)
)
) as Source,
if(Source = 'organic',
if(LastSearchEngine IN ('yandex', 'google'), concat('organic ', LastSearchEngine), 'organic other'),
if(Source IN ('media_ad', 'retargeting', 'cpa', 'cpc'),
if(
LastAdvEngine IN ('market', 'google_adwords', 'yandex'),
concat(Source, concat(' ', LastAdvEngine)),
concat(Source, ' other')
),
Source
)) as SourceDetailed
FROM visits_all
WHERE ClientID != 0
GROUP BY Source, SourceDetailed
FORMAT TabSeparatedWithNames
'''
%time last_raw_df = get_clickhouse_df(q).groupby(['Source', 'SourceDetailed']).sum()
CPU times: user 14.8 ms, sys: 8.91 ms, total: 23.7 ms Wall time: 4.93 s
Note: Значение Revenue
были изменены
last_raw_df
Visits | Revenue | Purchases | ||
---|---|---|---|---|
Source | SourceDetailed | |||
cpa | cpa google_adwords | 8144 | 19420163 | 179 |
cpa market | 299 | 1636920 | 9 | |
cpa other | 324588 | 533080271 | 5445 | |
cpa yandex | 4565 | 6916381 | 66 | |
cpc | cpc google_adwords | 2152 | 13963992 | 81 |
cpc market | 343667 | 987963079 | 7947 | |
cpc other | 190393 | 400087849 | 3279 | |
cpc yandex | 670374 | 812646786 | 5528 | |
direct | direct | 882305 | 2005777498 | 15712 |
130686 | 252044411 | 2526 | ||
marketplace | marketplace | 5033 | 13124252 | 192 |
media_ad | media_ad google_adwords | 360733 | 301655890 | 2685 |
media_ad other | 5880 | 4600901 | 60 | |
media_ad yandex | 71485 | 40893931 | 461 | |
organic | organic google | 601434 | 569959890 | 5657 |
organic other | 56712 | 97028874 | 728 | |
organic yandex | 1214740 | 1466552889 | 13453 | |
referral | referral | 68278 | 118097982 | 938 |
retargeting | retargeting yandex | 431 | 870687 | 5 |
social | social | 73483 | 34991340 | 334 |
last_raw_df_norm = last_raw_df.apply(lambda x: 100.*x/last_raw_df.sum(), axis = 1)
last_raw_df_norm
Visits | Revenue | Purchases | ||
---|---|---|---|---|
Source | SourceDetailed | |||
cpa | cpa google_adwords | 0.162380 | 0.252823 | 0.274182 |
cpa market | 0.005962 | 0.021310 | 0.013786 | |
cpa other | 6.471850 | 6.939962 | 8.340354 | |
cpa yandex | 0.091020 | 0.090042 | 0.101095 | |
cpc | cpc google_adwords | 0.042908 | 0.181792 | 0.124071 |
cpc market | 6.852260 | 12.861902 | 12.172781 | |
cpc other | 3.796181 | 5.208586 | 5.022593 | |
cpc yandex | 13.366360 | 10.579528 | 8.467489 | |
direct | direct | 17.591980 | 26.112427 | 24.066784 |
2.605704 | 3.281267 | 3.869189 | ||
marketplace | marketplace | 0.100351 | 0.170859 | 0.294095 |
media_ad | media_ad google_adwords | 7.192533 | 3.927139 | 4.112736 |
media_ad other | 0.117239 | 0.059897 | 0.091905 | |
media_ad yandex | 1.425315 | 0.532382 | 0.706135 | |
organic | organic google | 11.991788 | 7.420083 | 8.665084 |
organic other | 1.130761 | 1.263181 | 1.115111 | |
organic yandex | 24.220289 | 19.092474 | 20.606571 | |
referral | referral | 1.361372 | 1.537471 | 1.436777 |
retargeting | retargeting yandex | 0.008594 | 0.011335 | 0.007659 |
social | social | 1.465153 | 0.455538 | 0.511603 |
traffic_sources_coeffs = {
'direct': 0.1,
'referral': 0.2,
'social': 0.3,
'cpa': 0.5,
'cpc': 0.5,
'retargeting': 0.3,
'organic': 0.4,
'email': 0.4,
'media_ad': 0.5,
'marketplace': 0.5
}
def get_attribution_data_pos_sample(i, N, traffic_sources_coeffs, where_condition = ''):
q = '''
SELECT
sum(VisitRevenue*SourceCoefNorm) as Revenue,
sum(VisitPurchases*SourceCoefNorm) as Purchases,
sum(SourceCoefNorm) as Visits,
Source,
SourceDetailed
FROM
(SELECT
groupArray(Source) as Sources,
groupArray(SourceDetailed) as SourcesDetailed,
VisitID,
any(ClientID) as ClientID,
any(DateTime) as StartTime,
any(VisitRevenue) as VisitRevenue,
any(VisitPurchases) as VisitPurchases,
arrayMap(x -> transform(x, [{traffic_sources}], [{traffic_sources_coeffs}], 0), Sources) as SourcesRawCoefs,
arrayMap(x -> 1/(length(SourcesRawCoefs) - x), range(length(SourcesRawCoefs))) as PositionCoefs,
arrayMap(x, y -> x * y, SourcesRawCoefs, PositionCoefs) as SourcesCoefs,
arraySum(SourcesCoefs) as SourcesCoefsSum,
arrayMap(x -> x/SourcesCoefsSum, SourcesCoefs) as SourcesCoefsNorm
FROM
(SELECT
ClientID,
DateTime,
VisitRevenue,
VisitPurchases,
SourceTime,
Source,
SourceDetailed,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM
(SELECT
ClientID,
DateTime,
arraySum(PurchaseRevenue) as VisitRevenue,
length(PurchaseID) as VisitPurchases,
LastTraficSource,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM visits_all SAMPLE 1/{N} OFFSET {i}/{N}
WHERE ClientID != 0)
ALL LEFT JOIN
(SELECT
ClientID,
DateTime as SourceTime,
if(
lower(UTMMedium) IN ('cpa', 'cpc', 'marketplace'),
lower(UTMMedium),
if(
LastTraficSource = 'ad',
if(
LastAdvEngine = 'market',
'marketplace',
if(
(DirectConditionType = 2) OR (extractURLParameter(StartURL, 'ref') = 'ga_ref'),
'retargeting',
'media_ad')
),
if(
LastTraficSource IN ('direct', 'saved', 'undefined', 'internal'),
'direct',
LastTraficSource
)
)
) as Source,
if(Source = 'organic',
if(LastSearchEngine IN ('yandex', 'google'), concat('organic ', LastSearchEngine), 'organic other'),
if(Source IN ('media_ad', 'retargeting', 'cpa', 'cpc'),
if(
LastAdvEngine IN ('market', 'google_adwords', 'yandex'),
concat(Source, concat(' ', LastAdvEngine)),
concat(Source, ' other')
),
Source
)) as SourceDetailed
FROM visits_all SAMPLE 1/{N} OFFSET {i}/{N} {where_condition})
USING ClientID
WHERE SourceTime <= DateTime
ORDER BY SourceTime)
GROUP BY VisitID)
ARRAY JOIN
SourcesCoefsNorm as SourceCoefNorm,
Sources as Source,
SourcesDetailed as SourceDetailed
GROUP BY Source, SourceDetailed
ORDER BY Visits DESC
FORMAT TabSeparatedWithNames
'''.format(
traffic_sources = ', '.join(map(lambda x: "'%s'" % x, traffic_sources_coeffs.keys())),
traffic_sources_coeffs = ', '.join(map(str, traffic_sources_coeffs.values())),
i = i, N = N,
where_condition = where_condition
)
# print q
return get_clickhouse_df(q)
def get_attribution_data_pos(traffic_sources_coeffs, where_condition=''):
tmp_dfs = []
for i in range(10):
tmp_dfs.append(get_attribution_data_pos_sample(i, 10, traffic_sources_coeffs, where_condition))
return pd.concat(tmp_dfs).groupby(['Source', 'SourceDetailed']).sum()
%time not_bounce_pos_df = get_attribution_data_pos(traffic_sources_coeffs, where_condition = 'WHERE IsBounce = 0')
CPU times: user 104 ms, sys: 58.8 ms, total: 163 ms Wall time: 45.7 s
not_bounce_pos_df_norm = not_bounce_pos_df.apply(lambda x: 100.*x/not_bounce_pos_df.sum(), axis = 1)
Напишем базовую функцию для расчета модели атрибуции:
def get_attribution_data_sample(i, N, where_condition = ''):
q = '''
SELECT
sum(VisitRevenue*SourceCoefNorm) as Revenue,
sum(VisitPurchases*SourceCoefNorm) as Purchases,
sum(SourceCoefNorm) as Visits,
Source,
SourceDetailed
FROM
(SELECT
groupArray(Source) as Sources,
groupArray(SourceDetailed) as SourcesDetailed,
groupArray(HasCart) as HasCarts,
groupArray(HasImpression) as HasImpressions,
VisitID,
any(ClientID) as ClientID,
any(DateTime) as StartTime,
any(VisitRevenue) as VisitRevenue,
any(VisitPurchases) as VisitPurchases,
arrayMap(x, y -> 1 + 0.5*x + 0.1*y, HasCarts, HasImpressions) as SourcesCoefs,
arraySum(SourcesCoefs) as SourcesCoefsSum,
arrayMap(x -> x/SourcesCoefsSum, SourcesCoefs) as SourcesCoefsNorm
FROM
(SELECT
ClientID,
DateTime,
VisitRevenue,
VisitPurchases,
SourceTime,
Source,
HasCart,
HasImpression,
SourceDetailed,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM
(SELECT
ClientID,
DateTime,
arraySum(PurchaseRevenue) as VisitRevenue,
length(PurchaseID) as VisitPurchases,
LastTraficSource,
VisitID,
LastSearchEngine,
LastAdvEngine
FROM visits_all SAMPLE 1/{N} OFFSET {i}/{N}
WHERE ClientID != 0)
ALL LEFT JOIN
(SELECT
ClientID,
DateTime as SourceTime,
if(
lower(UTMMedium) IN ('cpa', 'cpc', 'marketplace'),
lower(UTMMedium),
if(
LastTraficSource = 'ad',
if(
LastAdvEngine = 'market',
'marketplace',
if(
(DirectConditionType = 2) OR (extractURLParameter(StartURL, 'ref') = 'ga_ref'),
'retargeting',
'media_ad')
),
if(
LastTraficSource IN ('direct', 'saved', 'undefined', 'internal'),
'direct',
LastTraficSource
)
)
) as Source,
if(Source = 'organic',
if(LastSearchEngine IN ('yandex', 'google'), concat('organic ', LastSearchEngine), 'organic other'),
if(Source IN ('media_ad', 'retargeting', 'cpa', 'cpc'),
if(
LastAdvEngine IN ('market', 'google_adwords', 'yandex'),
concat(Source, concat(' ', LastAdvEngine)),
concat(Source, ' other')
),
Source
)) as SourceDetailed,
has(GoalsID, 552829) as HasCart,
length(ImpressionsProductID) > 0 as HasImpression
FROM visits_all SAMPLE 1/{N} OFFSET {i}/{N} {where_condition}
)
USING ClientID
WHERE SourceTime <= DateTime
ORDER BY SourceTime)
GROUP BY VisitID)
ARRAY JOIN
SourcesCoefsNorm as SourceCoefNorm,
Sources as Source,
SourcesDetailed as SourceDetailed
GROUP BY Source, SourceDetailed
ORDER BY Visits DESC
FORMAT TabSeparatedWithNames
'''.format(
i = i, N = N,
where_condition = where_condition
)
# print q
return get_clickhouse_df(q)
def get_attribution_data(where_condition=''):
tmp_dfs = []
for i in range(10):
tmp_dfs.append(get_attribution_data_sample(i, 10, where_condition))
return pd.concat(tmp_dfs).groupby(['Source', 'SourceDetailed']).sum()
%time not_bounce_df = get_attribution_data(where_condition = 'WHERE IsBounce = 0')
CPU times: user 105 ms, sys: 46.1 ms, total: 151 ms Wall time: 1min 1s
not_bounce_df_norm = not_bounce_df.apply(lambda x: 100.*x/not_bounce_df.sum(), axis = 1)
Напишем еще одну функцию для построения bar-chart'ов.
def show_bar_plot(cmp_df):
data = []
for column in cmp_df.columns:
trace = go.Bar(
x = cmp_df.index.values,
y = cmp_df[column].values,
name = column
)
data.append(trace)
layout = go.Layout({'xaxis': {'tickangle': 45}})
fig = go.Figure(data = data, layout = layout)
iplot(fig, show_link=False)
dct = {
'last click': last_raw_df_norm,
'behaviour': not_bounce_df_norm,
'time decay & weights': not_bounce_pos_df_norm,
}
def get_comparison(parameter, detalization, show_delta = False):
cmp_df = pd.DataFrame()
for item in dct:
cmp_df[item] = dct[item].reset_index().groupby(detalization)[parameter].sum()
cmp_df = cmp_df.sort_values('last click', ascending = False)
if not show_delta:
return cmp_df
cmp_df_norm = cmp_df.apply(lambda x: x - cmp_df['last click']).drop('last click', axis = 1).sort_values('behaviour')
return cmp_df_norm
show_bar_plot(get_comparison('Revenue', 'Source', show_delta=False))
show_bar_plot(get_comparison('Revenue', 'Source', show_delta=True))