shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse);
library(lubridate);
library(scales);
library(magrittr);
library(dplyr)
})
Instrumentation Notes:
event.context = 'languages-list'
query <-
"
SELECT
TO_DATE(dt) AS `date`,
wiki,
event.web_session_id,
event.usereditbucket,
event.timetochangelanguage,
event.isanon,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage,
Count(*) AS n_events
FROM event.universallanguageselector
WHERE
year = 2021
AND ((Month = 04 AND DAY > 26) OR (MONTH = 05))
AND event.context = 'languages-list'
GROUP BY
TO_DATE(dt),
wiki,
event.web_session_id,
event.usereditbucket,
event.timetochangelanguage,
event.isanon,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage
"
lang_link_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
lang_link_events$date <- as.Date(lang_link_events$date)
lang_link_events_daily <- lang_link_events %>%
group_by(date) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_link_events_daily
`summarise()` ungrouping output (override with `.groups` argument)
date | n_events | n_sessions |
---|---|---|
<date> | <int> | <int> |
2021-04-28 | 791 | 636 |
2021-04-29 | 46934 | 33173 |
2021-04-30 | 274519 | 177001 |
2021-05-01 | 243709 | 151370 |
2021-05-02 | 258630 | 161650 |
2021-05-03 | 303583 | 195465 |
2021-05-04 | 300642 | 194931 |
2021-05-05 | 295520 | 191876 |
2021-05-06 | 291788 | 190268 |
2021-05-07 | 265159 | 171239 |
2021-05-08 | 225456 | 141143 |
2021-05-09 | 244182 | 152870 |
2021-05-10 | 301699 | 196353 |
2021-05-11 | 296753 | 193206 |
2021-05-12 | 278392 | 181382 |
2021-05-13 | 54123 | 37258 |
We start recording events on 28 April 2021. There are an average 176,827 sessions per day including sessions by both logged in and logged out users. No unexpected spikes or drops so far.
Check to make sure there are duplicate session id. Some sessions should have more than one click event.
length(unique(lang_link_events$web_session_id)) == nrow(lang_link_events)
lang_link_events_persession <- lang_link_events %>%
group_by(isanon) %>%
summarize(avg_clicks = mean(n_events),
max_clicks = max(n_events),
min_clciks = min(n_events))
lang_link_events_persession
`summarise()` ungrouping output (override with `.groups` argument)
isanon | avg_clicks | max_clicks | min_clciks |
---|---|---|---|
<chr> | <dbl> | <int> | <int> |
false | 1.000327 | 2 | 1 |
true | 1.000169 | 4 | 1 |
lang_link_events_isanon <- lang_link_events %>%
group_by(isanon) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_link_events_isanon
`summarise()` ungrouping output (override with `.groups` argument)
isanon | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
false | 113161 | 46278 |
true | 3568719 | 2058676 |
97.8% of all sessions with clicks to the language links are by logged out users. That's high but expected because instrumentation was limited to legacy sidebar in modern Vector (not from legacy or other skins such as timeless). The new language switching functionality was made available to all logged-in users opted into the latest version of the Vector skin.
Legacy sidebar in modern Vector would mostly appear to logged-out users on test wikis where Vector is deployed as default.
# events and sessions that include link to language link by test wiki
lang_link_events_testwiki <- lang_link_events %>%
filter(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' )) %>%
group_by(wiki) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_link_events_testwiki
`summarise()` ungrouping output (override with `.groups` argument)
wiki | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
bnwiki | 8451 | 5726 |
dewikivoyage | 755 | 604 |
euwiki | 31609 | 22351 |
fawiki | 200781 | 110092 |
frwiki | 1933743 | 1091442 |
frwiktionary | 21307 | 13350 |
hewiki | 173526 | 100165 |
kowiki | 146461 | 81522 |
ptwiki | 555228 | 328675 |
ptwikiversity | 15 | 13 |
srwiki | 98359 | 56521 |
trwiki | 256347 | 151107 |
vecwiki | 1348 | 1133 |
# events and sessions that include link to language link by test wiki category and logged-in status
lang_link_events_testwiki_isanon <- lang_link_events %>%
mutate(istestwiki = ifelse(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ), 'test_wiki', 'non_test_wiki')) %>%
group_by(istestwiki, isanon) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_link_events_testwiki_isanon
`summarise()` regrouping output by 'istestwiki' (override with `.groups` argument)
istestwiki | isanon | n_events | n_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <int> |
non_test_wiki | false | 3540 | 1422 |
non_test_wiki | true | 147 | 80 |
test_wiki | false | 102371 | 41836 |
test_wiki | true | 3325559 | 1922334 |
Almost all of the events recorded to date (99%) have been on test wikis. This is expected as the new language switcher button was deployed to all users opt'd in to the modern vector on all non test wikis. Users with modern vector on the test wikis have not been shown the new language switcher and still shown the language links in the sidebar.
On non test wikis, the majority (94.67%) of sessions with clicks to the language list on modern vector come from logged-in users. Need to confirm if it's possible to have language link in sidebar if you are logged-in, on modern vector and on a non test wiki.
logged_in_editcount <- lang_link_events %>%
filter(isanon == 'false') %>%
group_by(usereditbucket) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
logged_in_editcount
`summarise()` ungrouping output (override with `.groups` argument)
usereditbucket | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
0 edits | 20092 | 10961 |
1-4 edits | 11510 | 5643 |
100-999 edits | 17264 | 6533 |
1000+ edits | 31482 | 8932 |
5-99 edits | 25556 | 11332 |
NULL | 7 | 5 |
logged_out_editcount <- lang_link_events %>%
filter(isanon == 'true') %>%
group_by(usereditbucket) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
logged_out_editcount
`summarise()` ungrouping output (override with `.groups` argument)
usereditbucket | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
5-99 edits | 1 | 1 |
NULL | 3325705 | 1922412 |
There are just a few instances (under 0.01%) of the event.usereditbucket field being populated for logged out users and recorded as NULL for logged-in users. Further investigation might be needed; however, the numbers of these events is not high enough to skew the data.
# test that you can switch from one language to the next.
top_final_languages <- lang_link_events %>%
mutate(all_sessions = n_distinct(web_session_id)) %>%
group_by(selectedinterfacelanguage) %>%
summarize(n_sessions = n_distinct(web_session_id),
pct_sessions = n_sessions/all_sessions) %>%
distinct() %>%
arrange(desc(n_sessions))
head(top_final_languages )
`summarise()` regrouping output by 'selectedinterfacelanguage' (override with `.groups` argument)
selectedinterfacelanguage | n_sessions | pct_sessions |
---|---|---|
<chr> | <int> | <dbl> |
en | 1443105 | 0.73470332 |
es | 128358 | 0.06534871 |
de | 125061 | 0.06367016 |
it | 65010 | 0.03309743 |
ar | 64408 | 0.03279094 |
ru | 52428 | 0.02669177 |
The most frequent language switches are to english (73% of sessions) followed by spanish (6.5%), and german (6.3%).
top_initial_languages <- lang_link_events %>%
mutate(all_sessions = n_distinct(web_session_id)) %>%
group_by(interfacelanguage, contentlanguage) %>%
summarize(n_sessions = n_distinct(web_session_id),
pct_sessions = n_sessions/all_sessions) %>%
distinct() %>%
arrange(desc(n_sessions))
head(top_initial_languages )
`summarise()` regrouping output by 'interfacelanguage', 'contentlanguage' (override with `.groups` argument)
interfacelanguage | contentlanguage | n_sessions | pct_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <dbl> |
fr | fr | 1103742 | 0.56192925 |
pt | pt | 328012 | 0.16699513 |
tr | tr | 150977 | 0.07686433 |
fa | fa | 109989 | 0.05599681 |
he | he | 100125 | 0.05097493 |
ko | ko | 81429 | 0.04145655 |
The interfacelanguage and contentlanguage will usually be the same and should match for most instances, which is confirmed here.
The top initial languages all from test wikis, which is expected since the language links are still shown to all logged-in and logged-out users on modern vector. The new language switcher, which replaces the lang links with a button, are show to all logged-in users opt'd into modern vector on non-test wikis.
## Most Frequent Switch Types
top_final_languages <- lang_link_events %>%
mutate(all_sessions = n_distinct(web_session_id)) %>%
group_by(interfacelanguage, selectedinterfacelanguage) %>%
summarize(n_sessions = n_distinct(web_session_id),
pct_sessions = n_sessions/all_sessions) %>%
distinct() %>%
arrange(desc(n_sessions))
head(top_final_languages )
`summarise()` regrouping output by 'interfacelanguage', 'selectedinterfacelanguage' (override with `.groups` argument)
interfacelanguage | selectedinterfacelanguage | n_sessions | pct_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <dbl> |
fr | en | 775769 | 0.39495398 |
pt | en | 264373 | 0.13459570 |
tr | en | 118824 | 0.06049483 |
fa | en | 95622 | 0.04868239 |
fr | de | 94915 | 0.04832245 |
he | en | 84668 | 0.04310557 |
39% of all sessions are switches are from French to English.
event.action="compact-language-links-open"
query <-
"
SELECT
TO_DATE(dt) AS `date`,
wiki,
event.web_session_id,
event.usereditbucket,
event.skin,
event.skinVersion,
event.timetochangelanguage,
event.isanon,
event.context,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage,
Count(*) AS n_events
FROM event.universallanguageselector
WHERE
year = 2021
AND month >= 05
AND event.action = 'compact-language-links-open'
AND useragent.is_bot = false
GROUP BY
TO_DATE(dt),
wiki,
event.web_session_id,
event.usereditbucket,
event.skin,
event.skinVersion,
event.timetochangelanguage,
event.isanon,
event.context,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage
"
lang_button_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
lang_button_events_daily <- lang_button_events %>%
group_by(date) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_button_events_daily
`summarise()` ungrouping output (override with `.groups` argument)
date | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
2021-05-12 | 12 | 10 |
2021-05-13 | 194 | 159 |
2021-05-14 | 1317 | 1147 |
2021-05-15 | 1159 | 928 |
2021-05-16 | 1358 | 1042 |
2021-05-17 | 32668 | 28541 |
2021-05-18 | 90811 | 78945 |
2021-05-19 | 89299 | 77108 |
2021-05-20 | 87644 | 75820 |
2021-05-21 | 81498 | 67973 |
2021-05-22 | 67460 | 55270 |
2021-05-23 | 76222 | 62416 |
2021-05-24 | 92724 | 76384 |
2021-05-25 | 92351 | 77546 |
2021-05-26 | 89947 | 75196 |
2021-05-27 | 87440 | 73193 |
2021-05-28 | 78177 | 65185 |
2021-05-29 | 65494 | 53538 |
2021-05-30 | 72238 | 59545 |
2021-05-31 | 87384 | 72570 |
2021-06-01 | 86754 | 72234 |
2021-06-02 | 85835 | 71774 |
2021-06-03 | 81864 | 68127 |
2021-06-04 | 74580 | 61948 |
2021-06-05 | 63824 | 52160 |
2021-06-06 | 71082 | 58131 |
2021-06-07 | 85287 | 71501 |
2021-06-08 | 84667 | 70399 |
2021-06-09 | 83472 | 69111 |
2021-06-10 | 80794 | 67411 |
2021-06-11 | 72647 | 60353 |
2021-06-12 | 60176 | 49825 |
2021-06-13 | 66600 | 54534 |
2021-06-14 | 80895 | 67718 |
2021-06-15 | 79705 | 67001 |
2021-06-16 | 78153 | 64678 |
2021-06-17 | 76308 | 63395 |
2021-06-18 | 70039 | 57090 |
2021-06-19 | 57473 | 46714 |
2021-06-20 | 62298 | 51153 |
2021-06-21 | 76513 | 63711 |
2021-06-22 | 103507 | 80471 |
2021-06-23 | 145211 | 103694 |
2021-06-24 | 151335 | 106808 |
2021-06-25 | 142435 | 98600 |
2021-06-26 | 128018 | 85548 |
2021-06-27 | 145148 | 96486 |
2021-06-28 | 180394 | 123458 |
2021-06-29 | 63085 | 45605 |
lang_button_events_bycontext <- lang_button_events %>%
# date new instrumentation was added
filter(date >= '2021-06-08')%>%
group_by(skinversion, context) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_button_events_bycontext
`summarise()` regrouping output by 'skinversion' (override with `.groups` argument)
skinversion | context | n_events | n_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <int> |
latest | header | 595196 | 326328 |
latest | other | 94351 | 77145 |
legacy | other | 1362565 | 1093551 |
NULL | NULL | 36761 | 31132 |
Conirmed we are only recording new button clicks on the latest vector.
length(unique(lang_button_events$web_session_id)) == nrow(lang_button_events)
lang_button_events_bywiki <- lang_button_events %>%
group_by(wiki) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id)) %>%
arrange(desc(n_sessions))
head(lang_button_events_bywiki)
`summarise()` ungrouping output (override with `.groups` argument)
wiki | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
enwiki | 1719595 | 1341346 |
frwiki | 455616 | 280201 |
ruwiki | 230135 | 177510 |
dewiki | 208276 | 173340 |
eswiki | 154449 | 126942 |
ptwiki | 145856 | 91869 |
Almost half (49.7%) of sessions with a click to the language button were recorded on English Wikipedia, followed by Russian, German and Spanish Wikipedia. Note: The language switcher button is currently was not available to logged-in users on test wikis until 22 June 2021. AB Test deployment on Fawiki was delayed until 28 June 2021.
lang_button_events_bytestwiki <- lang_button_events %>%
filter(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki',
'dewikivoyage', 'vecwiki' )) %>%
group_by(wiki) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id)) %>%
arrange(desc(n_sessions))
lang_button_events_bytestwiki
`summarise()` ungrouping output (override with `.groups` argument)
wiki | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
frwiki | 455616 | 280201 |
ptwiki | 145856 | 91869 |
trwiki | 67646 | 46391 |
hewiki | 54807 | 33156 |
kowiki | 39555 | 24618 |
srwiki | 30836 | 21933 |
fawiki | 15068 | 11986 |
frwiktionary | 6245 | 4000 |
bnwiki | 4205 | 2979 |
euwiki | 3972 | 2944 |
vecwiki | 887 | 794 |
dewikivoyage | 259 | 177 |
ptwikiversity | 4 | 3 |
About 8.4% of all sessions were recorded on the early adopter wikis.
I reviewed the number of sessions on test wikis by logged-in status to determine if these were mostly logged-out or logged-in users.
# events and sessions that include link to language link by test wiki category and logged-in status
lang_button_events_testwiki_isanon <- lang_button_events %>%
mutate(istestwiki = ifelse(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ), 'test_wiki', 'non_test_wiki')) %>%
group_by(istestwiki, isanon) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_button_events_testwiki_isanon
`summarise()` regrouping output by 'istestwiki' (override with `.groups` argument)
istestwiki | isanon | n_events | n_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <int> |
non_test_wiki | false | 116409 | 68458 |
non_test_wiki | true | 2892131 | 2324483 |
test_wiki | false | 27679 | 13284 |
test_wiki | true | 797277 | 508146 |
0.29% of sessions with a click to the lang switcher button came from logged-in users on the early adopter wikis and 8.1% of sessions by logged-out users.
lang_button_events_byanon <- lang_button_events %>%
group_by(isanon) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
head(lang_button_events_byanon)
`summarise()` ungrouping output (override with `.groups` argument)
isanon | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
false | 144088 | 81742 |
true | 3689408 | 2832629 |
The majority of sessions with clicks to the language button are by logged out users (97.2%)
lang_button_events_persession <- lang_button_events %>%
group_by(isanon) %>%
summarize(avg_clicks = mean(n_events),
max_clicks = max(n_events),
min_clciks = min(n_events))
lang_button_events_persession
`summarise()` ungrouping output (override with `.groups` argument)
isanon | avg_clicks | max_clicks | min_clciks |
---|---|---|---|
<chr> | <dbl> | <int> | <int> |
false | 1.583506 | 285 | 1 |
true | 1.244324 | 798 | 1 |
Most sessions include between 1 to 2 clicks to the language switcher button.
Note: There are some sessions by anon users with over 600 clicks, which is is likely automated traffic from bots and can be filtered out.
logged_in_editcount <- lang_button_events %>%
filter(isanon == 'false') %>%
group_by(usereditbucket) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
logged_in_editcount
`summarise()` ungrouping output (override with `.groups` argument)
usereditbucket | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
0 edits | 36100 | 26263 |
1-4 edits | 15370 | 10060 |
100-999 edits | 21124 | 11002 |
1000+ edits | 37444 | 15816 |
5-99 edits | 34048 | 18811 |
NULL | 2 | 2 |
logged_out_editcount <- lang_button_events %>%
filter(isanon == 'true') %>%
group_by(usereditbucket) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
logged_out_editcount
`summarise()` ungrouping output (override with `.groups` argument)
usereditbucket | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
NULL | 3689408 | 2832629 |
Confirmed that we are not recording an edit count for logged out users as expected.
top_content_languages <- lang_button_events %>%
mutate(all_sessions = n_distinct(web_session_id)) %>%
group_by(interfacelanguage, contentlanguage) %>%
summarize(n_sessions = n_distinct(web_session_id),
pct_sessions = n_sessions/all_sessions) %>%
distinct() %>%
arrange(desc(n_sessions))
head(top_content_languages )
`summarise()` regrouping output by 'interfacelanguage', 'contentlanguage' (override with `.groups` argument)
interfacelanguage | contentlanguage | n_sessions | pct_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <dbl> |
en | en | 1361586 | 0.46746010 |
fr | fr | 284033 | 0.09751429 |
ru | ru | 178766 | 0.06137400 |
de | de | 174512 | 0.05991351 |
es | es | 127334 | 0.04371635 |
pt | pt | 91875 | 0.03154255 |
The top current content and interface lang settings are english, russian, german and spanish, which fits with the top wikis we see where the language button switch clicks occur.
Update: French is now on the list as of 22 June 2021 as AB test was deployed then
lang_button_events_context <- lang_button_events %>%
group_by(context) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
lang_button_events_context
`summarise()` ungrouping output (override with `.groups` argument)
context | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
header | 595196 | 326328 |
NULL | 1781384 | 1439296 |
other | 1456916 | 1170691 |
We are not recording event.context
for these events.
Note: This was changed with a fix deployed on 6 June 2021. We are now recorded new button clicks as event.context = 'header'. event.context = 'other' is recorded for people in control group (or using legacy skin) that click N more button in the sidebar.
lang_button_events_time <- lang_button_events %>%
summarise(avg_time = mean(timetochangelanguage),
median_time = median(timetochangelanguage),
max_time = max(timetochangelanguage),
min_time = min(timetochangelanguage))
lang_button_events_time
Warning message in mean.default(timetochangelanguage): “argument is not numeric or logical: returning NA” Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): “argument is not numeric or logical: returning NA”
avg_time | median_time | max_time | min_time |
---|---|---|---|
<dbl> | <dbl> | <chr> | <chr> |
NA | NA | NULL | NULL |
Note: We don't record time to change language for the initial click to the button but we record when the user actually changes languages.
#rough query to confirm approach
# will be refine in analysis
query <-
"
-- sessions where lang button was selected
WITH button AS (
SELECT
MIN(TO_DATE(dt)) as button_date,
event.web_session_id as session_id,
event.context as open_context,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month >= 05
AND useragent.is_bot = false
AND event.action = 'compact-language-links-open'
GROUP BY
event.web_session_id,
event.context,
wiki
),
lang_switches AS (
SELECT
TO_DATE(dt) as switch_date,
event.web_session_id as session_id,
event.context as switch_context,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month >= 05
AND useragent.is_bot = false
AND event.action = 'language-change'
)
SELECT
button.button_date,
lang_switches.switch_date,
button.session_id,
button.wiki,
button.open_context,
-- sessions with lang switch that occured after button clicks
IF(lang_switches.session_id IS NOT NULL AND switch_date >= button_date, 1, 0) AS language_switch,
lang_switches.switch_context
FROM button
LEFT JOIN lang_switches ON
button.session_id = lang_switches.session_id AND
button.wiki = lang_switches.wiki
"
lang_button_switch_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
#reformat to date format
lang_button_switch_events$button_date <- as.Date(lang_button_switch_events$button_date, format = "%Y-%m-%d")
lang_button_switch_events$switch_date <- as.Date(lang_button_switch_events$switch_date, format = "%Y-%m-%d")
lang_button_switch_events_nsessions <- lang_button_switch_events %>%
mutate(language_switch = ifelse(language_switch == 0, "no_switch", "switch")) %>%
group_by(language_switch) %>%
summarise(n_sessions = n_distinct(session_id))
lang_button_switch_events_nsessions
`summarise()` ungrouping output (override with `.groups` argument)
language_switch | n_sessions |
---|---|
<chr> | <int> |
no_switch | 2106249 |
switch | 842170 |
About 3.4% of sessions where the language button was clicked was followed by an event to switch the language.
lang_button_switch_events_nsessions <- lang_button_switch_events %>%
filter(language_switch == 1,
button_date >= '2021-06-27') %>%
group_by(open_context, switch_context) %>%
summarise(n_sessions = n_distinct(session_id))
lang_button_switch_events_nsessions
`summarise()` regrouping output by 'open_context' (override with `.groups` argument)
open_context | switch_context | n_sessions |
---|---|---|
<chr> | <chr> | <int> |
header | content-language-switcher | 112876 |
header | interface | 3 |
header | languages-list | 748 |
other | content-language-switcher | 58041 |
other | interface | 38 |
other | languages-list | 446 |
Following 22 June 2021 fix, we are recording three types of language switch events after the new button is clicked: (1) 'interface': (2) 'languages-list' (3) 'content-language-switcher'
TODO: Need to clarfiy differences.
Change deployed on 8 June 2021 Rechecked data to confirm that events are recording as expected and that were able to distinguish between the following three event types:
Update: Fix deployed on 10 June 2021 to address bug identified in post deployment QA, where links to switch languages after clicking the langauge switcher button were not instrumented. Data updated below to reflect changes following this fix.
Note on instrumentation: If the user has opened the language switcher in the header and switched language, we should two events with the following properties:
22 June 2021 Update: Fix deployed to add additional context to differentiate between the following events (both of these were previously marked as event.context = 'interface:
lang_button_switch_events_byskin <- lang_button_events %>%
filter(date >= '2021-06-11') %>% # date fix deployed
group_by(date, skin, skinversion) %>%
summarise(n_sessions = n_distinct(web_session_id))
lang_button_switch_events_byskin
`summarise()` regrouping output by 'date', 'skin' (override with `.groups` argument)
date | skin | skinversion | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
2021-06-11 | NULL | NULL | 18 |
2021-06-11 | vector | latest | 5467 |
2021-06-11 | vector | legacy | 54874 |
2021-06-12 | NULL | NULL | 14 |
2021-06-12 | vector | latest | 4363 |
2021-06-12 | vector | legacy | 45448 |
2021-06-13 | NULL | NULL | 5 |
2021-06-13 | vector | latest | 5053 |
2021-06-13 | vector | legacy | 49477 |
2021-06-14 | NULL | NULL | 9 |
2021-06-14 | vector | latest | 5926 |
2021-06-14 | vector | legacy | 61784 |
2021-06-15 | NULL | NULL | 6 |
2021-06-15 | vector | latest | 5952 |
2021-06-15 | vector | legacy | 61043 |
2021-06-16 | NULL | NULL | 2 |
2021-06-16 | vector | latest | 5725 |
2021-06-16 | vector | legacy | 58951 |
2021-06-17 | NULL | NULL | 1 |
2021-06-17 | vector | latest | 5622 |
2021-06-17 | vector | legacy | 57772 |
2021-06-18 | NULL | NULL | 2 |
2021-06-18 | vector | latest | 4948 |
2021-06-18 | vector | legacy | 52140 |
2021-06-19 | NULL | NULL | 4 |
2021-06-19 | vector | latest | 4288 |
2021-06-19 | vector | legacy | 42424 |
2021-06-20 | NULL | NULL | 2 |
2021-06-20 | vector | latest | 4618 |
2021-06-20 | vector | legacy | 46535 |
2021-06-21 | vector | latest | 5528 |
2021-06-21 | vector | legacy | 58187 |
2021-06-22 | vector | latest | 23023 |
2021-06-22 | vector | legacy | 57450 |
2021-06-23 | vector | latest | 47755 |
2021-06-23 | vector | legacy | 55941 |
2021-06-24 | vector | latest | 51067 |
2021-06-24 | vector | legacy | 55742 |
2021-06-25 | NULL | NULL | 1 |
2021-06-25 | vector | latest | 48562 |
2021-06-25 | vector | legacy | 50041 |
2021-06-26 | vector | latest | 44291 |
2021-06-26 | vector | legacy | 41258 |
2021-06-27 | vector | latest | 51986 |
2021-06-27 | vector | legacy | 44501 |
2021-06-28 | NULL | NULL | 1 |
2021-06-28 | vector | latest | 68627 |
2021-06-28 | vector | legacy | 54833 |
2021-06-29 | vector | latest | 23540 |
2021-06-29 | vector | legacy | 22065 |
We are recording two skin types: 'NULL and 'vector' and recording both skin version types for vector: 'latest' and 'legacy'. I'm assuming NULL counts as all skin types not as vector. It's not needed for the AB test but I recommend revising to clarify the specific non-vector skin types at some point.
# By Context Field
lang_button_switch_events_bycontext <- lang_button_events %>%
filter(date >= '2021-06-11') %>% # date fix deployed
group_by(context, skin, skinversion) %>%
summarise(n_sessions = n_distinct(web_session_id))
lang_button_switch_events_bycontext
`summarise()` regrouping output by 'context', 'skin' (override with `.groups` argument)
context | skin | skinversion | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
header | vector | latest | 325736 |
NULL | NULL | NULL | 65 |
other | vector | latest | 62458 |
other | vector | legacy | 939722 |
event.action = 'compact-language-links-open'
; event.context = 'header'
; event.skinVersion = 'latest'
event.action = 'compact-language-links-open'
and event.context = 'other'
, event.skinVersion = 'latest'
event.context = 'NULL'
; event.skinVersion = 'NULL'
We are only recording clicks to the new button in the header on the latest vector as expected.
There are clicks to the N More button by both legacy and latest as expected. The new language button was not deployed to logged-in users on test wikis so they would still have clicks to the N more button on the latest vector. When the AB test run, only users in the control group will be able to access the N more button on the latest vector.
Note context value events are only set for user on vector; however, we can identify click from other skins to the N More button by the lack of values (event.context = NULL
)
lang_button_switch_events_byusertype <- lang_button_events %>%
filter(date >= '2021-06-11') %>% # date fix deployed
group_by(context, skin, skinversion, isanon) %>%
summarise(n_sessions = n_distinct(web_session_id))
lang_button_switch_events_byusertype
`summarise()` regrouping output by 'context', 'skin', 'skinversion' (override with `.groups` argument)
context | skin | skinversion | isanon | n_sessions |
---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <int> |
header | vector | latest | false | 10298 |
header | vector | latest | true | 315691 |
NULL | NULL | NULL | false | 1 |
NULL | NULL | NULL | true | 64 |
other | vector | latest | false | 2226 |
other | vector | latest | true | 60257 |
other | vector | legacy | false | 24755 |
other | vector | legacy | true | 915463 |
lang_button_events_byusertype_all <- lang_button_events %>%
filter(date < '2021-06-22',
context == 'header') %>% # date fix deployed
group_by(isanon) %>%
summarize(n_events = sum(n_events),
n_sessions = n_distinct(web_session_id))
head(lang_button_events_byusertype_all)
`summarise()` ungrouping output (override with `.groups` argument)
isanon | n_events | n_sessions |
---|---|---|
<chr> | <int> | <int> |
false | 11648 | 3378 |
true | 9 | 5 |
Confirmed we are only recording clicks to the new language button by logged-in users. All new button clicks recorded so far have been on latest vector.
Update: AB Test deployed on 22 June 2021. We start seeing a lot more logged-out users with clicks to the new button after that date.
lang_button_events_isanon <- lang_button_events %>%
filter(date >= '2021-06-11',
context == 'header',
isanon == "true")
lang_button_events_isanon
date | wiki | web_session_id | usereditbucket | skin | skinversion | timetochangelanguage | isanon | context | interfacelanguage | contentlanguage | selectedinterfacelanguage | n_events |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <int> |
2021-06-11 | testwiki | bc29f4fc89d59f4bc99c | NULL | vector | latest | NULL | true | header | en | en | NULL | 1 |
2021-06-11 | bnwiki | 4fa77beb78338661d19c | NULL | vector | latest | NULL | true | header | bn | bn | NULL | 1 |
2021-06-13 | testwiki | 1c87469b4eb16ead0427 | NULL | vector | latest | NULL | true | header | en | en | NULL | 3 |
Two of these events occured on testwiki and one on bwiki, which is one of the early adopter wikis.
lang_button_switch_events_byisanon_testwiki <- lang_button_events %>%
filter(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ),
date >= '2021-06-24') %>% # following AB test
group_by(context, skin, skinversion, isanon) %>%
summarise(n_sessions = n_distinct(web_session_id))
lang_button_switch_events_byisanon_testwiki
`summarise()` regrouping output by 'context', 'skin', 'skinversion' (override with `.groups` argument)
context | skin | skinversion | isanon | n_sessions |
---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <int> |
header | vector | latest | false | 4743 |
header | vector | latest | true | 258299 |
other | vector | latest | false | 450 |
other | vector | latest | true | 3781 |
other | vector | legacy | false | 165 |
Numbers appear as expected. There are a limited number of events recorded on legacy vector since the new skin is deployed as opt-out.
We're still seeing some clicks to the N More button on the latest vector. Not sure where these are coming frmo.
lang_button_switch_events_bynontest <- lang_button_events %>%
filter(skinversion == 'latest',
skin == 'vector',
date >= '2021-06-11',
!wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ),
date >= '2021-06-08') %>% # date fix deployed
group_by(context, skin, skinversion, isanon) %>%
summarise(n_sessions = n_distinct(web_session_id),
n_events = sum(n_events))
lang_button_switch_events_bynontest
`summarise()` regrouping output by 'context', 'skin', 'skinversion' (override with `.groups` argument)
context | skin | skinversion | isanon | n_sessions | n_events |
---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <int> | <int> |
header | vector | latest | false | 4252 | 15217 |
header | vector | latest | true | 62 | 107 |
other | vector | latest | false | 3 | 3 |
other | vector | latest | true | 19 | 24 |
On non-test wikis, we should only be recording clicks to the new button by logged-in users. There are two events by logged-out users which occurs on test wiki, see above.
top_content_languages <- lang_button_events %>%
filter(date >= '2021-06-11',
context == 'header') %>%
mutate(all_sessions = n_distinct(web_session_id)) %>%
group_by(interfacelanguage, contentlanguage) %>%
summarize(n_sessions = n_distinct(web_session_id),
pct_sessions = n_sessions/all_sessions) %>%
distinct() %>%
arrange(desc(n_sessions))
head(top_content_languages)
`summarise()` regrouping output by 'interfacelanguage', 'contentlanguage' (override with `.groups` argument)
interfacelanguage | contentlanguage | n_sessions | pct_sessions |
---|---|---|---|
<chr> | <chr> | <int> | <dbl> |
en | en | 85 | 0.29513889 |
ja | ja | 20 | 0.06944444 |
zh-tw | zh | 20 | 0.06944444 |
de | de | 14 | 0.04861111 |
ru | ru | 12 | 0.04166667 |
ca | ca | 11 | 0.03819444 |
No test wikis and larger size non-test wikis are listed as the top content langauges where a new search button was clicked.
lang_button_byeditcount <- lang_button_events %>%
filter(date >= '2021-06-11',
context == 'header') %>%
group_by(usereditbucket, isanon) %>%
summarize(n_sessions = n_distinct(web_session_id))
lang_button_byeditcount
`summarise()` regrouping output by 'usereditbucket' (override with `.groups` argument)
usereditbucket | isanon | n_sessions |
---|---|---|
<chr> | <chr> | <int> |
0 edits | false | 93 |
1-4 edits | false | 64 |
100-999 edits | false | 135 |
1000+ edits | false | 360 |
5-99 edits | false | 181 |
NULL | true | 3 |
There are three NULL events recorded. These are all for the logged-out user events with clicks to the header identified above.
There appears to be some type of bug with these instances. Need to identify more info associated with these events to see if we can isolate what's happening here
#rough query to confirm approach
# will be refine in analysis
query <-
"
-- sessions where lang button was selected
WITH button AS (
SELECT
MIN(TO_DATE(dt)) as button_date,
event.web_session_id as session_id,
event.skinVersion as skinversion,
event.isAnon As isAnon,
event.context as button_type,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND Day >= 23
AND Day <= 30
AND useragent.is_bot = false
AND event.action = 'compact-language-links-open'
GROUP BY
event.web_session_id,
event.context,
event.isAnon,
event.skinversion,
wiki
),
lang_switches AS (
SELECT
TO_DATE(dt) as switch_date,
event.web_session_id as session_id,
event.context as switch_context,
event.timetochangelanguage as switch_time,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND day >= 23
AND useragent.is_bot = false
AND event.action = 'language-change'
)
SELECT
button.button_date,
button.button_type,
button.skinversion,
button.isAnon,
lang_switches.switch_time,
lang_switches.switch_date,
button.session_id,
button.wiki,
-- sessions with lang switch that occured after button clicks
IF(lang_switches.session_id IS NOT NULL, 1, 0) AS language_switch,
lang_switches.switch_context
FROM button
LEFT JOIN lang_switches ON
button.session_id = lang_switches.session_id
"
lang_button_switch_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
lang_button_switch_bybuttontype <- lang_button_switch_events %>%
filter(
language_switch == 1, #only sessions with lang switch
wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ),
skinversion == 'latest',
isanon == 'false',
switch_context %in% c('content-language-switcher', 'interface')) %>%
group_by(button_type, wiki, switch_context) %>%
summarise(n_sessions = n_distinct(session_id)) %>%
spread(switch_context, n_sessions)
lang_button_switch_bybuttontype
`summarise()` regrouping output by 'button_type', 'wiki' (override with `.groups` argument)
button_type | wiki | content-language-switcher | interface |
---|---|---|---|
<chr> | <chr> | <int> | <int> |
header | bnwiki | 49 | NA |
header | euwiki | 105 | 1 |
header | frwiki | 2903 | 1 |
header | frwiktionary | 67 | NA |
header | hewiki | 660 | NA |
header | kowiki | 341 | 4 |
header | ptwiki | 928 | 1 |
header | srwiki | 135 | NA |
header | trwiki | 358 | 1 |
header | vecwiki | 1 | NA |
other | bnwiki | 3 | NA |
other | euwiki | 8 | NA |
other | frwiki | 149 | 1 |
other | frwiktionary | 4 | NA |
other | hewiki | 15 | NA |
other | kowiki | 11 | NA |
other | ptwiki | 50 | NA |
other | srwiki | 35 | NA |
other | trwiki | 30 | NA |
other | vecwiki | 1 | NA |
query <-
"
SELECT DISTINCT
event.web_session_id as session_id,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
-- first full day of events
AND Day >= 23
AND DAy <= 30
AND useragent.is_bot = false
AND event.isAnon = false
AND wiki IN ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
AND event.action = 'language-change'
AND event.context = 'languages-list'
AND event.skinversion = 'latest'
"
lang_link_sidebar <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
lang_link_sidebar_bywiki <- lang_link_sidebar %>%
group_by(wiki) %>%
summarise(sidebar = n_distinct(session_id),
button_type = 'other') # add column to specific this is the control group (button type will be other)
lang_link_sidebar_bywiki
`summarise()` ungrouping output (override with `.groups` argument)
wiki | sidebar | button_type |
---|---|---|
<chr> | <int> | <chr> |
bnwiki | 99 | other |
dewikivoyage | 5 | other |
euwiki | 108 | other |
frwiki | 6069 | other |
frwiktionary | 176 | other |
hewiki | 986 | other |
kowiki | 627 | other |
ptwiki | 1886 | other |
srwiki | 388 | other |
trwiki | 806 | other |
vecwiki | 39 | other |
Now we'll merge to have all lang switches together in the same location.
lang_switched_AB <- full_join(lang_button_switch_bybuttontype, lang_link_sidebar_bywiki,
by = c('wiki','button_type')
)
lang_switched_AB
button_type | wiki | content-language-switcher | interface | sidebar |
---|---|---|---|---|
<chr> | <chr> | <int> | <int> | <int> |
header | bnwiki | 49 | NA | NA |
header | euwiki | 105 | 1 | NA |
header | frwiki | 2903 | 1 | NA |
header | frwiktionary | 67 | NA | NA |
header | hewiki | 660 | NA | NA |
header | kowiki | 341 | 4 | NA |
header | ptwiki | 928 | 1 | NA |
header | srwiki | 135 | NA | NA |
header | trwiki | 358 | 1 | NA |
header | vecwiki | 1 | NA | NA |
other | bnwiki | 3 | NA | 99 |
other | euwiki | 8 | NA | 108 |
other | frwiki | 149 | 1 | 6069 |
other | frwiktionary | 4 | NA | 176 |
other | hewiki | 15 | NA | 986 |
other | kowiki | 11 | NA | 627 |
other | ptwiki | 50 | NA | 1886 |
other | srwiki | 35 | NA | 388 |
other | trwiki | 30 | NA | 806 |
other | vecwiki | 1 | NA | 39 |
other | dewikivoyage | NA | NA | 5 |
lang_button_switch_byskintype <- lang_button_switch_events %>%
group_by(button_type, switch_context, language_switch, skinversion) %>%
summarise(n_sessions = n_distinct(session_id))
lang_button_switch_byskintype
`summarise()` regrouping output by 'button_type', 'switch_context', 'language_switch' (override with `.groups` argument)
button_type | switch_context | language_switch | skinversion | n_sessions |
---|---|---|---|---|
<chr> | <chr> | <int> | <chr> | <int> |
header | interface | 1 | latest | 614 |
header | NULL | 0 | latest | 220 |
NULL | interface | 1 | NULL | 3 |
NULL | NULL | 0 | NULL | 36 |
other | interface | 1 | latest | 8197 |
other | interface | 1 | legacy | 71475 |
other | languages-list | 1 | latest | 5604 |
other | languages-list | 1 | legacy | 6 |
other | NULL | 0 | latest | 3076 |
other | NULL | 0 | legacy | 77735 |
lang_button_switch_bytime<- lang_button_switch_events %>%
filter(language_switch == 1,
skinversion == "latest") %>%
group_by(button_type, switch_context, switch_time) %>%
summarise(n_sessions = n_distinct(session_id))
lang_button_switch_bytime
`summarise()` regrouping output by 'button_type', 'switch_context' (override with `.groups` argument)
button_type | switch_context | switch_time | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
header | interface | 10017.0 | 1 |
header | interface | 10019.0 | 1 |
header | interface | 10033.900146484375 | 1 |
header | interface | 100522.89990234375 | 1 |
header | interface | 10064.39990234375 | 1 |
header | interface | 10088.89990234375 | 1 |
header | interface | 10102.900146484375 | 1 |
header | interface | 101213.60009765625 | 1 |
header | interface | 10155.900146484375 | 1 |
header | interface | 10161.5 | 1 |
header | interface | 10190.39990234375 | 1 |
header | interface | 10196.400146484375 | 1 |
header | interface | 10205.800048828125 | 1 |
header | interface | 10209.300048828125 | 1 |
header | interface | 10210.0 | 1 |
header | interface | 10220.400146484375 | 1 |
header | interface | 10240.39990234375 | 1 |
header | interface | 10260.2451171875 | 1 |
header | interface | 10282.699951171875 | 1 |
header | interface | 102956.0 | 1 |
header | interface | 10310.7001953125 | 1 |
header | interface | 10314.89990234375 | 1 |
header | interface | 10350.10009765625 | 1 |
header | interface | 10404.0 | 1 |
header | interface | 10410.0 | 1 |
header | interface | 10436.0 | 1 |
header | interface | 10448.2998046875 | 1 |
header | interface | 10474.0 | 1 |
header | interface | 10489.30029296875 | 1 |
header | interface | 10501.0 | 1 |
⋮ | ⋮ | ⋮ | ⋮ |
other | languages-list | 9946.900146484375 | 1 |
other | languages-list | 9948.900146484375 | 1 |
other | languages-list | 9949.5 | 1 |
other | languages-list | 9952.300048828125 | 1 |
other | languages-list | 9955.0 | 1 |
other | languages-list | 9956.2998046875 | 1 |
other | languages-list | 9958.199951171875 | 1 |
other | languages-list | 99629.5 | 1 |
other | languages-list | 9964.800048828125 | 1 |
other | languages-list | 9965.0 | 1 |
other | languages-list | 99685.19995117188 | 1 |
other | languages-list | 9969.0 | 1 |
other | languages-list | 9969.10009765625 | 1 |
other | languages-list | 9970.0 | 1 |
other | languages-list | 9970.7998046875 | 1 |
other | languages-list | 9972.0 | 1 |
other | languages-list | 997265.0 | 1 |
other | languages-list | 99734.0 | 1 |
other | languages-list | 9977.119873046875 | 1 |
other | languages-list | 9979.800048828125 | 1 |
other | languages-list | 998.0 | 1 |
other | languages-list | 9981.800048828125 | 1 |
other | languages-list | 998152.0 | 1 |
other | languages-list | 9982.39990234375 | 1 |
other | languages-list | 9988.0 | 1 |
other | languages-list | 9989.0 | 1 |
other | languages-list | 9992.400146484375 | 1 |
other | languages-list | 99969.30053710938 | 1 |
other | languages-list | 9997.0 | 1 |
other | languages-list | 9999.0 | 1 |
Confirmed time to change is recorded.
query <-
"
SELECT
TO_DATE(dt) AS `date`,
wiki,
event.web_session_id,
event.usereditbucket,
event.skin,
event.skinVersion,
event.timetochangelanguage,
event.isanon,
event.action,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage,
Count(*) AS n_events
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND day >= 11
AND event.context = 'languages-list'
AND useragent.is_bot = false
GROUP BY
TO_DATE(dt),
wiki,
event.web_session_id,
event.usereditbucket,
event.skin,
event.skinVersion,
event.timetochangelanguage,
event.isanon,
event.action,
event.interfacelanguage,
event.contentlanguage,
event.selectedinterfacelanguage
"
lang_sidebar_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
lang_sidebar_events_byskintype<- lang_sidebar_events %>%
group_by(skin, skinversion, isanon) %>%
summarize(n_sessions = n_distinct(web_session_id))
lang_sidebar_events_byskintype
`summarise()` regrouping output by 'skin', 'skinversion' (override with `.groups` argument)
skin | skinversion | isanon | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
NULL | NULL | false | 5 |
NULL | NULL | true | 162 |
vector | latest | false | 44738 |
vector | latest | true | 1658646 |
We are recording clicks to the language list in the sidebar either the latest version of vector or NULL. This is expected as instrumentation for sidebar clicks was limited to language list. Further instrumentation here to clarfiy NULL values would be helpful but it is assumed that any events identifed with a NULL skin type came from non latest vector.
This instrumentation will be good for the AB Test since we will only be looking users on the latest vector skin; however, further clarification of these other skin types will be useful in the future in case we want to know percent of users clicking on these links from legacy vector vs other skin types.
lang_sidebar_events_byskintype_nontestwiki <- lang_sidebar_events %>%
filter(!wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' )) %>%
group_by(skin, skinversion, isanon) %>%
summarize(n_sessions = n_distinct(web_session_id))
lang_sidebar_events_byskintype_nontestwiki
`summarise()` regrouping output by 'skin', 'skinversion' (override with `.groups` argument)
skin | skinversion | isanon | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
vector | latest | false | 5 |
vector | latest | true | 4 |
## By Test Wiki
lang_sidebar_events_byskintype_testwiki <- lang_sidebar_events %>%
filter(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' )) %>%
group_by(skin, skinversion, isanon) %>%
summarize(n_sessions = n_distinct(web_session_id))
lang_sidebar_events_byskintype_testwiki
`summarise()` regrouping output by 'skin', 'skinversion' (override with `.groups` argument)
skin | skinversion | isanon | n_sessions |
---|---|---|---|
<chr> | <chr> | <chr> | <int> |
NULL | NULL | false | 5 |
NULL | NULL | true | 162 |
vector | latest | false | 44680 |
vector | latest | true | 1658560 |
Since the AB test has not started, we are not logging a lot of clicks by logged in or logged out users to the lang links on the sidebar on non-test wikis (the new search button was deployed as to all logged-in users on latest vector).
Note: There are some sessions by both logged-in and logged-out users on the latest vector recorded as having click a lang list link. Not sure where these are coming from but it's such a small percentage it should not impact the AB test.
The majority of clicks occur on test wikis by logged-out as expected.
Once the AB test runs, any clicks to the lang links in the sidebar should only be recorded by users in the control group.
lang_sidebar_events_byaction<- lang_sidebar_events %>%
group_by(action) %>%
summarize(n_sessions = n_distinct(web_session_id))
lang_sidebar_events_byaction
`summarise()` ungrouping output (override with `.groups` argument)
action | n_sessions |
---|---|
<chr> | <int> |
language-change | 1702128 |
Only associated with event.action = 'language-change'
as expected.
#rough query to confirm approach
# will be refine in analysis
query <-
"
-- sessions where lang button was selected
WITH button AS (
SELECT
MIN(TO_DATE(dt)) as button_date,
event.web_session_id as session_id,
event.skinVersion as skinversion,
event.context as button_type,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND Day >= 11
AND useragent.is_bot = false
AND event.action = 'compact-language-links-open'
GROUP BY
event.web_session_id,
event.context,
event.skinversion,
wiki
),
follow_actions AS (
SELECT
TO_DATE(dt) as action_date,
event.action as action_type,
event.web_session_id as session_id,
event.context as action_context,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND Day >= 11
AND useragent.is_bot = false
AND event.action != 'compact-language-links-open'
)
SELECT
button.button_date,
button.button_type,
button.skinversion,
follow_actions.action_date,
follow_actions.action_type,
button.session_id,
button.wiki,
-- sessions with lang switch that occured after button clicks
IF(follow_actions.session_id IS NOT NULL, 1, 0) AS follow_action,
follow_actions.action_context
FROM button
LEFT JOIN follow_actions ON
button.session_id = follow_actions.session_id
"
more_lang_actions <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit Warning message in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : “embedded nul(s) found in input”
# find what types of actions are recorded with a new button clicks
new_button_actions <- more_lang_actions %>%
filter(button_type == 'header') %>%
group_by(action_type) %>%
summarise (n_sessions = n_distinct(session_id))
new_button_actions
`summarise()` ungrouping output (override with `.groups` argument)
action_type | n_sessions |
---|---|
<chr> | <int> |
font-change | 13 |
ime-change | 263 |
ime-disable | 52 |
ime-enable | 143 |
language-change | 284961 |
more-languages-access | 94 |
no-search-results | 4078 |
NULL | 45822 |
settings-open | 198 |
webfonts-disable | 3 |
webfonts-enable | 29 |
So far, we have recorded 'ime-change' (user changed the input method), 'no-search-results' (User searched for a language with no results) and 'webfonts-enable' (webfonts-enable: User enabled the webfonts functionality via ULS settings) actions in sessions where clicks to the new button were logged.
With the fix, we are now also recording language-change events.
# find what types of actions are recorded with all button clicks
all_button_actions <- more_lang_actions %>%
group_by(action_type, button_type) %>%
summarise (n_sessions = n_distinct(session_id)) %>%
arrange(button_type)
all_button_actions
`summarise()` regrouping output by 'action_type' (override with `.groups` argument)
action_type | button_type | n_sessions |
---|---|---|
<chr> | <chr> | <int> |
ime-change | header | 3 |
language-change | header | 219 |
more-languages-access | header | 1 |
no-search-results | header | 8 |
NULL | header | 79 |
webfonts-enable | header | 1 |
language-change | NULL | 2 |
no-search-results | NULL | 2 |
NULL | NULL | 13 |
font-change | other | 1 |
ime-change | other | 71 |
ime-disable | other | 24 |
ime-enable | other | 47 |
language-change | other | 26266 |
more-languages-access | other | 32 |
no-search-results | other | 4449 |
NULL | other | 22555 |
settings-open | other | 1471 |
ui-lang-revert | other | 1 |
webfonts-enable | other | 7 |
Some of the actions not recorded for the new clicks button make sense. For example, we shouldn't be recording settings-open events in sessions with clicks to the new language button but I would anticipate there being clicks to switch languages from the button.
query <-
"
SELECT
event.web_session_id as session_id,
event.skinVersion as skinversion,
wiki as wiki
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
AND Day >= 22
AND useragent.is_bot = false
AND event.context = 'content-language-switcher'
AND event.action = 'language-change'
GROUP BY
event.web_session_id,
event.skinVersion,
wiki
)
"
lang_content_switches <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit Warning message in system(cmd, intern = TRUE): “running command 'export HADOOP_HEAPSIZE=1024 && ionice nice hive -S -f ./temp_query11634119c207.hql 2>&1 > ./temp_results116316d08b51.tsv' had status 20”
Error in read.table(file = file, header = header, sep = sep, quote = quote, : no lines available in input Traceback: 1. wmfdata::query_hive(query) 2. utils::read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, . header = TRUE) 3. read.table(file = file, header = header, sep = sep, quote = quote, . dec = dec, fill = fill, comment.char = comment.char, ...) 4. stop("no lines available in input")
head(lang_content_switches)
date | session_id | skinversion | isanon | action | wiki | |
---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | 2021-06-23 | 053a2ed57541904671e1 | latest | true | language-change | trwiki |
2 | 2021-06-23 | 28f7400c785a050e1466 | latest | true | language-change | frwiki |
3 | 2021-06-23 | c2ebfd1bede689af758c | legacy | true | language-change | enwiki |
4 | 2021-06-23 | 38d8c82b0d93a8db81b0 | latest | true | language-change | hewiki |
5 | 2021-06-23 | 11e1d5c96a48c08365b2 | legacy | true | language-change | eswiki |
6 | 2021-06-23 | 8f1fb84ee0abe3ea118b | legacy | true | language-change | enwiki |
lang_content_switches %>%
group_by(date) %>%
summarize(num_sessions = n_distinct(session_id))
`summarise()` ungrouping output (override with `.groups` argument)
date | num_sessions |
---|---|
<chr> | <int> |
2021-06-22 | 32193 |
2021-06-23 | 66203 |
2021-06-24 | 70480 |
2021-06-25 | 66043 |
2021-06-26 | 58343 |
2021-06-27 | 66835 |
2021-06-28 | 86242 |
2021-06-29 | 46804 |
Confirmed we start recording events on 22 June 2021 with the first full day of events recorded on 23 June 2021.
lang_content_switches %>%
group_by(action) %>%
summarize(num_sessions = n_distinct(session_id))
`summarise()` ungrouping output (override with `.groups` argument)
action | num_sessions |
---|---|
<chr> | <int> |
language-change | 464766 |
Confirmed we are only recording for language-change actions as expected.
## By Test Wiki Status
lang_content_switches %>%
filter(date > '2021-06-23',
wiki != 'fawiki') %>%
mutate(istest = ifelse(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ), 'test', 'non_test')) %>%
group_by(istest) %>%
summarize(num_sessions = n_distinct(session_id))
`summarise()` ungrouping output (override with `.groups` argument)
istest | num_sessions |
---|---|
<chr> | <int> |
non_test | 134361 |
test | 237847 |
The majority (64%) of content-language-switcher events have occured on the test wikis, since 23 June 2021. This is expected as these events should only fire with the new language button which is only available to logged-in users that opt-in on non test wikis.
lang_content_switches %>%
filter(date > '2021-06-23',
wiki != 'fawiki') %>%
mutate(istest = ifelse(wiki %in% c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki' ), 'test', 'non_test')) %>%
group_by(istest, skinversion) %>%
summarize(num_sessions = n_distinct(session_id))
`summarise()` regrouping output by 'istest' (override with `.groups` argument)
istest | skinversion | num_sessions |
---|---|---|
<chr> | <chr> | <int> |
non_test | latest | 984 |
non_test | legacy | 143948 |
test | latest | 263198 |
test | legacy | 88 |
Potential issue: We're seeing events recorded for legacy. Need to confirm if this is possible.
Looked at query documented in https://phabricator.wikimedia.org/T280825
Adjusted to restrict to wikis in the AB test and also account for clicks to the language list in the sidebar for the control group.
Unfortunately, it is not possible to accurately determine buckets as we do not have instrumentation to track disitinct users that visit the site during the time of the AB test. Further checks on the client will be done to confirm.
query <-
"
SELECT
event.web_session_id as session_id,
wiki as wiki,
SUM(1) AS num_sessions,
SUM(if(event.action = 'compact-language-links-open' AND event.context = 'header', 1, 0)) as n_header,
SUM(if(event.action = 'language-change' AND event.context = 'languages-list', 1, 0)) as n_sidebar_link,
SUM(if(event.action = 'compact-language-links-open' AND event.context = 'other', 1, 0)) as n_other,
SUM(if(event.action = 'settings-open' AND event.context = 'interlanguage', 1, 0)) as n_sidebar_settings
FROM event.universallanguageselector
WHERE
year = 2021
AND month = 06
-- first full day of events
AND Day >= 23
AND useragent.is_bot = false
AND event.skinVersion = 'latest'
AND event.isAnon = false
AND wiki IN ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki',
'euwiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
GROUP BY
event.web_session_id,
wiki
"
ab_test_data <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
ab_test_data %>%
group_by(wiki) %>%
summarise(test = sum(n_header),
control = sum(n_other + n_sidebar_link + n_sidebar_settings),
all = sum(num_sessions))
`summarise()` ungrouping output (override with `.groups` argument)
wiki | test | control | all |
---|---|---|---|
<chr> | <int> | <int> | <int> |
bnwiki | 205 | 220 | 550 |
dewikivoyage | 0 | 10 | 10 |
euwiki | 392 | 176 | 883 |
frwiki | 10095 | 15302 | 32761 |
frwiktionary | 270 | 338 | 807 |
hewiki | 2131 | 3465 | 7106 |
kowiki | 1641 | 1711 | 4351 |
ptwiki | 3301 | 4436 | 10159 |
srwiki | 451 | 967 | 1818 |
trwiki | 1303 | 1554 | 3718 |
vecwiki | 3 | 108 | 113 |
Confirmed that we are logging both control and test events and the instrumentation allows us to distinguish these events.
The splits are not perfectly balanced but there are no signficant differences indicating a regression or difference in sampling rate. Differences appear as expected based on a 50/50 split.
Vec
ab_test_data %>%
summarise(test = sum(n_header),
control = sum(n_other + n_sidebar_link + n_sidebar_settings),
all = sum(num_sessions))
test | control | all |
---|---|---|
<int> | <int> | <int> |
18425 | 26055 | 57645 |