from time import perf_counter
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import numpy as np
import pandas as pd
from tabulate import tabulate
from wmfdata import charting, hive
from wmfdata.charting import comma_fmt, pct_fmt
from wmfdata.utils import df_to_remarkup, pct_str
We are using the data starting on 14 July, after all our significant data quality issues were fixed.
Let's do the grouping into sessions and users in Hive since it takes a loooong time in Pandas.
sessions = hive.run("""
select
event.editing_session_id as edit_attempt_id,
wiki,
event.platform,
collect_list(event.editor_interface) as interface,
-- User IDs are not unique across wikis
if(event.user_id != 0, concat(wiki, "-", event.user_id), event.anonymous_user_token) as user_id,
event.bucket,
event.user_id = 0 as user_is_anonymous,
event.user_editcount as user_edit_count,
max(event.revision_id) as max_rev_id,
sum(cast(event.action = "ready" as int)) >= 1 as ready,
sum(cast(event.action = "saveSuccess" as int)) >= 1 as completed,
min(dt) as start_dt,
unix_timestamp(max(dt), "yyyy-MM-dd'T'HH:mm:ss'Z'") - unix_timestamp(min(dt), "yyyy-MM-dd'T'HH:mm:ss'Z'") as duration
from event.editattemptstep
where
event.bucket in ('default-visual', 'default-source') and
year = 2019 and (
month = 7 and day >= 14 or
month >= 8
)
group by
event.editing_session_id,
wiki,
event.platform,
event.user_id,
event.user_editcount,
event.anonymous_user_token,
event.bucket
""").assign(
start_dt=lambda df: pd.to_datetime(df["start_dt"]),
wiki=lambda df: df["wiki"].astype("category"),
platform=lambda df: df["platform"].astype("category"),
bucket=lambda df: df["bucket"].astype("category")
)
# Ensure that including extra data in the group by hasn't broken any sessions into multiple rows
assert sessions["edit_attempt_id"].is_unique
sessions.info()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) /srv/home/neilpquinn-wmf/pkg/wmfdata/wmfdata/hive.py in run(cmds, fmt) 24 try: ---> 25 hive_conn = impala_conn(host='an-coord1001.eqiad.wmnet', port=10000, auth_mechanism='PLAIN') 26 hive_cursor = hive_conn.cursor() ~/venv/lib/python3.5/site-packages/impala/dbapi.py in connect(host, port, database, timeout, use_ssl, ca_cert, auth_mechanism, user, password, kerberos_service_name, use_ldap, ldap_user, ldap_password, use_kerberos, protocol) 146 kerberos_service_name=kerberos_service_name, --> 147 auth_mechanism=auth_mechanism) 148 return hs2.HiveServer2Connection(service, default_db=database) ~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in connect(host, port, timeout, use_ssl, ca_cert, user, password, kerberos_service_name, auth_mechanism) 757 auth_mechanism, user, password) --> 758 transport.open() 759 protocol = TBinaryProtocol(transport) ~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in open(self) 66 def open(self): ---> 67 if not self._trans.isOpen(): 68 self._trans.open() AttributeError: 'TSocket' object has no attribute 'isOpen' During handling of the above exception, another exception occurred: UnboundLocalError Traceback (most recent call last) <ipython-input-2-97c7075565c5> in <module>() 30 event.anonymous_user_token, 31 event.bucket ---> 32 """).assign( 33 start_dt=lambda df: pd.to_datetime(df["start_dt"]), 34 wiki=lambda df: df["wiki"].astype("category"), /srv/home/neilpquinn-wmf/pkg/wmfdata/wmfdata/hive.py in run(cmds, fmt) 37 result = hive_cursor.fetchall() 38 finally: ---> 39 hive_conn.close() 40 41 return result UnboundLocalError: local variable 'hive_conn' referenced before assignment
There seems to be a pretty big imbalance in the number bucketed users and attempts. That doesn't seem right, but it's unclear what we can actually do about it.
sessions.groupby("bucket").agg(
users=("user_id", "nunique"),
attempts=("edit_attempt_id", len)
).style.format("{:,.0f}")
attempts | users | |
---|---|---|
bucket | ||
default-source | 1,822,026 | 1,302,187 |
default-visual | 1,704,482 | 1,214,917 |
start = perf_counter()
users = sessions.assign(
bucket=lambda df: df["bucket"].astype(object) # Work around Pandas bug aggregating categorical fields
).groupby("user_id").agg(
attempts=("edit_attempt_id", "nunique"),
ready_attempts=("ready", "sum"),
completed_attempts=("completed", "sum"),
is_anonymous=("user_is_anonymous", "first"),
bucket=("bucket", "first"),
avg_duration=("duration", "mean")
).assign(
ready_rate=lambda df: df["ready_attempts"] / df["attempts"],
completion_rate=lambda df: df["completed_attempts"] / df["ready_attempts"]
)
elapsed_time = perf_counter() - start
print("Completed in {:.1f} min.".format(elapsed_time / 60))
Completed in 0.3 min.
# We have some infinite completion rate because the users have saves but no readies
users = users.replace([np.inf, -np.inf], np.nan)
sessions_sample.groupby("user_id").agg({"user_is_anonymous": "first"}).head()
user_is_anonymous | |
---|---|
user_id | |
00005eececf789cffb2a9b7216fa4115 | True |
0005c9f5898dbaf5380fa365aa702bff | True |
000a0a1dbee10136ba77b6401f9f71be | True |
0013f3a8d69c67390f8f143691e8387f | True |
0014ee6476a15e409d26f590704353aa | True |
sessions_sample.groupby("user_id").agg({"bucket": "first"}).head()
bucket | |
---|---|
0 | default-source |
1 | default-visual |
2 | default-visual |
3 | default-source |
4 | default-source |
14 day horizon from first recorded attempt to deal with censoring:
sessions_sample = sessions.sample(10000)
sessions_sample.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 2108210 to 29624 Data columns (total 13 columns): edit_attempt_id 10000 non-null object wiki 10000 non-null category platform 10000 non-null category interface 10000 non-null object user_id 10000 non-null object bucket 10000 non-null category user_is_anonymous 10000 non-null bool user_edit_count 10000 non-null int64 max_rev_id 10000 non-null int64 ready 10000 non-null bool completed 10000 non-null bool start_dt 10000 non-null datetime64[ns, UTC] duration 10000 non-null int64 dtypes: bool(3), category(3), datetime64[ns, UTC](1), int64(3), object(3) memory usage: 684.6+ KB
data_end = pd.Timestamp("2019-08-27 00:00:00Z")
def uncensor(group):
user_entrance = group["start_dt"].min()
user_exit = user_entrance + pd.Timedelta(days=14)
if user_exit > data_end:
return None
else:
return group[group["start_dt"] < user_exit].reset_index(drop=True).drop("user_id", axis=1)
uncensored_sessions = sessions_sample.groupby("user_id").apply(uncensor)
uncensored_sessions.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 6619 entries, (00005eececf789cffb2a9b7216fa4115, 0) to (zh_yuewiki-179809, 0) Data columns (total 12 columns): edit_attempt_id 6619 non-null object wiki 6619 non-null category platform 6619 non-null category interface 6619 non-null object bucket 6619 non-null category user_is_anonymous 6619 non-null bool user_edit_count 6619 non-null int64 max_rev_id 6619 non-null int64 ready 6619 non-null bool completed 6619 non-null bool start_dt 6619 non-null datetime64[ns, UTC] duration 6619 non-null int64 dtypes: bool(3), category(3), datetime64[ns, UTC](1), int64(3), object(2) memory usage: 446.9+ KB
uncensored_sessions.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-fd02fa451f27> in <module> ----> 1 uncensored_sessions.head() NameError: name 'uncensored_sessions' is not defined
users["init_completion_rate"] = users["completed_attempts"] / users["attempts"]
users.groupby("bucket").agg(
avg_attempts=("attempts", "mean"),
avg_completion_rate=("completion_rate", "mean"),
avg_init_completion_rate=("init_completion_rate", "mean"),
avg_completed_edits=("completed_attempts", "mean"),
).style.format({
"avg_attempts": "{:.3f}",
"avg_completed_edits": "{:.3f}",
"avg_completion_rate": "{:.2%}",
"avg_init_completion_rate": "{:.2%}"
})
avg_attempts | avg_completed_edits | avg_completion_rate | avg_init_completion_rate | |
---|---|---|---|---|
bucket | ||||
default-source | 1.399 | 0.040 | 0.88% | 0.87% |
default-visual | 1.403 | 0.037 | 1.23% | 0.79% |
sessions.query("completed").groupby("bucket")["duration"].mean()
bucket default-source 131.509963 default-visual 165.769435 Name: duration, dtype: float64
events