from collections import OrderedDict
from datetime import timedelta
from functools import partial
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, FixedLocator
import numpy as np
import pandas as pd
import requests
from wmfdata import hive, mariadb, charting
from wmfdata.utils import pct_str
charting.set_mpl_style()
null_prop = lambda s: len(s[s.isnull()]) / len(s)
pos_prop = lambda s: len(s[s > 0]) / len(s)
editors_r = hive.run("""
select
coalesce(event_user_text, event_user_text_historical) as user_name,
not event_user_is_anonymous as registered,
count(*) as yearly_edits,
sum(cast(
coalesce(mve.editor, "desktop wikitext editor") in ("mobile visual editor", "mobile wikitext editor")
as int)) / count(*) as mobile_edit_proportion
from wmf.mediawiki_history mh
left join neilpquinn.mob_or_ve_edits mve
on
mve.wiki = mh.wiki_db and
mve.rev_id = mh.revision_id
where
mh.event_timestamp >= "2017-10" and
event_entity = "revision" and
event_type = "create" and
snapshot = "2018-09"
group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous
""")
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-19-fae70611ab4b> in <module>() 18 snapshot = "2018-09" 19 group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous ---> 20 """) /srv/home/neilpquinn-wmf/pkg/wmfdata/wmfdata/hive.py in run(cmds, fmt) 27 28 for cmd in cmds: ---> 29 hive_cursor.execute(cmd) 30 if fmt == "pandas": 31 try: ~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in execute(self, operation, parameters, configuration) 302 configuration=configuration) 303 log.debug('Waiting for query to finish') --> 304 self._wait_to_finish() # make execute synchronous 305 log.debug('Query finished') 306 ~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _wait_to_finish(self) 371 while True: 372 req = TGetOperationStatusReq(operationHandle=self._last_operation.handle) --> 373 resp = self._last_operation._rpc('GetOperationStatus', req) 374 operation_state = TOperationState._VALUES_TO_NAMES[resp.operationState] 375 ~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _rpc(self, func_name, request) 921 def _rpc(self, func_name, request): 922 self._log_request(func_name, request) --> 923 response = self._execute(func_name, request) 924 self._log_response(func_name, response) 925 err_if_rpc_not_ok(response) ~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _execute(self, func_name, request) 938 log.debug('Transport opened') 939 func = getattr(self.client, func_name) --> 940 return func(request) 941 except socket.error: 942 log.exception('Failed to open transport (tries_left=%s)', ~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _req(self, _api, *args, **kwargs) 196 # wait result only if non-oneway 197 if not getattr(result_cls, "oneway"): --> 198 return self._recv(_api) 199 200 def _send(self, _api, **kwargs): ~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _recv(self, _api) 208 209 def _recv(self, _api): --> 210 fname, mtype, rseqid = self._iprot.read_message_begin() 211 if mtype == TMessageType.EXCEPTION: 212 x = TApplicationException() ~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(self) 370 def read_message_begin(self): 371 api, ttype, seqid = read_message_begin( --> 372 self.trans, strict=self.strict_read) 373 return api, ttype, seqid 374 ~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(inbuf, strict) 162 163 def read_message_begin(inbuf, strict=True): --> 164 sz = unpack_i32(inbuf.read(4)) 165 if sz < 0: 166 version = sz & VERSION_MASK ~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in read(self, sz) 157 return ret 158 --> 159 self._read_frame() 160 return ret + self.__rbuf.read(sz - len(ret)) 161 ~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in _read_frame(self) 161 162 def _read_frame(self): --> 163 header = read_all_compat(self._trans, 4) 164 (length,) = struct.unpack(">I", header) 165 if self.encode: ~/venv/lib/python3.5/site-packages/thrift_sasl/six.py in <lambda>(trans, sz) 52 return is_open 53 ---> 54 read_all_compat = lambda trans, sz: readall(trans.read, sz) ~/venv/lib/python3.5/site-packages/thriftpy/transport/__init__.py in readall(read_fn, sz) 12 have = 0 13 while have < sz: ---> 14 chunk = read_fn(sz - have) 15 have += len(chunk) 16 buff += chunk ~/venv/lib/python3.5/site-packages/thriftpy/transport/socket.py in read(self, sz) 106 def read(self, sz): 107 try: --> 108 buff = self.sock.recv(sz) 109 except socket.error as e: 110 if (e.args[0] == errno.ECONNRESET and KeyboardInterrupt:
editors = editors_r.copy()
editors.head()
user_name | registered | yearly_edits | mobile_edit_proportion | |
---|---|---|---|---|
0 | ! Bikkit ! | True | 583 | 0.0 |
1 | !Alla17 | True | 45 | 0.0 |
2 | !Ann07 | True | 1 | 0.0 |
3 | !Maciek!Wawa | True | 2 | 0.0 |
4 | !Puce456 | True | 1 | 1.0 |
def bin_platform(x):
if x == 0:
return "desktop only"
elif x == 1:
return "mobile only"
else:
return "mixed"
editors["platform"] = pd.Categorical(editors["mobile_edit_proportion"].apply(bin_platform))
editors["activity_level"] = pd.cut(
editors["yearly_edits"],
bins=ec_bins[1:],
labels=ec_labels[1:],
right=False
)
reg_editors = editors.query("registered == True")
ip_editors = editors.query("registered == False")
platform_prop = reg_editors.groupby("platform")["user_name"].agg(len) / len(reg_editors)
pd.DataFrame(platform_prop).applymap(pct_str).T
platform | desktop only | mixed | mobile only |
---|---|---|---|
user_name | 74.9% | 6.2% | 18.9% |
by_act = reg_editors.groupby(["activity_level", "platform"])["user_name"].agg(len).unstack()
by_act.apply(lambda row: row / row.sum(), axis=1).applymap(pct_str)
platform | desktop only | mixed | mobile only |
---|---|---|---|
activity_level | |||
1-9 edits | 74.5% | 3.4% | 22.1% |
10-99 edits | 78.5% | 15.5% | 6.0% |
100-999 edits | 69.6% | 28.6% | 1.9% |
1000+ edits | 66.7% | 33.1% | 0.1% |
fig, ax = plt.subplots()
ax.hist(reg_editors["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");
fig, ax = plt.subplots()
ax.hist(reg_editors.query("0 < mobile_edit_proportion < 1.0")["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");
# For all the people who edited Oct 2017–March 2018, find their first edit with each interface
# and join it to all their edits from Oct 2017-Sep 2018 with the same interface
# WARNING: Initial edit count is unreliable because the database has a separate edit count for each wiki
edits_r = hive.run("""
with first_edits as (
select
event_user_text,
coalesce(editor, "desktop wikitext editor") as editor,
min(event_timestamp) as first_edit_time,
min(event_user_revision_count) as initial_edit_count
from wmf.mediawiki_history mh
left join neilpquinn.mob_or_ve_edits mve
on
mve.wiki = mh.wiki_db and
mve.rev_id = mh.revision_id
where
event_timestamp >= "2017-10" and
event_timestamp < "2018-04" and
not event_user_is_anonymous and
event_entity = "revision" and
event_type = "create" and
mh.snapshot = "2018-09"
group by event_user_text, editor
)
select
user_name,
interface,
week_number,
min(initial_edit_count) as initial_edit_count,
count(*) as edits
from (
select
fe.event_user_text as user_name,
fe.editor as interface,
fe.initial_edit_count as initial_edit_count,
event_timestamp,
(fe.first_edit_time = mh.event_timestamp) as first_edit,
-- If datediff is 0-7, week 1; if 8-14, week 2; etc.
if(
ceil(datediff(event_timestamp, first_edit_time) / 7) != 0,
ceil(datediff(event_timestamp, first_edit_time) / 7),
1
) as week_number
from first_edits fe
left join wmf.mediawiki_history mh on
fe.event_user_text = mh.event_user_text and
mh.snapshot = "2018-09"
left join neilpquinn.mob_or_ve_edits mve on
mve.wiki = mh.wiki_db and
mve.rev_id = mh.revision_id
where
fe.editor = coalesce(mve.editor, "desktop wikitext editor") and
-- Limit to events with 182 days (26 full weeks) after starting to avoid censoring
datediff(event_timestamp, first_edit_time) <= 182 and
event_timestamp >= "2017-10" and
event_timestamp < "2018-10" and
event_entity = "revision" and
event_type = "create"
) edits
group by user_name, interface, week_number
""")
weekly_edits = edits_r.drop(["initial_edit_count"], axis=1)
weekly_edits.head()
user_name | interface | week_number | edits | |
---|---|---|---|---|
0 | ! Bikkit ! | desktop wikitext editor | 14 | 24 |
1 | ! Bikkit ! | desktop wikitext editor | 26 | 6 |
2 | !!Ethereal!! | desktop wikitext editor | 2 | 1 |
3 | !!Priyanshu Patel!! | desktop wikitext editor | 1 | 2 |
4 | !0Clarke | desktop wikitext editor | 1 | 4 |
weekly_edits = weekly_edits.set_index(["user_name", "interface", "week_number"]).unstack()
weekly_edits = weekly_edits.fillna(0).apply(np.uint32)
weekly_edits.columns = weekly_edits.columns.droplevel(0)
weekly_edits.columns.name = None
weekly_edits = weekly_edits.rename(columns = lambda x: "week {!s}".format(x))
weekly_edits.head()
week 1 | week 2 | week 3 | week 4 | week 5 | week 6 | week 7 | week 8 | week 9 | week 10 | ... | week 17 | week 18 | week 19 | week 20 | week 21 | week 22 | week 23 | week 24 | week 25 | week 26 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_name | interface | |||||||||||||||||||||
! Bikkit ! | desktop wikitext editor | 4 | 0 | 15 | 0 | 2 | 14 | 31 | 6 | 0 | 0 | ... | 9 | 0 | 0 | 0 | 0 | 46 | 47 | 16 | 0 | 6 |
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !! | desktop wikitext editor | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
!!7597JOBV++! | desktop wikitext editor | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
!!Ethereal!! | desktop wikitext editor | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
!!Priyanshu Patel!! | desktop wikitext editor | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 26 columns
weekly_retention = weekly_edits.groupby("interface").agg(pos_prop)
weekly_retention.applymap(pct_str)
week 1 | week 2 | week 3 | week 4 | week 5 | week 6 | week 7 | week 8 | week 9 | week 10 | ... | week 17 | week 18 | week 19 | week 20 | week 21 | week 22 | week 23 | week 24 | week 25 | week 26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
interface | |||||||||||||||||||||
desktop visual editor | 100.0% | 8.6% | 7.0% | 6.1% | 5.5% | 5.1% | 4.7% | 4.4% | 4.1% | 3.9% | ... | 3.1% | 3.0% | 3.0% | 2.9% | 2.8% | 2.8% | 2.7% | 2.7% | 2.7% | 2.7% |
desktop wikitext editor | 100.0% | 12.4% | 10.6% | 9.7% | 9.2% | 8.7% | 8.3% | 8.0% | 7.8% | 7.5% | ... | 6.8% | 6.8% | 6.7% | 6.6% | 6.6% | 6.5% | 6.5% | 6.4% | 6.3% | 6.3% |
mobile visual editor | 100.0% | 4.4% | 3.4% | 3.0% | 2.6% | 2.3% | 2.1% | 2.0% | 1.9% | 1.8% | ... | 1.3% | 1.3% | 1.3% | 1.1% | 1.0% | 1.1% | 1.1% | 1.1% | 1.1% | 1.1% |
mobile wikitext editor | 100.0% | 5.9% | 4.9% | 4.4% | 4.1% | 3.8% | 3.6% | 3.5% | 3.4% | 3.3% | ... | 2.8% | 2.7% | 2.7% | 2.7% | 2.6% | 2.6% | 2.6% | 2.5% | 2.5% | 2.4% |
4 rows × 26 columns
retention_for_graph = weekly_retention.iloc[:, 1:].T
retention_for_graph.index = range(2, 27)
retention_for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.legend()
<matplotlib.legend.Legend at 0x7f47c4df5f98>
_229.columns
Index(['week 1', 'week 2', 'week 3', 'week 4', 'week 5', 'week 6', 'week 7', 'week 8', 'week 9', 'week 10', 'week 11', 'week 12', 'week 13', 'week 14', 'week 15', 'week 16', 'week 17', 'week 18', 'week 19', 'week 20', 'week 21', 'week 22', 'week 23', 'week 24', 'week 25', 'week 26'], dtype='object')
second_week = _229["week 2"]
second_week.name = "second week"
second_month_edits = weekly_edits.loc[:, "week 5":"week 8"].sum(axis=1)
second_month = second_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
second_month.name = "second month"
sixth_month_edits = weekly_edits.loc[:, "week 23":"week 26"].sum(axis=1)
sixth_month = sixth_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
sixth_month.name = "sixth month"
pd.concat([second_week, second_month, sixth_month], axis=1)
second week | second month | sixth month | |
---|---|---|---|
interface | |||
desktop visual editor | 8.6% | 11.9% | 6.4% |
desktop wikitext editor | 12.4% | 16.7% | 11.6% |
mobile visual editor | 4.4% | 6.0% | 2.8% |
mobile wikitext editor | 5.9% | 8.7% | 5.5% |
experience = edits_r[["user_name", "interface", "initial_edit_count"]].copy()
experience = experience.groupby(["user_name", "interface"]).first()
experience["initial_experience"] = pd.cut(
experience["initial_edit_count"],
bins=ec_bins[1:],
labels=ec_labels[1:],
right=False
)
experience = experience.drop(["initial_edit_count"], axis=1)
experience.head()
initial_experience | ||
---|---|---|
user_name | interface | |
! Bikkit ! | desktop wikitext editor | 1-9 edits |
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !! | desktop wikitext editor | 1-9 edits |
!!7597JOBV++! | desktop wikitext editor | 1-9 edits |
!!Ethereal!! | desktop wikitext editor | 1-9 edits |
!!Priyanshu Patel!! | desktop wikitext editor | 1-9 edits |
retention_by_experience = weekly_edits.join(experience).groupby(
["interface", "initial_experience"]
).agg(pos_prop)
retention_by_experience.applymap(pct_str)
week 1 | week 2 | week 3 | week 4 | week 5 | week 6 | week 7 | week 8 | week 9 | week 10 | ... | week 17 | week 18 | week 19 | week 20 | week 21 | week 22 | week 23 | week 24 | week 25 | week 26 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
interface | initial_experience | |||||||||||||||||||||
desktop visual editor | 1-9 edits | 100.0% | 6.9% | 5.3% | 4.5% | 4.0% | 3.5% | 3.1% | 2.9% | 2.7% | 2.5% | ... | 1.6% | 1.6% | 1.6% | 1.5% | 1.5% | 1.4% | 1.4% | 1.4% | 1.3% | 1.4% |
10-99 edits | 100.0% | 11.1% | 9.2% | 8.3% | 7.6% | 7.1% | 6.5% | 6.1% | 5.9% | 5.5% | ... | 4.6% | 4.5% | 4.5% | 4.3% | 4.3% | 4.3% | 4.1% | 4.2% | 4.1% | 4.0% | |
100-999 edits | 100.0% | 16.8% | 15.4% | 13.9% | 13.5% | 13.1% | 12.9% | 12.4% | 11.7% | 11.6% | ... | 10.8% | 10.7% | 10.9% | 10.4% | 10.2% | 10.2% | 10.3% | 10.3% | 10.0% | 10.0% | |
1000+ edits | 100.0% | 25.0% | 23.0% | 22.4% | 21.7% | 21.3% | 21.2% | 20.0% | 20.0% | 19.6% | ... | 18.8% | 18.9% | 19.1% | 18.1% | 17.9% | 18.1% | 18.1% | 18.2% | 17.6% | 17.5% | |
desktop wikitext editor | 1-9 edits | 100.0% | 9.8% | 8.0% | 7.1% | 6.5% | 6.1% | 5.7% | 5.4% | 5.2% | 5.0% | ... | 4.2% | 4.1% | 4.1% | 4.0% | 4.0% | 3.9% | 3.9% | 3.8% | 3.8% | 3.7% |
10-99 edits | 100.0% | 16.3% | 14.8% | 13.8% | 13.3% | 12.7% | 12.3% | 11.9% | 11.8% | 11.5% | ... | 10.9% | 10.8% | 10.6% | 10.5% | 10.5% | 10.5% | 10.5% | 10.4% | 10.2% | 10.2% | |
100-999 edits | 100.0% | 22.4% | 20.5% | 19.7% | 19.4% | 18.6% | 18.1% | 17.6% | 17.4% | 17.0% | ... | 16.4% | 16.5% | 16.6% | 16.3% | 16.3% | 16.2% | 16.0% | 16.1% | 15.8% | 15.8% | |
1000+ edits | 100.0% | 35.7% | 34.1% | 33.2% | 32.3% | 32.1% | 31.3% | 31.1% | 31.1% | 31.0% | ... | 30.2% | 29.7% | 29.9% | 29.4% | 29.5% | 29.7% | 29.6% | 28.9% | 29.3% | 29.6% | |
mobile visual editor | 1-9 edits | 100.0% | 2.8% | 2.0% | 1.9% | 1.5% | 1.4% | 1.3% | 1.1% | 1.1% | 1.0% | ... | 0.7% | 0.7% | 0.7% | 0.5% | 0.5% | 0.5% | 0.5% | 0.5% | 0.6% | 0.5% |
10-99 edits | 100.0% | 9.8% | 7.6% | 6.5% | 6.1% | 4.9% | 4.7% | 5.4% | 4.8% | 4.2% | ... | 3.1% | 3.0% | 3.1% | 3.1% | 2.8% | 2.9% | 2.9% | 2.9% | 2.6% | 3.1% | |
100-999 edits | 100.0% | 15.1% | 13.7% | 12.9% | 11.5% | 10.4% | 9.0% | 7.5% | 7.2% | 8.6% | ... | 5.9% | 6.0% | 6.4% | 5.1% | 4.9% | 4.6% | 4.3% | 5.3% | 5.0% | 5.2% | |
1000+ edits | 100.0% | 16.6% | 16.8% | 13.6% | 11.6% | 11.8% | 9.9% | 9.9% | 9.1% | 8.3% | ... | 6.5% | 9.1% | 7.1% | 6.5% | 7.5% | 6.1% | 6.9% | 6.5% | 8.1% | 6.5% | |
mobile wikitext editor | 1-9 edits | 100.0% | 4.1% | 3.2% | 2.8% | 2.5% | 2.3% | 2.1% | 2.0% | 1.9% | 1.8% | ... | 1.5% | 1.4% | 1.3% | 1.3% | 1.3% | 1.3% | 1.3% | 1.2% | 1.2% | 1.2% |
10-99 edits | 100.0% | 10.9% | 9.4% | 8.8% | 8.5% | 7.9% | 7.5% | 7.5% | 7.2% | 7.1% | ... | 5.9% | 5.8% | 5.7% | 5.6% | 5.5% | 5.6% | 5.5% | 5.5% | 5.5% | 5.1% | |
100-999 edits | 100.0% | 16.7% | 15.1% | 15.0% | 14.2% | 13.4% | 13.3% | 12.9% | 12.8% | 12.8% | ... | 11.9% | 11.4% | 11.5% | 11.4% | 10.9% | 10.9% | 11.4% | 11.5% | 10.8% | 10.9% | |
1000+ edits | 100.0% | 20.0% | 18.4% | 17.6% | 17.1% | 16.5% | 16.7% | 16.1% | 16.3% | 16.1% | ... | 15.1% | 14.7% | 14.1% | 14.9% | 14.4% | 15.0% | 14.6% | 14.5% | 14.7% | 14.1% |
16 rows × 26 columns
newbie_retention = retention_by_experience.xs("1-9 edits", level=1)
for_graph = newbie_retention.iloc[:, 1:].T
for_graph.index = range(2, 27)
for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.title("Interface retention among inexperienced editors");
# How many people are contributing to these statistics?
pd.DataFrame(weekly_edits["week 1"]).join(experience).groupby(["interface", "initial_experience"]).count()
week 1 | ||
---|---|---|
interface | initial_experience | |
desktop visual editor | 1-9 edits | 256058 |
10-99 edits | 55906 | |
100-999 edits | 19627 | |
1000+ edits | 7967 | |
desktop wikitext editor | 1-9 edits | 589884 |
10-99 edits | 114853 | |
100-999 edits | 57749 | |
1000+ edits | 22566 | |
mobile visual editor | 1-9 edits | 31681 |
10-99 edits | 5255 | |
100-999 edits | 1361 | |
1000+ edits | 493 | |
mobile wikitext editor | 1-9 edits | 194749 |
10-99 edits | 24287 | |
100-999 edits | 11885 | |
1000+ edits | 6942 |
started = may_sessions.query("ready_count >= 1")
completed = started.query("save_success_count >=1 ")
mobile_edits = completed.query("platform == 'phone'")
desktop_edits = completed.query("platform == 'desktop'")
desktop_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(desktop_edits)).map(pct_str)
editor visualeditor 9.9% wikitext 90.1% Name: actions, dtype: object
mobile_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(mobile_edits)).map(pct_str)
editor visualeditor 6.1% wikitext 93.9% Name: actions, dtype: object
However, on desktop this isn't the fairest of comparisons, since the visual editor isn't available on many wikis and many wikis and many namespaces. How does the picture look when we limit it to VE-default wikis?
To get the list of VE-default wikis, we start with the "visualeditor-nondefault" database list. On top of that, we need to add the wikis where visual editor is demoted using settings in InitialiseSettings.php: wmgVisualEditorSingleEditTabSecondaryEditor (enwiki and frwiktionary) and wmgVisualEditorDisableForAnons (enwiki and eswiki).
# Get raw dblist file
r = requests.get("https://raw.githubusercontent.com/wikimedia/operations-mediawiki-config/master/dblists/visualeditor-nondefault.dblist")
# Strip comments
no_comments = re.sub(r"#.*\n", "", r.text)
# Split by line
nonve_dbs = re.split("\n", no_comments)
# Remove the trailing empty line
nonve_dbs.remove("")
# Add our extras
nonve_dbs.extend(["enwiki", "eswiki", "frwiktionary"])
ve_def_edits = desktop_edits.query('wiki not in @nonve_dbs')
ve_def_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_def_edits)).map(pct_str)
editor visualeditor 14.8% wikitext 85.2% Name: actions, dtype: object
ve_nondef_edits = desktop_edits.query('wiki in @nonve_dbs')
ve_nondef_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_nondef_edits)).map(pct_str)
editor visualeditor 5.7% wikitext 94.3% Name: actions, dtype: object
def calc_funnel(df, omitloaded=False, omitsaveintent=False):
results = OrderedDict()
with_inits = df.query("init_count >= 1")
total = len(with_inits)
results["init"] = len(with_inits) / total
with_readies = with_inits.query("ready_count >= 1")
results["ready"] = len(with_readies) / total
with_loaded = with_readies.query("loaded_count >= 1")
if not omitloaded and len(with_loaded) > 0:
results["loaded"] = len(with_loaded) / total
with_readies = with_loaded
with_saveintents = with_readies.query("save_intent_count >= 1")
if not omitsaveintent and len(with_saveintents) > 0:
results["saveIntent"] = len(with_saveintents) / total
with_readies = with_saveintents
with_saveattempts = with_readies.query("save_attempt_count >= 1")
results["saveAttempt"] = len(with_saveattempts) / total
with_savesuccesses = with_saveattempts.query("save_success_count >= 1")
results["saveSuccess"] = len(with_savesuccesses) / total
return pd.DataFrame(results, index = ["percentage_reaching"]).T.applymap(pct_str)
calc_funnel(may_sessions, omitloaded=True, omitsaveintent=True)
percentage_reaching | |
---|---|
init | 100.0% |
ready | 63.9% |
saveAttempt | 7.9% |
saveSuccess | 7.7% |
may_sessions.groupby(["platform", "editor"]).apply(calc_funnel).unstack([0, 1]).fillna("—")
percentage_reaching | ||||
---|---|---|---|---|
platform | desktop | phone | ||
editor | visualeditor | wikitext | visualeditor | wikitext |
init | 100.0% | 100.0% | 100.0% | 100.0% |
loaded | 92.3% | 41.7% | — | — |
ready | 92.3% | 42.0% | 95.4% | 97.0% |
saveAttempt | 12.6% | 10.4% | 17.1% | 2.7% |
saveIntent | 13.2% | — | 18.8% | 3.0% |
saveSuccess | 11.6% | 10.2% | 15.1% | 2.5% |
with_inits = may_sessions.query("platform == 'desktop' & editor == 'wikitext' & init_count >= 1")
len(with_inits)
3340351
It's not because a lot of sessions skip the loaded event.
len(with_inits.query("loaded_count >= 1 & ready_count >= 1")) / len(with_inits.query("ready_count >= 1"))
0.9938517826194514
The desktop wikitext editor has more sessions that drop after init (because there's no ready) but that still have other later events. However, it's not enough to explain the dramatically lower ready rate.
wt_init_no_ready = with_inits.query("ready_count == 0")
wt_init_only = wt_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")
print(pct_str(len(wt_init_only) / len(wt_init_no_ready)))
87.4%
ve_init_no_ready = may_sessions.query("platform == 'desktop' & editor == 'visualeditor' & init_count >= 1 & ready_count == 0")
ve_init_only = ve_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")
print(pct_str(len(ve_init_only) / len(ve_init_no_ready)))
97.6%
def completion_rate(df):
started = df.query("ready_count >= 1")
completed = started.query("save_success_count >= 1")
if len(started) > 0:
return len(completed) / len(started)
else:
return None
may_sessions.groupby(["platform"]).apply(completion_rate).map(pct_str)
platform desktop 22.5% phone 2.7% dtype: object
desktop = may_sessions.query("platform == 'desktop'")
desktop.groupby(["editor"]).apply(completion_rate).map(pct_str)
editor visualeditor 12.6% wikitext 24.6% dtype: object
desktop.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)
editor | visualeditor | wikitext |
---|---|---|
experience | ||
IP | 5.2% | 6.4% |
0 edits (new editor) | 27.8% | 18.0% |
1-9 edits | 45.3% | 35.5% |
10-99 edits | 57.6% | 48.8% |
100-999 edits | 65.3% | 58.4% |
1000+ edits | 73.3% | 71.7% |
phone = may_sessions.query("platform == 'phone'")
phone.groupby(["editor"]).apply(completion_rate).map(pct_str)
editor visualeditor 15.8% wikitext 2.6% dtype: object
phone.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)
editor | visualeditor | wikitext |
---|---|---|
experience | ||
IP | 11.4% | 1.5% |
0 edits (new editor) | 17.4% | 21.9% |
1-9 edits | 28.0% | 38.7% |
10-99 edits | 38.1% | 45.8% |
100-999 edits | 52.1% | 55.9% |
1000+ edits | 58.1% | 62.9% |
completed["duration"].head()
114 00:07:54 130 00:00:46 137 00:01:55 144 00:28:50 158 00:00:12 Name: duration, dtype: timedelta64[ns]
completed.groupby(["editor", "platform"])["duration"].apply(np.median)
editor platform visualeditor desktop 00:01:40 phone 00:01:43 wikitext desktop 00:00:38 phone 00:00:48 Name: duration, dtype: timedelta64[ns]
def duration_hist(ax, td_ser):
hist_vals = (
td_ser
.astype("timedelta64[s]")
.replace(0, 1)
)
bins = np.floor(
np.logspace(0, 4, num=40, dtype=int)
)
ax.hist(hist_vals, bins=bins, label=None)
ax.set_xscale("log")
s_ticks = [1, 10, 60, 10*60, 60*60, 150*60]
duration_ticks = FixedLocator(s_ticks)
def to_td(s):
return str(timedelta(seconds=int(s)))
duration_fmt = FuncFormatter(lambda x, pos: to_td(x))
lines = [
("10th percentile ({})", hist_vals.quantile(0.1), "xkcd:pink"),
("50th percentile ({})", hist_vals.median(), "xkcd:red"),
("90th percentile ({})", hist_vals.quantile(0.9), "xkcd:dark red")
]
for label, pos, color in lines:
ax.axvline(x=pos, color=color, label=label.format(to_td(pos)))
ax.legend()
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.set_ylabel("number of sampled, completed edit sessions")
ax.xaxis.set_major_locator(duration_ticks)
ax.xaxis.set_major_formatter(duration_fmt)
ax.set_xlabel("time to save as %H:%M:%S (log scale)")
fig, ax = plt.subplots()
ax.set_title("Time to save")
duration_hist(ax, completed["duration"])
full_desktop = completed.query("platform == 'desktop'")
full_phone = completed.query("platform == 'phone'")
fig, ax = plt.subplots(2, 1, figsize=(14, 14))
fig.suptitle("Time to save by platform")
desktop_ax = ax[0]
phone_ax = ax[1]
duration_hist(desktop_ax, full_desktop["duration"])
desktop_ax.set_title("Desktops")
duration_hist(phone_ax, full_phone["duration"])
phone_ax.set_title("Phones");
full_desktop_ve = completed.query("platform == 'desktop' & editor == 'visualeditor'")
full_desktop_wte = completed.query("platform == 'desktop' & editor == 'wikitext'")
fig, ax = plt.subplots(2, 1, figsize=(14, 14))
fig.suptitle("Desktop time to save duration by editing interface")
ve_ax = ax[0]
wte_ax = ax[1]
duration_hist(ve_ax, full_desktop_ve["duration"])
ve_ax.set_title("Visual editor")
duration_hist(wte_ax, full_desktop_wte["duration"])
wte_ax.set_title("2010 wikitext editor");