In [3]:
from collections import OrderedDict
from datetime import timedelta
from functools import partial
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, FixedLocator
import numpy as np
import pandas as pd
import requests
from wmfdata import hive, mariadb, charting
from wmfdata.utils import pct_str

charting.set_mpl_style()
In [4]:
null_prop = lambda s: len(s[s.isnull()]) / len(s)
pos_prop = lambda s: len(s[s > 0]) / len(s)

Data loading

Mobile edit proportion

In [19]:
editors_r = hive.run("""
select
    coalesce(event_user_text, event_user_text_historical) as user_name,
    not event_user_is_anonymous as registered,
    count(*) as yearly_edits,
    sum(cast(
        coalesce(mve.editor, "desktop wikitext editor") in ("mobile visual editor", "mobile wikitext editor")
    as int)) / count(*) as mobile_edit_proportion
from wmf.mediawiki_history mh
left join neilpquinn.mob_or_ve_edits mve
on 
    mve.wiki = mh.wiki_db and
    mve.rev_id = mh.revision_id
where
    mh.event_timestamp >= "2017-10" and
    event_entity = "revision" and
    event_type = "create" and
    snapshot = "2018-09"
group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous
""")
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-fae70611ab4b> in <module>()
     18     snapshot = "2018-09"
     19 group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous
---> 20 """)

/srv/home/neilpquinn-wmf/pkg/wmfdata/wmfdata/hive.py in run(cmds, fmt)
     27 
     28         for cmd in cmds:
---> 29             hive_cursor.execute(cmd)
     30             if fmt == "pandas":
     31                 try:

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in execute(self, operation, parameters, configuration)
    302                            configuration=configuration)
    303         log.debug('Waiting for query to finish')
--> 304         self._wait_to_finish()  # make execute synchronous
    305         log.debug('Query finished')
    306 

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _wait_to_finish(self)
    371         while True:
    372             req = TGetOperationStatusReq(operationHandle=self._last_operation.handle)
--> 373             resp = self._last_operation._rpc('GetOperationStatus', req)
    374             operation_state = TOperationState._VALUES_TO_NAMES[resp.operationState]
    375 

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _rpc(self, func_name, request)
    921     def _rpc(self, func_name, request):
    922         self._log_request(func_name, request)
--> 923         response = self._execute(func_name, request)
    924         self._log_response(func_name, response)
    925         err_if_rpc_not_ok(response)

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _execute(self, func_name, request)
    938                 log.debug('Transport opened')
    939                 func = getattr(self.client, func_name)
--> 940                 return func(request)
    941             except socket.error:
    942                 log.exception('Failed to open transport (tries_left=%s)',

~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _req(self, _api, *args, **kwargs)
    196         # wait result only if non-oneway
    197         if not getattr(result_cls, "oneway"):
--> 198             return self._recv(_api)
    199 
    200     def _send(self, _api, **kwargs):

~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _recv(self, _api)
    208 
    209     def _recv(self, _api):
--> 210         fname, mtype, rseqid = self._iprot.read_message_begin()
    211         if mtype == TMessageType.EXCEPTION:
    212             x = TApplicationException()

~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(self)
    370     def read_message_begin(self):
    371         api, ttype, seqid = read_message_begin(
--> 372             self.trans, strict=self.strict_read)
    373         return api, ttype, seqid
    374 

~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(inbuf, strict)
    162 
    163 def read_message_begin(inbuf, strict=True):
--> 164     sz = unpack_i32(inbuf.read(4))
    165     if sz < 0:
    166         version = sz & VERSION_MASK

~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in read(self, sz)
    157       return ret
    158 
--> 159     self._read_frame()
    160     return ret + self.__rbuf.read(sz - len(ret))
    161 

~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in _read_frame(self)
    161 
    162   def _read_frame(self):
--> 163     header = read_all_compat(self._trans, 4)
    164     (length,) = struct.unpack(">I", header)
    165     if self.encode:

~/venv/lib/python3.5/site-packages/thrift_sasl/six.py in <lambda>(trans, sz)
     52     return is_open
     53 
---> 54   read_all_compat = lambda trans, sz: readall(trans.read, sz)

~/venv/lib/python3.5/site-packages/thriftpy/transport/__init__.py in readall(read_fn, sz)
     12     have = 0
     13     while have < sz:
---> 14         chunk = read_fn(sz - have)
     15         have += len(chunk)
     16         buff += chunk

~/venv/lib/python3.5/site-packages/thriftpy/transport/socket.py in read(self, sz)
    106     def read(self, sz):
    107         try:
--> 108             buff = self.sock.recv(sz)
    109         except socket.error as e:
    110             if (e.args[0] == errno.ECONNRESET and

KeyboardInterrupt: 
In [302]:
editors = editors_r.copy()
editors.head()
Out[302]:
user_name registered yearly_edits mobile_edit_proportion
0 ! Bikkit ! True 583 0.0
1 !Alla17 True 45 0.0
2 !Ann07 True 1 0.0
3 !Maciek!Wawa True 2 0.0
4 !Puce456 True 1 1.0
In [303]:
def bin_platform(x):
    if x == 0:
        return "desktop only"
    elif x == 1:
        return "mobile only"
    else:
        return "mixed"
    
editors["platform"] = pd.Categorical(editors["mobile_edit_proportion"].apply(bin_platform))

editors["activity_level"] = pd.cut(
    editors["yearly_edits"], 
    bins=ec_bins[1:], 
    labels=ec_labels[1:],
    right=False
)
In [304]:
reg_editors = editors.query("registered == True")
ip_editors = editors.query("registered == False")
In [305]:
platform_prop = reg_editors.groupby("platform")["user_name"].agg(len) / len(reg_editors)
pd.DataFrame(platform_prop).applymap(pct_str).T
Out[305]:
platform desktop only mixed mobile only
user_name 74.9% 6.2% 18.9%
In [306]:
by_act = reg_editors.groupby(["activity_level", "platform"])["user_name"].agg(len).unstack()
by_act.apply(lambda row: row / row.sum(), axis=1).applymap(pct_str)
Out[306]:
platform desktop only mixed mobile only
activity_level
1-9 edits 74.5% 3.4% 22.1%
10-99 edits 78.5% 15.5% 6.0%
100-999 edits 69.6% 28.6% 1.9%
1000+ edits 66.7% 33.1% 0.1%
In [323]:
fig, ax = plt.subplots()
ax.hist(reg_editors["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");
In [361]:
fig, ax = plt.subplots()
ax.hist(reg_editors.query("0 < mobile_edit_proportion < 1.0")["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");

Interface retention

In [ ]:
# For all the people who edited Oct 2017–March 2018, find their first edit with each interface 
# and join it to all their edits from Oct 2017-Sep 2018 with the same interface

# WARNING: Initial edit count is unreliable because the database has a separate edit count for each wiki
edits_r = hive.run("""
with first_edits as (
    select
        event_user_text,
        coalesce(editor, "desktop wikitext editor") as editor,
        min(event_timestamp) as first_edit_time,
        min(event_user_revision_count) as initial_edit_count
    from wmf.mediawiki_history mh
    left join neilpquinn.mob_or_ve_edits mve
    on 
        mve.wiki = mh.wiki_db and
        mve.rev_id = mh.revision_id
    where
        event_timestamp >= "2017-10" and
        event_timestamp < "2018-04" and
        not event_user_is_anonymous and
        event_entity = "revision" and
        event_type = "create" and
        mh.snapshot = "2018-09"
    group by event_user_text, editor
)
select
user_name,
interface,
week_number,
min(initial_edit_count) as initial_edit_count,
count(*) as edits
from (
    select
        fe.event_user_text as user_name,
        fe.editor as interface,
        fe.initial_edit_count as initial_edit_count,
        event_timestamp,
        (fe.first_edit_time = mh.event_timestamp) as first_edit,
        -- If datediff is 0-7, week 1; if 8-14, week 2; etc.
        if(
            ceil(datediff(event_timestamp, first_edit_time) / 7) != 0,
            ceil(datediff(event_timestamp, first_edit_time) / 7),
            1
        ) as week_number
    from first_edits fe
    left join wmf.mediawiki_history mh on
        fe.event_user_text = mh.event_user_text and
        mh.snapshot = "2018-09"
    left join neilpquinn.mob_or_ve_edits mve on 
        mve.wiki = mh.wiki_db and
        mve.rev_id = mh.revision_id
    where
        fe.editor = coalesce(mve.editor, "desktop wikitext editor") and
        -- Limit to events with 182 days (26 full weeks) after starting to avoid censoring
        datediff(event_timestamp, first_edit_time) <= 182 and
        event_timestamp >= "2017-10" and
        event_timestamp < "2018-10" and
        event_entity = "revision" and
        event_type = "create"
) edits
group by user_name, interface, week_number
""")
In [226]:
weekly_edits = edits_r.drop(["initial_edit_count"], axis=1)
weekly_edits.head()
Out[226]:
user_name interface week_number edits
0 ! Bikkit ! desktop wikitext editor 14 24
1 ! Bikkit ! desktop wikitext editor 26 6
2 !!Ethereal!! desktop wikitext editor 2 1
3 !!Priyanshu Patel!! desktop wikitext editor 1 2
4 !0Clarke desktop wikitext editor 1 4
In [227]:
weekly_edits = weekly_edits.set_index(["user_name", "interface", "week_number"]).unstack()
weekly_edits = weekly_edits.fillna(0).apply(np.uint32)
weekly_edits.columns = weekly_edits.columns.droplevel(0)
weekly_edits.columns.name = None
weekly_edits = weekly_edits.rename(columns = lambda x: "week {!s}".format(x))
weekly_edits.head()
Out[227]:
week 1 week 2 week 3 week 4 week 5 week 6 week 7 week 8 week 9 week 10 ... week 17 week 18 week 19 week 20 week 21 week 22 week 23 week 24 week 25 week 26
user_name interface
! Bikkit ! desktop wikitext editor 4 0 15 0 2 14 31 6 0 0 ... 9 0 0 0 0 46 47 16 0 6
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !! desktop wikitext editor 11 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
!!7597JOBV++! desktop wikitext editor 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
!!Ethereal!! desktop wikitext editor 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
!!Priyanshu Patel!! desktop wikitext editor 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 26 columns

In [258]:
weekly_retention = weekly_edits.groupby("interface").agg(pos_prop)
weekly_retention.applymap(pct_str)
Out[258]:
week 1 week 2 week 3 week 4 week 5 week 6 week 7 week 8 week 9 week 10 ... week 17 week 18 week 19 week 20 week 21 week 22 week 23 week 24 week 25 week 26
interface
desktop visual editor 100.0% 8.6% 7.0% 6.1% 5.5% 5.1% 4.7% 4.4% 4.1% 3.9% ... 3.1% 3.0% 3.0% 2.9% 2.8% 2.8% 2.7% 2.7% 2.7% 2.7%
desktop wikitext editor 100.0% 12.4% 10.6% 9.7% 9.2% 8.7% 8.3% 8.0% 7.8% 7.5% ... 6.8% 6.8% 6.7% 6.6% 6.6% 6.5% 6.5% 6.4% 6.3% 6.3%
mobile visual editor 100.0% 4.4% 3.4% 3.0% 2.6% 2.3% 2.1% 2.0% 1.9% 1.8% ... 1.3% 1.3% 1.3% 1.1% 1.0% 1.1% 1.1% 1.1% 1.1% 1.1%
mobile wikitext editor 100.0% 5.9% 4.9% 4.4% 4.1% 3.8% 3.6% 3.5% 3.4% 3.3% ... 2.8% 2.7% 2.7% 2.7% 2.6% 2.6% 2.6% 2.5% 2.5% 2.4%

4 rows × 26 columns

In [268]:
retention_for_graph = weekly_retention.iloc[:, 1:].T
retention_for_graph.index = range(2, 27)
retention_for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.legend()
Out[268]:
<matplotlib.legend.Legend at 0x7f47c4df5f98>
In [237]:
_229.columns
Out[237]:
Index(['week 1', 'week 2', 'week 3', 'week 4', 'week 5', 'week 6', 'week 7',
       'week 8', 'week 9', 'week 10', 'week 11', 'week 12', 'week 13',
       'week 14', 'week 15', 'week 16', 'week 17', 'week 18', 'week 19',
       'week 20', 'week 21', 'week 22', 'week 23', 'week 24', 'week 25',
       'week 26'],
      dtype='object')
In [246]:
second_week = _229["week 2"]
second_week.name = "second week"
In [249]:
second_month_edits = weekly_edits.loc[:, "week 5":"week 8"].sum(axis=1)
second_month = second_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
second_month.name = "second month"
In [251]:
sixth_month_edits = weekly_edits.loc[:, "week 23":"week 26"].sum(axis=1)
sixth_month = sixth_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
sixth_month.name = "sixth month"
In [252]:
pd.concat([second_week, second_month, sixth_month], axis=1)
Out[252]:
second week second month sixth month
interface
desktop visual editor 8.6% 11.9% 6.4%
desktop wikitext editor 12.4% 16.7% 11.6%
mobile visual editor 4.4% 6.0% 2.8%
mobile wikitext editor 5.9% 8.7% 5.5%

By experience

In [230]:
experience = edits_r[["user_name", "interface", "initial_edit_count"]].copy()
experience = experience.groupby(["user_name", "interface"]).first()
experience["initial_experience"] = pd.cut(
    experience["initial_edit_count"], 
    bins=ec_bins[1:], 
    labels=ec_labels[1:],
    right=False
)
experience = experience.drop(["initial_edit_count"], axis=1)
experience.head()
Out[230]:
initial_experience
user_name interface
! Bikkit ! desktop wikitext editor 1-9 edits
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !! desktop wikitext editor 1-9 edits
!!7597JOBV++! desktop wikitext editor 1-9 edits
!!Ethereal!! desktop wikitext editor 1-9 edits
!!Priyanshu Patel!! desktop wikitext editor 1-9 edits
In [328]:
retention_by_experience = weekly_edits.join(experience).groupby(
    ["interface", "initial_experience"]
).agg(pos_prop)

retention_by_experience.applymap(pct_str)
Out[328]:
week 1 week 2 week 3 week 4 week 5 week 6 week 7 week 8 week 9 week 10 ... week 17 week 18 week 19 week 20 week 21 week 22 week 23 week 24 week 25 week 26
interface initial_experience
desktop visual editor 1-9 edits 100.0% 6.9% 5.3% 4.5% 4.0% 3.5% 3.1% 2.9% 2.7% 2.5% ... 1.6% 1.6% 1.6% 1.5% 1.5% 1.4% 1.4% 1.4% 1.3% 1.4%
10-99 edits 100.0% 11.1% 9.2% 8.3% 7.6% 7.1% 6.5% 6.1% 5.9% 5.5% ... 4.6% 4.5% 4.5% 4.3% 4.3% 4.3% 4.1% 4.2% 4.1% 4.0%
100-999 edits 100.0% 16.8% 15.4% 13.9% 13.5% 13.1% 12.9% 12.4% 11.7% 11.6% ... 10.8% 10.7% 10.9% 10.4% 10.2% 10.2% 10.3% 10.3% 10.0% 10.0%
1000+ edits 100.0% 25.0% 23.0% 22.4% 21.7% 21.3% 21.2% 20.0% 20.0% 19.6% ... 18.8% 18.9% 19.1% 18.1% 17.9% 18.1% 18.1% 18.2% 17.6% 17.5%
desktop wikitext editor 1-9 edits 100.0% 9.8% 8.0% 7.1% 6.5% 6.1% 5.7% 5.4% 5.2% 5.0% ... 4.2% 4.1% 4.1% 4.0% 4.0% 3.9% 3.9% 3.8% 3.8% 3.7%
10-99 edits 100.0% 16.3% 14.8% 13.8% 13.3% 12.7% 12.3% 11.9% 11.8% 11.5% ... 10.9% 10.8% 10.6% 10.5% 10.5% 10.5% 10.5% 10.4% 10.2% 10.2%
100-999 edits 100.0% 22.4% 20.5% 19.7% 19.4% 18.6% 18.1% 17.6% 17.4% 17.0% ... 16.4% 16.5% 16.6% 16.3% 16.3% 16.2% 16.0% 16.1% 15.8% 15.8%
1000+ edits 100.0% 35.7% 34.1% 33.2% 32.3% 32.1% 31.3% 31.1% 31.1% 31.0% ... 30.2% 29.7% 29.9% 29.4% 29.5% 29.7% 29.6% 28.9% 29.3% 29.6%
mobile visual editor 1-9 edits 100.0% 2.8% 2.0% 1.9% 1.5% 1.4% 1.3% 1.1% 1.1% 1.0% ... 0.7% 0.7% 0.7% 0.5% 0.5% 0.5% 0.5% 0.5% 0.6% 0.5%
10-99 edits 100.0% 9.8% 7.6% 6.5% 6.1% 4.9% 4.7% 5.4% 4.8% 4.2% ... 3.1% 3.0% 3.1% 3.1% 2.8% 2.9% 2.9% 2.9% 2.6% 3.1%
100-999 edits 100.0% 15.1% 13.7% 12.9% 11.5% 10.4% 9.0% 7.5% 7.2% 8.6% ... 5.9% 6.0% 6.4% 5.1% 4.9% 4.6% 4.3% 5.3% 5.0% 5.2%
1000+ edits 100.0% 16.6% 16.8% 13.6% 11.6% 11.8% 9.9% 9.9% 9.1% 8.3% ... 6.5% 9.1% 7.1% 6.5% 7.5% 6.1% 6.9% 6.5% 8.1% 6.5%
mobile wikitext editor 1-9 edits 100.0% 4.1% 3.2% 2.8% 2.5% 2.3% 2.1% 2.0% 1.9% 1.8% ... 1.5% 1.4% 1.3% 1.3% 1.3% 1.3% 1.3% 1.2% 1.2% 1.2%
10-99 edits 100.0% 10.9% 9.4% 8.8% 8.5% 7.9% 7.5% 7.5% 7.2% 7.1% ... 5.9% 5.8% 5.7% 5.6% 5.5% 5.6% 5.5% 5.5% 5.5% 5.1%
100-999 edits 100.0% 16.7% 15.1% 15.0% 14.2% 13.4% 13.3% 12.9% 12.8% 12.8% ... 11.9% 11.4% 11.5% 11.4% 10.9% 10.9% 11.4% 11.5% 10.8% 10.9%
1000+ edits 100.0% 20.0% 18.4% 17.6% 17.1% 16.5% 16.7% 16.1% 16.3% 16.1% ... 15.1% 14.7% 14.1% 14.9% 14.4% 15.0% 14.6% 14.5% 14.7% 14.1%

16 rows × 26 columns

In [330]:
newbie_retention = retention_by_experience.xs("1-9 edits", level=1)

for_graph = newbie_retention.iloc[:, 1:].T
for_graph.index = range(2, 27)

for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.title("Interface retention among inexperienced editors");
In [ ]:
 
In [331]:
# How many people are contributing to these statistics?
pd.DataFrame(weekly_edits["week 1"]).join(experience).groupby(["interface", "initial_experience"]).count()
Out[331]:
week 1
interface initial_experience
desktop visual editor 1-9 edits 256058
10-99 edits 55906
100-999 edits 19627
1000+ edits 7967
desktop wikitext editor 1-9 edits 589884
10-99 edits 114853
100-999 edits 57749
1000+ edits 22566
mobile visual editor 1-9 edits 31681
10-99 edits 5255
100-999 edits 1361
1000+ edits 493
mobile wikitext editor 1-9 edits 194749
10-99 edits 24287
100-999 edits 11885
1000+ edits 6942

Editor proportion

In [332]:
started = may_sessions.query("ready_count >= 1")
completed = started.query("save_success_count >=1 ")
mobile_edits = completed.query("platform == 'phone'")
desktop_edits = completed.query("platform == 'desktop'")
In [333]:
desktop_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(desktop_edits)).map(pct_str)
Out[333]:
editor
visualeditor     9.9%
wikitext        90.1%
Name: actions, dtype: object
In [334]:
mobile_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(mobile_edits)).map(pct_str)
Out[334]:
editor
visualeditor     6.1%
wikitext        93.9%
Name: actions, dtype: object

VE-default wikis

However, on desktop this isn't the fairest of comparisons, since the visual editor isn't available on many wikis and many wikis and many namespaces. How does the picture look when we limit it to VE-default wikis?

To get the list of VE-default wikis, we start with the "visualeditor-nondefault" database list. On top of that, we need to add the wikis where visual editor is demoted using settings in InitialiseSettings.php: wmgVisualEditorSingleEditTabSecondaryEditor (enwiki and frwiktionary) and wmgVisualEditorDisableForAnons (enwiki and eswiki).

In [338]:
# Get raw dblist file
r = requests.get("https://raw.githubusercontent.com/wikimedia/operations-mediawiki-config/master/dblists/visualeditor-nondefault.dblist")

# Strip comments
no_comments = re.sub(r"#.*\n", "", r.text)

# Split by line
nonve_dbs = re.split("\n", no_comments)

# Remove the trailing empty line
nonve_dbs.remove("")

# Add our extras
nonve_dbs.extend(["enwiki", "eswiki", "frwiktionary"])
In [340]:
ve_def_edits = desktop_edits.query('wiki not in @nonve_dbs')

ve_def_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_def_edits)).map(pct_str)
Out[340]:
editor
visualeditor    14.8%
wikitext        85.2%
Name: actions, dtype: object
In [341]:
ve_nondef_edits = desktop_edits.query('wiki in @nonve_dbs')
ve_nondef_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_nondef_edits)).map(pct_str)
Out[341]:
editor
visualeditor     5.7%
wikitext        94.3%
Name: actions, dtype: object

Funnel chart

In [23]:
def calc_funnel(df, omitloaded=False, omitsaveintent=False):
    results = OrderedDict()
    
    with_inits = df.query("init_count >= 1") 
    total = len(with_inits)
    results["init"] = len(with_inits) / total
    
    with_readies = with_inits.query("ready_count >= 1")
    results["ready"] = len(with_readies) / total
    
    with_loaded = with_readies.query("loaded_count >= 1")
    if not omitloaded and len(with_loaded) > 0:
        results["loaded"] = len(with_loaded) / total
        with_readies = with_loaded
    
    with_saveintents = with_readies.query("save_intent_count >= 1")
    if not omitsaveintent and len(with_saveintents) > 0:
        results["saveIntent"] = len(with_saveintents) / total
        with_readies = with_saveintents
    
    with_saveattempts = with_readies.query("save_attempt_count >= 1")
    results["saveAttempt"] = len(with_saveattempts) / total
    
    with_savesuccesses = with_saveattempts.query("save_success_count >= 1")
    results["saveSuccess"] = len(with_savesuccesses) / total
    
    return pd.DataFrame(results, index = ["percentage_reaching"]).T.applymap(pct_str)
In [24]:
calc_funnel(may_sessions, omitloaded=True, omitsaveintent=True)
Out[24]:
percentage_reaching
init 100.0%
ready 63.9%
saveAttempt 7.9%
saveSuccess 7.7%
In [25]:
may_sessions.groupby(["platform", "editor"]).apply(calc_funnel).unstack([0, 1]).fillna("—")
Out[25]:
percentage_reaching
platform desktop phone
editor visualeditor wikitext visualeditor wikitext
init 100.0% 100.0% 100.0% 100.0%
loaded 92.3% 41.7%
ready 92.3% 42.0% 95.4% 97.0%
saveAttempt 12.6% 10.4% 17.1% 2.7%
saveIntent 13.2% 18.8% 3.0%
saveSuccess 11.6% 10.2% 15.1% 2.5%

Low ready rate on the desktop wikitext editor

In [28]:
with_inits = may_sessions.query("platform == 'desktop' & editor == 'wikitext' & init_count >= 1")
len(with_inits)
Out[28]:
3340351

It's not because a lot of sessions skip the loaded event.

In [33]:
len(with_inits.query("loaded_count >= 1 & ready_count >= 1")) / len(with_inits.query("ready_count >= 1"))
Out[33]:
0.9938517826194514

The desktop wikitext editor has more sessions that drop after init (because there's no ready) but that still have other later events. However, it's not enough to explain the dramatically lower ready rate.

In [45]:
wt_init_no_ready = with_inits.query("ready_count == 0")
wt_init_only = wt_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")

print(pct_str(len(wt_init_only) / len(wt_init_no_ready)))
87.4%
In [46]:
ve_init_no_ready = may_sessions.query("platform == 'desktop' & editor == 'visualeditor' & init_count >= 1 & ready_count == 0")
ve_init_only = ve_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")

print(pct_str(len(ve_init_only) / len(ve_init_no_ready)))
97.6%

Edit completion

In [20]:
def completion_rate(df):
    started = df.query("ready_count >= 1")
    completed = started.query("save_success_count >= 1")
    if len(started) > 0:
        return len(completed) / len(started)
    else:
        return None

may_sessions.groupby(["platform"]).apply(completion_rate).map(pct_str)
Out[20]:
platform
desktop    22.5%
phone       2.7%
dtype: object

Desktop

In [21]:
desktop = may_sessions.query("platform == 'desktop'")
desktop.groupby(["editor"]).apply(completion_rate).map(pct_str)
Out[21]:
editor
visualeditor    12.6%
wikitext        24.6%
dtype: object
In [22]:
desktop.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)
Out[22]:
editor visualeditor wikitext
experience
IP 5.2% 6.4%
0 edits (new editor) 27.8% 18.0%
1-9 edits 45.3% 35.5%
10-99 edits 57.6% 48.8%
100-999 edits 65.3% 58.4%
1000+ edits 73.3% 71.7%

Phone

In [349]:
phone = may_sessions.query("platform == 'phone'")
phone.groupby(["editor"]).apply(completion_rate).map(pct_str)
Out[349]:
editor
visualeditor    15.8%
wikitext         2.6%
dtype: object
In [350]:
phone.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)
Out[350]:
editor visualeditor wikitext
experience
IP 11.4% 1.5%
0 edits (new editor) 17.4% 21.9%
1-9 edits 28.0% 38.7%
10-99 edits 38.1% 45.8%
100-999 edits 52.1% 55.9%
1000+ edits 58.1% 62.9%

Time to save

In [351]:
completed["duration"].head()
Out[351]:
114   00:07:54
130   00:00:46
137   00:01:55
144   00:28:50
158   00:00:12
Name: duration, dtype: timedelta64[ns]
In [352]:
completed.groupby(["editor", "platform"])["duration"].apply(np.median)
Out[352]:
editor        platform
visualeditor  desktop    00:01:40
              phone      00:01:43
wikitext      desktop    00:00:38
              phone      00:00:48
Name: duration, dtype: timedelta64[ns]
In [357]:
def duration_hist(ax, td_ser):
    hist_vals = (
        td_ser
        .astype("timedelta64[s]")
        .replace(0, 1)
    )

    bins = np.floor(
        np.logspace(0, 4, num=40, dtype=int)
    )

    ax.hist(hist_vals, bins=bins, label=None)
    ax.set_xscale("log")

    s_ticks = [1, 10, 60, 10*60, 60*60, 150*60]
    duration_ticks = FixedLocator(s_ticks)
    
    def to_td(s):
        return str(timedelta(seconds=int(s)))
    
    duration_fmt = FuncFormatter(lambda x, pos: to_td(x))
    
    lines = [
        ("10th percentile ({})", hist_vals.quantile(0.1), "xkcd:pink"),
        ("50th percentile ({})", hist_vals.median(), "xkcd:red"),
        ("90th percentile ({})", hist_vals.quantile(0.9), "xkcd:dark red")
        
    ]
    
    for label, pos, color in lines:
        ax.axvline(x=pos, color=color, label=label.format(to_td(pos)))
    
    ax.legend()

    ax.yaxis.set_major_formatter(charting.comma_fmt)
    ax.set_ylabel("number of sampled, completed edit sessions")

    ax.xaxis.set_major_locator(duration_ticks)
    ax.xaxis.set_major_formatter(duration_fmt)
    ax.set_xlabel("time to save as %H:%M:%S (log scale)")
    
In [358]:
fig, ax = plt.subplots()
ax.set_title("Time to save")
duration_hist(ax, completed["duration"])
In [359]:
full_desktop = completed.query("platform == 'desktop'")
full_phone = completed.query("platform == 'phone'")

fig, ax = plt.subplots(2, 1, figsize=(14, 14))

fig.suptitle("Time to save by platform")
desktop_ax = ax[0]
phone_ax = ax[1]

duration_hist(desktop_ax, full_desktop["duration"])
desktop_ax.set_title("Desktops")

duration_hist(phone_ax, full_phone["duration"])
phone_ax.set_title("Phones");
In [360]:
full_desktop_ve = completed.query("platform == 'desktop' & editor == 'visualeditor'")
full_desktop_wte = completed.query("platform == 'desktop' & editor == 'wikitext'")

fig, ax = plt.subplots(2, 1, figsize=(14, 14))

fig.suptitle("Desktop time to save duration by editing interface")
ve_ax = ax[0]
wte_ax = ax[1]

duration_hist(ve_ax, full_desktop_ve["duration"])
ve_ax.set_title("Visual editor")

duration_hist(wte_ax, full_desktop_wte["duration"])
wte_ax.set_title("2010 wikitext editor");
In [ ]: