In [3]:

from collections import OrderedDict
from datetime import timedelta
from functools import partial
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, FixedLocator
import numpy as np
import pandas as pd
import requests
from wmfdata import hive, mariadb, charting
from wmfdata.utils import pct_str

charting.set_mpl_style()

In [4]:

null_prop = lambda s: len(s[s.isnull()]) / len(s)
pos_prop = lambda s: len(s[s > 0]) / len(s)

Data loading¶

Mobile edit proportion¶

In [19]:

editors_r = hive.run("""
select
    coalesce(event_user_text, event_user_text_historical) as user_name,
    not event_user_is_anonymous as registered,
    count(*) as yearly_edits,
    sum(cast(
        coalesce(mve.editor, "desktop wikitext editor") in ("mobile visual editor", "mobile wikitext editor")
    as int)) / count(*) as mobile_edit_proportion
from wmf.mediawiki_history mh
left join neilpquinn.mob_or_ve_edits mve
on 
    mve.wiki = mh.wiki_db and
    mve.rev_id = mh.revision_id
where
    mh.event_timestamp >= "2017-10" and
    event_entity = "revision" and
    event_type = "create" and
    snapshot = "2018-09"
group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous
""")

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-fae70611ab4b> in <module>()
     18     snapshot = "2018-09"
     19 group by coalesce(event_user_text, event_user_text_historical), event_user_is_anonymous
---> 20 """)

/srv/home/neilpquinn-wmf/pkg/wmfdata/wmfdata/hive.py in run(cmds, fmt)
     27 
     28         for cmd in cmds:
---> 29             hive_cursor.execute(cmd)
     30             if fmt == "pandas":
     31                 try:

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in execute(self, operation, parameters, configuration)
    302                            configuration=configuration)
    303         log.debug('Waiting for query to finish')
--> 304         self._wait_to_finish()  # make execute synchronous
    305         log.debug('Query finished')
    306 

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _wait_to_finish(self)
    371         while True:
    372             req = TGetOperationStatusReq(operationHandle=self._last_operation.handle)
--> 373             resp = self._last_operation._rpc('GetOperationStatus', req)
    374             operation_state = TOperationState._VALUES_TO_NAMES[resp.operationState]
    375 

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _rpc(self, func_name, request)
    921     def _rpc(self, func_name, request):
    922         self._log_request(func_name, request)
--> 923         response = self._execute(func_name, request)
    924         self._log_response(func_name, response)
    925         err_if_rpc_not_ok(response)

~/venv/lib/python3.5/site-packages/impala/hiveserver2.py in _execute(self, func_name, request)
    938                 log.debug('Transport opened')
    939                 func = getattr(self.client, func_name)
--> 940                 return func(request)
    941             except socket.error:
    942                 log.exception('Failed to open transport (tries_left=%s)',

~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _req(self, _api, *args, **kwargs)
    196         # wait result only if non-oneway
    197         if not getattr(result_cls, "oneway"):
--> 198             return self._recv(_api)
    199 
    200     def _send(self, _api, **kwargs):

~/venv/lib/python3.5/site-packages/thriftpy/thrift.py in _recv(self, _api)
    208 
    209     def _recv(self, _api):
--> 210         fname, mtype, rseqid = self._iprot.read_message_begin()
    211         if mtype == TMessageType.EXCEPTION:
    212             x = TApplicationException()

~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(self)
    370     def read_message_begin(self):
    371         api, ttype, seqid = read_message_begin(
--> 372             self.trans, strict=self.strict_read)
    373         return api, ttype, seqid
    374 

~/venv/lib/python3.5/site-packages/thriftpy/protocol/binary.py in read_message_begin(inbuf, strict)
    162 
    163 def read_message_begin(inbuf, strict=True):
--> 164     sz = unpack_i32(inbuf.read(4))
    165     if sz < 0:
    166         version = sz & VERSION_MASK

~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in read(self, sz)
    157       return ret
    158 
--> 159     self._read_frame()
    160     return ret + self.__rbuf.read(sz - len(ret))
    161 

~/venv/lib/python3.5/site-packages/thrift_sasl/__init__.py in _read_frame(self)
    161 
    162   def _read_frame(self):
--> 163     header = read_all_compat(self._trans, 4)
    164     (length,) = struct.unpack(">I", header)
    165     if self.encode:

~/venv/lib/python3.5/site-packages/thrift_sasl/six.py in <lambda>(trans, sz)
     52     return is_open
     53 
---> 54   read_all_compat = lambda trans, sz: readall(trans.read, sz)

~/venv/lib/python3.5/site-packages/thriftpy/transport/__init__.py in readall(read_fn, sz)
     12     have = 0
     13     while have < sz:
---> 14         chunk = read_fn(sz - have)
     15         have += len(chunk)
     16         buff += chunk

~/venv/lib/python3.5/site-packages/thriftpy/transport/socket.py in read(self, sz)
    106     def read(self, sz):
    107         try:
--> 108             buff = self.sock.recv(sz)
    109         except socket.error as e:
    110             if (e.args[0] == errno.ECONNRESET and

KeyboardInterrupt:

In [302]:

editors = editors_r.copy()
editors.head()

Out[302]:

	user_name	registered	yearly_edits	mobile_edit_proportion
0	! Bikkit !	True	583	0.0
1	!Alla17	True	45	0.0
2	!Ann07	True	1	0.0
3	!Maciek!Wawa	True	2	0.0
4	!Puce456	True	1	1.0

In [303]:

def bin_platform(x):
    if x == 0:
        return "desktop only"
    elif x == 1:
        return "mobile only"
    else:
        return "mixed"
    
editors["platform"] = pd.Categorical(editors["mobile_edit_proportion"].apply(bin_platform))

editors["activity_level"] = pd.cut(
    editors["yearly_edits"], 
    bins=ec_bins[1:], 
    labels=ec_labels[1:],
    right=False
)

In [304]:

reg_editors = editors.query("registered == True")
ip_editors = editors.query("registered == False")

In [305]:

platform_prop = reg_editors.groupby("platform")["user_name"].agg(len) / len(reg_editors)
pd.DataFrame(platform_prop).applymap(pct_str).T

Out[305]:

platform	desktop only	mixed	mobile only
user_name	74.9%	6.2%	18.9%

In [306]:

by_act = reg_editors.groupby(["activity_level", "platform"])["user_name"].agg(len).unstack()
by_act.apply(lambda row: row / row.sum(), axis=1).applymap(pct_str)

Out[306]:

platform	desktop only	mixed	mobile only
activity_level
1-9 edits	74.5%	3.4%	22.1%
10-99 edits	78.5%	15.5%	6.0%
100-999 edits	69.6%	28.6%	1.9%
1000+ edits	66.7%	33.1%	0.1%

In [323]:

fig, ax = plt.subplots()
ax.hist(reg_editors["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");

In [361]:

fig, ax = plt.subplots()
ax.hist(reg_editors.query("0 < mobile_edit_proportion < 1.0")["mobile_edit_proportion"])
ax.margins(y=0.1, tight=True)
ax.yaxis.set_major_formatter(charting.comma_fmt)
ax.xaxis.set_major_formatter(charting.pct_fmt)
ax.set_ylabel("editors")
ax.set_xlabel("mobile edit proportion");

Interface retention¶

In [ ]:

# For all the people who edited Oct 2017–March 2018, find their first edit with each interface 
# and join it to all their edits from Oct 2017-Sep 2018 with the same interface

# WARNING: Initial edit count is unreliable because the database has a separate edit count for each wiki
edits_r = hive.run("""
with first_edits as (
    select
        event_user_text,
        coalesce(editor, "desktop wikitext editor") as editor,
        min(event_timestamp) as first_edit_time,
        min(event_user_revision_count) as initial_edit_count
    from wmf.mediawiki_history mh
    left join neilpquinn.mob_or_ve_edits mve
    on 
        mve.wiki = mh.wiki_db and
        mve.rev_id = mh.revision_id
    where
        event_timestamp >= "2017-10" and
        event_timestamp < "2018-04" and
        not event_user_is_anonymous and
        event_entity = "revision" and
        event_type = "create" and
        mh.snapshot = "2018-09"
    group by event_user_text, editor
)
select
user_name,
interface,
week_number,
min(initial_edit_count) as initial_edit_count,
count(*) as edits
from (
    select
        fe.event_user_text as user_name,
        fe.editor as interface,
        fe.initial_edit_count as initial_edit_count,
        event_timestamp,
        (fe.first_edit_time = mh.event_timestamp) as first_edit,
        -- If datediff is 0-7, week 1; if 8-14, week 2; etc.
        if(
            ceil(datediff(event_timestamp, first_edit_time) / 7) != 0,
            ceil(datediff(event_timestamp, first_edit_time) / 7),
            1
        ) as week_number
    from first_edits fe
    left join wmf.mediawiki_history mh on
        fe.event_user_text = mh.event_user_text and
        mh.snapshot = "2018-09"
    left join neilpquinn.mob_or_ve_edits mve on 
        mve.wiki = mh.wiki_db and
        mve.rev_id = mh.revision_id
    where
        fe.editor = coalesce(mve.editor, "desktop wikitext editor") and
        -- Limit to events with 182 days (26 full weeks) after starting to avoid censoring
        datediff(event_timestamp, first_edit_time) <= 182 and
        event_timestamp >= "2017-10" and
        event_timestamp < "2018-10" and
        event_entity = "revision" and
        event_type = "create"
) edits
group by user_name, interface, week_number
""")

In [226]:

weekly_edits = edits_r.drop(["initial_edit_count"], axis=1)
weekly_edits.head()

Out[226]:

	user_name	interface	week_number	edits
0	! Bikkit !	desktop wikitext editor	14	24
1	! Bikkit !	desktop wikitext editor	26	6
2	!!Ethereal!!	desktop wikitext editor	2	1
3	!!Priyanshu Patel!!	desktop wikitext editor	1	2
4	!0Clarke	desktop wikitext editor	1	4

In [227]:

weekly_edits = weekly_edits.set_index(["user_name", "interface", "week_number"]).unstack()
weekly_edits = weekly_edits.fillna(0).apply(np.uint32)
weekly_edits.columns = weekly_edits.columns.droplevel(0)
weekly_edits.columns.name = None
weekly_edits = weekly_edits.rename(columns = lambda x: "week {!s}".format(x))
weekly_edits.head()

Out[227]:

		week 1	week 2	week 3	week 4	week 5	week 6	week 7	week 8	week 9	week 10	...	week 17	week 18	week 19	week 20	week 21	week 22	week 23	week 24	week 25	week 26
user_name	interface
! Bikkit !	desktop wikitext editor	4	0	15	0	2	14	31	6	0	0	...	9	0	0	0	0	46	47	16	0	6
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !!	desktop wikitext editor	11	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
!!7597JOBV++!	desktop wikitext editor	5	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
!!Ethereal!!	desktop wikitext editor	1	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
!!Priyanshu Patel!!	desktop wikitext editor	2	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 26 columns

In [258]:

weekly_retention = weekly_edits.groupby("interface").agg(pos_prop)
weekly_retention.applymap(pct_str)

Out[258]:

	week 1	week 2	week 3	week 4	week 5	week 6	week 7	week 8	week 9	week 10	...	week 17	week 18	week 19	week 20	week 21	week 22	week 23	week 24	week 25	week 26
interface
desktop visual editor	100.0%	8.6%	7.0%	6.1%	5.5%	5.1%	4.7%	4.4%	4.1%	3.9%	...	3.1%	3.0%	3.0%	2.9%	2.8%	2.8%	2.7%	2.7%	2.7%	2.7%
desktop wikitext editor	100.0%	12.4%	10.6%	9.7%	9.2%	8.7%	8.3%	8.0%	7.8%	7.5%	...	6.8%	6.8%	6.7%	6.6%	6.6%	6.5%	6.5%	6.4%	6.3%	6.3%
mobile visual editor	100.0%	4.4%	3.4%	3.0%	2.6%	2.3%	2.1%	2.0%	1.9%	1.8%	...	1.3%	1.3%	1.3%	1.1%	1.0%	1.1%	1.1%	1.1%	1.1%	1.1%
mobile wikitext editor	100.0%	5.9%	4.9%	4.4%	4.1%	3.8%	3.6%	3.5%	3.4%	3.3%	...	2.8%	2.7%	2.7%	2.7%	2.6%	2.6%	2.6%	2.5%	2.5%	2.4%

4 rows × 26 columns

In [268]:

retention_for_graph = weekly_retention.iloc[:, 1:].T
retention_for_graph.index = range(2, 27)
retention_for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.legend()

Out[268]:

<matplotlib.legend.Legend at 0x7f47c4df5f98>

In [237]:

_229.columns

Out[237]:

Index(['week 1', 'week 2', 'week 3', 'week 4', 'week 5', 'week 6', 'week 7',
       'week 8', 'week 9', 'week 10', 'week 11', 'week 12', 'week 13',
       'week 14', 'week 15', 'week 16', 'week 17', 'week 18', 'week 19',
       'week 20', 'week 21', 'week 22', 'week 23', 'week 24', 'week 25',
       'week 26'],
      dtype='object')

In [246]:

second_week = _229["week 2"]
second_week.name = "second week"

In [249]:

second_month_edits = weekly_edits.loc[:, "week 5":"week 8"].sum(axis=1)
second_month = second_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
second_month.name = "second month"

In [251]:

sixth_month_edits = weekly_edits.loc[:, "week 23":"week 26"].sum(axis=1)
sixth_month = sixth_month_edits.groupby("interface").agg(pos_prop).map(pct_str)
sixth_month.name = "sixth month"

In [252]:

pd.concat([second_week, second_month, sixth_month], axis=1)

Out[252]:

	second week	second month	sixth month
interface
desktop visual editor	8.6%	11.9%	6.4%
desktop wikitext editor	12.4%	16.7%	11.6%
mobile visual editor	4.4%	6.0%	2.8%
mobile wikitext editor	5.9%	8.7%	5.5%

By experience¶

In [230]:

experience = edits_r[["user_name", "interface", "initial_edit_count"]].copy()
experience = experience.groupby(["user_name", "interface"]).first()
experience["initial_experience"] = pd.cut(
    experience["initial_edit_count"], 
    bins=ec_bins[1:], 
    labels=ec_labels[1:],
    right=False
)
experience = experience.drop(["initial_edit_count"], axis=1)
experience.head()

Out[230]:

		initial_experience
user_name	interface
! Bikkit !	desktop wikitext editor	1-9 edits
!! !!ПОМОГИТЕ СЕСТРЕ НА ОПЕРАЦИЮ!! !!	desktop wikitext editor	1-9 edits
!!7597JOBV++!	desktop wikitext editor	1-9 edits
!!Ethereal!!	desktop wikitext editor	1-9 edits
!!Priyanshu Patel!!	desktop wikitext editor	1-9 edits

In [328]:

retention_by_experience = weekly_edits.join(experience).groupby(
    ["interface", "initial_experience"]
).agg(pos_prop)

retention_by_experience.applymap(pct_str)

Out[328]:

		week 1	week 2	week 3	week 4	week 5	week 6	week 7	week 8	week 9	week 10	...	week 17	week 18	week 19	week 20	week 21	week 22	week 23	week 24	week 25	week 26
interface	initial_experience
desktop visual editor	1-9 edits	100.0%	6.9%	5.3%	4.5%	4.0%	3.5%	3.1%	2.9%	2.7%	2.5%	...	1.6%	1.6%	1.6%	1.5%	1.5%	1.4%	1.4%	1.4%	1.3%	1.4%
	10-99 edits	100.0%	11.1%	9.2%	8.3%	7.6%	7.1%	6.5%	6.1%	5.9%	5.5%	...	4.6%	4.5%	4.5%	4.3%	4.3%	4.3%	4.1%	4.2%	4.1%	4.0%
	100-999 edits	100.0%	16.8%	15.4%	13.9%	13.5%	13.1%	12.9%	12.4%	11.7%	11.6%	...	10.8%	10.7%	10.9%	10.4%	10.2%	10.2%	10.3%	10.3%	10.0%	10.0%
	1000+ edits	100.0%	25.0%	23.0%	22.4%	21.7%	21.3%	21.2%	20.0%	20.0%	19.6%	...	18.8%	18.9%	19.1%	18.1%	17.9%	18.1%	18.1%	18.2%	17.6%	17.5%
desktop wikitext editor	1-9 edits	100.0%	9.8%	8.0%	7.1%	6.5%	6.1%	5.7%	5.4%	5.2%	5.0%	...	4.2%	4.1%	4.1%	4.0%	4.0%	3.9%	3.9%	3.8%	3.8%	3.7%
	10-99 edits	100.0%	16.3%	14.8%	13.8%	13.3%	12.7%	12.3%	11.9%	11.8%	11.5%	...	10.9%	10.8%	10.6%	10.5%	10.5%	10.5%	10.5%	10.4%	10.2%	10.2%
	100-999 edits	100.0%	22.4%	20.5%	19.7%	19.4%	18.6%	18.1%	17.6%	17.4%	17.0%	...	16.4%	16.5%	16.6%	16.3%	16.3%	16.2%	16.0%	16.1%	15.8%	15.8%
	1000+ edits	100.0%	35.7%	34.1%	33.2%	32.3%	32.1%	31.3%	31.1%	31.1%	31.0%	...	30.2%	29.7%	29.9%	29.4%	29.5%	29.7%	29.6%	28.9%	29.3%	29.6%
mobile visual editor	1-9 edits	100.0%	2.8%	2.0%	1.9%	1.5%	1.4%	1.3%	1.1%	1.1%	1.0%	...	0.7%	0.7%	0.7%	0.5%	0.5%	0.5%	0.5%	0.5%	0.6%	0.5%
	10-99 edits	100.0%	9.8%	7.6%	6.5%	6.1%	4.9%	4.7%	5.4%	4.8%	4.2%	...	3.1%	3.0%	3.1%	3.1%	2.8%	2.9%	2.9%	2.9%	2.6%	3.1%
	100-999 edits	100.0%	15.1%	13.7%	12.9%	11.5%	10.4%	9.0%	7.5%	7.2%	8.6%	...	5.9%	6.0%	6.4%	5.1%	4.9%	4.6%	4.3%	5.3%	5.0%	5.2%
	1000+ edits	100.0%	16.6%	16.8%	13.6%	11.6%	11.8%	9.9%	9.9%	9.1%	8.3%	...	6.5%	9.1%	7.1%	6.5%	7.5%	6.1%	6.9%	6.5%	8.1%	6.5%
mobile wikitext editor	1-9 edits	100.0%	4.1%	3.2%	2.8%	2.5%	2.3%	2.1%	2.0%	1.9%	1.8%	...	1.5%	1.4%	1.3%	1.3%	1.3%	1.3%	1.3%	1.2%	1.2%	1.2%
	10-99 edits	100.0%	10.9%	9.4%	8.8%	8.5%	7.9%	7.5%	7.5%	7.2%	7.1%	...	5.9%	5.8%	5.7%	5.6%	5.5%	5.6%	5.5%	5.5%	5.5%	5.1%
	100-999 edits	100.0%	16.7%	15.1%	15.0%	14.2%	13.4%	13.3%	12.9%	12.8%	12.8%	...	11.9%	11.4%	11.5%	11.4%	10.9%	10.9%	11.4%	11.5%	10.8%	10.9%
	1000+ edits	100.0%	20.0%	18.4%	17.6%	17.1%	16.5%	16.7%	16.1%	16.3%	16.1%	...	15.1%	14.7%	14.1%	14.9%	14.4%	15.0%	14.6%	14.5%	14.7%	14.1%

16 rows × 26 columns

In [330]:

newbie_retention = retention_by_experience.xs("1-9 edits", level=1)

for_graph = newbie_retention.iloc[:, 1:].T
for_graph.index = range(2, 27)

for_graph.plot()
plt.gca().yaxis.set_major_formatter(charting.pct_fmt)
plt.ylim(ymin=0)
plt.ylabel("percent retained as editors in the given interface")
plt.xlabel("week after first edit with the interface in the past year")
plt.title("Interface retention among inexperienced editors");

In [ ]:

In [331]:

# How many people are contributing to these statistics?
pd.DataFrame(weekly_edits["week 1"]).join(experience).groupby(["interface", "initial_experience"]).count()

Out[331]:

		week 1
interface	initial_experience
desktop visual editor	1-9 edits	256058
	10-99 edits	55906
	100-999 edits	19627
	1000+ edits	7967
desktop wikitext editor	1-9 edits	589884
	10-99 edits	114853
	100-999 edits	57749
	1000+ edits	22566
mobile visual editor	1-9 edits	31681
	10-99 edits	5255
	100-999 edits	1361
	1000+ edits	493
mobile wikitext editor	1-9 edits	194749
	10-99 edits	24287
	100-999 edits	11885
	1000+ edits	6942

Editor proportion¶

In [332]:

started = may_sessions.query("ready_count >= 1")
completed = started.query("save_success_count >=1 ")
mobile_edits = completed.query("platform == 'phone'")
desktop_edits = completed.query("platform == 'desktop'")

In [333]:

desktop_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(desktop_edits)).map(pct_str)

Out[333]:

editor
visualeditor     9.9%
wikitext        90.1%
Name: actions, dtype: object

In [334]:

mobile_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(mobile_edits)).map(pct_str)

Out[334]:

editor
visualeditor     6.1%
wikitext        93.9%
Name: actions, dtype: object

VE-default wikis¶

However, on desktop this isn't the fairest of comparisons, since the visual editor isn't available on many wikis and many wikis and many namespaces. How does the picture look when we limit it to VE-default wikis?

To get the list of VE-default wikis, we start with the "visualeditor-nondefault" database list. On top of that, we need to add the wikis where visual editor is demoted using settings in InitialiseSettings.php: wmgVisualEditorSingleEditTabSecondaryEditor (enwiki and frwiktionary) and wmgVisualEditorDisableForAnons (enwiki and eswiki).

In [338]:

# Get raw dblist file
r = requests.get("https://raw.githubusercontent.com/wikimedia/operations-mediawiki-config/master/dblists/visualeditor-nondefault.dblist")

# Strip comments
no_comments = re.sub(r"#.*\n", "", r.text)

# Split by line
nonve_dbs = re.split("\n", no_comments)

# Remove the trailing empty line
nonve_dbs.remove("")

# Add our extras
nonve_dbs.extend(["enwiki", "eswiki", "frwiktionary"])

In [340]:

ve_def_edits = desktop_edits.query('wiki not in @nonve_dbs')

ve_def_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_def_edits)).map(pct_str)

Out[340]:

editor
visualeditor    14.8%
wikitext        85.2%
Name: actions, dtype: object

In [341]:

ve_nondef_edits = desktop_edits.query('wiki in @nonve_dbs')
ve_nondef_edits.groupby("editor")["actions"].agg(lambda grp: len(grp) / len(ve_nondef_edits)).map(pct_str)

Out[341]:

editor
visualeditor     5.7%
wikitext        94.3%
Name: actions, dtype: object

Funnel chart¶

In [23]:

def calc_funnel(df, omitloaded=False, omitsaveintent=False):
    results = OrderedDict()
    
    with_inits = df.query("init_count >= 1") 
    total = len(with_inits)
    results["init"] = len(with_inits) / total
    
    with_readies = with_inits.query("ready_count >= 1")
    results["ready"] = len(with_readies) / total
    
    with_loaded = with_readies.query("loaded_count >= 1")
    if not omitloaded and len(with_loaded) > 0:
        results["loaded"] = len(with_loaded) / total
        with_readies = with_loaded
    
    with_saveintents = with_readies.query("save_intent_count >= 1")
    if not omitsaveintent and len(with_saveintents) > 0:
        results["saveIntent"] = len(with_saveintents) / total
        with_readies = with_saveintents
    
    with_saveattempts = with_readies.query("save_attempt_count >= 1")
    results["saveAttempt"] = len(with_saveattempts) / total
    
    with_savesuccesses = with_saveattempts.query("save_success_count >= 1")
    results["saveSuccess"] = len(with_savesuccesses) / total
    
    return pd.DataFrame(results, index = ["percentage_reaching"]).T.applymap(pct_str)

In [24]:

calc_funnel(may_sessions, omitloaded=True, omitsaveintent=True)

Out[24]:

	percentage_reaching
init	100.0%
ready	63.9%
saveAttempt	7.9%
saveSuccess	7.7%

In [25]:

may_sessions.groupby(["platform", "editor"]).apply(calc_funnel).unstack([0, 1]).fillna("—")

Out[25]:

	percentage_reaching
platform	desktop		phone
editor	visualeditor	wikitext	visualeditor	wikitext
init	100.0%	100.0%	100.0%	100.0%
loaded	92.3%	41.7%	—	—
ready	92.3%	42.0%	95.4%	97.0%
saveAttempt	12.6%	10.4%	17.1%	2.7%
saveIntent	13.2%	—	18.8%	3.0%
saveSuccess	11.6%	10.2%	15.1%	2.5%

Low ready rate on the desktop wikitext editor¶

In [28]:

with_inits = may_sessions.query("platform == 'desktop' & editor == 'wikitext' & init_count >= 1")
len(with_inits)

Out[28]:

It's not because a lot of sessions skip the loaded event.

In [33]:

len(with_inits.query("loaded_count >= 1 & ready_count >= 1")) / len(with_inits.query("ready_count >= 1"))

Out[33]:

0.9938517826194514

The desktop wikitext editor has more sessions that drop after init (because there's no ready) but that still have other later events. However, it's not enough to explain the dramatically lower ready rate.

In [45]:

wt_init_no_ready = with_inits.query("ready_count == 0")
wt_init_only = wt_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")

print(pct_str(len(wt_init_only) / len(wt_init_no_ready)))

87.4%

In [46]:

ve_init_no_ready = may_sessions.query("platform == 'desktop' & editor == 'visualeditor' & init_count >= 1 & ready_count == 0")
ve_init_only = ve_init_no_ready.query("ready_count == 0 & save_intent_count == 0 & save_attempt_count == 0 & save_success_count == 0")

print(pct_str(len(ve_init_only) / len(ve_init_no_ready)))

97.6%

Edit completion¶

In [20]:

def completion_rate(df):
    started = df.query("ready_count >= 1")
    completed = started.query("save_success_count >= 1")
    if len(started) > 0:
        return len(completed) / len(started)
    else:
        return None

may_sessions.groupby(["platform"]).apply(completion_rate).map(pct_str)

Out[20]:

platform
desktop    22.5%
phone       2.7%
dtype: object

Desktop¶

In [21]:

desktop = may_sessions.query("platform == 'desktop'")
desktop.groupby(["editor"]).apply(completion_rate).map(pct_str)

Out[21]:

editor
visualeditor    12.6%
wikitext        24.6%
dtype: object

In [22]:

desktop.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)

Out[22]:

editor	visualeditor	wikitext
experience
IP	5.2%	6.4%
0 edits (new editor)	27.8%	18.0%
1-9 edits	45.3%	35.5%
10-99 edits	57.6%	48.8%
100-999 edits	65.3%	58.4%
1000+ edits	73.3%	71.7%

Phone¶

In [349]:

phone = may_sessions.query("platform == 'phone'")
phone.groupby(["editor"]).apply(completion_rate).map(pct_str)

Out[349]:

editor
visualeditor    15.8%
wikitext         2.6%
dtype: object

In [350]:

phone.groupby(["experience", "editor"]).apply(completion_rate).unstack().applymap(pct_str)

Out[350]:

editor	visualeditor	wikitext
experience
IP	11.4%	1.5%
0 edits (new editor)	17.4%	21.9%
1-9 edits	28.0%	38.7%
10-99 edits	38.1%	45.8%
100-999 edits	52.1%	55.9%
1000+ edits	58.1%	62.9%

Time to save¶

In [351]:

completed["duration"].head()

Out[351]:

114   00:07:54
130   00:00:46
137   00:01:55
144   00:28:50
158   00:00:12
Name: duration, dtype: timedelta64[ns]

In [352]:

completed.groupby(["editor", "platform"])["duration"].apply(np.median)

Out[352]:

editor        platform
visualeditor  desktop    00:01:40
              phone      00:01:43
wikitext      desktop    00:00:38
              phone      00:00:48
Name: duration, dtype: timedelta64[ns]

In [357]:

def duration_hist(ax, td_ser):
    hist_vals = (
        td_ser
        .astype("timedelta64[s]")
        .replace(0, 1)
    )

    bins = np.floor(
        np.logspace(0, 4, num=40, dtype=int)
    )

    ax.hist(hist_vals, bins=bins, label=None)
    ax.set_xscale("log")

    s_ticks = [1, 10, 60, 10*60, 60*60, 150*60]
    duration_ticks = FixedLocator(s_ticks)
    
    def to_td(s):
        return str(timedelta(seconds=int(s)))
    
    duration_fmt = FuncFormatter(lambda x, pos: to_td(x))
    
    lines = [
        ("10th percentile ({})", hist_vals.quantile(0.1), "xkcd:pink"),
        ("50th percentile ({})", hist_vals.median(), "xkcd:red"),
        ("90th percentile ({})", hist_vals.quantile(0.9), "xkcd:dark red")
        
    ]
    
    for label, pos, color in lines:
        ax.axvline(x=pos, color=color, label=label.format(to_td(pos)))
    
    ax.legend()

    ax.yaxis.set_major_formatter(charting.comma_fmt)
    ax.set_ylabel("number of sampled, completed edit sessions")

    ax.xaxis.set_major_locator(duration_ticks)
    ax.xaxis.set_major_formatter(duration_fmt)
    ax.set_xlabel("time to save as %H:%M:%S (log scale)")
    

In [358]:

fig, ax = plt.subplots()
ax.set_title("Time to save")
duration_hist(ax, completed["duration"])

In [359]:

full_desktop = completed.query("platform == 'desktop'")
full_phone = completed.query("platform == 'phone'")

fig, ax = plt.subplots(2, 1, figsize=(14, 14))

fig.suptitle("Time to save by platform")
desktop_ax = ax[0]
phone_ax = ax[1]

duration_hist(desktop_ax, full_desktop["duration"])
desktop_ax.set_title("Desktops")

duration_hist(phone_ax, full_phone["duration"])
phone_ax.set_title("Phones");

In [360]:

full_desktop_ve = completed.query("platform == 'desktop' & editor == 'visualeditor'")
full_desktop_wte = completed.query("platform == 'desktop' & editor == 'wikitext'")

fig, ax = plt.subplots(2, 1, figsize=(14, 14))

fig.suptitle("Desktop time to save duration by editing interface")
ve_ax = ax[0]
wte_ax = ax[1]

duration_hist(ve_ax, full_desktop_ve["duration"])
ve_ax.set_title("Visual editor")

duration_hist(wte_ax, full_desktop_wte["duration"])
wte_ax.set_title("2010 wikitext editor");

In [ ]: