Citation Usage Event Data

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

def get_stats(df,groupby):
    all_events_count = df['count'].sum()
    stats = df.groupby(groupby).agg(
    [('days','count'),
    ('total_events','sum'),
    'mean', 
    'median', 
    'min', 
    'max', 
    'std', 
    ('25%', lambda x: x.quantile(.25)), 
    ('50%', lambda x: x.quantile(.5)), 
    ('75%', lambda x: x.quantile(.75)),
    ('perc', lambda x: sum(x)/all_events_count)])
    stats.columns = stats.columns.droplevel()
    return stats.reset_index()

Overview

Session count

In [2]:
# citationusage sessions by date
sessions_query = """
SELECT COUNT(DISTINCT session_id) AS distinct_sessions
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
"""
sessions = spark.sql(sessions_query.format(event_exclusion_sql,start_date_string, end_date_string))
print('Distinct sessions: ', sessions.toPandas()['distinct_sessions'].sum())
Distinct sessions:  72953065

Events by date and type

In [3]:
# show citationusage events by date and type
events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

events = spark.sql(events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
events_pandas = events_df.toPandas()
In [4]:
print('Total events: ', events_pandas['count'].sum())
get_stats(events_pandas,['eventType'])
Total events:  113520376
Out[4]:
eventType days total_events mean median min max std 25% 50% 75% perc
0 extClick 32 50682887 1.583840e+06 1601884 1336032 1782721 128559.418702 1498160.50 1601884 1695299 0.446465
1 fnClick 32 24074581 7.523307e+05 751192 649300 861378 49202.443384 727962.25 751192 785771 0.212073
2 fnHover 32 37590478 1.174702e+06 1219257 865928 1539108 193622.451738 1006661.75 1219257 1340965 0.331134
3 upClick 32 1172430 3.663844e+04 27046 19934 149622 25622.259697 23866.75 27046 37308 0.010328
In [5]:
events_pandas.pivot(index='date', columns='eventType', values='count')
Out[5]:
eventType extClick fnClick fnHover upClick
date
2019-03-22 1564276 725377 1131859 149622
2019-03-23 1369397 667724 883147 20342
2019-03-24 1509326 750329 1018411 23969
2019-03-25 1744801 782508 1365524 82990
2019-03-26 1701660 737057 1342285 28909
2019-03-27 1666356 728824 1321898 27761
2019-03-28 1611850 705904 1267787 25736
2019-03-29 1498888 666005 1122572 23467
2019-03-30 1336032 649300 865928 19934
2019-03-31 1458753 736367 999087 23209
2019-04-01 1696205 786290 1347917 64128
2019-04-02 1682662 747037 1347737 40231
2019-04-03 1670454 738633 1388282 27035
2019-04-04 1646892 729689 1295965 25781
2019-04-05 1569885 698560 1167598 23560
2019-04-06 1384602 686494 884076 20321
2019-04-07 1509870 774512 1017164 25706
2019-04-08 1782721 806068 1381709 28268
2019-04-09 1737629 785598 1354357 27683
2019-04-10 1694997 815365 1330797 26776
2019-04-11 1652196 769221 1322496 25945
2019-04-12 1562013 731778 1174525 24225
2019-04-13 1371961 713839 880946 21390
2019-04-14 1495978 807770 1000841 23310
2019-04-15 1715777 818613 1340525 27057
2019-04-16 1696712 789199 1539108 27887
2019-04-17 1668298 775790 1263989 37014
2019-04-18 1591918 760815 1159988 60986
2019-04-19 1502334 762657 1008602 61486
2019-04-20 1402357 752056 869566 36189
2019-04-21 1453171 813824 926495 38190
2019-04-22 1732916 861378 1269297 53323
In [6]:
# daily count of events for WP:M pages with external links over study period
wpm_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_wpm_events = spark.sql(
        wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)

wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)
wpm_events = wpm_events_merged.toPandas()

WP:M event summary

In [7]:
# summary of events for WP:M pages with external links
print('Total events: ', wpm_events['count'].sum())
get_stats(wpm_events,['eventType'])
Total events:  3322871
Out[7]:
eventType days total_events mean median min max std 25% 50% 75% perc
0 extClick 32 825981 25811.90625 27276.5 18382 30593 4193.178031 21741.50 27276.5 29271.75 0.248575
1 fnClick 32 887662 27739.43750 28612.0 22608 31122 2626.223334 25830.75 28612.0 29741.75 0.267137
2 fnHover 32 1559965 48748.90625 48772.5 32894 60269 8621.973440 41783.50 48772.5 56687.50 0.469463
3 upClick 32 49263 1539.46875 913.5 621 12580 2227.803402 820.00 913.5 1132.25 0.014825

WP:M daily events

In [8]:
# daily event counts for WP:M pages with external links
wpm_events.pivot(index='date', columns='eventType', values='count')
Out[8]:
eventType extClick fnClick fnHover upClick
date
2019-03-22 27130 27753 47375 5851
2019-03-23 20471 23852 37786 683
2019-03-24 23375 26987 44589 896
2019-03-25 30398 29599 59335 1235
2019-03-26 30568 30422 60269 1051
2019-03-27 29707 30741 59600 971
2019-03-28 29682 29298 56639 929
2019-03-29 24527 25752 46813 830
2019-03-30 18647 22608 36714 659
2019-03-31 21674 25275 42956 799
2019-04-01 29135 29591 56833 1391
2019-04-02 30330 30050 58545 1442
2019-04-03 28904 29103 57382 865
2019-04-04 28497 28041 54026 860
2019-04-05 25218 25857 47214 730
2019-04-06 19373 23510 35657 700
2019-04-07 21764 26299 42046 916
2019-04-08 29885 30308 58223 1098
2019-04-09 30593 31122 58366 1077
2019-04-10 29814 30883 55613 911
2019-04-11 28581 29395 53284 880
2019-04-12 25286 26925 46673 762
2019-04-13 18806 22654 35298 705
2019-04-14 21449 26156 40996 827
2019-04-15 28771 29639 54695 969
2019-04-16 28438 30080 54799 1014
2019-04-17 28218 29006 52183 886
2019-04-18 26262 28218 47142 1872
2019-04-19 25757 29488 39727 12580
2019-04-20 18382 23494 32894 621
2019-04-21 18916 25155 36123 1435
2019-04-22 27423 30401 50170 2818
In [9]:
# daily count of events for W pages with ext links over study period
w_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_w_events = spark.sql(
        w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    w_events_rdd = w_events_rdd.union(daily_w_events.rdd)

w_events_merged = sqlContext.createDataFrame(w_events_rdd)
w_events = w_events_merged.toPandas()

W event summary

In [10]:
# summary of events for W pages with external links
print('Total events: ', w_events['count'].sum())
get_stats(w_events,['eventType'])
Total events:  109974252
Out[10]:
eventType days total_events mean median min max std 25% 50% 75% perc
0 extClick 32 49827889 1557121 1573276 1316582 1752162 124483.889592 1474046.75 1573276 1664884 0.453087
1 fnClick 32 23108212 722131 723895 624928 820998 46869.018210 695899.75 723895 752815 0.210124
2 fnHover 32 35926529 1122704 1165469 826976 1481295 185086.791800 964512.25 1165469 1279761 0.326681
3 upClick 32 1111622 34738 25724 19026 143458 24483.350241 22709.50 25724 35897 0.010108

W daily events

In [11]:
# daily event counts for W pages with external links
w_events.pivot(index='date', columns='eventType', values='count')
Out[11]:
eventType extClick fnClick fnHover upClick
date
2019-03-22 1535905 695599 1081404 143458
2019-03-23 1347885 641895 843147 19347
2019-03-24 1484929 720997 971081 22756
2019-03-25 1713271 750599 1302415 81252
2019-03-26 1669983 704416 1278722 27362
2019-03-27 1635725 696000 1259035 26375
2019-03-28 1581173 674494 1208129 24435
2019-03-29 1473530 638280 1073015 22314
2019-03-30 1316582 624928 826976 19026
2019-03-31 1435934 709110 953775 22086
2019-04-01 1666036 754475 1287996 62344
2019-04-02 1650965 714823 1285843 38363
2019-04-03 1640269 707290 1327765 25749
2019-04-04 1617364 699627 1238942 24518
2019-04-05 1543810 670946 1117829 22570
2019-04-06 1364368 660849 846327 19360
2019-04-07 1487454 746301 972742 24484
2019-04-08 1752162 773643 1320311 26744
2019-04-09 1706339 752262 1292854 26244
2019-04-10 1664500 782356 1272114 25440
2019-04-11 1623127 737892 1266330 24670
2019-04-12 1536334 703104 1125370 23128
2019-04-13 1352825 689532 843750 20411
2019-04-14 1474219 779635 957517 22208
2019-04-15 1686583 786904 1282879 25699
2019-04-16 1667787 757042 1481295 26469
2019-04-17 1639804 744860 1209065 35731
2019-04-18 1565379 730809 1110389 58707
2019-04-19 1476396 731429 966844 48635
2019-04-20 1383804 726794 834892 35315
2019-04-21 1432099 780323 882207 36395
2019-04-22 1701348 820998 1205569 50027

Mobile vs Desktop

In [12]:
# mobile vs desktop events for W pages with external links
w_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""

events = spark.sql(w_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
w_events_pandas = events_df.toPandas()
get_stats(w_events_pandas,['mode','eventType'])
Out[12]:
mode eventType days total_events mean median min max std 25% 50% 75% perc
0 desktop extClick 32 29294243 9.154451e+05 974150.5 661195 1102793 160436.363245 749929.00 974150.5 1049703.00 0.266374
1 desktop fnClick 32 7527839 2.352450e+05 255253.5 162052 309010 43789.385081 195226.00 255253.5 267710.25 0.068451
2 desktop fnHover 32 33855425 1.057982e+06 1097533.0 766685 1416905 185721.887148 897499.25 1097533.0 1215819.50 0.307849
3 desktop upClick 32 999363 3.123009e+04 22426.5 15554 140145 24533.609128 19123.75 22426.5 32551.50 0.009087
4 mobile extClick 32 20533646 6.416764e+05 626029.5 561265 757109 56887.431496 591392.75 626029.5 687168.25 0.186713
5 mobile fnClick 32 15580373 4.868867e+05 480579.0 416481 595415 47949.474529 442803.75 480579.0 519503.25 0.141673
6 mobile fnHover 32 2071104 6.472200e+04 64384.0 57461 72650 3955.892616 62148.75 64384.0 67267.25 0.018833
7 mobile upClick 32 112259 3.508094e+03 3466.5 2987 4127 258.408199 3315.25 3466.5 3713.75 0.001021
In [13]:
# mobile vs desktop events for WP:M pages with external links
wpm_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""

events = spark.sql(wpm_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
wpm_events_pandas = events_df.toPandas()
get_stats(wpm_events_pandas,['mode','eventType'])
Out[13]:
mode eventType days total_events mean median min max std 25% 50% 75% perc
0 desktop extClick 32 569352 17792.25000 19381.5 10839 22257 3881.859539 14054.75 19381.5 21140.25 0.171343
1 desktop fnClick 32 350362 10948.81250 11809.5 6992 13387 2039.249885 9359.25 11809.5 12495.75 0.105440
2 desktop fnHover 32 1466075 45814.84375 45790.5 30290 56944 8395.114932 38803.50 45790.5 53479.75 0.441207
3 desktop upClick 32 45528 1422.75000 787.5 530 12492 2232.108652 711.00 787.5 982.00 0.013701
4 mobile extClick 32 256629 8019.65625 8070.0 7016 8960 438.177576 7644.75 8070.0 8363.50 0.077231
5 mobile fnClick 32 537300 16790.62500 16899.0 14864 18461 887.853764 16156.00 16899.0 17294.25 0.161698
6 mobile fnHover 32 93890 2934.06250 2937.5 2542 3421 265.305902 2690.75 2937.5 3159.75 0.028256
7 mobile upClick 32 3735 116.71875 118.0 88 155 16.142979 106.00 118.0 127.00 0.001124

Visualization

In [14]:
# Plots inline
%matplotlib inline
In [15]:
w_events_info = get_stats(w_events_pandas,['mode', 'eventType'])
w_events_info['group'] = "W"
wm_events_info = get_stats(wpm_events_pandas,['mode', 'eventType'])
wm_events_info['group'] = "WPM"

all_stats = w_events_info.append(wm_events_info)
all_stats.head()
Out[15]:
mode eventType days total_events mean median min max std 25% 50% 75% perc group
0 desktop extClick 32 29294243 9.154451e+05 974150.5 661195 1102793 160436.363245 749929.00 974150.5 1049703.00 0.266374 W
1 desktop fnClick 32 7527839 2.352450e+05 255253.5 162052 309010 43789.385081 195226.00 255253.5 267710.25 0.068451 W
2 desktop fnHover 32 33855425 1.057982e+06 1097533.0 766685 1416905 185721.887148 897499.25 1097533.0 1215819.50 0.307849 W
3 desktop upClick 32 999363 3.123009e+04 22426.5 15554 140145 24533.609128 19123.75 22426.5 32551.50 0.009087 W
4 mobile extClick 32 20533646 6.416764e+05 626029.5 561265 757109 56887.431496 591392.75 626029.5 687168.25 0.186713 W
In [16]:
events_by_group = all_stats.groupby(["eventType", "group"]).agg("sum")[["perc", "total_events"]].reset_index()
events_by_group["daily_average"] = events_by_group["total_events"]/32
events_by_group.head()
Out[16]:
eventType group perc total_events daily_average
0 extClick W 0.453087 49827889 1.557122e+06
1 extClick WPM 0.248575 825981 2.581191e+04
2 fnClick W 0.210124 23108212 7.221316e+05
3 fnClick WPM 0.267137 887662 2.773944e+04
4 fnHover W 0.326681 35926529 1.122704e+06
In [17]:
sns.set(font_scale=1.2)

def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height() + 0.005
            value = '{:,.1%}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center") 
    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


my_pal = {"WPM": "darkorange", "W": "steelblue"}
fig = sns.catplot(y='perc', x='eventType', hue='group', 
                data=events_by_group, kind='bar', palette=my_pal, 
                  legend=False, hue_order=["WPM", "W"])
fig.ax.set(xlabel='Event Type', ylabel='Normalized number of events', 
           title="Distribution of events by type - WPM vs. W")
fig.ax.ticklabel_format(style='plain', axis='y')
vals = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0%}'.format(x) for x in vals])
plt.gcf().set_size_inches(10, 6)
plt.legend(loc='upper right')

show_values_on_bars(fig.ax)
plt.savefig("events_distribution.pdf")

Statistical significance of the difference

Get events count

In [18]:
w_events_query = """
SELECT mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""

w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_w_events = spark.sql(
        w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    w_events_rdd = w_events_rdd.union(daily_w_events.rdd)
    
w_events_merged = sqlContext.createDataFrame(w_events_rdd)

w_events_merged.registerTempTable("w_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'W' as group
FROM w_events_merged
GROUP BY mode, eventType
"""

w_events = spark.sql(merge_events_query)
w_events
Out[18]:
DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]
In [19]:
wpm_events_query = """
SELECT mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""

wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_wpm_events = spark.sql(
        wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)
    
wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)

wpm_events_merged.registerTempTable("wpm_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'WPM' as group
FROM wpm_events_merged
GROUP BY mode, eventType
"""

wpm_events = spark.sql(merge_events_query)
wpm_events
Out[19]:
DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]
In [20]:
all_events = wpm_events.union(w_events).toPandas()
all_events
Out[20]:
mode eventType pageloads_with_event group
0 desktop fnClick 350362 WPM
1 mobile extClick 256629 WPM
2 mobile fnHover 93890 WPM
3 mobile fnClick 537300 WPM
4 desktop extClick 569352 WPM
5 desktop fnHover 1466075 WPM
6 desktop upClick 45528 WPM
7 mobile upClick 3735 WPM
8 desktop fnClick 7527839 W
9 mobile extClick 20533646 W
10 mobile fnHover 2071104 W
11 mobile fnClick 15580373 W
12 desktop extClick 29294243 W
13 desktop fnHover 33855425 W
14 desktop upClick 999363 W
15 mobile upClick 112259 W
In [21]:
from pyspark.sql import functions as fn

w_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
     SUM(view_count) AS total_pageviews, 'W' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
    (SELECT DISTINCT page_id 
    FROM ryanmax.population_w_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""


w_pageviews = spark.sql(w_query.format(start_date_string, end_date_string))\
                .groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))
In [22]:
wpm_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
            SUM(view_count) AS total_pageviews, 'WPM' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
    (SELECT DISTINCT page_id 
    FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
wpm_pageviews = spark.sql(wpm_query.format(start_date_string, end_date_string))\
                .groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))
In [23]:
all_pageviews = wpm_pageviews.union(w_pageviews).toPandas()
all_pageviews
Out[23]:
mode group total_pageviews
0 mobile WPM 125364759
1 desktop WPM 62650292
2 mobile W 4175643369
3 desktop W 3134600739
In [24]:
events_summary = all_events.merge(all_pageviews, on=['mode', 'group'])
events_summary['pageloads_without_event'] = events_summary['total_pageviews']-events_summary['pageloads_with_event']
events_summary['event_ratio'] = events_summary['pageloads_with_event']/events_summary['total_pageviews']
events_summary['pages_per_event'] = events_summary['total_pageviews']/events_summary['pageloads_with_event']

events_summary
Out[24]:
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
0 desktop fnClick 350362 WPM 62650292 62299930 0.005592 178.815888
1 desktop extClick 569352 WPM 62650292 62080940 0.009088 110.037889
2 desktop fnHover 1466075 WPM 62650292 61184217 0.023401 42.733347
3 desktop upClick 45528 WPM 62650292 62604764 0.000727 1376.082674
4 mobile extClick 256629 WPM 125364759 125108130 0.002047 488.505816
5 mobile fnHover 93890 WPM 125364759 125270869 0.000749 1335.230152
6 mobile fnClick 537300 WPM 125364759 124827459 0.004286 233.323579
7 mobile upClick 3735 WPM 125364759 125361024 0.000030 33564.861847
8 desktop fnClick 7527839 W 3134600739 3127072900 0.002402 416.401140
9 desktop extClick 29294243 W 3134600739 3105306496 0.009345 107.003985
10 desktop fnHover 33855425 W 3134600739 3100745314 0.010801 92.587842
11 desktop upClick 999363 W 3134600739 3133601376 0.000319 3136.598752
12 mobile extClick 20533646 W 4175643369 4155109723 0.004917 203.356158
13 mobile fnHover 2071104 W 4175643369 4173572265 0.000496 2016.143742
14 mobile fnClick 15580373 W 4175643369 4160062996 0.003731 268.006637
15 mobile upClick 112259 W 4175643369 4175531110 0.000027 37196.513144
In [25]:
from IPython.core import display as ICD
import scipy

for mode in events_summary['mode'].unique():
    for eventType in events_summary['eventType'].unique():
        print("\n-------------")
        df = events_summary[(events_summary['mode']==mode)
                       & (events_summary['group'].isin(["WPM", "W"]))
                       & (events_summary['eventType']==eventType)]
        ICD.display(df)
        print("Contingency table:")
        cm = df[['pageloads_with_event', 'pageloads_without_event']].as_matrix()
        print(cm)
        oddsratio, pvalue = scipy.stats.fisher_exact(cm)
        if pvalue < 0.001:
            print("\nOddsRatio: {}, p-value < 0.001".format(oddsratio))
        else:
            print("\nOddsRatio: {}, p-value = {}".format(oddsratio, pvalue))
-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
0 desktop fnClick 350362 WPM 62650292 62299930 0.005592 178.815888
8 desktop fnClick 7527839 W 3134600739 3127072900 0.002402 416.401140
Contingency table:
[[    350362   62299930]
 [   7527839 3127072900]]

OddsRatio: 2.3361306201136354, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
1 desktop extClick 569352 WPM 62650292 62080940 0.009088 110.037889
9 desktop extClick 29294243 W 3134600739 3105306496 0.009345 107.003985
Contingency table:
[[    569352   62080940]
 [  29294243 3105306496]]

OddsRatio: 0.9721756931564446, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
2 desktop fnHover 1466075 WPM 62650292 61184217 0.023401 42.733347
10 desktop fnHover 33855425 W 3134600739 3100745314 0.010801 92.587842
Contingency table:
[[   1466075   61184217]
 [  33855425 3100745314]]

OddsRatio: 2.194596122471873, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
3 desktop upClick 45528 WPM 62650292 62604764 0.000727 1376.082674
11 desktop upClick 999363 W 3134600739 3133601376 0.000319 3136.598752
Contingency table:
[[     45528   62604764]
 [    999363 3133601376]]

OddsRatio: 2.2802983491720803, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
6 mobile fnClick 537300 WPM 125364759 124827459 0.004286 233.323579
14 mobile fnClick 15580373 W 4175643369 4160062996 0.003731 268.006637
Contingency table:
[[    537300  124827459]
 [  15580373 4160062996]]

OddsRatio: 1.1492877219694864, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
4 mobile extClick 256629 WPM 125364759 125108130 0.002047 488.505816
12 mobile extClick 20533646 W 4175643369 4155109723 0.004917 203.356158
Contingency table:
[[    256629  125108130]
 [  20533646 4155109723]]

OddsRatio: 0.41508460385598855, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
5 mobile fnHover 93890 WPM 125364759 125270869 0.000749 1335.230152
13 mobile fnHover 2071104 W 4175643369 4173572265 0.000496 2016.143742
Contingency table:
[[     93890  125270869]
 [   2071104 4173572265]]

OddsRatio: 1.5103419291607243, p-value < 0.001

-------------
mode eventType pageloads_with_event group total_pageviews pageloads_without_event event_ratio pages_per_event
7 mobile upClick 3735 WPM 125364759 125361024 0.000030 33564.861847
15 mobile upClick 112259 W 4175643369 4175531110 0.000027 37196.513144
Contingency table:
[[      3735  125361024]
 [    112259 4175531110]]

OddsRatio: 1.1082012347934034, p-value < 0.001
In [ ]: