# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
def get_stats(df,groupby):
all_events_count = df['count'].sum()
stats = df.groupby(groupby).agg(
[('days','count'),
('total_events','sum'),
'mean',
'median',
'min',
'max',
'std',
('25%', lambda x: x.quantile(.25)),
('50%', lambda x: x.quantile(.5)),
('75%', lambda x: x.quantile(.75)),
('perc', lambda x: sum(x)/all_events_count)])
stats.columns = stats.columns.droplevel()
return stats.reset_index()
# citationusage sessions by date
sessions_query = """
SELECT COUNT(DISTINCT session_id) AS distinct_sessions
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
"""
sessions = spark.sql(sessions_query.format(event_exclusion_sql,start_date_string, end_date_string))
print('Distinct sessions: ', sessions.toPandas()['distinct_sessions'].sum())
Distinct sessions: 72953065
# show citationusage events by date and type
events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""
events = spark.sql(events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
events_pandas = events_df.toPandas()
print('Total events: ', events_pandas['count'].sum())
get_stats(events_pandas,['eventType'])
Total events: 113520376
eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | extClick | 32 | 50682887 | 1.583840e+06 | 1601884 | 1336032 | 1782721 | 128559.418702 | 1498160.50 | 1601884 | 1695299 | 0.446465 |
1 | fnClick | 32 | 24074581 | 7.523307e+05 | 751192 | 649300 | 861378 | 49202.443384 | 727962.25 | 751192 | 785771 | 0.212073 |
2 | fnHover | 32 | 37590478 | 1.174702e+06 | 1219257 | 865928 | 1539108 | 193622.451738 | 1006661.75 | 1219257 | 1340965 | 0.331134 |
3 | upClick | 32 | 1172430 | 3.663844e+04 | 27046 | 19934 | 149622 | 25622.259697 | 23866.75 | 27046 | 37308 | 0.010328 |
events_pandas.pivot(index='date', columns='eventType', values='count')
eventType | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
date | ||||
2019-03-22 | 1564276 | 725377 | 1131859 | 149622 |
2019-03-23 | 1369397 | 667724 | 883147 | 20342 |
2019-03-24 | 1509326 | 750329 | 1018411 | 23969 |
2019-03-25 | 1744801 | 782508 | 1365524 | 82990 |
2019-03-26 | 1701660 | 737057 | 1342285 | 28909 |
2019-03-27 | 1666356 | 728824 | 1321898 | 27761 |
2019-03-28 | 1611850 | 705904 | 1267787 | 25736 |
2019-03-29 | 1498888 | 666005 | 1122572 | 23467 |
2019-03-30 | 1336032 | 649300 | 865928 | 19934 |
2019-03-31 | 1458753 | 736367 | 999087 | 23209 |
2019-04-01 | 1696205 | 786290 | 1347917 | 64128 |
2019-04-02 | 1682662 | 747037 | 1347737 | 40231 |
2019-04-03 | 1670454 | 738633 | 1388282 | 27035 |
2019-04-04 | 1646892 | 729689 | 1295965 | 25781 |
2019-04-05 | 1569885 | 698560 | 1167598 | 23560 |
2019-04-06 | 1384602 | 686494 | 884076 | 20321 |
2019-04-07 | 1509870 | 774512 | 1017164 | 25706 |
2019-04-08 | 1782721 | 806068 | 1381709 | 28268 |
2019-04-09 | 1737629 | 785598 | 1354357 | 27683 |
2019-04-10 | 1694997 | 815365 | 1330797 | 26776 |
2019-04-11 | 1652196 | 769221 | 1322496 | 25945 |
2019-04-12 | 1562013 | 731778 | 1174525 | 24225 |
2019-04-13 | 1371961 | 713839 | 880946 | 21390 |
2019-04-14 | 1495978 | 807770 | 1000841 | 23310 |
2019-04-15 | 1715777 | 818613 | 1340525 | 27057 |
2019-04-16 | 1696712 | 789199 | 1539108 | 27887 |
2019-04-17 | 1668298 | 775790 | 1263989 | 37014 |
2019-04-18 | 1591918 | 760815 | 1159988 | 60986 |
2019-04-19 | 1502334 | 762657 | 1008602 | 61486 |
2019-04-20 | 1402357 | 752056 | 869566 | 36189 |
2019-04-21 | 1453171 | 813824 | 926495 | 38190 |
2019-04-22 | 1732916 | 861378 | 1269297 | 53323 |
# daily count of events for WP:M pages with external links over study period
wpm_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""
wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
dt = date_to_dt(d)
daily_wpm_events = spark.sql(
wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)
wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)
wpm_events = wpm_events_merged.toPandas()
# summary of events for WP:M pages with external links
print('Total events: ', wpm_events['count'].sum())
get_stats(wpm_events,['eventType'])
Total events: 3322871
eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | extClick | 32 | 825981 | 25811.90625 | 27276.5 | 18382 | 30593 | 4193.178031 | 21741.50 | 27276.5 | 29271.75 | 0.248575 |
1 | fnClick | 32 | 887662 | 27739.43750 | 28612.0 | 22608 | 31122 | 2626.223334 | 25830.75 | 28612.0 | 29741.75 | 0.267137 |
2 | fnHover | 32 | 1559965 | 48748.90625 | 48772.5 | 32894 | 60269 | 8621.973440 | 41783.50 | 48772.5 | 56687.50 | 0.469463 |
3 | upClick | 32 | 49263 | 1539.46875 | 913.5 | 621 | 12580 | 2227.803402 | 820.00 | 913.5 | 1132.25 | 0.014825 |
# daily event counts for WP:M pages with external links
wpm_events.pivot(index='date', columns='eventType', values='count')
eventType | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
date | ||||
2019-03-22 | 27130 | 27753 | 47375 | 5851 |
2019-03-23 | 20471 | 23852 | 37786 | 683 |
2019-03-24 | 23375 | 26987 | 44589 | 896 |
2019-03-25 | 30398 | 29599 | 59335 | 1235 |
2019-03-26 | 30568 | 30422 | 60269 | 1051 |
2019-03-27 | 29707 | 30741 | 59600 | 971 |
2019-03-28 | 29682 | 29298 | 56639 | 929 |
2019-03-29 | 24527 | 25752 | 46813 | 830 |
2019-03-30 | 18647 | 22608 | 36714 | 659 |
2019-03-31 | 21674 | 25275 | 42956 | 799 |
2019-04-01 | 29135 | 29591 | 56833 | 1391 |
2019-04-02 | 30330 | 30050 | 58545 | 1442 |
2019-04-03 | 28904 | 29103 | 57382 | 865 |
2019-04-04 | 28497 | 28041 | 54026 | 860 |
2019-04-05 | 25218 | 25857 | 47214 | 730 |
2019-04-06 | 19373 | 23510 | 35657 | 700 |
2019-04-07 | 21764 | 26299 | 42046 | 916 |
2019-04-08 | 29885 | 30308 | 58223 | 1098 |
2019-04-09 | 30593 | 31122 | 58366 | 1077 |
2019-04-10 | 29814 | 30883 | 55613 | 911 |
2019-04-11 | 28581 | 29395 | 53284 | 880 |
2019-04-12 | 25286 | 26925 | 46673 | 762 |
2019-04-13 | 18806 | 22654 | 35298 | 705 |
2019-04-14 | 21449 | 26156 | 40996 | 827 |
2019-04-15 | 28771 | 29639 | 54695 | 969 |
2019-04-16 | 28438 | 30080 | 54799 | 1014 |
2019-04-17 | 28218 | 29006 | 52183 | 886 |
2019-04-18 | 26262 | 28218 | 47142 | 1872 |
2019-04-19 | 25757 | 29488 | 39727 | 12580 |
2019-04-20 | 18382 | 23494 | 32894 | 621 |
2019-04-21 | 18916 | 25155 | 36123 | 1435 |
2019-04-22 | 27423 | 30401 | 50170 | 2818 |
# daily count of events for W pages with ext links over study period
w_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""
w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
dt = date_to_dt(d)
daily_w_events = spark.sql(
w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
w_events_rdd = w_events_rdd.union(daily_w_events.rdd)
w_events_merged = sqlContext.createDataFrame(w_events_rdd)
w_events = w_events_merged.toPandas()
# summary of events for W pages with external links
print('Total events: ', w_events['count'].sum())
get_stats(w_events,['eventType'])
Total events: 109974252
eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | extClick | 32 | 49827889 | 1557121 | 1573276 | 1316582 | 1752162 | 124483.889592 | 1474046.75 | 1573276 | 1664884 | 0.453087 |
1 | fnClick | 32 | 23108212 | 722131 | 723895 | 624928 | 820998 | 46869.018210 | 695899.75 | 723895 | 752815 | 0.210124 |
2 | fnHover | 32 | 35926529 | 1122704 | 1165469 | 826976 | 1481295 | 185086.791800 | 964512.25 | 1165469 | 1279761 | 0.326681 |
3 | upClick | 32 | 1111622 | 34738 | 25724 | 19026 | 143458 | 24483.350241 | 22709.50 | 25724 | 35897 | 0.010108 |
# daily event counts for W pages with external links
w_events.pivot(index='date', columns='eventType', values='count')
eventType | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
date | ||||
2019-03-22 | 1535905 | 695599 | 1081404 | 143458 |
2019-03-23 | 1347885 | 641895 | 843147 | 19347 |
2019-03-24 | 1484929 | 720997 | 971081 | 22756 |
2019-03-25 | 1713271 | 750599 | 1302415 | 81252 |
2019-03-26 | 1669983 | 704416 | 1278722 | 27362 |
2019-03-27 | 1635725 | 696000 | 1259035 | 26375 |
2019-03-28 | 1581173 | 674494 | 1208129 | 24435 |
2019-03-29 | 1473530 | 638280 | 1073015 | 22314 |
2019-03-30 | 1316582 | 624928 | 826976 | 19026 |
2019-03-31 | 1435934 | 709110 | 953775 | 22086 |
2019-04-01 | 1666036 | 754475 | 1287996 | 62344 |
2019-04-02 | 1650965 | 714823 | 1285843 | 38363 |
2019-04-03 | 1640269 | 707290 | 1327765 | 25749 |
2019-04-04 | 1617364 | 699627 | 1238942 | 24518 |
2019-04-05 | 1543810 | 670946 | 1117829 | 22570 |
2019-04-06 | 1364368 | 660849 | 846327 | 19360 |
2019-04-07 | 1487454 | 746301 | 972742 | 24484 |
2019-04-08 | 1752162 | 773643 | 1320311 | 26744 |
2019-04-09 | 1706339 | 752262 | 1292854 | 26244 |
2019-04-10 | 1664500 | 782356 | 1272114 | 25440 |
2019-04-11 | 1623127 | 737892 | 1266330 | 24670 |
2019-04-12 | 1536334 | 703104 | 1125370 | 23128 |
2019-04-13 | 1352825 | 689532 | 843750 | 20411 |
2019-04-14 | 1474219 | 779635 | 957517 | 22208 |
2019-04-15 | 1686583 | 786904 | 1282879 | 25699 |
2019-04-16 | 1667787 | 757042 | 1481295 | 26469 |
2019-04-17 | 1639804 | 744860 | 1209065 | 35731 |
2019-04-18 | 1565379 | 730809 | 1110389 | 58707 |
2019-04-19 | 1476396 | 731429 | 966844 | 48635 |
2019-04-20 | 1383804 | 726794 | 834892 | 35315 |
2019-04-21 | 1432099 | 780323 | 882207 | 36395 |
2019-04-22 | 1701348 | 820998 | 1205569 | 50027 |
# mobile vs desktop events for W pages with external links
w_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""
events = spark.sql(w_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
w_events_pandas = events_df.toPandas()
get_stats(w_events_pandas,['mode','eventType'])
mode | eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | desktop | extClick | 32 | 29294243 | 9.154451e+05 | 974150.5 | 661195 | 1102793 | 160436.363245 | 749929.00 | 974150.5 | 1049703.00 | 0.266374 |
1 | desktop | fnClick | 32 | 7527839 | 2.352450e+05 | 255253.5 | 162052 | 309010 | 43789.385081 | 195226.00 | 255253.5 | 267710.25 | 0.068451 |
2 | desktop | fnHover | 32 | 33855425 | 1.057982e+06 | 1097533.0 | 766685 | 1416905 | 185721.887148 | 897499.25 | 1097533.0 | 1215819.50 | 0.307849 |
3 | desktop | upClick | 32 | 999363 | 3.123009e+04 | 22426.5 | 15554 | 140145 | 24533.609128 | 19123.75 | 22426.5 | 32551.50 | 0.009087 |
4 | mobile | extClick | 32 | 20533646 | 6.416764e+05 | 626029.5 | 561265 | 757109 | 56887.431496 | 591392.75 | 626029.5 | 687168.25 | 0.186713 |
5 | mobile | fnClick | 32 | 15580373 | 4.868867e+05 | 480579.0 | 416481 | 595415 | 47949.474529 | 442803.75 | 480579.0 | 519503.25 | 0.141673 |
6 | mobile | fnHover | 32 | 2071104 | 6.472200e+04 | 64384.0 | 57461 | 72650 | 3955.892616 | 62148.75 | 64384.0 | 67267.25 | 0.018833 |
7 | mobile | upClick | 32 | 112259 | 3.508094e+03 | 3466.5 | 2987 | 4127 | 258.408199 | 3315.25 | 3466.5 | 3713.75 | 0.001021 |
# mobile vs desktop events for WP:M pages with external links
wpm_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""
events = spark.sql(wpm_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
wpm_events_pandas = events_df.toPandas()
get_stats(wpm_events_pandas,['mode','eventType'])
mode | eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | desktop | extClick | 32 | 569352 | 17792.25000 | 19381.5 | 10839 | 22257 | 3881.859539 | 14054.75 | 19381.5 | 21140.25 | 0.171343 |
1 | desktop | fnClick | 32 | 350362 | 10948.81250 | 11809.5 | 6992 | 13387 | 2039.249885 | 9359.25 | 11809.5 | 12495.75 | 0.105440 |
2 | desktop | fnHover | 32 | 1466075 | 45814.84375 | 45790.5 | 30290 | 56944 | 8395.114932 | 38803.50 | 45790.5 | 53479.75 | 0.441207 |
3 | desktop | upClick | 32 | 45528 | 1422.75000 | 787.5 | 530 | 12492 | 2232.108652 | 711.00 | 787.5 | 982.00 | 0.013701 |
4 | mobile | extClick | 32 | 256629 | 8019.65625 | 8070.0 | 7016 | 8960 | 438.177576 | 7644.75 | 8070.0 | 8363.50 | 0.077231 |
5 | mobile | fnClick | 32 | 537300 | 16790.62500 | 16899.0 | 14864 | 18461 | 887.853764 | 16156.00 | 16899.0 | 17294.25 | 0.161698 |
6 | mobile | fnHover | 32 | 93890 | 2934.06250 | 2937.5 | 2542 | 3421 | 265.305902 | 2690.75 | 2937.5 | 3159.75 | 0.028256 |
7 | mobile | upClick | 32 | 3735 | 116.71875 | 118.0 | 88 | 155 | 16.142979 | 106.00 | 118.0 | 127.00 | 0.001124 |
# Plots inline
%matplotlib inline
w_events_info = get_stats(w_events_pandas,['mode', 'eventType'])
w_events_info['group'] = "W"
wm_events_info = get_stats(wpm_events_pandas,['mode', 'eventType'])
wm_events_info['group'] = "WPM"
all_stats = w_events_info.append(wm_events_info)
all_stats.head()
mode | eventType | days | total_events | mean | median | min | max | std | 25% | 50% | 75% | perc | group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | desktop | extClick | 32 | 29294243 | 9.154451e+05 | 974150.5 | 661195 | 1102793 | 160436.363245 | 749929.00 | 974150.5 | 1049703.00 | 0.266374 | W |
1 | desktop | fnClick | 32 | 7527839 | 2.352450e+05 | 255253.5 | 162052 | 309010 | 43789.385081 | 195226.00 | 255253.5 | 267710.25 | 0.068451 | W |
2 | desktop | fnHover | 32 | 33855425 | 1.057982e+06 | 1097533.0 | 766685 | 1416905 | 185721.887148 | 897499.25 | 1097533.0 | 1215819.50 | 0.307849 | W |
3 | desktop | upClick | 32 | 999363 | 3.123009e+04 | 22426.5 | 15554 | 140145 | 24533.609128 | 19123.75 | 22426.5 | 32551.50 | 0.009087 | W |
4 | mobile | extClick | 32 | 20533646 | 6.416764e+05 | 626029.5 | 561265 | 757109 | 56887.431496 | 591392.75 | 626029.5 | 687168.25 | 0.186713 | W |
events_by_group = all_stats.groupby(["eventType", "group"]).agg("sum")[["perc", "total_events"]].reset_index()
events_by_group["daily_average"] = events_by_group["total_events"]/32
events_by_group.head()
eventType | group | perc | total_events | daily_average | |
---|---|---|---|---|---|
0 | extClick | W | 0.453087 | 49827889 | 1.557122e+06 |
1 | extClick | WPM | 0.248575 | 825981 | 2.581191e+04 |
2 | fnClick | W | 0.210124 | 23108212 | 7.221316e+05 |
3 | fnClick | WPM | 0.267137 | 887662 | 2.773944e+04 |
4 | fnHover | W | 0.326681 | 35926529 | 1.122704e+06 |
sns.set(font_scale=1.2)
def show_values_on_bars(axs):
def _show_on_single_plot(ax):
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height() + 0.005
value = '{:,.1%}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
my_pal = {"WPM": "darkorange", "W": "steelblue"}
fig = sns.catplot(y='perc', x='eventType', hue='group',
data=events_by_group, kind='bar', palette=my_pal,
legend=False, hue_order=["WPM", "W"])
fig.ax.set(xlabel='Event Type', ylabel='Normalized number of events',
title="Distribution of events by type - WPM vs. W")
fig.ax.ticklabel_format(style='plain', axis='y')
vals = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0%}'.format(x) for x in vals])
plt.gcf().set_size_inches(10, 6)
plt.legend(loc='upper right')
show_values_on_bars(fig.ax)
plt.savefig("events_distribution.pdf")
w_events_query = """
SELECT mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""
w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
dt = date_to_dt(d)
daily_w_events = spark.sql(
w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
w_events_rdd = w_events_rdd.union(daily_w_events.rdd)
w_events_merged = sqlContext.createDataFrame(w_events_rdd)
w_events_merged.registerTempTable("w_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'W' as group
FROM w_events_merged
GROUP BY mode, eventType
"""
w_events = spark.sql(merge_events_query)
w_events
DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]
wpm_events_query = """
SELECT mode, action AS eventType, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""
wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
dt = date_to_dt(d)
daily_wpm_events = spark.sql(
wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)
wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)
wpm_events_merged.registerTempTable("wpm_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'WPM' as group
FROM wpm_events_merged
GROUP BY mode, eventType
"""
wpm_events = spark.sql(merge_events_query)
wpm_events
DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]
all_events = wpm_events.union(w_events).toPandas()
all_events
mode | eventType | pageloads_with_event | group | |
---|---|---|---|---|
0 | desktop | fnClick | 350362 | WPM |
1 | mobile | extClick | 256629 | WPM |
2 | mobile | fnHover | 93890 | WPM |
3 | mobile | fnClick | 537300 | WPM |
4 | desktop | extClick | 569352 | WPM |
5 | desktop | fnHover | 1466075 | WPM |
6 | desktop | upClick | 45528 | WPM |
7 | mobile | upClick | 3735 | WPM |
8 | desktop | fnClick | 7527839 | W |
9 | mobile | extClick | 20533646 | W |
10 | mobile | fnHover | 2071104 | W |
11 | mobile | fnClick | 15580373 | W |
12 | desktop | extClick | 29294243 | W |
13 | desktop | fnHover | 33855425 | W |
14 | desktop | upClick | 999363 | W |
15 | mobile | upClick | 112259 | W |
from pyspark.sql import functions as fn
w_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
SUM(view_count) AS total_pageviews, 'W' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
w_pageviews = spark.sql(w_query.format(start_date_string, end_date_string))\
.groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))
wpm_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
SUM(view_count) AS total_pageviews, 'WPM' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
wpm_pageviews = spark.sql(wpm_query.format(start_date_string, end_date_string))\
.groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))
all_pageviews = wpm_pageviews.union(w_pageviews).toPandas()
all_pageviews
mode | group | total_pageviews | |
---|---|---|---|
0 | mobile | WPM | 125364759 |
1 | desktop | WPM | 62650292 |
2 | mobile | W | 4175643369 |
3 | desktop | W | 3134600739 |
events_summary = all_events.merge(all_pageviews, on=['mode', 'group'])
events_summary['pageloads_without_event'] = events_summary['total_pageviews']-events_summary['pageloads_with_event']
events_summary['event_ratio'] = events_summary['pageloads_with_event']/events_summary['total_pageviews']
events_summary['pages_per_event'] = events_summary['total_pageviews']/events_summary['pageloads_with_event']
events_summary
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
0 | desktop | fnClick | 350362 | WPM | 62650292 | 62299930 | 0.005592 | 178.815888 |
1 | desktop | extClick | 569352 | WPM | 62650292 | 62080940 | 0.009088 | 110.037889 |
2 | desktop | fnHover | 1466075 | WPM | 62650292 | 61184217 | 0.023401 | 42.733347 |
3 | desktop | upClick | 45528 | WPM | 62650292 | 62604764 | 0.000727 | 1376.082674 |
4 | mobile | extClick | 256629 | WPM | 125364759 | 125108130 | 0.002047 | 488.505816 |
5 | mobile | fnHover | 93890 | WPM | 125364759 | 125270869 | 0.000749 | 1335.230152 |
6 | mobile | fnClick | 537300 | WPM | 125364759 | 124827459 | 0.004286 | 233.323579 |
7 | mobile | upClick | 3735 | WPM | 125364759 | 125361024 | 0.000030 | 33564.861847 |
8 | desktop | fnClick | 7527839 | W | 3134600739 | 3127072900 | 0.002402 | 416.401140 |
9 | desktop | extClick | 29294243 | W | 3134600739 | 3105306496 | 0.009345 | 107.003985 |
10 | desktop | fnHover | 33855425 | W | 3134600739 | 3100745314 | 0.010801 | 92.587842 |
11 | desktop | upClick | 999363 | W | 3134600739 | 3133601376 | 0.000319 | 3136.598752 |
12 | mobile | extClick | 20533646 | W | 4175643369 | 4155109723 | 0.004917 | 203.356158 |
13 | mobile | fnHover | 2071104 | W | 4175643369 | 4173572265 | 0.000496 | 2016.143742 |
14 | mobile | fnClick | 15580373 | W | 4175643369 | 4160062996 | 0.003731 | 268.006637 |
15 | mobile | upClick | 112259 | W | 4175643369 | 4175531110 | 0.000027 | 37196.513144 |
from IPython.core import display as ICD
import scipy
for mode in events_summary['mode'].unique():
for eventType in events_summary['eventType'].unique():
print("\n-------------")
df = events_summary[(events_summary['mode']==mode)
& (events_summary['group'].isin(["WPM", "W"]))
& (events_summary['eventType']==eventType)]
ICD.display(df)
print("Contingency table:")
cm = df[['pageloads_with_event', 'pageloads_without_event']].as_matrix()
print(cm)
oddsratio, pvalue = scipy.stats.fisher_exact(cm)
if pvalue < 0.001:
print("\nOddsRatio: {}, p-value < 0.001".format(oddsratio))
else:
print("\nOddsRatio: {}, p-value = {}".format(oddsratio, pvalue))
-------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
0 | desktop | fnClick | 350362 | WPM | 62650292 | 62299930 | 0.005592 | 178.815888 |
8 | desktop | fnClick | 7527839 | W | 3134600739 | 3127072900 | 0.002402 | 416.401140 |
Contingency table: [[ 350362 62299930] [ 7527839 3127072900]] OddsRatio: 2.3361306201136354, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
1 | desktop | extClick | 569352 | WPM | 62650292 | 62080940 | 0.009088 | 110.037889 |
9 | desktop | extClick | 29294243 | W | 3134600739 | 3105306496 | 0.009345 | 107.003985 |
Contingency table: [[ 569352 62080940] [ 29294243 3105306496]] OddsRatio: 0.9721756931564446, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
2 | desktop | fnHover | 1466075 | WPM | 62650292 | 61184217 | 0.023401 | 42.733347 |
10 | desktop | fnHover | 33855425 | W | 3134600739 | 3100745314 | 0.010801 | 92.587842 |
Contingency table: [[ 1466075 61184217] [ 33855425 3100745314]] OddsRatio: 2.194596122471873, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
3 | desktop | upClick | 45528 | WPM | 62650292 | 62604764 | 0.000727 | 1376.082674 |
11 | desktop | upClick | 999363 | W | 3134600739 | 3133601376 | 0.000319 | 3136.598752 |
Contingency table: [[ 45528 62604764] [ 999363 3133601376]] OddsRatio: 2.2802983491720803, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
6 | mobile | fnClick | 537300 | WPM | 125364759 | 124827459 | 0.004286 | 233.323579 |
14 | mobile | fnClick | 15580373 | W | 4175643369 | 4160062996 | 0.003731 | 268.006637 |
Contingency table: [[ 537300 124827459] [ 15580373 4160062996]] OddsRatio: 1.1492877219694864, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
4 | mobile | extClick | 256629 | WPM | 125364759 | 125108130 | 0.002047 | 488.505816 |
12 | mobile | extClick | 20533646 | W | 4175643369 | 4155109723 | 0.004917 | 203.356158 |
Contingency table: [[ 256629 125108130] [ 20533646 4155109723]] OddsRatio: 0.41508460385598855, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
5 | mobile | fnHover | 93890 | WPM | 125364759 | 125270869 | 0.000749 | 1335.230152 |
13 | mobile | fnHover | 2071104 | W | 4175643369 | 4173572265 | 0.000496 | 2016.143742 |
Contingency table: [[ 93890 125270869] [ 2071104 4173572265]] OddsRatio: 1.5103419291607243, p-value < 0.001 -------------
mode | eventType | pageloads_with_event | group | total_pageviews | pageloads_without_event | event_ratio | pages_per_event | |
---|---|---|---|---|---|---|---|---|
7 | mobile | upClick | 3735 | WPM | 125364759 | 125361024 | 0.000030 | 33564.861847 |
15 | mobile | upClick | 112259 | W | 4175643369 | 4175531110 | 0.000027 | 37196.513144 |
Contingency table: [[ 3735 125361024] [ 112259 4175531110]] OddsRatio: 1.1082012347934034, p-value < 0.001