Time To Event

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

Select first event by action type

In [2]:
first_event_query = """
SELECT page_token, page_id, action, MIN(event_offset_time) AS first_event_offset_time
FROM
    citationusage
WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY page_token, page_id, action
"""

citationusage_first_event = spark.sql(
        first_event_query.format(event_exclusion_sql, start_date_string, end_date_string
    ))

Register as table:

In [3]:
citationusage_first_event.registerTempTable("citationusage_first_event")
In [4]:
w_event_time_query = """
SELECT action, 
    CAST(AVG(first_event_offset_time) AS DECIMAL(10,2)) AS average_time_to_event_in_ms, 
    PERCENTILE(first_event_offset_time,0.5) AS median_time_to_event_in_ms,
    STDDEV(first_event_offset_time) as stddev,
    (PERCENTILE(first_event_offset_time,0.75) - PERCENTILE(first_event_offset_time,0.25)) as iqr

FROM 
    citationusage_first_event 
WHERE page_id IN 
        (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
GROUP BY action
"""



spark.sql(
    w_event_time_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    )).show()
+--------+---------------------------+--------------------------+--------------------+--------+
|  action|average_time_to_event_in_ms|median_time_to_event_in_ms|              stddev|     iqr|
+--------+---------------------------+--------------------------+--------------------+--------+
| fnHover|                 1133349.50|                   67672.0|  3.36349580336148E8|189559.0|
| fnClick|                  551408.55|                   56581.0|    8973764.21151973|132473.0|
| upClick|                 1677620.66|                   69882.0|1.7689820370919287E7|249820.0|
|extClick|                  391666.78|                   32758.0| 5.647982637670883E7| 86424.0|
+--------+---------------------------+--------------------------+--------------------+--------+

In [5]:
wpm_event_time_query = """
SELECT action, 
    CAST(AVG(first_event_offset_time) AS DECIMAL(10,2)) AS average_time_to_event_in_ms, 
    PERCENTILE(first_event_offset_time,0.5) AS median_time_to_event_in_ms,
    STDDEV(first_event_offset_time) as stddev,
    (PERCENTILE(first_event_offset_time,0.75) - PERCENTILE(first_event_offset_time,0.25)) as iqr

FROM 
    citationusage_first_event 
WHERE page_id IN 
        (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
GROUP BY action
"""

spark.sql(
    wpm_event_time_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    )).show()
+--------+---------------------------+--------------------------+--------------------+---------+
|  action|average_time_to_event_in_ms|median_time_to_event_in_ms|              stddev|      iqr|
+--------+---------------------------+--------------------------+--------------------+---------+
| fnHover|                 4515701.82|                   76874.0|2.1958250545759377E9|232157.25|
| fnClick|                  881732.14|                   72859.5|1.1722656796857577E7|180704.75|
| upClick|                 2010296.18|                   93005.0|1.8052394892862096E7| 367071.0|
|extClick|                  825780.67|                   47532.0|1.1779610983504394E7|147306.25|
+--------+---------------------------+--------------------------+--------------------+---------+

Statistics

Time to click:

In [6]:
# Plots inline
%matplotlib inline
In [7]:
w_click_time_offset_query = """
SELECT action, event_offset_time
FROM 
    citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
    (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND action = 'extClick'
"""

w_click_time_offset = spark.sql(
    w_click_time_offset_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    )).sample(0.2).toPandas()

w_click_time_offset.head()
Out[7]:
action event_offset_time
0 extClick 78857
1 extClick 315215
2 extClick 16107
3 extClick 255079
4 extClick 784937
In [8]:
wpm_click_time_offset_query = """
SELECT action, event_offset_time
FROM 
    citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND action = 'extClick'
"""

wpm_click_time_offset = spark.sql(
    wpm_click_time_offset_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    )).toPandas()

wpm_click_time_offset.head()
Out[8]:
action event_offset_time
0 extClick 4192783
1 extClick 172940
2 extClick 27298
3 extClick 25466
4 extClick 21528
In [9]:
w_click_time_offset = w_click_time_offset[w_click_time_offset.event_offset_time > 0]
wpm_click_time_offset = wpm_click_time_offset[wpm_click_time_offset.event_offset_time > 0]
In [10]:
w_click_time_offset['group'] = "W"
wpm_click_time_offset['group'] = "WPM"
w_click_time_offset['event_offset_time_seconds'] = w_click_time_offset["event_offset_time"]/1000
wpm_click_time_offset['event_offset_time_seconds'] = wpm_click_time_offset["event_offset_time"]/1000

all_event_time = w_click_time_offset.append(wpm_click_time_offset)
In [11]:
# fix data collection errors
error_rate = len(all_event_time[all_event_time.event_offset_time<=0])/len(all_event_time)
print("Proportion of samples with negative time = {}".format(error_rate))
Proportion of samples with negative time = 0.0
In [12]:
from scipy.stats import binned_statistic


def get_normalized_hist(bins, data):
    total_elements = len(data)
    groups = binned_statistic(data, data, bins = bins, statistic = lambda r: len(r) / total_elements)
    bin_centers = pd.Series(groups.bin_edges).rolling(2, 2).mean().dropna()
    return bin_centers, groups.statistic

bins = np.logspace(0, 4, 30, base=10)

w_distribution = get_normalized_hist(bins, w_click_time_offset.event_offset_time_seconds)
wpm_distribution = get_normalized_hist(bins, wpm_click_time_offset.event_offset_time_seconds)

figure(num=None, figsize=(12, 5), dpi=80, facecolor='w', edgecolor='k')

# Histogram
plt.subplot(121)
ax = plt.plot(wpm_distribution[0], wpm_distribution[1], marker='+', label="WPM", color="darkorange", zorder=2)
plt.gca().fill_between(wpm_distribution[0], wpm_distribution[1], color = 'orange', alpha = 0.1)
ax = plt.plot(w_distribution[0], w_distribution[1], marker='.', label="W", color="steelblue", zorder=1)
plt.gca().fill_between(w_distribution[0], w_distribution[1], color = 'blue', alpha = 0.1)
plt.gca().legend(loc="upper right")
plt.gca().set_xscale("log", nonposx='clip')
plt.title("Click-time offset W vs. WPM (Normalized Y)")
plt.xlabel("Click-time offset in seconds (log space - 30 bins)")
plt.ylabel("Portion of clicks")
vals = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0%}'.format(x) for x in vals])

# Box plot
plt.subplot(122)
my_pal = {"WPM": "darkorange", "W": "steelblue"}
sns.boxplot(x="group", y="event_offset_time_seconds", data=all_event_time, 
            showfliers=False, order=["WPM", "W"], palette=my_pal)
plt.ylabel("Click-time offset (seconds)")
plt.xlabel("Group")
plt.title("Distribution of the click-time offset")


plt.savefig("first_click_time.pdf")

Normality test

In [13]:
from scipy import stats
import math

def test_normality(data):  
    k2, p = stats.normaltest(data)
    alpha = 1e-3
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        print("The null hypothesis can be rejected. p < {:g}".format(alpha))
    else:
        print("The null hypothesis cannot be rejected. p = {:g}".format(p))

print("Normal Test WPM")
test_normality(wpm_click_time_offset.event_offset_time_seconds)

print("\nNormal Test W")
test_normality(w_click_time_offset.event_offset_time_seconds)
Normal Test WPM
The null hypothesis can be rejected. p < 0.001

Normal Test W
The null hypothesis can be rejected. p < 0.001

Mann–Whitney U test

In [14]:
from scipy.stats import mannwhitneyu

# compare distributions
stat, p = mannwhitneyu(w_click_time_offset.event_offset_time_seconds, 
                       wpm_click_time_offset.event_offset_time_seconds,
                       alternative="two-sided")
alpha = 0.001
if p < alpha:
    print('Statistics=%.3f, p < %.3f' % (stat, alpha))
else:
    print('Statistics=%.3f, p = %.3f' % (stat, p))
# interpret
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')
Statistics=3553445487136.500, p < 0.001
Different distribution (reject H0)
In [15]:
print("WPM sample size = {}".format(len(w_click_time_offset)))
print("W sample size = {}".format(len(wpm_click_time_offset)))
WPM sample size = 9961488
W sample size = 825712
In [ ]: