# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
# pandas float formatting
pd.options.display.float_format = '{:.4f}'.format
# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type
pm_category_events_query = """
SELECT projmed_categories.category, action, count(*) count
FROM
citationusage,
(SELECT DISTINCT page_id, category
FROM ryanmax.projmed_categories
WHERE projmed_categories.category LIKE '%Class_medicine_articles%'
AND to_date(projmed_categories.dt) = '2019-04-20'
)
AS projmed_categories
WHERE citationusage.page_id = projmed_categories.page_id
AND wiki = 'enwiki'
AND citationusage.page_id IN
(SELECT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY projmed_categories.category, action
ORDER BY projmed_categories.category, action
"""
pm_category_events = spark.sql(
pm_category_events_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
pm_category_events_rdd = pm_category_events.rdd
pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)
pm_category_events_pandas = pm_category_events_df.toPandas()
pm_category_events_pandas.pivot(index='category', columns='action', values='count')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
category | ||||
B-Class_medicine_articles | 197327.0000 | 332716.0000 | 608502.0000 | 19064.0000 |
C-Class_medicine_articles | 212225.0000 | 238659.0000 | 413065.0000 | 12359.0000 |
Category-Class_medicine_articles | 12.0000 | 6.0000 | 13.0000 | nan |
Disambig-Class_medicine_articles | 164.0000 | 27.0000 | 29.0000 | 2.0000 |
FA-Class_medicine_articles | 14235.0000 | 29790.0000 | 70200.0000 | 2152.0000 |
FL-Class_medicine_articles | 425.0000 | 911.0000 | 1260.0000 | 40.0000 |
GA-Class_medicine_articles | 26627.0000 | 65354.0000 | 139347.0000 | 8036.0000 |
List-Class_medicine_articles | 12431.0000 | 10669.0000 | 18497.0000 | 187.0000 |
Redirect-Class_medicine_articles | 148.0000 | 98.0000 | 106.0000 | 2.0000 |
Start-Class_medicine_articles | 291457.0000 | 190972.0000 | 287123.0000 | 6710.0000 |
Stub-Class_medicine_articles | 70427.0000 | 17996.0000 | 21208.0000 | 700.0000 |
pm_category_events_pandas.groupby(['category']).agg(
[
('total_events','sum'),
('daily_average', lambda x: sum(x)/days_in_study)
])
count | ||
---|---|---|
total_events | daily_average | |
category | ||
B-Class_medicine_articles | 1157609 | 36175.2812 |
C-Class_medicine_articles | 876308 | 27384.6250 |
Category-Class_medicine_articles | 31 | 0.9688 |
Disambig-Class_medicine_articles | 222 | 6.9375 |
FA-Class_medicine_articles | 116377 | 3636.7812 |
FL-Class_medicine_articles | 2636 | 82.3750 |
GA-Class_medicine_articles | 239364 | 7480.1250 |
List-Class_medicine_articles | 41784 | 1305.7500 |
Redirect-Class_medicine_articles | 354 | 11.0625 |
Start-Class_medicine_articles | 776262 | 24258.1875 |
Stub-Class_medicine_articles | 110331 | 3447.8438 |
pm_category_events_pandas['average'] = pm_category_events_pandas['count'].map(lambda x: x/days_in_study)
pm_category_events_pandas.pivot(index='category', columns='action', values='average')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
category | ||||
B-Class_medicine_articles | 6166.4688 | 10397.3750 | 19015.6875 | 595.7500 |
C-Class_medicine_articles | 6632.0312 | 7458.0938 | 12908.2812 | 386.2188 |
Category-Class_medicine_articles | 0.3750 | 0.1875 | 0.4062 | nan |
Disambig-Class_medicine_articles | 5.1250 | 0.8438 | 0.9062 | 0.0625 |
FA-Class_medicine_articles | 444.8438 | 930.9375 | 2193.7500 | 67.2500 |
FL-Class_medicine_articles | 13.2812 | 28.4688 | 39.3750 | 1.2500 |
GA-Class_medicine_articles | 832.0938 | 2042.3125 | 4354.5938 | 251.1250 |
List-Class_medicine_articles | 388.4688 | 333.4062 | 578.0312 | 5.8438 |
Redirect-Class_medicine_articles | 4.6250 | 3.0625 | 3.3125 | 0.0625 |
Start-Class_medicine_articles | 9108.0312 | 5967.8750 | 8972.5938 | 209.6875 |
Stub-Class_medicine_articles | 2200.8438 | 562.3750 | 662.7500 | 21.8750 |
numbers will not match [1] because we're limiting to namespace 0 pages with external links
[1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics
# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will not match [1] because we're limiting to namespace 0 pages with external links
# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics
pm_category_pages = """
SELECT category, COUNT(DISTINCT page_id) AS pages_w_links
FROM ryanmax.projmed_categories
WHERE category LIKE '%Class_medicine_articles%'
AND to_date(projmed_categories.dt) = '2019-04-20'
AND page_id IN
(SELECT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
GROUP BY category
ORDER BY COUNT(*) DESC
"""
pm_cat_counts = spark.sql(pm_category_pages)
cats = sqlContext.createDataFrame(pm_cat_counts.rdd)
cats.toPandas()
category | pages_w_links | |
---|---|---|
0 | Start-Class_medicine_articles | 14484 |
1 | Stub-Class_medicine_articles | 9765 |
2 | C-Class_medicine_articles | 5332 |
3 | B-Class_medicine_articles | 2173 |
4 | List-Class_medicine_articles | 452 |
5 | GA-Class_medicine_articles | 240 |
6 | FA-Class_medicine_articles | 62 |
7 | Disambig-Class_medicine_articles | 17 |
8 | Redirect-Class_medicine_articles | 13 |
9 | FL-Class_medicine_articles | 12 |
10 | Category-Class_medicine_articles | 1 |
numbers will be higher than overall WP:M pageloads since one page may have more than one category
# pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will be higher than overall WP:M pageloads since one page may have more than one category
pm_category_pageloads_query = """
SELECT projmed_categories.category, sum(view_count) AS total_pageloads, sum(view_count)/{} AS daily_average
FROM
wmf.pageview_hourly,
(SELECT DISTINCT page_id, category
FROM ryanmax.projmed_categories
WHERE projmed_categories.category LIKE '%Class_medicine_articles%'
AND to_date(projmed_categories.dt) = '2019-04-20'
)
AS projmed_categories
WHERE pageview_hourly.page_id = projmed_categories.page_id
AND pageview_hourly.project = 'en.wikipedia'
AND pageview_hourly.agent_type = 'user'
AND pageview_hourly.page_id IN
(SELECT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY projmed_categories.category
ORDER BY projmed_categories.category
"""
pm_category_pageloads = spark.sql(
pm_category_pageloads_query.format(
days_in_study, start_date_string, end_date_string
))
pm_category_pageloads.toPandas()
category | total_pageloads | daily_average | |
---|---|---|---|
0 | B-Class_medicine_articles | 57892215 | 1809131.7188 |
1 | C-Class_medicine_articles | 51507495 | 1609609.2188 |
2 | Category-Class_medicine_articles | 259 | 8.0938 |
3 | Disambig-Class_medicine_articles | 36927 | 1153.9688 |
4 | FA-Class_medicine_articles | 4389844 | 137182.6250 |
5 | FL-Class_medicine_articles | 121691 | 3802.8438 |
6 | GA-Class_medicine_articles | 9350159 | 292192.4688 |
7 | List-Class_medicine_articles | 1801806 | 56306.4375 |
8 | Redirect-Class_medicine_articles | 25308 | 790.8750 |
9 | Start-Class_medicine_articles | 54741621 | 1710675.6562 |
10 | Stub-Class_medicine_articles | 8056357 | 251761.1562 |
# count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)
pm_category_links_query = """
SELECT projmed_categories.category,
COUNT(DISTINCT el_from, el_to) AS num_extlinks,
COUNT(DISTINCT el_from, el_to)/COUNT(DISTINCT el_from) as avg_extlinks_per_page
FROM
ryanmax.population_externallinks,
(SELECT DISTINCT page_id, category
FROM ryanmax.projmed_categories
WHERE projmed_categories.category LIKE '%Class_medicine_articles%'
AND to_date(projmed_categories.dt) = '2019-04-20'
)
AS projmed_categories
WHERE population_externallinks.el_from = projmed_categories.page_id
AND population_externallinks.el_from IN
(SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
GROUP BY projmed_categories.category
ORDER BY projmed_categories.category
"""
pm_category_links = spark.sql(pm_category_links_query)
pm_category_links.toPandas()
category | num_extlinks | avg_extlinks_per_page | |
---|---|---|---|
0 | B-Class_medicine_articles | 216159 | 99.4749 |
1 | C-Class_medicine_articles | 256660 | 48.1358 |
2 | Category-Class_medicine_articles | 135 | 135.0000 |
3 | Disambig-Class_medicine_articles | 85 | 5.0000 |
4 | FA-Class_medicine_articles | 13914 | 224.4194 |
5 | FL-Class_medicine_articles | 2588 | 215.6667 |
6 | GA-Class_medicine_articles | 32793 | 136.6375 |
7 | List-Class_medicine_articles | 34091 | 75.4226 |
8 | Redirect-Class_medicine_articles | 119 | 9.1538 |
9 | Start-Class_medicine_articles | 310574 | 21.4426 |
10 | Stub-Class_medicine_articles | 76668 | 7.8513 |