# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
# Total count of events (by all event types) in InfoBoxes for WP:M pages only
# Total count of events (by all event types) in the Main section for WP:M pages only
pm_infobox_events_query = """
SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM
citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY in_infobox, action
ORDER BY in_infobox, action
"""
pm_infobox_events = spark.sql(
pm_infobox_events_query.format(
days_in_study, event_exclusion_sql, start_date_string, end_date_string
))
pm_infobox_events.toPandas()
in_infobox | action | total_event_count | daily_average_event_count | |
---|---|---|---|---|
0 | False | extClick | 684612 | 21394.12500 |
1 | False | fnClick | 817320 | 25541.25000 |
2 | False | fnHover | 1533302 | 47915.68750 |
3 | False | upClick | 49263 | 1539.46875 |
4 | True | extClick | 141369 | 4417.78125 |
5 | True | fnClick | 70342 | 2198.18750 |
6 | True | fnHover | 26663 | 833.21875 |
# Total count of events (by all event types) in InfoBoxes for W pages
# Total count of events (by all event types) in the Main section for W pages
w_infobox_events_query = """
SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM
citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY in_infobox, action
ORDER BY in_infobox, action
"""
w_infobox_events = spark.sql(
w_infobox_events_query.format(
days_in_study, event_exclusion_sql, start_date_string, end_date_string
))
w_infobox_events.toPandas()
in_infobox | action | total_event_count | daily_average_event_count | |
---|---|---|---|---|
0 | False | extClick | 35176829 | 1.099276e+06 |
1 | False | fnClick | 21324424 | 6.663882e+05 |
2 | False | fnHover | 34837380 | 1.088668e+06 |
3 | False | upClick | 1111282 | 3.472756e+04 |
4 | True | extClick | 14651060 | 4.578456e+05 |
5 | True | fnClick | 1783788 | 5.574338e+04 |
6 | True | fnHover | 1089149 | 3.403591e+04 |
7 | True | upClick | 340 | 1.062500e+01 |
# Infobox clicks that occurred under a section heading (e.g. External links)
# no limits
infobox_section_events_query = """
SELECT section_id, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM
citationusage
WHERE
wiki = 'enwiki'
AND in_infobox = TRUE
{}
AND useragent_is_bot = FALSE
GROUP BY section_id, action
ORDER BY total_event_count desc
"""
infobox_section_events = spark.sql(infobox_section_events_query.format(days_in_study,event_exclusion_sql))
infobox_section_events.show()
+--------------------+--------+-----------------+-------------------------+ | section_id| action|total_event_count|daily_average_event_count| +--------------------+--------+-----------------+-------------------------+ | null|extClick| 14979994| 468124.8125| | null| fnClick| 1849201| 57787.53125| | null| fnHover| 1107570| 34611.5625| | Reception| fnClick| 28389| 887.15625| | Reception| fnHover| 13881| 433.78125| | External_links|extClick| 7268| 227.125| | Timeline|extClick| 4907| 153.34375| | Rankings| fnClick| 4270| 133.4375| | Rankings| fnHover| 3925| 122.65625| | In_popular_culture|extClick| 3107| 97.09375| | Bonnie_Parker|extClick| 2648| 82.75| | Soundtrack|extClick| 2208| 69.0| | Elevator_video|extClick| 2046| 63.9375| | Career|extClick| 2020| 63.125| | History|extClick| 1918| 59.9375| | Assassination|extClick| 1774| 55.4375| | Professional_career|extClick| 1671| 52.21875| |Gesundheit!_Insti...|extClick| 1492| 46.625| | Body_camera_footage|extClick| 1425| 44.53125| | Events|extClick| 1392| 43.5| +--------------------+--------+-----------------+-------------------------+ only showing top 20 rows
# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
# where the event also occurred in an InfoBox
pm_section_events_query = """
SELECT population_wpm_sections.section_h2, action, count(*) count
FROM
citationusage
LEFT JOIN ryanmax.population_wpm_sections
ON
population_wpm_sections.page_id = citationusage.page_id
AND population_wpm_sections.section_id = citationusage.section_id
WHERE
wiki = 'enwiki'
AND in_infobox = TRUE
AND citationusage.page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(citationusage.event_time) >= '{}'
AND to_date(citationusage.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action
ORDER BY count desc
"""
pm_section_events = spark.sql(
pm_section_events_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
pm_section_events_rdd = pm_section_events.rdd
pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd)
pm_section_events_pandas = pm_section_events_df.toPandas()
** Limits: WP:M pages and >= 5 events **
section_pda = pm_section_events_pandas.copy()
# replace 'NaN' section_h2 with 'missing'
section_pda.section_h2.fillna(value='-- Infobox event outside of a section --', inplace=True)
# limit to counts of 1K or more
section_pda['count'] = section_pda['count'].astype(int)
df_filtered = section_pda.query('count>=5').copy()
# set precision before pivot
df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered.pivot(index='section_h2', columns='action', values='count')
action | extClick | fnClick | fnHover |
---|---|---|---|
section_h2 | |||
-- Infobox event outside of a section -- | 135256 | 70104 | 26495 |
ASH_(United_Kingdom) | 5 | None | None |
Academics | None | 20 | 18 |
Battle_with_schools | None | 146 | 48 |
Bibliography | 9 | None | None |
Books | 50 | None | None |
Career | 10 | None | None |
Cause | 12 | 5 | None |
Causes | None | None | 21 |
Clinfowiki | 16 | None | None |
Common_families_of_interleukins | 23 | None | None |
DAN_America | 8 | None | None |
Diagnosis | 15 | None | None |
External_links | 4526 | None | None |
EyeWiki | 13 | None | None |
Family_and_birth | 74 | None | None |
Ganfyd | 12 | None | None |
Health | 7 | None | None |
HemOnc.org | 5 | None | None |
History | 67 | None | None |
History,_society,_and_culture | 55 | None | None |
Institute_for_Functional_Medicine | 124 | None | None |
Legacy | 19 | None | None |
Levels | 34 | None | None |
Me2/Orchestra | 8 | None | None |
Mechanism_of_action | 21 | 5 | 8 |
Medical_use | 7 | None | None |
Mental_Health_Parity_and_Addiction_Equity_Act | 5 | None | None |
Mental_changes_and_brain_damage | 75 | None | None |
Neurosurgery_methods | 6 | None | None |
Nurofen | 9 | None | None |
Occurrence | 213 | None | None |
Origin_of_term | 65 | None | None |
Overdose | None | None | 9 |
Overview | 51 | None | None |
Pathophysiology | 22 | None | None |
Political_career | 7 | None | None |
Rankings_and_reputation | None | 8 | 6 |
References | 13 | None | None |
Rehabilitation_attempts | 6 | None | None |
Rescue | 56 | None | None |
School_of_Medicine | None | None | 9 |
See_also | 5 | None | None |
Selected_publications | 76 | None | None |
Signs_and_symptoms | 162 | None | None |
Society_and_culture | 9 | None | None |
Structure | 10 | None | None |
Toxicity_and_precautions | None | None | 7 |
Tuberculosis_in_early_civilization | 7 | None | None |
Uses | 11 | None | None |
Versions | 15 | None | None |
Virology | None | 7 | 12 |
WikEM | 5 | None | None |
WikiDoc | 24 | None | None |
WikiLectures | 7 | None | None |
ZDoggMD | 53 | None | None |