Section ID data was not limited to top-level (H2) sections during data capture, requiring post-capture processing for section ID click data.
Example of capture issue: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms. Clicks on links under "Acute hepatitis" were captured with section_id Acute_hepatitis, not Signs_and_symptoms.
Post-capture data augmentation: click event section_ids where mapped to parent H2 section headings. See populate-section-table.ipynb for extraction details.
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
# "raw" section data from captured events to show extent of the capture issue described above
pm_section_events_raw_query = """
SELECT section_id, action, count(*) count
FROM
citationusage
WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
AND wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY section_id, action
ORDER BY count desc
LIMIT 100
"""
pm_section_events_raw = spark.sql(
pm_section_events_raw_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
pm_section_events_raw_pandas = pm_section_events_raw.toPandas()
section_pda_raw = pm_section_events_raw_pandas.copy()
# replace 'NaN' section_id with 'missing'
section_pda_raw.section_id.fillna(value='-- missing --', inplace=True)
# limit to counts of 1K or more
df_filtered_raw = section_pda_raw.query('count>3000').copy()
# set precision before pivot
df_filtered_raw['count'] = df_filtered_raw['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered_raw.pivot(index='section_id', columns='action', values='count')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
section_id | ||||
-- missing -- | 143490 | 365209 | 523476 | None |
Adverse_effects | None | 4252 | 6343 | None |
Background | None | None | 3496 | None |
Cause | None | 3821 | 7787 | None |
Causes | None | 10930 | 18686 | None |
Classification | None | 3029 | 6620 | None |
Criminal_charges | None | 3025 | None | None |
Diagnosis | None | 7353 | 14525 | None |
Downfall | None | None | 3523 | None |
Early_life | None | None | 3192 | None |
Epidemiology | None | 8049 | 16396 | None |
External_links | 114496 | None | None | None |
Function | None | None | 3082 | None |
Further_reading | 12920 | None | None | None |
Genetics | None | None | 6477 | None |
History | None | 23552 | 48210 | None |
Management | None | None | 4420 | None |
Mechanism | None | None | 4187 | None |
Mechanism_of_action | None | 4698 | 10453 | None |
Medical_uses | None | 5672 | 9901 | None |
Medications | None | None | 3138 | None |
Notes | 14437 | None | None | None |
Pathophysiology | None | 3882 | 10161 | None |
Personal_life | None | 6076 | 5398 | None |
Pharmacodynamics | None | None | 7132 | None |
Pharmacokinetics | None | None | 3705 | None |
Prevention | None | 3202 | 5437 | None |
Prognosis | None | 6368 | 7950 | None |
References | 488685 | None | None | 45536 |
Research | None | 3922 | 6242 | None |
Risk_factors | None | None | 3273 | None |
Side_effects | None | 4008 | 5390 | None |
Signs_and_symptoms | None | 17120 | 30088 | None |
Society_and_culture | None | None | 3369 | None |
Transmission | None | None | 3536 | None |
Treatment | None | 12053 | 15874 | None |
Types | None | None | 3559 | None |
United_States | None | None | 4951 | None |
# count of top-level (H2) section IDs for WP:M pages only
pm_sections_query = """
SELECT section_h2, count(distinct page_id) count
FROM
ryanmax.population_wpm_sections
GROUP BY section_h2
ORDER BY count desc, section_h2
"""
pm_sections = spark.sql(pm_sections_query)
pm_sections.toPandas().head(50)
section_h2 | count | |
---|---|---|
0 | References | 30862 |
1 | External_links | 18460 |
2 | See_also | 13327 |
3 | History | 5666 |
4 | Diagnosis | 4654 |
5 | Treatment | 4263 |
6 | Further_reading | 3165 |
7 | Signs_and_symptoms | 2839 |
8 | Causes | 2149 |
9 | Epidemiology | 2066 |
10 | Prognosis | 1300 |
11 | Career | 1253 |
12 | Notes | 1244 |
13 | Research | 1221 |
14 | Cause | 1209 |
15 | Pathophysiology | 1145 |
16 | Medical_uses | 1016 |
17 | Society_and_culture | 937 |
18 | Biography | 842 |
19 | Prevention | 827 |
20 | Management | 796 |
21 | Early_life | 729 |
22 | Symptoms | 624 |
23 | Side_effects | 607 |
24 | Genetics | 597 |
25 | Mechanism | 593 |
26 | Types | 588 |
27 | Personal_life | 587 |
28 | Pharmacology | 553 |
29 | Bibliography | 543 |
30 | Background | 536 |
31 | Early_life_and_education | 526 |
32 | Publications | 502 |
33 | Presentation | 493 |
34 | Classification | 486 |
35 | Education | 480 |
36 | Chemistry | 470 |
37 | Life | 427 |
38 | Mechanism_of_action | 396 |
39 | Adverse_effects | 388 |
40 | Sources | 387 |
41 | Contraindications | 351 |
42 | Works | 340 |
43 | Awards | 338 |
44 | Interactions | 311 |
45 | Structure | 310 |
46 | Uses | 291 |
47 | Selected_publications | 290 |
48 | Function | 279 |
49 | Overview | 279 |
# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
pm_section_events_query = """
SELECT population_wpm_sections.section_h2, action, count(*) count, count(*)/{} AS daily_average
FROM
citationusage
LEFT JOIN ryanmax.population_wpm_sections
ON
population_wpm_sections.page_id = citationusage.page_id
AND population_wpm_sections.section_id = citationusage.section_id
WHERE
wiki = 'enwiki'
AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(citationusage.event_time) >= '{}'
AND to_date(citationusage.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action
ORDER BY count desc
"""
pm_section_events = spark.sql(
pm_section_events_query.format(
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
pm_section_events_pandas = pm_section_events.toPandas()
section_pda = pm_section_events_pandas.copy()
# replace 'NaN' section_h2 with 'missing'
section_pda.section_h2.fillna(value='-- missing --', inplace=True)
pivot=section_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')
section_h2s=pivot.index
pivot
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
section_h2 | ||||
-- missing -- | 144124.0 | 367446.0 | 527633.0 | 6.0 |
Adverse_effects | 206.0 | 8017.0 | 14158.0 | NaN |
Applications | 18.0 | 1187.0 | 2429.0 | NaN |
Background | 53.0 | 1853.0 | 4216.0 | 1.0 |
Biography | 60.0 | 1700.0 | 3864.0 | NaN |
Career | 124.0 | 1801.0 | 3876.0 | NaN |
Cause | 174.0 | 8959.0 | 21897.0 | NaN |
Causes | 386.0 | 20480.0 | 39966.0 | 1.0 |
Characteristics | 66.0 | 1624.0 | 3835.0 | NaN |
Chemistry | 14.0 | 2062.0 | 4100.0 | NaN |
Classification | 1602.0 | 3771.0 | 8233.0 | 2.0 |
Clinical_significance | 19.0 | 1210.0 | 2708.0 | NaN |
Criticism | 3.0 | 1263.0 | 2204.0 | NaN |
Definition | 42.0 | 1644.0 | 3126.0 | NaN |
Definitions | 21.0 | 1299.0 | 2501.0 | NaN |
Description | 173.0 | 1234.0 | 2597.0 | NaN |
Diagnosis | 1836.0 | 15648.0 | 34727.0 | NaN |
Early_life | 28.0 | 1431.0 | 3047.0 | NaN |
Effects | 10.0 | 2519.0 | 4717.0 | NaN |
Epidemiology | 47.0 | 9952.0 | 21259.0 | NaN |
Examples | 243.0 | 1022.0 | 1544.0 | NaN |
Experiments | NaN | 1096.0 | 1573.0 | NaN |
Function | 16.0 | 1538.0 | 4510.0 | NaN |
Genetics | 239.0 | 1293.0 | 3632.0 | NaN |
Health_effects | NaN | 2426.0 | 5195.0 | NaN |
History | 536.0 | 34264.0 | 75457.0 | NaN |
Interactions | 5.0 | 1042.0 | 2046.0 | NaN |
Life | 212.0 | 1532.0 | 4646.0 | NaN |
Management | 11.0 | 7595.0 | 18470.0 | NaN |
Mechanism | 71.0 | 2875.0 | 7266.0 | NaN |
Mechanism_of_action | 79.0 | 3827.0 | 8596.0 | NaN |
Medical_use | 7.0 | 1859.0 | 3987.0 | NaN |
Medical_uses | 27.0 | 14913.0 | 29189.0 | NaN |
Notable_cases | 6.0 | 2097.0 | 1742.0 | NaN |
Overview | 80.0 | 1726.0 | 3172.0 | NaN |
Pathogenesis | 1.0 | 1024.0 | 2869.0 | NaN |
Pathophysiology | 68.0 | 5911.0 | 15846.0 | NaN |
Personal_life | 6.0 | 6333.0 | 5641.0 | NaN |
Pharmacology | 22.0 | 7245.0 | 19622.0 | NaN |
Physiology | 4.0 | 1220.0 | 2799.0 | NaN |
Prevention | 90.0 | 5387.0 | 10765.0 | NaN |
Prognosis | 74.0 | 7022.0 | 9220.0 | NaN |
Research | 121.0 | 6413.0 | 11408.0 | NaN |
Risk_factors | 7.0 | 3796.0 | 7167.0 | NaN |
Safety | 16.0 | 1321.0 | 2336.0 | NaN |
Science_and_technology | NaN | 1135.0 | 2123.0 | NaN |
Side_effects | 21.0 | 5930.0 | 9299.0 | NaN |
Signs_and_symptoms | 810.0 | 27628.0 | 56668.0 | 1.0 |
Society_and_culture | 345.0 | 10760.0 | 20100.0 | NaN |
Structure | 195.0 | 1159.0 | 3023.0 | NaN |
Symptoms | 78.0 | 1622.0 | 2472.0 | NaN |
Theranos | NaN | 5974.0 | 10060.0 | NaN |
Treatment | 443.0 | 19942.0 | 29533.0 | NaN |
Treatments | 9.0 | 1016.0 | 1394.0 | NaN |
Types | 345.0 | 4721.0 | 9512.0 | NaN |
Uses | 34.0 | 4604.0 | 9748.0 | NaN |
section_pda.pivot(index='section_h2', columns='action', values='daily_average').loc[section_h2s]
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
section_h2 | ||||
-- missing -- | 4503.87500 | 11482.68750 | 16488.53125 | 0.18750 |
Adverse_effects | 6.43750 | 250.53125 | 442.43750 | NaN |
Applications | 0.56250 | 37.09375 | 75.90625 | NaN |
Background | 1.65625 | 57.90625 | 131.75000 | 0.03125 |
Biography | 1.87500 | 53.12500 | 120.75000 | NaN |
Career | 3.87500 | 56.28125 | 121.12500 | NaN |
Cause | 5.43750 | 279.96875 | 684.28125 | NaN |
Causes | 12.06250 | 640.00000 | 1248.93750 | 0.03125 |
Characteristics | 2.06250 | 50.75000 | 119.84375 | NaN |
Chemistry | 0.43750 | 64.43750 | 128.12500 | NaN |
Classification | 50.06250 | 117.84375 | 257.28125 | 0.06250 |
Clinical_significance | 0.59375 | 37.81250 | 84.62500 | NaN |
Criticism | 0.09375 | 39.46875 | 68.87500 | NaN |
Definition | 1.31250 | 51.37500 | 97.68750 | NaN |
Definitions | 0.65625 | 40.59375 | 78.15625 | NaN |
Description | 5.40625 | 38.56250 | 81.15625 | NaN |
Diagnosis | 57.37500 | 489.00000 | 1085.21875 | NaN |
Early_life | 0.87500 | 44.71875 | 95.21875 | NaN |
Effects | 0.31250 | 78.71875 | 147.40625 | NaN |
Epidemiology | 1.46875 | 311.00000 | 664.34375 | NaN |
Examples | 7.59375 | 31.93750 | 48.25000 | NaN |
Experiments | NaN | 34.25000 | 49.15625 | NaN |
Function | 0.50000 | 48.06250 | 140.93750 | NaN |
Genetics | 7.46875 | 40.40625 | 113.50000 | NaN |
Health_effects | NaN | 75.81250 | 162.34375 | NaN |
History | 16.75000 | 1070.75000 | 2358.03125 | NaN |
Interactions | 0.15625 | 32.56250 | 63.93750 | NaN |
Life | 6.62500 | 47.87500 | 145.18750 | NaN |
Management | 0.34375 | 237.34375 | 577.18750 | NaN |
Mechanism | 2.21875 | 89.84375 | 227.06250 | NaN |
Mechanism_of_action | 2.46875 | 119.59375 | 268.62500 | NaN |
Medical_use | 0.21875 | 58.09375 | 124.59375 | NaN |
Medical_uses | 0.84375 | 466.03125 | 912.15625 | NaN |
Notable_cases | 0.18750 | 65.53125 | 54.43750 | NaN |
Overview | 2.50000 | 53.93750 | 99.12500 | NaN |
Pathogenesis | 0.03125 | 32.00000 | 89.65625 | NaN |
Pathophysiology | 2.12500 | 184.71875 | 495.18750 | NaN |
Personal_life | 0.18750 | 197.90625 | 176.28125 | NaN |
Pharmacology | 0.68750 | 226.40625 | 613.18750 | NaN |
Physiology | 0.12500 | 38.12500 | 87.46875 | NaN |
Prevention | 2.81250 | 168.34375 | 336.40625 | NaN |
Prognosis | 2.31250 | 219.43750 | 288.12500 | NaN |
Research | 3.78125 | 200.40625 | 356.50000 | NaN |
Risk_factors | 0.21875 | 118.62500 | 223.96875 | NaN |
Safety | 0.50000 | 41.28125 | 73.00000 | NaN |
Science_and_technology | NaN | 35.46875 | 66.34375 | NaN |
Side_effects | 0.65625 | 185.31250 | 290.59375 | NaN |
Signs_and_symptoms | 25.31250 | 863.37500 | 1770.87500 | 0.03125 |
Society_and_culture | 10.78125 | 336.25000 | 628.12500 | NaN |
Structure | 6.09375 | 36.21875 | 94.46875 | NaN |
Symptoms | 2.43750 | 50.68750 | 77.25000 | NaN |
Theranos | NaN | 186.68750 | 314.37500 | NaN |
Treatment | 13.84375 | 623.18750 | 922.90625 | NaN |
Treatments | 0.28125 | 31.75000 | 43.56250 | NaN |
Types | 10.78125 | 147.53125 | 297.25000 | NaN |
Uses | 1.06250 | 143.87500 | 304.62500 | NaN |
# count of events by event type and access mode for each top-level (H2) section ID for WP:M pages only
pm_section_events_mode_query = """
SELECT population_wpm_sections.section_h2, action, mode, count(*) count
FROM
citationusage
LEFT JOIN ryanmax.population_wpm_sections
ON
population_wpm_sections.page_id = citationusage.page_id
AND population_wpm_sections.section_id = citationusage.section_id
WHERE
wiki = 'enwiki'
AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(citationusage.event_time) >= '{}'
AND to_date(citationusage.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action, mode
ORDER BY count desc
"""
pm_section_events_mode = spark.sql(
pm_section_events_mode_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
pm_section_events_mode_pandas = pm_section_events_mode.toPandas()
desktop_pda = pm_section_events_mode_pandas.query('mode == "desktop"').copy()
# replace 'NaN' section_h2 with 'missing'
desktop_pda.section_h2.fillna(value='-- missing --', inplace=True)
desktop_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
section_h2 | ||||
-- missing -- | 65483.0 | 123684.0 | 485050.0 | 6.0 |
Adverse_effects | 49.0 | 2487.0 | 13283.0 | NaN |
Background | 25.0 | 1024.0 | 4057.0 | NaN |
Cause | 96.0 | 3624.0 | 20839.0 | NaN |
Causes | 207.0 | 7194.0 | 37401.0 | NaN |
Chemistry | 9.0 | 1232.0 | 3941.0 | NaN |
Classification | 377.0 | 1697.0 | 7789.0 | 1.0 |
Diagnosis | 722.0 | 6724.0 | 32950.0 | NaN |
Epidemiology | 33.0 | 5265.0 | 20627.0 | NaN |
Function | 9.0 | 1096.0 | 4392.0 | NaN |
Health_effects | NaN | 1019.0 | 4856.0 | NaN |
History | 330.0 | 18914.0 | 72711.0 | NaN |
Management | 5.0 | 3046.0 | 17377.0 | NaN |
Mechanism | 40.0 | 1564.0 | 6992.0 | NaN |
Mechanism_of_action | 54.0 | 2371.0 | 8291.0 | NaN |
Medical_uses | 14.0 | 5291.0 | 27188.0 | NaN |
Overview | 45.0 | 1003.0 | 3055.0 | NaN |
Pathophysiology | 38.0 | 3170.0 | 15170.0 | NaN |
Personal_life | 3.0 | 1389.0 | 5229.0 | NaN |
Pharmacology | 14.0 | 4039.0 | 18964.0 | NaN |
Prevention | 38.0 | 2140.0 | 10100.0 | NaN |
Prognosis | 48.0 | 2126.0 | 8709.0 | NaN |
Research | 70.0 | 2977.0 | 10980.0 | NaN |
Risk_factors | 4.0 | 1524.0 | 6698.0 | NaN |
Side_effects | 15.0 | 1775.0 | 8654.0 | NaN |
Signs_and_symptoms | 240.0 | 8395.0 | 52838.0 | 1.0 |
Society_and_culture | 167.0 | 3998.0 | 19013.0 | NaN |
Theranos | NaN | 1840.0 | 9671.0 | NaN |
Treatment | 135.0 | 5844.0 | 27318.0 | NaN |
Types | 170.0 | 2025.0 | 8863.0 | NaN |
Uses | 16.0 | 1978.0 | 9257.0 | NaN |
mobile_pda = pm_section_events_mode_pandas.query('mode == "mobile"').copy()
# replace 'NaN' section_h2 with 'missing'
mobile_pda.section_h2.fillna(value='-- missing --', inplace=True)
mobile_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
section_h2 | ||||
-- missing -- | 78641.0 | 243762.0 | 42583.0 | NaN |
Adverse_effects | 157.0 | 5530.0 | 875.0 | NaN |
Career | 38.0 | 1044.0 | 132.0 | NaN |
Cause | 78.0 | 5335.0 | 1058.0 | NaN |
Causes | 179.0 | 13286.0 | 2565.0 | 1.0 |
Characteristics | 4.0 | 1009.0 | 204.0 | NaN |
Classification | 1225.0 | 2074.0 | 444.0 | 1.0 |
Diagnosis | 1114.0 | 8924.0 | 1777.0 | NaN |
Early_life | 12.0 | 1090.0 | 193.0 | NaN |
Effects | 10.0 | 1641.0 | 305.0 | NaN |
Epidemiology | 14.0 | 4687.0 | 632.0 | NaN |
Health_effects | NaN | 1407.0 | 339.0 | NaN |
History | 206.0 | 15350.0 | 2746.0 | NaN |
Management | 6.0 | 4549.0 | 1093.0 | NaN |
Mechanism | 31.0 | 1311.0 | 274.0 | NaN |
Mechanism_of_action | 25.0 | 1456.0 | 305.0 | NaN |
Medical_use | 2.0 | 1095.0 | 232.0 | NaN |
Medical_uses | 13.0 | 9622.0 | 2001.0 | NaN |
Notable_cases | 3.0 | 1633.0 | 126.0 | NaN |
Pathophysiology | 30.0 | 2741.0 | 676.0 | NaN |
Personal_life | 3.0 | 4944.0 | 412.0 | NaN |
Pharmacology | 8.0 | 3206.0 | 658.0 | NaN |
Prevention | 52.0 | 3247.0 | 665.0 | NaN |
Prognosis | 26.0 | 4896.0 | 511.0 | NaN |
Research | 51.0 | 3436.0 | 428.0 | NaN |
Risk_factors | 3.0 | 2272.0 | 469.0 | NaN |
Side_effects | 6.0 | 4155.0 | 645.0 | NaN |
Signs_and_symptoms | 570.0 | 19233.0 | 3830.0 | NaN |
Society_and_culture | 178.0 | 6762.0 | 1087.0 | NaN |
Symptoms | 55.0 | 1138.0 | 168.0 | NaN |
Theranos | NaN | 4134.0 | 389.0 | NaN |
Treatment | 308.0 | 14098.0 | 2215.0 | NaN |
Types | 175.0 | 2696.0 | 649.0 | NaN |
Uses | 18.0 | 2626.0 | 491.0 | NaN |
# event counts for pages with Adverse_effects section H2s
adverse_events_query = """
SELECT citationusage.page_id, population_page_titles_20190420.page_title, count(*) AS event_count
FROM
citationusage
LEFT JOIN ryanmax.population_wpm_sections
ON
population_wpm_sections.page_id = citationusage.page_id
AND population_wpm_sections.section_id = citationusage.section_id
LEFT JOIN ryanmax.population_page_titles_20190420
ON
population_page_titles_20190420.page_id = citationusage.page_id
WHERE
wiki = 'enwiki'
AND population_wpm_sections.section_h2 = 'Adverse_effects'
AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(citationusage.event_time) >= '{}'
AND to_date(citationusage.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY citationusage.page_id, population_page_titles_20190420.page_title
ORDER BY event_count desc
"""
adverse_events = spark.sql(
adverse_events_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
adverse_events_pandas = adverse_events.toPandas()
print('total events for Adverse_effects: ',adverse_events_pandas['event_count'].sum())
adverse_events_pandas.head(20)
total events for Adverse_effects: 22381
page_id | page_title | event_count | |
---|---|---|---|
0 | 1481886 | Cannabis (drug) | 1198 |
1 | 10024 | MDMA | 1099 |
2 | 1245311 | Finasteride | 511 |
3 | 17537 | Lysergic acid diethylamide | 464 |
4 | 83406 | Paracetamol | 433 |
5 | 141915 | Fentanyl | 393 |
6 | 724730 | Clonazepam | 375 |
7 | 8718425 | Circumcision | 339 |
8 | 155627 | Ibuprofen | 332 |
9 | 178197 | Statin | 315 |
10 | 253720 | Metformin | 310 |
11 | 229985 | Isotretinoin | 280 |
12 | 22071 | Nonsteroidal anti-inflammatory drug | 263 |
13 | 1525 | Aspirin | 262 |
14 | 4781 | Benzodiazepine | 245 |
15 | 185272 | Atypical antipsychotic | 237 |
16 | 201310 | Citalopram | 229 |
17 | 520574 | Venlafaxine | 220 |
18 | 31690663 | Quinolone antibiotic | 219 |
19 | 875202 | Mitragyna speciosa | 216 |