wmf.pageview_hourly
table# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
# pandas float formatting
pd.options.display.float_format = '{:.4f}'.format
wpm_query = """
SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
wpm_pageviews = spark.sql(wpm_query.format(days_in_study, start_date_string, end_date_string))
wpm_pageviews_rdd = wpm_pageviews.rdd
wpm_pageviews_df = sqlContext.createDataFrame(wpm_pageviews_rdd)
wpm_pageviews_pandas = wpm_pageviews_df.toPandas()
wpm_pageviews_pandas
access_method | total_pageviews | daily_average_pageviews | |
---|---|---|---|
0 | desktop | 62650292 | 1957821.6250 |
1 | mobile web | 124214376 | 3881699.2500 |
2 | mobile app | 1150383 | 35949.4688 |
w_query = """
SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
w_pageviews = spark.sql(w_query.format(days_in_study,start_date_string, end_date_string))
w_pageviews_rdd = w_pageviews.rdd
w_pageviews_df = sqlContext.createDataFrame(w_pageviews_rdd)
w_pageviews_pandas = w_pageviews_df.toPandas()
w_pageviews_pandas
access_method | total_pageviews | daily_average_pageviews | |
---|---|---|---|
0 | desktop | 3134600739 | 97956273.0938 |
1 | mobile web | 4120797860 | 128774933.1250 |
2 | mobile app | 54845509 | 1713922.1562 |
# most visited WP:M pages with external links
top_wpm_query = """
SELECT pageview_hourly.page_id, population_page_titles_20190420.page_title, SUM(pageview_hourly.view_count) AS total_pageviews, SUM(pageview_hourly.view_count)/{} AS daily_average_pageviews
FROM wmf.pageview_hourly
LEFT JOIN ryanmax.population_page_titles_20190420 ON pageview_hourly.page_id = population_page_titles_20190420.page_id
WHERE project = 'en.wikipedia'
AND pageview_hourly.page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY pageview_hourly.page_id, population_page_titles_20190420.page_title
ORDER BY total_pageviews DESC
LIMIT 50
"""
top_wpm = spark.sql(top_wpm_query.format(days_in_study,start_date_string, end_date_string))
top_wpm.limit(50).toPandas()
page_id | page_title | total_pageviews | daily_average_pageviews | |
---|---|---|---|---|
0 | 43573275 | Elizabeth Holmes | 1349527 | 42172.7188 |
1 | 41779862 | Theranos | 546969 | 17092.7812 |
2 | 58911 | Measles | 388340 | 12135.6250 |
3 | 18079 | Leonardo da Vinci | 380421 | 11888.1562 |
4 | 27546 | Sexual intercourse | 371290 | 11602.8125 |
5 | 37556 | Asperger syndrome | 336547 | 10517.0938 |
6 | 791546 | Ketogenic diet | 319326 | 9978.9375 |
7 | 4488176 | Factitious disorder imposed on another | 314542 | 9829.4375 |
8 | 4501 | Black Death | 308057 | 9626.7812 |
9 | 56880920 | Ramesh Balwani | 305280 | 9540.0000 |
10 | 52135 | Pneumonia | 292179 | 9130.5938 |
11 | 1232575 | Suicide methods | 290488 | 9077.7500 |
12 | 44990 | Ryan White | 260600 | 8143.7500 |
13 | 56483 | Tourette syndrome | 245082 | 7658.8125 |
14 | 50601 | Cystic fibrosis | 241014 | 7531.6875 |
15 | 1232085 | Cannabidiol | 240093 | 7502.9062 |
16 | 149223 | Borderline personality disorder | 239415 | 7481.7188 |
17 | 26652964 | Intersex | 229836 | 7182.3750 |
18 | 63522 | Crohn's disease | 226383 | 7074.4688 |
19 | 30653 | Tuberculosis | 222910 | 6965.9375 |
20 | 4531 | Bipolar disorder | 221744 | 6929.5000 |
21 | 10024 | MDMA | 206809 | 6462.7812 |
22 | 50603 | Multiple sclerosis | 206073 | 6439.7812 |
23 | 17537 | Lysergic acid diethylamide | 198790 | 6212.1875 |
24 | 1481886 | Cannabis (drug) | 197614 | 6175.4375 |
25 | 5069516 | HIV/AIDS | 196495 | 6140.4688 |
26 | 141915 | Fentanyl | 195843 | 6120.0938 |
27 | 7701 | Cocaine | 195679 | 6114.9688 |
28 | 244113 | Lyme disease | 192052 | 6001.6250 |
29 | 27790 | Schizophrenia | 189400 | 5918.7500 |
30 | 158400 | Sepsis | 186060 | 5814.3750 |
31 | 22228064 | Parkinson's disease | 183926 | 5747.6875 |
32 | 26743 | Sigmund Freud | 182681 | 5708.7812 |
33 | 482534 | Gabapentin | 181754 | 5679.8125 |
34 | 318049 | Fibromyalgia | 179980 | 5624.3750 |
35 | 42043 | Project MKUltra | 175753 | 5492.2812 |
36 | 101965 | Psoriasis | 175601 | 5487.5312 |
37 | 40017873 | Diabetes mellitus | 175558 | 5486.1875 |
38 | 83406 | Paracetamol | 174939 | 5466.8438 |
39 | 491494 | Ankylosing spondylitis | 173697 | 5428.0312 |
40 | 37195273 | Trypophobia | 172821 | 5400.6562 |
41 | 65847 | Vitiligo | 172270 | 5383.4375 |
42 | 25 | Autism | 169124 | 5285.1250 |
43 | 11038318 | Methamphetamine | 166435 | 5201.0938 |
44 | 443800 | Shingles | 164727 | 5147.7188 |
45 | 649100 | Adderall | 164180 | 5130.6250 |
46 | 250858 | Tramadol | 162858 | 5089.3125 |
47 | 20556798 | Myocardial infarction | 160886 | 5027.6875 |
48 | 19375577 | Amyotrophic lateral sclerosis | 160844 | 5026.3750 |
49 | 234806 | Diazepam | 158085 | 4940.1562 |