population_infobox
table# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
'enwiki-20190420-pages-articles-multistream.xml.bz2']
INFOBOX_REGEX = re.compile(r'\{ *infobox ', re.IGNORECASE)
def count_infobox(entity, date):
page_text = entity.revision.text._VALUE
count = len(INFOBOX_REGEX.findall(page_text))
return Row(page_id=entity.id, infobox_count=count, dt=date)
infobox_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
dump_date = re.search(r'.*(\d{8}).*',file).group(1)
articles = wikipedia\
.filter("ns = '0'")\
.filter("redirect._title is null") \
.filter("revision.text._VALUE is not null") \
.filter("length(revision.text._VALUE) > 0")
daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))
infobox_rdd = infobox_rdd.union(daily_counts.rdd)
infobox_merged = sqlContext.createDataFrame(infobox_rdd)
infobox = infobox_merged.toPandas()
infobox
dt | infobox_count | page_id | |
---|---|---|---|
0 | 20190401 | 0 | 12 |
1 | 20190401 | 1 | 25 |
2 | 20190401 | 0 | 39 |
3 | 20190401 | 1 | 290 |
4 | 20190401 | 2 | 303 |
5 | 20190401 | 0 | 305 |
6 | 20190401 | 2 | 307 |
7 | 20190401 | 1 | 308 |
8 | 20190401 | 0 | 309 |
9 | 20190401 | 1 | 316 |
10 | 20190401 | 2 | 324 |
11 | 20190401 | 1 | 330 |
12 | 20190401 | 2 | 332 |
13 | 20190401 | 0 | 334 |
14 | 20190401 | 0 | 336 |
15 | 20190401 | 1 | 339 |
16 | 20190401 | 1 | 340 |
17 | 20190401 | 1 | 344 |
18 | 20190401 | 1 | 358 |
19 | 20190401 | 0 | 359 |
20 | 20190401 | 0 | 569 |
21 | 20190401 | 0 | 572 |
22 | 20190401 | 0 | 573 |
23 | 20190401 | 0 | 579 |
24 | 20190401 | 0 | 580 |
25 | 20190401 | 1 | 586 |
26 | 20190401 | 0 | 590 |
27 | 20190401 | 0 | 593 |
28 | 20190401 | 1 | 594 |
29 | 20190401 | 1 | 595 |
... | ... | ... | ... |
11686877 | 20190420 | 1 | 60552524 |
11686878 | 20190420 | 1 | 60552528 |
11686879 | 20190420 | 0 | 60552534 |
11686880 | 20190420 | 0 | 60552561 |
11686881 | 20190420 | 1 | 60552576 |
11686882 | 20190420 | 1 | 60552592 |
11686883 | 20190420 | 1 | 60552597 |
11686884 | 20190420 | 0 | 60552601 |
11686885 | 20190420 | 1 | 60552606 |
11686886 | 20190420 | 0 | 60552607 |
11686887 | 20190420 | 1 | 60552611 |
11686888 | 20190420 | 1 | 60552619 |
11686889 | 20190420 | 0 | 60552630 |
11686890 | 20190420 | 1 | 60552632 |
11686891 | 20190420 | 1 | 60552643 |
11686892 | 20190420 | 1 | 60552649 |
11686893 | 20190420 | 1 | 60552656 |
11686894 | 20190420 | 1 | 60552665 |
11686895 | 20190420 | 1 | 60552674 |
11686896 | 20190420 | 1 | 60552678 |
11686897 | 20190420 | 1 | 60552679 |
11686898 | 20190420 | 1 | 60552689 |
11686899 | 20190420 | 1 | 60552690 |
11686900 | 20190420 | 1 | 60552709 |
11686901 | 20190420 | 1 | 60552711 |
11686902 | 20190420 | 1 | 60552724 |
11686903 | 20190420 | 1 | 60552730 |
11686904 | 20190420 | 0 | 60552736 |
11686905 | 20190420 | 1 | 60552745 |
11686906 | 20190420 | 1 | 60552764 |
11686907 rows × 3 columns
# write infobox counts data to a table for later use
infobox_merged.createOrReplaceTempView("temp_infobox_count")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.infobox_count")
sqlContext.sql("CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count")
DataFrame[]
infobox.query('infobox_count>25')
dt | infobox_count | page_id | |
---|---|---|---|
5011 | 20190401 | 49 | 10577 |
6537 | 20190401 | 58 | 13696 |
49147 | 20190401 | 34 | 102962 |
49165 | 20190401 | 27 | 102985 |
49565 | 20190401 | 29 | 103782 |
162390 | 20190401 | 52 | 373334 |
164886 | 20190401 | 34 | 381506 |
169633 | 20190401 | 28 | 397295 |
276936 | 20190401 | 78 | 782670 |
285251 | 20190401 | 32 | 841648 |
293703 | 20190401 | 239 | 871024 |
309385 | 20190401 | 34 | 931088 |
324731 | 20190401 | 27 | 996290 |
346730 | 20190401 | 34 | 1086718 |
350842 | 20190401 | 35 | 1104268 |
350865 | 20190401 | 36 | 1104332 |
350906 | 20190401 | 35 | 1104505 |
350910 | 20190401 | 35 | 1104515 |
350918 | 20190401 | 38 | 1104564 |
351288 | 20190401 | 35 | 1106128 |
351294 | 20190401 | 33 | 1106178 |
351311 | 20190401 | 33 | 1106241 |
351326 | 20190401 | 26 | 1106299 |
351346 | 20190401 | 29 | 1106375 |
351506 | 20190401 | 28 | 1106937 |
357754 | 20190401 | 34 | 1130287 |
358597 | 20190401 | 36 | 1133844 |
358688 | 20190401 | 39 | 1134236 |
506704 | 20190401 | 28 | 1847173 |
526287 | 20190401 | 80 | 1945597 |
... | ... | ... | ... |
11306108 | 20190420 | 31 | 55569842 |
11308223 | 20190420 | 35 | 55595865 |
11309929 | 20190420 | 123 | 55619912 |
11315945 | 20190420 | 179 | 55697297 |
11321659 | 20190420 | 26 | 55763651 |
11325516 | 20190420 | 30 | 55809734 |
11332444 | 20190420 | 27 | 55887267 |
11342183 | 20190420 | 28 | 56002822 |
11353183 | 20190420 | 119 | 56142356 |
11357978 | 20190420 | 129 | 56192995 |
11371695 | 20190420 | 145 | 56363828 |
11377401 | 20190420 | 142 | 56434290 |
11385423 | 20190420 | 130 | 56532039 |
11385904 | 20190420 | 126 | 56538063 |
11389134 | 20190420 | 136 | 56577813 |
11413632 | 20190420 | 334 | 56887146 |
11414205 | 20190420 | 145 | 56891883 |
11435248 | 20190420 | 28 | 57147301 |
11472465 | 20190420 | 139 | 57437661 |
11475831 | 20190420 | 122 | 57482185 |
11479726 | 20190420 | 141 | 57536902 |
11485287 | 20190420 | 136 | 57620451 |
11486673 | 20190420 | 146 | 57637971 |
11492147 | 20190420 | 128 | 57706926 |
11501751 | 20190420 | 126 | 57839087 |
11506322 | 20190420 | 71 | 57897975 |
11520191 | 20190420 | 27 | 58072995 |
11537552 | 20190420 | 30 | 58344889 |
11564773 | 20190420 | 75 | 58742596 |
11652475 | 20190420 | 43 | 60089397 |
729 rows × 3 columns
# calculate rate of change
q = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date
FROM
ryanmax.infobox_count
GROUP BY extract_date
ORDER BY extract_date
"""
ibs = spark.sql(q)
ibs_pd = sqlContext.createDataFrame(ibs.rdd).toPandas()
print('Rate of change: ',(ibs_pd['pages_w_infobox'][1] - ibs_pd['pages_w_infobox'][0])/ibs_pd['pages_w_infobox'][0])
ibs_pd
Rate of change: 0.00149698163222
pages_w_infobox | extract_date | |
---|---|---|
0 | 5839083 | 20190401 |
1 | 5847824 | 20190420 |
# write data for 2019-04-20 and report out link count
ibs = spark.sql("SELECT * FROM ryanmax.infobox_count WHERE dt = '20190420'")
ibs_df = sqlContext.createDataFrame(ibs.rdd)
ibs_df.createOrReplaceTempView("temp_infobox")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_infobox")
sqlContext.sql("CREATE TABLE ryanmax.population_infobox AS SELECT * FROM temp_infobox")
spark.sql("select COUNT(DISTINCT page_id) AS pages_w_infobox from ryanmax.population_infobox").show()
+---------------+ |pages_w_infobox| +---------------+ | 5847824| +---------------+
# Count of pages with infoboxes in W pages with external links
w_infobox_query = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox
FROM
ryanmax.population_infobox
WHERE page_id IN
(SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
"""
w_infobox = spark.sql(w_infobox_query)
w_infobox.show()
+---------------+ |pages_w_infobox| +---------------+ | 5209322| +---------------+
# Count of pages with infoboxes in WP:M pages with external links
pm_infobox_query = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox
FROM
ryanmax.population_infobox
WHERE page_id IN
(SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
"""
pm_infobox = spark.sql(pm_infobox_query)
pm_infobox.show()
+---------------+ |pages_w_infobox| +---------------+ | 32597| +---------------+
# Number of infoboxes per page in W pages with external links
w_infobox_cnts_query = """
SELECT MIN(infobox_count) as min,
MAX(infobox_count) as max,
CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,
PERCENTILE(infobox_count,0.5) AS median_infobox_count,
STDDEV(infobox_count) as stddev_infobox_count,
(PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count
FROM
ryanmax.population_infobox
WHERE page_id IN
(SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
"""
w_infobox_cnts = spark.sql(w_infobox_cnts_query)
w_infobox_cnts.show()
+---+---+---------------------+--------------------+--------------------+-----------------+ |min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count| +---+---+---------------------+--------------------+--------------------+-----------------+ | 0|346| 0.67| 1.0| 0.8573307965874148| 1.0| +---+---+---------------------+--------------------+--------------------+-----------------+
# Number of infoboxes per page in WP:M pages with external links
pm_infobox_cnts_query = """
SELECT MIN(infobox_count) as min,
MAX(infobox_count) as max,
CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,
PERCENTILE(infobox_count,0.5) AS median_infobox_count,
STDDEV(infobox_count) as stddev_infobox_count,
(PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count
FROM
ryanmax.population_infobox
WHERE page_id IN
(SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
"""
pm_infobox_cnts = spark.sql(pm_infobox_cnts_query)
pm_infobox_cnts.show()
+---+---+---------------------+--------------------+--------------------+-----------------+ |min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count| +---+---+---------------------+--------------------+--------------------+-----------------+ | 0| 11| 0.49| 0.0| 0.5235718191828268| 1.0| +---+---+---------------------+--------------------+--------------------+-----------------+