page_titles
table from single dump file enwiki-20190420-pages-articles-multistream.xml.bz2
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
WIKIPEDIA_XML_DUMP = 'enwiki-20190420-pages-articles-multistream.xml.bz2'
def page_title(entity):
page_title = entity.title
return Row(page_id=entity.id, page_title=entity.title)
wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(WIKIPEDIA_XML_DUMP)
articles = wikipedia\
.filter("ns = '0'")\
.filter("redirect._title is null") \
.filter("revision.text._VALUE is not null") \
.filter("length(revision.text._VALUE) > 0")
page_titles = sqlContext.createDataFrame(articles.rdd.map(lambda entity: page_title(entity)))
page_titles.show()
+-------+--------------------+ |page_id| page_title| +-------+--------------------+ | 12| Anarchism| | 25| Autism| | 39| Albedo| | 290| A| | 303| Alabama| | 305| Achilles| | 307| Abraham Lincoln| | 308| Aristotle| | 309|An American in Paris| | 316|Academy Award for...| | 324| Academy Awards| | 330| Actrius| | 332| Animalia (book)| | 334|International Ato...| | 336| Altruism| | 339| Ayn Rand| | 340| Alain Connes| | 344| Allan Dwan| | 358| Algeria| | 359|List of Atlas Shr...| +-------+--------------------+ only showing top 20 rows
# save page titles data to a table for later use
page_titles.registerTempTable("temp_page_titles")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_page_titles_20190420")
sqlContext.sql("CREATE TABLE ryanmax.population_page_titles_20190420 AS SELECT * FROM temp_page_titles")
DataFrame[]
# sanity check: 1:1 ratio of page_id to page_title
q = """
select count(distinct page_id) as page_ids, count(distinct page_title) as page_titles
from ryanmax.population_page_titles_20190420
"""
sqlContext.sql(q).show()
+--------+-----------+ |page_ids|page_titles| +--------+-----------+ | 5847824| 5847824| +--------+-----------+