Play along: http://bit.ly/ardc-glam-data
Hit space to go forward, shift+space to go back...
Eeek! We're going to run live code during this presentation. To run a code cell (like the one below), just hover over it and then click on the play icon that pops up in the left margin. Do this for every code cell you see!
import requests
import pandas as pd
import json
import altair as alt
from tqdm import tnrange, trange
import folium
from folium.plugins import HeatMapWithTime
from IPython.display import display, HTML
api_key = 'ju3rgk0jp354ikmh'
alt.renderers.enable('notebook')
params = {
'q': ' ',
'zone': 'newspaper',
'encoding': 'json',
'facet': 'state',
'n': '1',
'key': api_key
}
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
facets = data['response']['zone'][0]['facets']['facet']['term']
df = pd.DataFrame(facets)
df['count'] = pd.to_numeric(df['count'], errors='coerce')
df = df.replace('ACT', 'Australian Capital Territory')
df.head()
with open('data/aus_state.geojson', "r") as geo_file:
geo_data = json.load(geo_file)
c = alt.Chart(alt.Data(values=geo_data['features'])
).mark_geoshape(stroke='black', strokeWidth=0.2
).encode(color=alt.Color('count:Q', scale=alt.Scale(scheme='greenblue'), legend=alt.Legend(title='Total articles'))
).transform_lookup(lookup='properties.STATE_NAME', from_=alt.LookupData(df, 'display', ['count'])
).project(type='mercator'
).properties(width=600, height=400)
c
# Get some data
params = {
'q': 'influenza',
'zone': 'newspaper',
'encoding': 'json',
'facet': 'category',
'n': '1',
'key': api_key
}
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
total = int(data['response']['zone'][0]['records']['total'])
display(HTML('<p style="padding-top:30px;">There are <span style="font-size:500%;">{:,}</span> articles Tim.</p>'.format(total)))
# Make a chart
facets = data['response']['zone'][0]['facets']['facet']['term']
df = pd.DataFrame(facets)
df['count'] = pd.to_numeric(df['count'], errors='coerce')
c1 = alt.Chart(df).mark_bar().encode(
x=alt.X('count:Q', title='Number of articles'),
y=alt.Y('display:N', title='Category'),
tooltip=['count:Q']
)
# Show the chart
c1
# Get some data
start = 188
end = 194
years = []
params = {
'q': 'influenza', # A space to search for everything
'facet': 'year',
'zone': 'newspaper',
'l-category': 'Article',
'key': api_key,
'encoding': 'json',
'n': 0
}
for decade in trange(start, end):
params['l-decade'] = decade
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
years += data['response']['zone'][0]['facets']['facet']['term']
df = pd.DataFrame(years)
df.head()
# Make a chart
df['display'] = pd.to_datetime(df['display'], format='%Y', errors='coerce')
c2 = alt.Chart(df).mark_line().encode(
x='display:T',
y='count:Q',
tooltip=[alt.Tooltip('display:T', format='%Y', title='Year'), alt.Tooltip('count:Q', title='Articles')]
)
# Show the chart
c2
We could try dividing the number of results by the total number of articles published that year...
# Get some data
params['q'] = ' '
years = []
for decade in trange(start, end):
params['l-decade'] = decade
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
years += data['response']['zone'][0]['facets']['facet']['term']
df2 = pd.DataFrame(years)
df2['display'] = pd.to_datetime(df['display'], format='%Y', errors='coerce')
merged = pd.merge(df, df2, on='display')
merged['count_x'] = pd.to_numeric(merged['count_x'], errors='coerce')
merged['count_y'] = pd.to_numeric(merged['count_y'], errors='coerce')
merged['proportion'] = merged['count_x'] / merged['count_y']
merged.head()
# Make a chart
c3 = alt.Chart(merged).mark_line().encode(
x='display:T',
y='proportion:Q',
tooltip=[alt.Tooltip('display:T', title='Year'), alt.Tooltip('proportion:Q', title='Proportion')]
)
# Show the chart
c3
# Set things up
locations = pd.read_csv('data/trove-newspaper-titles-locations.csv', names=['title_id', 'title', 'state', 'place_id', 'place', 'lat', 'lon'])
locations.drop_duplicates(subset=['title_id'], keep='first', inplace=True)
hm_series = []
time_index = []
start = 1918
end = 1919
params = {
'q': 'influenza', # A space to search for everything
'facet': 'title',
'l-category': 'Article',
'zone': 'newspaper',
'key': api_key,
'encoding': 'json',
'n': 0
}
# Get some data
for year in range(start, end+1):
params['l-year'] = year
for month in trange(1, 13):
params['l-month'] = month
time_index.append('{}-{}'.format(year, month))
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
facets = data['response']['zone'][0]['facets']['facet']['term']
df = pd.DataFrame(facets)
df = df[['display', 'count']]
df.columns = ['title_id', 'total']
df['total'] = pd.to_numeric(df['total'], errors='coerce')
df_located = pd.merge(df, locations, on='title_id', how='left')
df_totals = df_located.groupby(['place', 'lat', 'lon']).sum()
hm_data = []
for place in df_totals.index:
total = df_totals.loc[place]['total']
hm_data += ([[place[1], place[2]]] * total)
hm_series.append(hm_data)
# Make a map
m1 = folium.Map(
location=[-30, 135],
zoom_start=4
)
#Add the heatmap data!
HeatMapWithTime(
hm_series,
index=time_index,
radius = 10,
auto_play=True
).add_to(m1)
# Show the map
m1
# Get some data
start = 190
end = 195
years = []
params = {
'q': ' ', # A space to search for everything
'facet': 'year',
'zone': 'newspaper',
'l-category': 'Article',
'key': api_key,
'encoding': 'json',
'n': 0
}
for decade in trange(start, end):
params['l-decade'] = decade
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
years += data['response']['zone'][0]['facets']['facet']['term']
df = pd.DataFrame(years)
df.head()
# Make a chart
df['display'] = pd.to_datetime(df['display'], format='%Y', errors='coerce')
c4 = alt.Chart(df).mark_line().encode(
x='display:T',
y='count:Q',
tooltip=[alt.Tooltip('display:T', format='%Y', title='Year'), alt.Tooltip('count:Q', title='Articles')]
)
# Show the chart
c4