QueryPic deconstructed

Visualise searches in Trove's digitised newspapers

QueryPic is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.

This is a deconstructed, extended, and hackable version of QueryPic.

In [ ]:
import requests
from requests.exceptions import HTTPError, Timeout
import os
import ipywidgets as widgets
from operator import itemgetter # used for sorting
import pandas as pd # makes manipulating the data easier
import altair as alt
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm_notebook
from IPython.display import display, HTML, FileLink, clear_output
import math
from collections import OrderedDict
import time

# Make sure data directory exists
os.makedirs('data', exist_ok=True)

# Set up Altair
alt.renderers.enable('notebook')

# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))

Enter your Trove API key

Get your own Trove API key and enter it below.

In [ ]:
api_key = widgets.Text(
    placeholder='Enter your Trove API key',
    description='API key:',
    disabled=False
)
display(api_key)
In [ ]:
params = {
    'q': ' ', # A space to search for everything
    'facet': 'year',
    'zone': 'newspaper',
    'l-category': 'Article',
    'encoding': 'json',
    'n': 0
}

results = widgets.Output()
save_data = widgets.Output()
df = None
In [ ]:
def get_results(params):
    '''
    Get JSON response data from the Trove API.
    Parameters:
        params
    Returns:
        JSON formatted response data from Trove API 
    '''
    response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=30)
    response.raise_for_status()
    # print(response.url) # This shows us the url that's sent to the API
    data = response.json()
    return data

def get_facets(data):
    '''
    Loop through facets in Trove API response, saving terms and counts.
    Parameters:
        data  - JSON formatted response data from Trove API  
    Returns:
        A list of dictionaries containing: 'year', 'total_results'
    '''
    facets = []
    try:
        for term in data['response']['zone'][0]['facets']['facet']['term']:
            if int(term['display']) >= date_range.value[0] and int(term['display']) <= date_range.value[1]:
                facets.append({'year': int(term['display']), 'total_results': int(term['count'])})
        facets.sort(key=itemgetter('year'))
    except TypeError:
        pass
    return facets

def combine_totals(query_data, total_data):
    '''
    Take facets data from the query search and a blank search (ie everything) for a decade and combine them.
    Parameters:
        query_data    - list of dictionaries containing facets data from a query search
        total_data    - list of dictionaries containing facets data from a blank search
    Returns:
        A list of dictionaries containing: 'year', 'total_results', 'total articles' 
    '''
    combined_data = []
    query_data = get_facets(query_data)
    total_data = get_facets(total_data)
    for index, query_row in enumerate(query_data):
        total_row = total_data[index]
        query_row['total_articles'] = total_row['total_results']
        combined_data.append(query_row)
    return combined_data 

def year_totals(params):
    '''
    Generate a dataset for a search query.
    Parameters:
        query    - search query
    Returns:
        A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.
    '''
    totals = []
    start_decade = math.floor(date_range.value[0] / 10)
    end_decade = math.floor(date_range.value[1] / 10) + 1
    query = params['q']
    with results:
        for decade in tqdm_notebook(range(start_decade, end_decade)):
            params['l-decade'] = decade
            params['q'] = query
            query_data = get_results(params)
            params['q'] = ' '
            total_data = get_results(params)
            combined_data = combine_totals(query_data, total_data)
            totals.extend(combined_data)
    totals.sort(key=itemgetter('year'))
    return totals

Set a date range

In [ ]:
date_range = widgets.IntRangeSlider(
    value=[1803, 1954],
    min=1803,
    max=2018,
    step=1,
    description='Date range:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='0<4d',
    layout=widgets.Layout(width='50%')
)
display(date_range)

Add your search queries

You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:

  • Compare queries — cat vs dog
  • Compare states — swimmers in NSW, Victoria, and Queensland
  • Compare newspapers — protectionism in The Age vs The Argus
In [ ]:
queries = []
out = widgets.Output()

@out.capture()
def add_query(b):
    queries.append(query.value)
    query.value = ''
    print('Query {}: {}'.format(len(queries), queries[-1]))

query = widgets.Text(
        placeholder='Enter your query then click the button to add',
        disabled=False,
    )

query_button = widgets.Button(
        description='Add query',
        disabled=False,
        tooltip='Click to add query',
        icon=''
    )

query_button.on_click(add_query)
query_tip = widgets.HTML(value='A query can be anything you\'d enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.')
In [ ]:
def get_titles(b):
    params = {
    'encoding': 'json',
    'key': api_key.value
    }
    response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)
    data = response.json()
    title_list = [(t['title'], {'id': t['id'], 'title': t['title']}) for t in data['response']['records']['newspaper']]
    title_list.sort(key=itemgetter(0))
    titles_sorted = OrderedDict(title_list)
    titles.options = titles_sorted
    
title_query = widgets.Text(
        placeholder='Enter your query',
        description='Search for:',
        disabled=False,
    )
titles = widgets.SelectMultiple(
        options=['Click on button to load titles'],
        rows=10,
        description='In:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )
titles_button = widgets.Button(
        description='Load titles',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to load titles',
        icon=''
    )
titles_button.on_click(get_titles)
titles_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple newspapers to compare.')
In [ ]:
state_query = widgets.Text(
        placeholder='Enter your query',
        description='Search for:',
        disabled=False,
    )
    
states = widgets.SelectMultiple(
    options=[
            'ACT',
            'New South Wales',
            'Queensland',
            'South Australia',
            'Northern Territory',
            'Tasmania',
            'Victoria',
            'Western Australia',
            'National',
            'International'
    ],
    rows=10,
    description='In:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)

states_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple states to compare.')
In [ ]:
def plot_raw_results(width=700, height=400):
    chart = alt.Chart(df).mark_line(point=True).encode(
        x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
        y=alt.Y('total_results:Q', axis=alt.Axis(format=',d', title='Number of articles')),
        color=alt.Color('query', legend=alt.Legend(title='')),
        tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('total_results:Q', title='Articles', format=',')]
    ).properties(width=width, height=height).interactive()
    return chart

def plot_relative_results(width=700, height=400):
    chart = alt.Chart(df).mark_line(point=True).encode(
        x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
        y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.2%', title='Percentage of total articles')),
        color=alt.Color('query', legend=alt.Legend(title='')),
        tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('PercentOfTotal:Q', title='Articles', format='.2%')]
    ).properties(width=width, height=height).transform_calculate(
        PercentOfTotal="datum.total_results / datum.total_articles"
    ).interactive()
    return chart
In [ ]:
def clear_all(b):
    states.value = []
    state_query.value = ''
    titles.value = []
    title_query.value = ''
    out.clear_output()
    queries.clear()
    results.clear_output()

def get_data(b):
    global df
    results.clear_output()
    traces = []
    q_params = params.copy()
    q_params['key'] = api_key.value
    if tab.selected_index == 0:
        for query in queries:
            q_params['q'] = query
            with results:
                display(HTML('Searching for {}...'.format(query)))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals['query'] = query
            traces.append(df_totals)
    elif tab.selected_index == 1:
        q_params['q'] = state_query.value
        for state in states.value:
            q_params['l-state'] = state
            with results:
                display(HTML('Searching in {}...'.format(state)))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals['query'] = state
            traces.append(df_totals)
    elif tab.selected_index == 2:
        q_params['q'] = title_query.value
        for title in titles.value:
            q_params['l-title'] = title['id']
            with results:
                display(HTML('Searching in {}...'.format(title['title'])))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals['query'] = title['title']
            traces.append(df_totals)
    try:
        df = pd.concat(traces, ignore_index=True)
    except ValueError:
        with results:
            display(HTML('No results!'))
    else:
        results.clear_output(wait=True)
        chart = plot_relative_results()
        chart_type.value = 'proportion'
        csv_file = save_as_csv()
        with results:
            display(chart_type)
            display(chart)
        with save_data:
            display(HTML('Download data:'), FileLink(csv_file))
            display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]))

def save_chart(b):
    width = save_chart_width.value
    height = save_chart_height.value
    if chart_type.value == 'proportion':
        chart = plot_relative_results(width, height)
    else:
        chart = plot_raw_results(width, height)
    filename = 'data/querypic-{}.html'.format(int(time.time()))
    chart.save(filename)
    display(HTML('View HTML version:'), FileLink(filename))
    
        
def save_as_csv():
    filename = 'data/querypic-{}.csv'.format(int(time.time()))
    df.to_csv(filename, index=False)
    return filename

def change_chart(o):
    results.clear_output(wait=True)
    if chart_type.value == 'proportion':
        chart = plot_relative_results()
    else:
        chart = plot_raw_results()
    with results:
        display(chart_type)
        display(chart)

chart_type = widgets.Dropdown(
        options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],
        value='proportion'
    )

chart_type.observe(change_chart)
    
clear_all_button = widgets.Button(
        description='Clear all',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Clear current queries',
        icon=''
    )

get_data_button = widgets.Button(
        description='Create chart',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Create chart',
        icon=''
    )

save_chart_button = widgets.Button(
        description='Save chart',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Save chart as HTML',
        icon=''
    )

save_chart_width = widgets.BoundedIntText(
    value=700,
    min=700,
    max=2000,
    step=100,
    description='Width',
    disabled=False
)

save_chart_height = widgets.BoundedIntText(
    value=400,
    min=400,
    max=1500,
    step=100,
    description='Height',
    disabled=False
)

clear_all_button.on_click(clear_all)
get_data_button.on_click(get_data)
save_chart_button.on_click(save_chart)
tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])
tab2 = widgets.VBox([state_query, states, states_tip])
tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])

tab = widgets.Tab(children=[tab1, tab2, tab3])
tab.set_title(0, 'Compare queries')
tab.set_title(1, 'Compare states')
tab.set_title(2, 'Compare newspapers')
display(widgets.VBox([tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]))