Notebook

QueryPic deconstructed¶

Visualise searches in Trove's digitised newspapers¶

QueryPic is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.

This is a deconstructed, extended, and hackable version of QueryPic.

In [ ]:

import math
import os
import time
from collections import OrderedDict
from operator import itemgetter  # used for sorting

import altair as alt
import ipywidgets as widgets
import pandas as pd  # makes manipulating the data easier
import requests
from IPython.display import HTML, FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

# Make sure data directory exists
os.makedirs("data", exist_ok=True)

# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

In [ ]:

%%capture
# Load env variables
%load_ext dotenv
%dotenv

Enter your Trove API key¶

Get your own Trove API key and enter it below.

In [ ]:

api_key = widgets.Text(
    placeholder="Enter your Trove API key", description="API key:", disabled=False
)
display(api_key)

In [ ]:

params = {
    "q": " ",  # A space to search for everything
    "facet": "year",
    "zone": "newspaper",
    # 'l-category': 'Article',
    "encoding": "json",
    "n": 0,
}

results = widgets.Output()
save_data = widgets.Output()
df = None

In [ ]:

def get_results(params):
    """
    Get JSON response data from the Trove API.
    Parameters:
        params
    Returns:
        JSON formatted response data from Trove API
    """
    response = s.get(
        "https://api.trove.nla.gov.au/v2/result", params=params, timeout=30
    )
    response.raise_for_status()
    # print(response.url) # This shows us the url that's sent to the API
    data = response.json()
    return data


def get_facets(data):
    """
    Loop through facets in Trove API response, saving terms and counts.
    Parameters:
        data  - JSON formatted response data from Trove API
    Returns:
        A list of dictionaries containing: 'year', 'total_results'
    """
    facets = []
    try:
        for term in data["response"]["zone"][0]["facets"]["facet"]["term"]:
            if (
                int(term["display"]) >= date_range.value[0]
                and int(term["display"]) <= date_range.value[1]
            ):
                facets.append(
                    {"year": int(term["display"]), "total_results": int(term["count"])}
                )
        facets.sort(key=itemgetter("year"))
    except TypeError:
        pass
    return facets


def combine_totals(query_data, total_data):
    """
    Take facets data from the query search and a blank search (ie everything) for a decade and combine them.
    Parameters:
        query_data    - list of dictionaries containing facets data from a query search
        total_data    - list of dictionaries containing facets data from a blank search
    Returns:
        A list of dictionaries containing: 'year', 'total_results', 'total articles'
    """
    combined_data = []
    query_data = get_facets(query_data)
    total_data = get_facets(total_data)
    for index, query_row in enumerate(query_data):
        total_row = total_data[index]
        query_row["total_articles"] = total_row["total_results"]
        combined_data.append(query_row)
    return combined_data


def year_totals(params):
    """
    Generate a dataset for a search query.
    Parameters:
        query    - search query
    Returns:
        A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.
    """
    totals = []
    start_decade = math.floor(date_range.value[0] / 10)
    end_decade = math.floor(date_range.value[1] / 10) + 1
    query = params["q"]
    with results:
        for decade in tqdm(range(start_decade, end_decade)):
            params["l-decade"] = decade
            params["q"] = query
            query_data = get_results(params)
            params["q"] = " "
            total_data = get_results(params)
            combined_data = combine_totals(query_data, total_data)
            totals.extend(combined_data)
    totals.sort(key=itemgetter("year"))
    return totals

Set a date range¶

In [ ]:

date_range = widgets.IntRangeSlider(
    value=[1803, 1954],
    min=1803,
    max=2018,
    step=1,
    description="Date range:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="0<4d",
    layout=widgets.Layout(width="50%"),
)
display(date_range)

Add your search queries¶

You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:

Compare queries — cat vs dog
Compare states — swimmers in NSW, Victoria, and Queensland
Compare newspapers — protectionism in The Age vs The Argus

In [ ]:

queries = []
out = widgets.Output()


@out.capture()
def add_query(b):
    queries.append(query.value)
    query.value = ""
    print("Query {}: {}".format(len(queries), queries[-1]))


query = widgets.Text(
    placeholder="Enter your query then click the button to add",
    disabled=False,
)

query_button = widgets.Button(
    description="Add query", disabled=False, tooltip="Click to add query", icon=""
)

query_button.on_click(add_query)
query_tip = widgets.HTML(
    value="A query can be anything you'd enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want."
)

In [ ]:

def get_titles(b):
    params = {"encoding": "json", "key": api_key.value}
    response = requests.get(
        "http://api.trove.nla.gov.au/v2/newspaper/titles", params=params
    )
    data = response.json()
    title_list = [
        (t["title"], {"id": t["id"], "title": t["title"]})
        for t in data["response"]["records"]["newspaper"]
    ]
    title_list.sort(key=itemgetter(0))
    titles_sorted = OrderedDict(title_list)
    titles.options = titles_sorted


title_query = widgets.Text(
    placeholder="Enter your query",
    description="Search for:",
    disabled=False,
)
titles = widgets.SelectMultiple(
    options=["Click on button to load titles"],
    rows=10,
    description="In:",
    disabled=False,
    layout=widgets.Layout(width="50%"),
)
titles_button = widgets.Button(
    description="Load titles",
    disabled=False,
    button_style="",  # 'success', 'info', 'warning', 'danger' or ''
    tooltip="Click to load titles",
    icon="",
)
titles_button.on_click(get_titles)
titles_tip = widgets.HTML(
    value="Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple newspapers to compare."
)

In [ ]:

state_query = widgets.Text(
    placeholder="Enter your query",
    description="Search for:",
    disabled=False,
)

states = widgets.SelectMultiple(
    options=[
        "ACT",
        "New South Wales",
        "Queensland",
        "South Australia",
        "Northern Territory",
        "Tasmania",
        "Victoria",
        "Western Australia",
        "National",
        "International",
    ],
    rows=10,
    description="In:",
    disabled=False,
    layout=widgets.Layout(width="50%"),
)

states_tip = widgets.HTML(
    value="Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple states to compare."
)

In [ ]:

def plot_raw_results(width=700, height=400):
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(
            x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
            y=alt.Y(
                "total_results:Q",
                axis=alt.Axis(format=",d", title="Number of articles"),
            ),
            color=alt.Color("query", legend=alt.Legend(title="")),
            tooltip=[
                alt.Tooltip("query", title="Query:"),
                alt.Tooltip("year:Q", title="Year"),
                alt.Tooltip("total_results:Q", title="Articles", format=","),
            ],
        )
        .properties(width=width, height=height)
    )
    return chart


def plot_relative_results(width=700, height=400):
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(
            x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
            y=alt.Y(
                "PercentOfTotal:Q",
                axis=alt.Axis(format=".2%", title="Percentage of total articles"),
            ),
            color=alt.Color("query", legend=alt.Legend(title="")),
            tooltip=[
                alt.Tooltip("query", title="Query:"),
                alt.Tooltip("year:Q", title="Year"),
                alt.Tooltip("PercentOfTotal:Q", title="Articles", format=".2%"),
            ],
        )
        .properties(width=width, height=height)
        .transform_calculate(
            PercentOfTotal="datum.total_results / datum.total_articles"
        )
    )
    return chart

In [ ]:

def clear_all(b):
    states.value = []
    state_query.value = ""
    titles.value = []
    title_query.value = ""
    out.clear_output()
    queries.clear()
    results.clear_output()
    save_data.clear_output()


def get_data(b):
    global df
    results.clear_output()
    save_data.clear_output()
    traces = []
    q_params = params.copy()
    q_params["key"] = api_key.value
    if tab.selected_index == 0:
        for query in queries:
            q_params["q"] = query
            with results:
                display(HTML("Searching for {}...".format(query)))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals["query"] = query
            traces.append(df_totals)
    elif tab.selected_index == 1:
        q_params["q"] = state_query.value
        for state in states.value:
            q_params["l-state"] = state
            with results:
                display(HTML("Searching in {}...".format(state)))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals["query"] = state
            traces.append(df_totals)
    elif tab.selected_index == 2:
        q_params["q"] = title_query.value
        for title in titles.value:
            q_params["l-title"] = title["id"]
            with results:
                display(HTML("Searching in {}...".format(title["title"])))
            totals = year_totals(q_params.copy())
            df_totals = pd.DataFrame(totals)
            df_totals["query"] = title["title"]
            traces.append(df_totals)
    try:
        df = pd.concat(traces, ignore_index=True)
    except ValueError:
        with results:
            display(HTML("No results!"))
    else:
        results.clear_output()
        chart = plot_raw_results()
        chart_type.value = "raw"
        csv_file = save_as_csv()
        with results:
            display(chart_type)
            display(chart)
        with save_data:
            display(
                HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>')
            )
            display(
                widgets.HBox([save_chart_button, save_chart_width, save_chart_height])
            )


def save_chart(b):
    width = save_chart_width.value
    height = save_chart_height.value
    if chart_type.value == "proportion":
        chart = plot_relative_results(width, height)
    else:
        chart = plot_raw_results(width, height)
    filename = "data/querypic-{}.html".format(int(time.time()))
    chart.save(filename)
    with save_data:
        display(HTML("View HTML version:"), FileLink(filename))


def save_as_csv():
    filename = "data/querypic-{}.csv".format(int(time.time()))
    df.to_csv(filename, index=False)
    return filename


def change_chart(o):
    results.clear_output(wait=True)
    if chart_type.value == "proportion":
        chart = plot_relative_results()
    else:
        chart = plot_raw_results()
    with results:
        display(chart_type)
        display(chart)


chart_type = widgets.Dropdown(
    options=[
        ("Raw number of results", "raw"),
        ("Proportion of total articles", "proportion"),
    ],
    value="raw",
)

chart_type.observe(change_chart)

clear_all_button = widgets.Button(
    description="Clear all",
    disabled=False,
    button_style="",  # 'success', 'info', 'warning', 'danger' or ''
    tooltip="Clear current queries",
    icon="",
)

get_data_button = widgets.Button(
    description="Create chart",
    disabled=False,
    button_style="primary",  # 'success', 'info', 'warning', 'danger' or ''
    tooltip="Create chart",
    icon="",
)

save_chart_button = widgets.Button(
    description="Save chart",
    disabled=False,
    button_style="primary",  # 'success', 'info', 'warning', 'danger' or ''
    tooltip="Save chart as HTML",
    icon="",
)

save_chart_width = widgets.BoundedIntText(
    value=700, min=700, max=2000, step=100, description="Width", disabled=False
)

save_chart_height = widgets.BoundedIntText(
    value=400, min=400, max=1500, step=100, description="Height", disabled=False
)

clear_all_button.on_click(clear_all)
get_data_button.on_click(get_data)
save_chart_button.on_click(save_chart)
tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])
tab2 = widgets.VBox([state_query, states, states_tip])
tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])

tab = widgets.Tab(children=[tab1, tab2, tab3])
tab.set_title(0, "Compare queries")
tab.set_title(1, "Compare states")
tab.set_title(2, "Compare newspapers")
display(
    widgets.VBox(
        [tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]
    )
)

In [ ]:

# TESTING

if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"):
    api_key.value = os.getenv("TROVE_API_KEY")
    query.value = "cat"
    query_button.click()
    get_data_button.click()

Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.