#!/usr/bin/env python # coding: utf-8 # # QueryPic deconstructed # #### Visualise searches in Trove's digitised newspapers # [QueryPic](http://dhistory.org/querypic/) is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time. # # This is a deconstructed, extended, and hackable version of QueryPic. # In[ ]: import math import os import time from collections import OrderedDict from operator import itemgetter # used for sorting import altair as alt import ipywidgets as widgets import pandas as pd # makes manipulating the data easier import requests from IPython.display import HTML, FileLink, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from tqdm.auto import tqdm # Make sure data directory exists os.makedirs("data", exist_ok=True) # Create a session that will automatically retry on server errors s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("http://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries)) # In[ ]: get_ipython().run_cell_magic('capture', '', '# Load env variables\n%load_ext dotenv\n%dotenv\n') # ### Enter your Trove API key # # Get your own [Trove API key](https://trove.nla.gov.au/about/create-something/using-api) and enter it below. # In[ ]: api_key = widgets.Text( placeholder="Enter your Trove API key", description="API key:", disabled=False ) display(api_key) # In[ ]: params = { "q": " ", # A space to search for everything "facet": "year", "zone": "newspaper", # 'l-category': 'Article', "encoding": "json", "n": 0, } results = widgets.Output() save_data = widgets.Output() df = None # In[ ]: def get_results(params): """ Get JSON response data from the Trove API. Parameters: params Returns: JSON formatted response data from Trove API """ response = s.get( "https://api.trove.nla.gov.au/v2/result", params=params, timeout=30 ) response.raise_for_status() # print(response.url) # This shows us the url that's sent to the API data = response.json() return data def get_facets(data): """ Loop through facets in Trove API response, saving terms and counts. Parameters: data - JSON formatted response data from Trove API Returns: A list of dictionaries containing: 'year', 'total_results' """ facets = [] try: for term in data["response"]["zone"][0]["facets"]["facet"]["term"]: if ( int(term["display"]) >= date_range.value[0] and int(term["display"]) <= date_range.value[1] ): facets.append( {"year": int(term["display"]), "total_results": int(term["count"])} ) facets.sort(key=itemgetter("year")) except TypeError: pass return facets def combine_totals(query_data, total_data): """ Take facets data from the query search and a blank search (ie everything) for a decade and combine them. Parameters: query_data - list of dictionaries containing facets data from a query search total_data - list of dictionaries containing facets data from a blank search Returns: A list of dictionaries containing: 'year', 'total_results', 'total articles' """ combined_data = [] query_data = get_facets(query_data) total_data = get_facets(total_data) for index, query_row in enumerate(query_data): total_row = total_data[index] query_row["total_articles"] = total_row["total_results"] combined_data.append(query_row) return combined_data def year_totals(params): """ Generate a dataset for a search query. Parameters: query - search query Returns: A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year. """ totals = [] start_decade = math.floor(date_range.value[0] / 10) end_decade = math.floor(date_range.value[1] / 10) + 1 query = params["q"] with results: for decade in tqdm(range(start_decade, end_decade)): params["l-decade"] = decade params["q"] = query query_data = get_results(params) params["q"] = " " total_data = get_results(params) combined_data = combine_totals(query_data, total_data) totals.extend(combined_data) totals.sort(key=itemgetter("year")) return totals # ### Set a date range # In[ ]: date_range = widgets.IntRangeSlider( value=[1803, 1954], min=1803, max=2018, step=1, description="Date range:", disabled=False, continuous_update=False, orientation="horizontal", readout=True, readout_format="0<4d", layout=widgets.Layout(width="50%"), ) display(date_range) # ### Add your search queries # # You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers: # # * Compare queries — `cat` vs `dog` # * Compare states — `swimmers` in NSW, Victoria, and Queensland # * Compare newspapers — `protectionism` in *The Age* vs *The Argus* # In[ ]: queries = [] out = widgets.Output() @out.capture() def add_query(b): queries.append(query.value) query.value = "" print("Query {}: {}".format(len(queries), queries[-1])) query = widgets.Text( placeholder="Enter your query then click the button to add", disabled=False, ) query_button = widgets.Button( description="Add query", disabled=False, tooltip="Click to add query", icon="" ) query_button.on_click(add_query) query_tip = widgets.HTML( value="A query can be anything you'd enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want." ) # In[ ]: def get_titles(b): params = {"encoding": "json", "key": api_key.value} response = requests.get( "http://api.trove.nla.gov.au/v2/newspaper/titles", params=params ) data = response.json() title_list = [ (t["title"], {"id": t["id"], "title": t["title"]}) for t in data["response"]["records"]["newspaper"] ] title_list.sort(key=itemgetter(0)) titles_sorted = OrderedDict(title_list) titles.options = titles_sorted title_query = widgets.Text( placeholder="Enter your query", description="Search for:", disabled=False, ) titles = widgets.SelectMultiple( options=["Click on button to load titles"], rows=10, description="In:", disabled=False, layout=widgets.Layout(width="50%"), ) titles_button = widgets.Button( description="Load titles", disabled=False, button_style="", # 'success', 'info', 'warning', 'danger' or '' tooltip="Click to load titles", icon="", ) titles_button.on_click(get_titles) titles_tip = widgets.HTML( value="Use Shift or Cmd/Ctrl to select multiple newspapers to compare." ) # In[ ]: state_query = widgets.Text( placeholder="Enter your query", description="Search for:", disabled=False, ) states = widgets.SelectMultiple( options=[ "ACT", "New South Wales", "Queensland", "South Australia", "Northern Territory", "Tasmania", "Victoria", "Western Australia", "National", "International", ], rows=10, description="In:", disabled=False, layout=widgets.Layout(width="50%"), ) states_tip = widgets.HTML( value="Use Shift or Cmd/Ctrl to select multiple states to compare." ) # In[ ]: def plot_raw_results(width=700, height=400): chart = ( alt.Chart(df) .mark_line(point=True) .encode( x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")), y=alt.Y( "total_results:Q", axis=alt.Axis(format=",d", title="Number of articles"), ), color=alt.Color("query", legend=alt.Legend(title="")), tooltip=[ alt.Tooltip("query", title="Query:"), alt.Tooltip("year:Q", title="Year"), alt.Tooltip("total_results:Q", title="Articles", format=","), ], ) .properties(width=width, height=height) ) return chart def plot_relative_results(width=700, height=400): chart = ( alt.Chart(df) .mark_line(point=True) .encode( x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")), y=alt.Y( "PercentOfTotal:Q", axis=alt.Axis(format=".2%", title="Percentage of total articles"), ), color=alt.Color("query", legend=alt.Legend(title="")), tooltip=[ alt.Tooltip("query", title="Query:"), alt.Tooltip("year:Q", title="Year"), alt.Tooltip("PercentOfTotal:Q", title="Articles", format=".2%"), ], ) .properties(width=width, height=height) .transform_calculate( PercentOfTotal="datum.total_results / datum.total_articles" ) ) return chart # In[ ]: def clear_all(b): states.value = [] state_query.value = "" titles.value = [] title_query.value = "" out.clear_output() queries.clear() results.clear_output() save_data.clear_output() def get_data(b): global df results.clear_output() save_data.clear_output() traces = [] q_params = params.copy() q_params["key"] = api_key.value if tab.selected_index == 0: for query in queries: q_params["q"] = query with results: display(HTML("Searching for {}...".format(query))) totals = year_totals(q_params.copy()) df_totals = pd.DataFrame(totals) df_totals["query"] = query traces.append(df_totals) elif tab.selected_index == 1: q_params["q"] = state_query.value for state in states.value: q_params["l-state"] = state with results: display(HTML("Searching in {}...".format(state))) totals = year_totals(q_params.copy()) df_totals = pd.DataFrame(totals) df_totals["query"] = state traces.append(df_totals) elif tab.selected_index == 2: q_params["q"] = title_query.value for title in titles.value: q_params["l-title"] = title["id"] with results: display(HTML("Searching in {}...".format(title["title"]))) totals = year_totals(q_params.copy()) df_totals = pd.DataFrame(totals) df_totals["query"] = title["title"] traces.append(df_totals) try: df = pd.concat(traces, ignore_index=True) except ValueError: with results: display(HTML("No results!")) else: results.clear_output() chart = plot_raw_results() chart_type.value = "raw" csv_file = save_as_csv() with results: display(chart_type) display(chart) with save_data: display( HTML(f'Download data: {csv_file}') ) display( widgets.HBox([save_chart_button, save_chart_width, save_chart_height]) ) def save_chart(b): width = save_chart_width.value height = save_chart_height.value if chart_type.value == "proportion": chart = plot_relative_results(width, height) else: chart = plot_raw_results(width, height) filename = "data/querypic-{}.html".format(int(time.time())) chart.save(filename) with save_data: display(HTML("View HTML version:"), FileLink(filename)) def save_as_csv(): filename = "data/querypic-{}.csv".format(int(time.time())) df.to_csv(filename, index=False) return filename def change_chart(o): results.clear_output(wait=True) if chart_type.value == "proportion": chart = plot_relative_results() else: chart = plot_raw_results() with results: display(chart_type) display(chart) chart_type = widgets.Dropdown( options=[ ("Raw number of results", "raw"), ("Proportion of total articles", "proportion"), ], value="raw", ) chart_type.observe(change_chart) clear_all_button = widgets.Button( description="Clear all", disabled=False, button_style="", # 'success', 'info', 'warning', 'danger' or '' tooltip="Clear current queries", icon="", ) get_data_button = widgets.Button( description="Create chart", disabled=False, button_style="primary", # 'success', 'info', 'warning', 'danger' or '' tooltip="Create chart", icon="", ) save_chart_button = widgets.Button( description="Save chart", disabled=False, button_style="primary", # 'success', 'info', 'warning', 'danger' or '' tooltip="Save chart as HTML", icon="", ) save_chart_width = widgets.BoundedIntText( value=700, min=700, max=2000, step=100, description="Width", disabled=False ) save_chart_height = widgets.BoundedIntText( value=400, min=400, max=1500, step=100, description="Height", disabled=False ) clear_all_button.on_click(clear_all) get_data_button.on_click(get_data) save_chart_button.on_click(save_chart) tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out]) tab2 = widgets.VBox([state_query, states, states_tip]) tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip]) tab = widgets.Tab(children=[tab1, tab2, tab3]) tab.set_title(0, "Compare queries") tab.set_title(1, "Compare states") tab.set_title(2, "Compare newspapers") display( widgets.VBox( [tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data] ) ) # In[ ]: # TESTING if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"): api_key.value = os.getenv("TROVE_API_KEY") query.value = "cat" query_button.click() get_data_button.click() # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).