#!/usr/bin/env python # coding: utf-8 # # Harvesting series # In[8]: import time import csv import os import math import string import requests import pandas as pd from PIL import Image, ImageOps from requests import ConnectionError from recordsearch_tools.utilities import retry from recordsearch_tools.client import RSSearchClient, RSSeriesClient from tinydb import TinyDB, Query try: from io import BytesIO except ImportError: from StringIO import StringIO from IPython.display import Image as DImage from IPython.core.display import HTML # Plotly helps us make pretty charts import plotly.offline as py import plotly.graph_objs as go # This lets Plotly draw charts in cells py.init_notebook_mode() # In[2]: # What series do you want to harvest? # Insert the series id between the quotes. series = 'B13' # ## The harvesting code # In[13]: class SeriesHarvester(): def __init__(self, series, control=None, images_only=False): self.series = series self.control = control if not images_only: self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest() self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-'))) self.items = self.db.table('items') self.images = self.db.table('images') def get_total(self): return self.client.total_results def prepare_harvest(self): if self.control: self.client.search(series=self.series, control=self.control) else: self.client.search(series=self.series) total_results = self.client.total_results print('{} items'.format(total_results)) self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1 print(self.total_pages) @retry(ConnectionError, tries=20, delay=10, backoff=1) def start_harvest(self, page=None): Record = Query() if not page: page = self.pages_complete + 1 while self.pages_complete < self.total_pages: if self.control: response = self.client.search(series=self.series, page=page, control=self.control, sort='9') else: response = self.client.search(series=self.series, page=page, sort='9') for result in response['results']: self.items.upsert(result, Record.identifier == result['identifier']) self.pages_complete += 1 page += 1 print('{} pages complete'.format(self.pages_complete)) time.sleep(1) @retry(ConnectionError, tries=20, delay=10, backoff=1) def harvest_images(self): Record = Query() items = self.items.search(Record.digitised_status == True) headers = {'User-Agent': 'Mozilla/5.0'} for item in items: directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier'])) if not os.path.exists(directory): os.makedirs(directory) for page in range(1, item['digitised_pages'] + 1): filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page) print('{}, p. {}'.format(item['identifier'], page)) if not os.path.exists(filename): img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page) response = requests.get(img_url, headers=headers, stream=True, verify=False) response.raise_for_status() try: image = Image.open(BytesIO(response.content)) except IOError: print('Not an image') else: width, height = image.size image.save(filename) del response image_meta = { 'image_id': '{}-{}'.format(item['identifier'], page), 'identifier': item['identifier'], 'page': page, 'width': width, 'height': height } self.images.upsert(image_meta, Record.image_id == image_meta['image_id']) print('Image saved') time.sleep(1) # In[4]: def harvest_series(series): h = SeriesHarvester(series=series) h.start_harvest() # In[5]: def harvest_large_series(series, control_range=None): ''' RecordSearch will not return more than 20,000 results. If a series has more than 20,000 items you'll need to break it up. The easiest way to do this is to add a param for control_symbol. This function will break break a series harvest down into a series of harvests -- using each letter and number with a wildcard as the control_symbol parameter. This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes. ''' if not control_range: control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)] for control in control_range: print(control) h = SeriesHarvester(series=series, control=control) h.start_harvest() # In[16]: def harvest_images(series, images_only=False): h = SeriesHarvester(series=series, images_only=images_only) h.harvest_images() # ## Functions to process harvested data # In[5]: def convert_to_df(series): ''' Get the series data from TinyDB and save as a Pandas dataframe. Also flattens the date dictionary, and does a bit of ordering. ''' # Load the series db db = TinyDB('data/db-{}.json'.format(series.replace('/', '-'))) items = db.table('items') # Let's convert the database into a simple list item_list = [i for i in items] # Now let's turm that list into a Pandas Dataframe df = pd.DataFrame(item_list) # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1) # Delete the old date field del df['contents_dates'] # Rename column df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True) # Put columns in preferred order df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']] df.sort_values(['identifier']) return df # In[6]: def save_as_csv(series): ''' Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe. ''' df = convert_to_df(series) df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False) # ## Running the harvests # In[1]: # This is a list of series relating to the White Australia Policy # Large series (> 20,000 items) will be harvested separately series_list = ['B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1'] # In[ ]: # Loop through series list, harvesting each in turn for s in series_list: harvest_series(s) # In[17]: # Loop through series list, harvesting each in turn for s in ['B13']: harvest_images(s, images_only=True) # In[ ]: # B13 is > 20,000 items harvest_large_series('B13') # In[3]: # A1 is a large series that needs a custom control range to harvest # This generates a list of control_symbol prefixes that should break it down into harvestable chunks (< 20,000 items) # This should also work with series that use the the current year as the prefix, eg 1935/190 # Just feed this range to harvest_large_series() -- eg. harvest_large_series('A1', control_range) control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(0,10)]] print(control_range) # In[ ]: # Harvest A1 harvest_large_series('A1', control_range) # ## Saving as CSV-formatted files # In[9]: # Assuming you've already harvested the series! for s in series_list: save_as_csv(s) # In[10]: # Don't forget B13 save_as_csv('B13') # In[ ]: