In [ ]:

# Import the IA Python library
import internetarchive as ia

# Replace placeholder strings with your IA credentials (leaving the quote marks)
ia_email = "YOUR_EMAIL_HERE"
ia_password = "YOUR_PASSWORD_HERE"

# add these credentials to the API's configuration object
ia.configure(ia_email, ia_password)

In [ ]:

# the requests library installed through conda
import requests

# a few other imports from the Python standard library
import gzip
import os
import sys
import xml.etree.ElementTree as ET

In [ ]:

# sample search (should yield two results)
query = "peter parley date:[1825 TO 1830] mediatype:texts"
vol_ids = [result['identifier'] for result in ia.search_items(query)]
vol_ids

In [ ]:

# define a function for downloading pictures from a given IA volume
def ia_picture_download(item_id, out_dir=None):
    """
    :param item_id: unique Internet Archive volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with one or more blockType=Picture in Abbyy OCR data
    """

    print("[{}] Starting processing".format(item_id))
    
    # Use command-line client to see available metadata formats:
    # `ia metadata formats VOLUME_ID`
    
    # for this lesson, only the Abbyy file is needed
    returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"]))
    
    # make sure something got returned
    if len(returned_files) > 0:
        abbyy_file = returned_files[0].name
    else:
        print("[{}] Could not get Abbyy file".format(item_id))
        return None
    
    # download the abbyy file to CWD
    ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, destdir=os.getcwd(), no_directory=True)
    
    # collect the pages with at least one picture block
    img_pages = []
    
    with gzip.open(abbyy_file) as fp:
        tree = ET.parse(fp)
        document = tree.getroot()
        for i, page in enumerate(document):
            for block in page:
                try:
                    if block.attrib['blockType'] == 'Picture':
                        img_pages.append(i)
                        break
                except KeyError:
                    continue
    
    # 0 is not a valid page for making GET requests to IA,
    #yet sometimes it's in the zipped Abbyy file
    img_pages = [page for page in img_pages if page > 0]
    
    # track for download progress report
    total_pages = len(img_pages)

    # OCR files are huge, so just delete once we have pagelist
    os.remove(abbyy_file)
    
    # if out_dir is not None, then also download page images
    if out_dir:
        
        # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory {}".format(item_id, out_dir))
        os.makedirs(out_dir)
        
        # https://iiif.archivelab.org/iiif/documentation
        urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) for page in img_pages]
        
        # no direct page download through API, DIY
        for i, page, url in zip(range(1,total_pages), img_pages, urls):
            rsp = requests.get(url, allow_redirects=True)
            if rsp.status_code == 200:
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp:
                    fp.write(rsp.content)
    
    # return list of pages with 1+ picture blocks
    return img_pages

In [ ]:

# loop over our search results and call the function
for item_id in vol_ids:
    destination = os.path.join("items", "internetarchive", item_id)
    img_pages = ia_picture_download(item_id, out_dir=destination)