# define a function for downloading pictures from a given IA volume
def ia_picture_download(item_id, out_dir=None):
"""
:param item_id: unique Internet Archive volume identifier
:param out_dir: destination for images; if None, no download
Note: if supplied, out_dir must be an existing directory and
the caller must have write permissions in that directory
:rtype list of pages with one or more blockType=Picture in Abbyy OCR data
"""
print("[{}] Starting processing".format(item_id))
# Use command-line client to see available metadata formats:
# `ia metadata formats VOLUME_ID`
# for this lesson, only the Abbyy file is needed
returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"]))
# make sure something got returned
if len(returned_files) > 0:
abbyy_file = returned_files[0].name
else:
print("[{}] Could not get Abbyy file".format(item_id))
return None
# download the abbyy file to CWD
ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, destdir=os.getcwd(), no_directory=True)
# collect the pages with at least one picture block
img_pages = []
with gzip.open(abbyy_file) as fp:
tree = ET.parse(fp)
document = tree.getroot()
for i, page in enumerate(document):
for block in page:
try:
if block.attrib['blockType'] == 'Picture':
img_pages.append(i)
break
except KeyError:
continue
# 0 is not a valid page for making GET requests to IA,
#yet sometimes it's in the zipped Abbyy file
img_pages = [page for page in img_pages if page > 0]
# track for download progress report
total_pages = len(img_pages)
# OCR files are huge, so just delete once we have pagelist
os.remove(abbyy_file)
# if out_dir is not None, then also download page images
if out_dir:
# return if folder already exists (reasonable inference that volume already processed)
if os.path.isdir(out_dir):
print("[{}] Directory already exists.".format(item_id))
return img_pages
# otherwise, create folder to put the images
print("[{}] Making directory {}".format(item_id, out_dir))
os.makedirs(out_dir)
# https://iiif.archivelab.org/iiif/documentation
urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) for page in img_pages]
# no direct page download through API, DIY
for i, page, url in zip(range(1,total_pages), img_pages, urls):
rsp = requests.get(url, allow_redirects=True)
if rsp.status_code == 200:
print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp:
fp.write(rsp.content)
# return list of pages with 1+ picture blocks
return img_pages