#!/usr/bin/env python # coding: utf-8 # # Creating a plaintext Odyssey on the fly with Perseus Table of Contents # # Patrick J. Burns 10.1.2017 # I recently read an article on sentence length in Greek hexameter poetry by [Dee Clayman from 1981](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1627358) ("Sentence Length in Greek Hexameter Poetry" in Hexameter Studies, *Quantitative Linguistics* 11)—an interesting article in many ways, that I will blog about at greater length in the near future. For now, I will just say that it is a data-driven study and an excellent example of computational/philological/literary critical work in Classics from nearly four decades ago. # # In the article, Clayman presents a series of charts that I wanted to replicate, including this one on "Sentences One Line in Length in Greek Hexameter Poetry." I could do this sort of thing relatively easily for Latin hexameters using CLTK and the plaintext Latin Library corpus that I've [written about at *Disiecta Membra*](https://disiectamembra.wordpress.com/2016/08/11/working-with-the-latin-library-corpus-in-cltk/). But I didn't have a plaintext Greek corpus at hand and I decided that I probably should. # ![Figure from Clayman 1981](img/clayman.jpg) # *Figure from Clayman's 1981 study* # I also just happened to teach a seminar last week on using Python to scrape XML by URL and I thought this would make a good example of an intermediate level scraping project. # # The [Perseus Digital Library](http://www.perseus.tufts.edu) provides open-access XML texts of many of the hexameter texts from Clayman's article that I wanted to test. We could, I suppose, cut and paste the texts from the browser. But for the Odyssey that would be almost three hundred pages chunked by section. Even chunked by book, we'd have to work through 24 pages. And we'd have to do that for every work we wanted to scrape. # # Fortunately, the library also provides a Table of Contents which gives us a map of all the individual sections. With these TOC files, we can use Python to build a list of URLs for the sections, scrape these pages, extract lines of poetry, and finally stitch the results together. Python and [lxml](http://lxml.de) are well-suited to this task. # ![Perseus XML Table of Contents](img/perseus-toc.png) # *Perseus XML Table of Contents for Homer's* Odyssey*, available [here](http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0135%3Abook%3D1).* # Below is a first pass at handling two Greek hexameter poems from Perseus: Homer's *Odyssey* and Hesiod's *Shield of Heracles*. In an upcoming post/notebook, I will collect all of the hexameter poems from Clayman's study and show how we can use Python to replicate her study. # ## Getting a plaintext *Odyssey* # In[1]: # Imports import urllib.request from lxml import etree import collections import time from pprint import pprint # In[2]: # Constants perseus_xml_base_url = "http://www.perseus.tufts.edu/hopper/xmlchunk?doc=" # In[3]: # Homer's Odyssey TOC XML odyssey_toc_url = "http://www.perseus.tufts.edu/hopper/xmltoc?doc=Perseus%3Atext%3A1999.01.0135%3Abook%3D1%3Acard%3D1" # Hesiod's Shield TOC XML shield_toc_url = "http://www.perseus.tufts.edu/hopper/xmltoc?doc=Perseus%3Atext%3A1999.01.0127%3Acard%3D1" # In[4]: def check_for_books(root): """ Some poems are single, self-contained works (e.g. Hesiod's Shield) others are divided into books (e.g. Homer's Odyssey). This tests for the presence of the attribute type with value 'book' in the element, so that book-level information can be retained when parsing. """ if root.findall(".//chunk[@type='book']"): return True return False # In[5]: with urllib.request.urlopen(odyssey_toc_url) as f: perseus_toc_xml = f.read() root = etree.fromstring(perseus_toc_xml) # In[6]: # Get list of refs from elements if check_for_books(root): books = root.findall(".//chunk[@type='book']") booknames = [book.find('head').text for book in books] else: books = [root] booknames = ['work'] book_refs = [] for book in books: chunks = book.findall('chunk') refs = [chunk.attrib['ref'] for chunk in chunks] book_refs.append(refs) # In[7]: print(book_refs[0]) # Example of retrieved refs from TOC for Odyssey 1 # In[8]: # Get xml for each ref book_sections = [] for book_ref in book_refs: book_section_xml = [] for ref in book_ref: #print(ref) #Uncomment if you want to see the progress time.sleep(.1) with urllib.request.urlopen(perseus_xml_base_url+ref) as f: book_section_xml.append(f.read()) book_sections.append(book_section_xml) # In[9]: # Example XML from Odyssey 1, Section 1 print(book_sections[0][0].decode('utf-8')[:1000]) # In[10]: # Some helper functions def check_for_lb(root): """ Some poetry in the Perseus XML has lines delimited by and some by . This tests for the presence of , so that the right parser is used below. """ if root.findall(".//lb"): return True return False # Need this helper function to retrieve lines which have # intervening elements. def node_text(node): """https://stackoverflow.com/a/7500304/1816347""" if node.text: result = node.text else: result = '' for child in node: if child.tail is not None: result += child.tail return result # In[11]: # Get xml for each ref book_lines = [] for section in book_sections: section_lines = [] for xml in section: root = etree.fromstring(xml) if check_for_lb(root): lines = root.findall('.//lb') lines = [line.tail for line in lines] lines = ['\n' if line is None else line for line in lines] section_lines.append(lines) else: lines = root.findall('.//l') lines = [node_text(line) for line in lines] lines = ['\n' if line is None else line for line in lines] section_lines.append(lines) book_lines.append(section_lines) print(book_lines[0][0][:25]) # In[12]: def flatten(l): """https://stackoverflow.com/a/2158532/1816347""" for el in l: if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)): yield from flatten(el) else: yield el # In[13]: plaintext = flatten(book_lines) print("\n".join(list(plaintext))[:1000]) # ## Getting a plaintext *Shield* # In[14]: with urllib.request.urlopen(shield_toc_url) as f: perseus_toc_xml = f.read() root = etree.fromstring(perseus_toc_xml) # Get list of refs from elements if check_for_books(root): books = root.findall(".//chunk[@type='book']") booknames = [book.find('head').text for book in books] else: books = [root] booknames = ['work'] book_refs = [] for book in books: chunks = book.findall('chunk') refs = [chunk.attrib['ref'] for chunk in chunks] book_refs.append(refs) # In[15]: # Get xml for each ref book_sections = [] for book_ref in book_refs: book_section_xml = [] for ref in book_ref: #print(ref) #Uncomment if you want to see the progress time.sleep(.1) with urllib.request.urlopen(perseus_xml_base_url+ref) as f: book_section_xml.append(f.read()) book_sections.append(book_section_xml) # In[16]: # Get xml for each ref book_lines = [] for section in book_sections: section_lines = [] for xml in section: root = etree.fromstring(xml) if check_for_lb(root): lines = root.findall('.//lb') lines = [line.tail for line in lines] lines = ['\n' if line is None else line for line in lines] section_lines.append(lines) else: lines = root.findall('.//l') lines = [node_text(line) for line in lines] lines = ['\n' if line is None else line for line in lines] section_lines.append(lines) book_lines.append(section_lines) print(book_lines[0][0][:25]) # In[17]: plaintext = flatten(book_lines) print("".join(list(plaintext))[:1000]) # Handle '\n' better