OU OpenLearn materials are published in XML form, which allows some degree of structured access to the document contents.
For example, we can construct a database of images used across OpenLearn unit material, or a list of quotes, or a list of activities.
This notebook is a very first pass, just scraping images, and not adding as much metadata to the table (eg parent course) as it should. As and when I get time to tinker, I'll work on this... ;-)
#You will probably need to pip install requests_cache lxml scraperwikiimport requests
import requests_cache
requests_cache.install_cache('openlearn_cache')
from urllib.parse import urlsplit, urlunsplit
import unicodedata
from lxml import etree
import os
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///openlearn.sqlite'
#os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///scraperwiki.sqlite'
import scraperwiki
#Example page
xmlurl='http://www.open.edu/openlearn/people-politics-law/politics-policy-people/sociology/the-politics-devolution/altformat-ouxml'
c=requests.get(xmlurl)
Routines for parsing the OU XML.
#===
#via http://stackoverflow.com/questions/5757201/help-or-advice-me-get-started-with-lxml/5899005#5899005
def flatten(el):
if el is None: return
result = [ (el.text or "") ]
for sel in el:
result.append(flatten(sel))
result.append(sel.tail or "")
return unicodedata.normalize("NFKD", "".join(result)) or ' '
#===
def droptable(table):
print("Trying to drop table '{}'".format(table))
try:
scraperwiki.sqlite.execute('drop table if exists "{}"'.format(table))
except:
pass
print('...{} dropped'.format(table))
def _course_code(xml_content):
root = etree.fromstring(xml_content)
return flatten(root.find('.//CourseCode'))
def _xml_figures(xml_content, coursecode='', pageurl='',dbsave=True):
figdicts=[]
try:
root = etree.fromstring(xml_content)
except: return False
figures=root.findall('.//Figure')
#??Note that acknowledgements to figures are provided at the end of the XML file with only informal free text/figure number identifers available for associating a particular acknowledgement/copyright assignment with a given image. It would be so much neater if this could be bundled up with the figure itself, or if the figure and the acknowledgement could share the same unique identifier?
figdict={}
for figure in figures:
figdict = {'xpageurl':pageurl,'caption':'','src':'','coursecode':coursecode,
'desc':'','owner':'','item':'','itemack':''}
img=figure.find('Image')
#The image url as given does not resolve - we need to add in provided hash info
figdict['srcurl']=img.get('src')
figdict['x_folderhash']=img.get('x_folderhash')
figdict['x_contenthash']=img.get('x_contenthash')
if figdict['x_contenthash'] is not None and figdict['x_contenthash'] is not None:
path = urlsplit(figdict['srcurl'])
sp=path.path.split('/')
path=path._replace( path='/'.join(sp[:-1]+[figdict['x_folderhash'],figdict['x_contenthash']]+sp[-1:]))
figdict['imgurl']=urlunsplit(path)
else:figdict['imgurl']=''
xsrc=img.get('x_imagesrc')
figdict['caption']=flatten(figure.find('Caption'))
#in desc, need to find a way of stripping <Number> element from start of description
figdict['desc']=flatten(figure.find('Description'))
#<SourceReference><ItemRights><OwnerRef/><ItemRef/><ItemAcknowledgement/></ItemRights></SourceReference>
ref=figure.find('SourceReference')
if ref is not None:
rights=ref.find('ItemRights')
if rights is not None:
figdict['owner']=flatten(rights.find('ItemRights'))
figdict['item']=flatten(rights.find('ItemRights'))
figdict['itemack']=flatten(rights.find('ItemAcknowledgement'))
#print( 'figures',xsrc,caption,desc,src)
figdicts.append(figdict)
if dbsave : scraperwiki.sqlite.save(unique_keys=[],table_name='xmlfigures',data=figdict)
return figdicts
#http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_001i.jpg
#http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/2c1a8d77/dd203_1_001i.jpg
#Here's an exampl of the figure data as parsed
_xml_figures(c.content,dbsave=False)
[{'caption': ' Figure 1 Presentation of the Treaty of the Union between England and Scotland to Queen Anne © National Galleries of Scotland', 'coursecode': '', 'desc': 'Figure 1', 'imgurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/2c1a8d77/dd203_1_001i.jpg', 'item': '', 'itemack': '', 'owner': '', 'src': '', 'srcurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_001i.jpg', 'x_contenthash': '2c1a8d77', 'x_folderhash': '0c10275d', 'xpageurl': ''}, {'caption': ' Figure 2 Illustration of Owain Glyndwr, by permission of Llyfrgell Genedlaethol Cymru/The National Library of Wales', 'coursecode': '', 'desc': ' Figure 2', 'imgurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/3d1e1b88/dd203_1_002i.jpg', 'item': '', 'itemack': '', 'owner': '', 'src': '', 'srcurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_002i.jpg', 'x_contenthash': '3d1e1b88', 'x_folderhash': '0c10275d', 'xpageurl': ''}, {'caption': 'Figure 3 Contemporary wall mural of the Battle of the Boyne in Northern Ireland © Nezumi Dumousseau', 'coursecode': '', 'desc': 'Figure 3', 'imgurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/a64b5f52/dd203_1_003i.jpg', 'item': '', 'itemack': '', 'owner': '', 'src': '', 'srcurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_003i.jpg', 'x_contenthash': 'a64b5f52', 'x_folderhash': '0c10275d', 'xpageurl': ''}, {'caption': ' Figure 4 Referendum campaigners for devolution in Scotland, 1997 © Sutton-Hibbert/Rex Features', 'coursecode': '', 'desc': 'Figure 4', 'imgurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/ddcfba15/dd203_1_004i.jpg', 'item': '', 'itemack': '', 'owner': '', 'src': '', 'srcurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_004i.jpg', 'x_contenthash': 'ddcfba15', 'x_folderhash': '0c10275d', 'xpageurl': ''}, {'caption': None, 'coursecode': '', 'desc': None, 'imgurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/1b9129f0/d3c986e6/ol_skeleton_keeponlearning_image.jpg', 'item': '', 'itemack': '', 'owner': '', 'src': '', 'srcurl': 'http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/ol_skeleton_keeponlearning_image.jpg', 'x_contenthash': 'd3c986e6', 'x_folderhash': '1b9129f0', 'xpageurl': ''}]
OpenLearn publish an OPML feed of units. It used to be hierarchical, grouping unitis in to topics, now it seems to be flat with links to units as well as topic feeds. At some point, I'll grab the topic feeds and use it to generate lookup tables from topics to units.
def getUnitLocations():
#The OPML file lists all OpenLearn units by topic area
srcUrl='http://openlearn.open.ac.uk/rss/file.php/stdfeed/1/full_opml.xml'
tree = etree.parse(srcUrl)
root = tree.getroot()
items=root.findall('.//body/outline')
#Handle each topic area separately?
#The OPML is linear and mixes links to content twith links to topic feeds
#Need to harvest by topic?
for item in items:
tt = item.get('text')
#print( tt)
it=item.get('text')
if it.startswith('Unit content for'):
it=it.replace('Unit content for','')
url=item.get('htmlUrl')
rssurl=item.get('xmlUrl')
#print(url)
xmlurl=url.replace('content-section-0','altformat-ouxml')
#print(xmlurl)
c=requests.get(xmlurl)
_xml_figures(c.content)
#droptable('xmlfigures')
getUnitLocations()
There are other things we can scrape data about as well as images:
<Quote>...</Quote>
)<Activity><Heading></Heading><Timing><Hours></Hours><Minutes></Minutes></Timing><Question></Question><Discussion></Discussion></Activity>
)<Box><Heading></Heading> ...<SourceReference></SourceReference></Box>
)<CourseCode></CourseCode>
and <CourseTitle></CourseTitle>
)<BackMatter><References><Reference></Reference></References></BackMatter>
)import pandas as pd
import sqlite3
conn = sqlite3.connect('openlearn.sqlite')
pd.read_sql('SELECT * FROM xmlfigures LIMIT 3', conn)
xpageurl | caption | src | coursecode | desc | owner | item | itemack | srcurl | x_folderhash | x_contenthash | imgurl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | None | http://www.open.edu/openlearn/ocw/pluginfile.p... | 1b9129f0 | d3c986e6 | http://www.open.edu/openlearn/ocw/pluginfile.p... | ||||||
1 | Figure 1 Presentation of the Treaty of the Un... | Figure 1 | http://www.open.edu/openlearn/ocw/pluginfile.p... | 0c10275d | 2c1a8d77 | http://www.open.edu/openlearn/ocw/pluginfile.p... | ||||||
2 | Figure 2 Illustration of Owain Glyndwr, by pe... | Figure 2 | http://www.open.edu/openlearn/ocw/pluginfile.p... | 0c10275d | 3d1e1b88 | http://www.open.edu/openlearn/ocw/pluginfile.p... |
We can use Simon Willison's rather wonderful datasette package to create a server that provides a query interface to the sqlite database. For example, here's a temporary one for demo purposes (this URL is subject to change and my Heroku limits may also max out)...
https://sheltered-journey-73156.herokuapp.com/openlearn-29f1575/xmlfigures
It would be nice if it was as easy to push a datesette to Reclaim Hosting as it is to push one to Heroku or Zeit Now...
#! /Users/ajh59/anaconda3/bin/pip install scriptedforms
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///scraperwiki.sqlite'