This worksheet shows how to query the Internet Archive with JSON and how to download from it.
import urllib2
import json
import re
import pprint
pp = pprint.PrettyPrinter(indent=4).pprint
Q = urllib2.quote
U = urllib2.unquote
We construct a url-encoded query (can we also post JSON?).
query = Q("title:(alice in wonderland) AND format:(djvu)")
columns = "&".join([Q(s) for s in "fl[]=identifier fl[]=source fl[]=title".split()])
params = "rows=100&page=1&output=json"
url = "http://archive.org/advancedsearch.php?q="+query+"&"+columns+"&"+params
print url
http://archive.org/advancedsearch.php?q=title%3A%28alice%20in%20wonderland%29%20AND%20format%3A%28djvu%29&fl%5B%5D%3Didentifier&fl%5B%5D%3Dsource&fl%5B%5D%3Dtitle&rows=100&page=1&output=json
# could we also post the query?
#jdata = json.dumps({"username":"...", "password":"..."})
#urllib2.urlopen("http://www.example.com/", jdata)
Now we read and parse the response.
response = urllib2.urlopen(url).read()
response = json.loads(response)["response"]
response.keys()
[u'start', u'numFound', u'docs']
response["numFound"]
43
len(response["docs"])
43
Each doc contains a title and an identifier (we asked for those):
for i,e in enumerate(response["docs"][:10]):
print i,e["title"],e["identifier"]
0 Alice In Wonderland caralic 1 Alice's Adventures In Wonderland AlicesAdventuresInWonderland 2 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_917 3 Alice in Wonderland aliceinwonderla00carrgoog 4 Alice's Adventures in Wonderland alicesadventure00jackgoog 5 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_841 6 Alice's adventures in Wonderland alicesadventure00tenngoog 7 Alice's Adventures in Wonderland alicesadventures00011gut 8 Alice's adventures in Wonderland adventuresalices00carrrich 9 Alice in Wonderland aliceinwonderlan00carriala
identifier = response["docs"][5]["identifier"]
identifier
u'AlicesAdventuresInWonderland_841'
Once we have the identifier for a document, we can retrieve more info about it, again in JSON.
hit = urllib2.urlopen("http://archive.org/details/"+Q(identifier)+"?output=json").read()
hit = json.loads(hit)
print hit.keys()
print hit["server"]
[u'files', u'misc', u'server', u'item', u'creativecommons', u'dir', u'metadata'] ia701208.us.archive.org
We're particularly interested in the files.
print hit["files"].items()[0][1].keys()
[u'sha1', u'format', u'source', u'mtime', u'crc32', u'md5', u'original', u'size']
The file list contains information about formats, sources, sizes, etc. We're looking for text.
fname = None
for k,v in hit["files"].items():
print repr(v["format"]),repr(v["source"]),k
if v["format"]=="DjVuTXT": fname = k
print fname
u'DjVu' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.djvu u'Abbyy GZ' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_abbyy.gz u'Image Container PDF' u'original' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.pdf u'Metadata' u'original' /AlicesAdventuresInWonderland_841_meta.xml u'Single Page Processed JP2 ZIP' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_jp2.zip u'DjVuTXT' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt u'Scandata' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_scandata.xml u'EPUB' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.epub u'Metadata' u'original' /AlicesAdventuresInWonderland_841_files.xml u'Animated GIF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.gif u'Djvu XML' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.xml u'Additional Text PDF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_text.pdf /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt
We can retrieve files from the archive.org/download
URL, combining the identifier for the document and the specific file name.
text = urllib2.urlopen("http://archive.org/download/"+Q(identifier)+fname).read()
text = text.decode("utf-8")
text[:400]
u'\n\n\n1 \n\n\n\n\nwtx \n\n\n\n\n\n% \xa7eb vtfy "tired of $LM&$ \nby nzr sisfer* ojl. tdthlmnh \n\ndo : once or "twice, sit ka.A \nfittfottL tufa i&& Irotk ktv \nS^^Mt Si ^ r w<t ^ riding, frat it \nk<U Ko pictures or conversation*- in lt } ajruL wh&*. is tfa, \nU&& of a- (rook t -ikoiL^kir Alice, , wii&out- pictures &<r can.* \n-VtrScrtio-ns t So ska, MCLS cons.ttle.rino> <-7i. fi\xa3r own niind, \n^aS w&^ as S/te- Could, fa'
We can now continue to process this text, for example with NLTK.
import nltk
tokens = nltk.tokenize.word_tokenize(text)
tokens[200:300]
[u'.', u'\u25a0', u'/I', u'nor', u'way', u'"', u'to', u'Uar', u'^', u'U', u'T', u'^', u'Ub', u'say', u'~', u'6', u'>', u'rfs', u'\xab', u'#', u"'", u'cU', u'\xb1', u'r', u',', u'dear', u';', u'*', u'UtL', u'U', u'too', u'\xa3', u'*', u'\xa3', u'e', u'r', u'(', u'vji', u'*', u'n.', u'$', u'U', u'idLca.', u'3', u'i&', u'A', u'ovtr', u'*', u'.', u'\xa3', u'&-', u'-WO-rcLS', u')', u'tir', u'occurred', u'&', u'A-', u'*', u'*', u'*', u'that', u's', u'^', u'e', u'oll', u'^', u'H', u'tfi', u'kavt', u'woTuLkfttL', u'at', u'-tiiis', u',', u'(', u'rat', u'ai', u'Ofb', u'tirae', u',', u'l&', u'alt', u'\u2022', u'seemed', u'auitl', u'natural', u'}', u'>', u'bu.t', u'wAe', u'*', u'.', u'\xb1', u'kt', u'raUit', u'actadly', u'-took', u'QL', u'w', u'atch.', u'out']