import urllib2 import json import re import pprint pp = pprint.PrettyPrinter(indent=4).pprint Q = urllib2.quote U = urllib2.unquote query = Q("title:(alice in wonderland) AND format:(djvu)") columns = "&".join([Q(s) for s in "fl[]=identifier fl[]=source fl[]=title".split()]) params = "rows=100&page=1&output=json" url = "http://archive.org/advancedsearch.php?q="+query+"&"+columns+"&"+params print url # could we also post the query? #jdata = json.dumps({"username":"...", "password":"..."}) #urllib2.urlopen("http://www.example.com/", jdata) response = urllib2.urlopen(url).read() response = json.loads(response)["response"] response.keys() response["numFound"] len(response["docs"]) for i,e in enumerate(response["docs"][:10]): print i,e["title"],e["identifier"] identifier = response["docs"][5]["identifier"] identifier hit = urllib2.urlopen("http://archive.org/details/"+Q(identifier)+"?output=json").read() hit = json.loads(hit) print hit.keys() print hit["server"] print hit["files"].items()[0][1].keys() fname = None for k,v in hit["files"].items(): print repr(v["format"]),repr(v["source"]),k if v["format"]=="DjVuTXT": fname = k print fname text = urllib2.urlopen("http://archive.org/download/"+Q(identifier)+fname).read() text = text.decode("utf-8") text[:400] import nltk tokens = nltk.tokenize.word_tokenize(text) tokens[200:300]