import urllib2
import json
import re
import pprint
pp = pprint.PrettyPrinter(indent=4).pprint
Q = urllib2.quote
U = urllib2.unquote

query = Q("title:(alice in wonderland) AND format:(djvu)")
columns = "&".join([Q(s) for s in "fl[]=identifier fl[]=source fl[]=title".split()])
params = "rows=100&page=1&output=json"
url = "http://archive.org/advancedsearch.php?q="+query+"&"+columns+"&"+params
print url

# could we also post the query?
#jdata = json.dumps({"username":"...", "password":"..."})
#urllib2.urlopen("http://www.example.com/", jdata)

response = urllib2.urlopen(url).read()
response = json.loads(response)["response"]
response.keys()

response["numFound"]

len(response["docs"])

for i,e in enumerate(response["docs"][:10]):
    print i,e["title"],e["identifier"]

identifier = response["docs"][5]["identifier"]
identifier

hit = urllib2.urlopen("http://archive.org/details/"+Q(identifier)+"?output=json").read()
hit = json.loads(hit)
print hit.keys()
print hit["server"]

print hit["files"].items()[0][1].keys()

fname = None
for k,v in hit["files"].items():
    print repr(v["format"]),repr(v["source"]),k
    if v["format"]=="DjVuTXT": fname = k
print fname

text = urllib2.urlopen("http://archive.org/download/"+Q(identifier)+fname).read()
text = text.decode("utf-8")
text[:400]

import nltk
tokens = nltk.tokenize.word_tokenize(text)

tokens[200:300]