#Importing libraries import matplotlib.pyplot as plt import pandas as pd from itertools import islice import numpy as np #Useful way of priting dataframe using html from IPython.core.display import HTML import IPython.core.display #Connect to database, our data is in mysql hosted on ec2 cluster. Since the size of data was huge hence it was essential to use a database. import MySQLdb db = MySQLdb.connect(host="bookhunters.cicwejvpogpp.us-east-1.rds.amazonaws.com", port=3306, user="bookhunters", passwd="wwod13pw!", db="bookhunters", charset='utf8') db.set_character_set('utf8') cursor = db.cursor() cursor.execute('SET NAMES utf8') cursor.execute('SET CHARACTER SET utf8') # execute SQL select statement to fetch results from database cursor.execute("select 'Hathtrust' , count(1) from ht_books union select 'Gutenberg', count(1) from gut_books union select 'Open library', count(1) from ol_books") results = cursor.fetchall() #Store results in a dataframe l = list(results) df = pd.DataFrame(l, columns=["Source","Count"]) df = df.set_index(df.Source) df df['Count'].plot(kind = 'bar',title="Total books") #Plot graphs based on publish date cursor.execute("select pubdate,'Hathitrust' as Source, COUNT(1) from ht_books group by pubdate, 'Hathitrust' union select publish_date,'Openlibrary' as Source, COUNT(1) from ol_books group by publish_date,'Openlibrary' union select created,'Gutenberg' as Source, COUNT(1) from gut_books group by created,'Gutenberg'") dates = cursor.fetchall() dateslist = list(dates) dfdates=pd.DataFrame(dateslist,columns=["Year","Source","Count"]) #display(HTML(dfdates[:-25].to_html())) #Parsing the dates field to fetch only the year, remove redundant text. import re def returnnumber(num): try: tmp = re.findall(("(\d{4})") ,num) if (len(tmp)==1): return int("".join(tmp)) else: return None except: return None #Removing null fields from the dataframe to add another column called FilteredDate to store parsed date. dfdates.dropna() s = pd.Series() s = dfdates.Year.apply(returnnumber) dfdates["FilteredDate"]=s dfdates.sort('FilteredDate') dfdates = dfdates[(dfdates.FilteredDate>1990) & (dfdates.FilteredDate<2007)] #display(HTML(dfdates.to_html())) #Plot the books publishes by year between 1990 and 2002 d = dfdates.pivot_table('Count', rows='FilteredDate', cols='Source', aggfunc=sum) d d.plot(title="Total books collection by year", linewidth=2.5 ) def ipynb_input(varname, prompt=''): """Prompt user for input and assign string val to given variable name.""" js_code = (""" var value = prompt("{prompt}",""); var py_code = "{varname} = '" + value + "'"; IPython.notebook.kernel.execute(py_code); """).format(prompt=prompt, varname=varname) return IPython.core.display.Javascript(js_code) ipynb_input("fromyear", prompt='Enter the from year: ') ipynb_input("toyear", prompt='Enter the to year: ') dfdatesuser = dfdates[(dfdates.FilteredDate>int(fromyear)) & (dfdates.FilteredDate3: tmp= l[-3:] else: tmp=l return langdict[tmp] except: return 'Other' dflang_gb6= dflang_gb[dflang_gb.Language.str.strip()!=""].sort('Count')[::-1][:6] dflang_ol6 = dflang_ol[dflang_ol.Language.str.strip()!=""].sort('Count')[::-1][:6] dflang_ht6 = dflang_ht[dflang_ht.Language.str.strip()!=""].sort('Count')[::-1][:6] dflang_gb6['Lang']= dflang_gb[dflang_gb.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language) dflang_ol6['Lang']= dflang_ol[dflang_ol.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language) dflang_ht6['Lang']= dflang_ht[dflang_ht.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language) df_mer_lang = dflang_gb6.append(dflang_ol6).append(dflang_ht6) df_mer_lang = df_mer_lang.pivot_table('Count', rows='Lang', cols='Source', aggfunc=sum) df_mer_lang.fillna(0) fig = plt.figure() fig.set_size_inches(10,10) ax1 = fig.add_subplot(2, 2, 1) df_mer_lang.fillna(0)['Gutenberg'].plot(kind="barh", title="Gutenberg",color='r') ax2 = fig.add_subplot(2, 2, 2) df_mer_lang.fillna(0)['Openlibrary'].plot(kind="barh", title="Openlibrary",color='b') ax3 = fig.add_subplot(2, 2, 3) print "\n" print "\n" df_mer_lang.fillna(0)['Hathitrust'].plot(kind="barh", title="Hathitrust",color='g') query = " ".join(["select COUNT(FORM) AS formatcnt, FORM as format, Source", "from( select case WHEN gut_files.FORMAT LIKE '%epub%' then 'epub'", "else case WHEN gut_files.FORMAT LIKE '%pdf%' THEN 'pdf' ", "else case WHEN gut_files.format like '%text%' THEN 'text' END END END AS FORM , 'Gutenberg' as Source", "from gut_files) a", "group by FORM , Source", "union SELECT count('PDF') as formatcnt,'pdf' as format, 'Hathitrust'as Source from ht_books where Access='allow' group by 'PDF','Hathitrust' ", "union select count('epub') as formatcnt,'epub' as format, 'Openlibrary' as Source from ol_books group by 'epub','Openlibrary' ", "union select count('text') as formatcnt, 'text' as format, 'Openlibrary' as Source from ol_books group by 'text','Openlibrary' ", "union select count('pdf') as formatcnt, 'pdf' as format, 'Openlibrary' as Source from ol_books group by 'pdf','Openlibrary' " ]) cursor.execute(query) formatres = list(cursor.fetchall()) dfformatres = pd.DataFrame(formatres,columns=['Count','Format','Source']) dfformatres = dfformatres.pivot_table('Count', rows='Format', cols='Source', aggfunc=sum) dfformatres.plot(kind="bar", stacked=True) def maketable(df): imagedict={6:"http://fchasen.com/cal/open-data/icons/pdf-icon.png", 8:"http://fchasen.com/cal/open-data/icons/epub-icon.png", 7:"http://fchasen.com/cal/open-data/icons/text-icon.png"} columns=df.columns table ="" table +=" " for i in columns: table += "".format(i) table += "" for k,s in df.iterrows(): table +="" table += "".format(k) for colnum,j in enumerate(s): #print type(j) try: if (colnum in [6,7,8]): if j is not None and j[:4]=="http": table += "".format(j,imagedict[colnum]) else: table += "" else: table += "".format(j.encode('utf-8')) except: table += "".format(j) table += "" table += "
Results{0}
{0}{0}{0}
" return table def searchBook(q, o="title"): if o == "title": g = "title" ol = "title" h = "Title" if o == "author": ol = "ol_authors.name" g = "creator" h = "Imprint" query = "".join(["select distinct 'Gutenberg' as source, g.title as title, " , "g.creator as author, g.lang as language,'Unknown' as pubdate, 'public domain' as rights, ", "(select about from gut_files where format like '%pdf%' and etext_id=g.etext_id limit 1) as pdf, ", "(select about from gut_files where format like '%text%' and etext_id=g.etext_id limit 1) as text, ", "(select about from gut_files where format like '%epub%' and etext_id=g.etext_id limit 1) as epub ", "from gut_books g ", "where lower("+g+") like lower('%"+q+"%') " , "union ", "select distinct 'Hathitrust' as source, Title as title, ", "'Unknown' as author, lang as language, PubDate as pubdate, case Access when 'allow' then 'public domain' else 'non public' end as rights, ", "concat('http://hdl.handle.net/2027/',VolumeID) as pdf,'N/A' as epub, 'N/A' as text from ", "ht_books where lower("+h+") like lower('%"+q+"%') and Access = 'allow' ", "union ", "select distinct 'OpenLibrary' as source, title, name as author, ", "language, publish_date as pubdate, 'Unknown' as rights, ", "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '.pdf') as pdf, ", "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '_djvu.txt') as text, ", "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '.epub') as epub ", "from ol_books LEFT JOIN ol_authors ON ol_books.author_key = ol_authors.key where lower("+ol+") like lower('%"+q+"%')"]) cursor.execute(query) res = list(cursor.fetchall()) if(len(res)): dfresult = pd.DataFrame(res ,columns=['source','title', 'author','language','pubdate','rights','pdf','text','epub'] ) dfresult['language']=dfresult.language.apply(language) print "Total Results:",len(dfresult) return dfresult else: print "No results returned, try again" return False inputHTML = ''' ''' js_code = (""" var $body = $("body"); $body.on('click', '#book-submit', function() { var $search = $('#book-search'), $select = $('#book-select'); var py_code1 = "queryterm = '" + $search.val() + "'"; var py_code2 = "on = '" + $select.val() + "'"; console.log('submit:', $search.val(), $select.val()); IPython.notebook.kernel.execute(py_code1); IPython.notebook.kernel.execute(py_code2); }); """) IPython.core.display.Javascript(js_code) HTML(inputHTML) dfresult = searchBook(queryterm, on) source = dfresult.groupby('source') source['source'].value_counts().plot(kind="bar", title="Count distribution of searched results") lbl= dfresult['language'].value_counts().index #dfresult.value_counts().plot(kind="bar") pie(dfresult['language'].value_counts(), labels=lbl, autopct='%1.1f%%') #type(dfresult['language']) display(HTML(maketable(dfresult))) if on == "title": h = "Title" if on == "author": h = "Imprint"#Fetching books only from hathitrust query = "".join(["select Title as title, 'Unknown' as subtitle, 'Unknown' as author,", "lang as language, PubDate as pubdate, Access as rights, concat('http://hdl.handle.net/2027/',VolumeID) as link,", "'pdf' as format from ht_books where "+h+" like '%" , queryterm, "%'"]) query cursor.execute(query) htres = list(cursor.fetchall()) dfht= pd.DataFrame(htres , columns=['title','subtitle','author','language','pubdate','rights','link','format']) #dislaying results from hathitrust display(HTML(dfht.to_html())) dfht.pubdate.apply(returnnumber).value_counts().plot(kind="barh", title = "number of books published per year for the queries book") dfht.language.apply(language) lbl = dfht.language.apply(language).value_counts().index pie(dfht.language.apply(language).value_counts(), labels=lbl, autopct='%1.1f%%')