#Importing libraries

import matplotlib.pyplot as plt
import pandas as pd
from itertools import islice
import numpy as np
#Useful way of priting dataframe using html
from IPython.core.display import HTML
import IPython.core.display


#Connect to database, our data is in mysql hosted on ec2 cluster. Since the size of data was huge hence it was essential to use a database.
import MySQLdb
db = MySQLdb.connect(host="bookhunters.cicwejvpogpp.us-east-1.rds.amazonaws.com", port=3306, user="bookhunters", passwd="wwod13pw!",
db="bookhunters", charset='utf8')
db.set_character_set('utf8')
cursor = db.cursor()
cursor.execute('SET NAMES utf8')
cursor.execute('SET CHARACTER SET utf8')

# execute SQL select statement to fetch results from database
cursor.execute("select 'Hathtrust' , count(1) from ht_books union select 'Gutenberg', count(1) from gut_books union select 'Open library', count(1) from ol_books")
results = cursor.fetchall()

#Store results in a dataframe
l = list(results)
df = pd.DataFrame(l, columns=["Source","Count"])
df = df.set_index(df.Source)
df

df['Count'].plot(kind = 'bar',title="Total books")

#Plot graphs based on publish date
cursor.execute("select pubdate,'Hathitrust' as Source, COUNT(1) from ht_books group by pubdate, 'Hathitrust' union select publish_date,'Openlibrary' as Source, COUNT(1) from ol_books group by publish_date,'Openlibrary' union select created,'Gutenberg' as Source, COUNT(1) from gut_books group by created,'Gutenberg'")
dates = cursor.fetchall()


dateslist = list(dates)
dfdates=pd.DataFrame(dateslist,columns=["Year","Source","Count"])
#display(HTML(dfdates[:-25].to_html()))

#Parsing the dates field to fetch only the year, remove redundant text.
import re
def returnnumber(num): 
    try:
        tmp = re.findall(("(\d{4})") ,num)
        if (len(tmp)==1):
            return int("".join(tmp))
        else:
            return None
    except:
        return None
    

#Removing null fields from the dataframe to add another column called FilteredDate to store parsed date.
dfdates.dropna()
s = pd.Series()
s = dfdates.Year.apply(returnnumber)
dfdates["FilteredDate"]=s

dfdates.sort('FilteredDate')
dfdates = dfdates[(dfdates.FilteredDate>1990) & (dfdates.FilteredDate<2007)]
#display(HTML(dfdates.to_html()))

#Plot the books publishes by year between 1990 and 2002
d = dfdates.pivot_table('Count', rows='FilteredDate', cols='Source', aggfunc=sum)
d

d.plot(title="Total books collection by year", linewidth=2.5 )

def ipynb_input(varname, prompt=''):
    """Prompt user for input and assign string val to given variable name."""
    js_code = ("""
        var value = prompt("{prompt}","");
        var py_code = "{varname} = '" + value + "'";
        IPython.notebook.kernel.execute(py_code);
    """).format(prompt=prompt, varname=varname)
    return IPython.core.display.Javascript(js_code)

ipynb_input("fromyear", prompt='Enter the from year: ')

ipynb_input("toyear", prompt='Enter the to year: ')

dfdatesuser = dfdates[(dfdates.FilteredDate>int(fromyear)) & (dfdates.FilteredDate<int(toyear))]
duser = dfdatesuser.pivot_table('Count', rows='FilteredDate', cols='Source', aggfunc=sum)
duser.plot(title="Books collection by year", linewidth=2.5 )

cursor.execute("select lang as lang,count(1) as count,'Hathitrust' as Source from ht_books where lang is not null group by (lang)") 
dflang_ht = pd.DataFrame(list(cursor.fetchall()),columns=['Language','Count','Source'])
cursor.execute("select lang as lang ,count(1) as count,'Gutenberg' as Source from gut_books where lang is not null group by (lang)")
dflang_gb = pd.DataFrame(list(cursor.fetchall()),columns=['Language','Count','Source'])
cursor.execute("select language as lang,count(1) as count, 'Openlibrary' as Source from ol_books where language is not null group by (language)")
dflang_ol = pd.DataFrame(list(cursor.fetchall()),columns=['Language','Count','Source'])

#Function to translate date abbreviation to their full form
def language(l):
    langdict= { 'eng':'English',
                'ger':'German',
                'fre':'French',
                'chi':'Chinese',
                'jpn':'Japan',
                'rus':'Russian',
                'en':'English',
                'fr':'French',
                'de':'Dutch',
                'nl':'Dutch',
                'pt':'Portugese',
                'spa':'Spanish',
                'ita':'Italian',
                'lat':'Latin',
                'fr':'French',
                'fi':'Finnish'    }
    try:
        if len(l)>3:
            tmp= l[-3:]
        else:
            tmp=l
        return langdict[tmp]
    except:
        return 'Other'
    

dflang_gb6= dflang_gb[dflang_gb.Language.str.strip()!=""].sort('Count')[::-1][:6]
dflang_ol6 = dflang_ol[dflang_ol.Language.str.strip()!=""].sort('Count')[::-1][:6]
dflang_ht6 = dflang_ht[dflang_ht.Language.str.strip()!=""].sort('Count')[::-1][:6]
dflang_gb6['Lang']= dflang_gb[dflang_gb.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language)
dflang_ol6['Lang']= dflang_ol[dflang_ol.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language)
dflang_ht6['Lang']= dflang_ht[dflang_ht.Language.str.strip()!=""].sort('Count')[::-1][:6].Language.apply(language)
df_mer_lang = dflang_gb6.append(dflang_ol6).append(dflang_ht6)

df_mer_lang = df_mer_lang.pivot_table('Count', rows='Lang', cols='Source', aggfunc=sum)
df_mer_lang.fillna(0)

fig = plt.figure()
fig.set_size_inches(10,10)
ax1 = fig.add_subplot(2, 2, 1)
df_mer_lang.fillna(0)['Gutenberg'].plot(kind="barh", title="Gutenberg",color='r')
ax2 = fig.add_subplot(2, 2, 2)
df_mer_lang.fillna(0)['Openlibrary'].plot(kind="barh", title="Openlibrary",color='b')
ax3 = fig.add_subplot(2, 2, 3)
print "\n"
print "\n"
df_mer_lang.fillna(0)['Hathitrust'].plot(kind="barh", title="Hathitrust",color='g')

query = " ".join(["select COUNT(FORM) AS formatcnt, FORM as format, Source",
"from( select case WHEN gut_files.FORMAT LIKE '%epub%' then 'epub'", 
"else case WHEN gut_files.FORMAT LIKE '%pdf%' THEN 'pdf' ",
"else case WHEN gut_files.format like '%text%' THEN 'text'  END END END AS FORM ,  'Gutenberg' as Source",
"from gut_files) a",
"group by FORM , Source",
"union SELECT count('PDF') as formatcnt,'pdf'  as format, 'Hathitrust'as Source from ht_books where Access='allow' group by 'PDF','Hathitrust' ",
"union select count('epub') as formatcnt,'epub' as format, 'Openlibrary' as Source from ol_books group by 'epub','Openlibrary' ",
"union select count('text') as formatcnt, 'text' as format, 'Openlibrary' as Source from ol_books group by 'text','Openlibrary' ",
"union select count('pdf') as formatcnt, 'pdf' as format, 'Openlibrary' as Source from ol_books group by 'pdf','Openlibrary' "
])
cursor.execute(query)
formatres = list(cursor.fetchall())
dfformatres = pd.DataFrame(formatres,columns=['Count','Format','Source'])


dfformatres = dfformatres.pivot_table('Count', rows='Format', cols='Source', aggfunc=sum)

dfformatres.plot(kind="bar", stacked=True)

def maketable(df):
    imagedict={6:"http://fchasen.com/cal/open-data/icons/pdf-icon.png",
               8:"http://fchasen.com/cal/open-data/icons/epub-icon.png",
               7:"http://fchasen.com/cal/open-data/icons/text-icon.png"}
    columns=df.columns
    
    table ="<table><thead>"
    table +=" <tr><th>Results</th>"
    for i in columns:
        table += "<th>{0}</th>".format(i)
    table += "</tr></thead><tbody>"
    for k,s  in df.iterrows():
        table +="<tr>"
        table +=    "<th>{0}</th>".format(k)
    
        for colnum,j in enumerate(s):
            #print type(j)
            try:
                if (colnum in [6,7,8]):
                    if j is not None and j[:4]=="http":
                        table +=    "<td style='word-break:break-word;'><a href='{0}' target='_blank' style='word-break:break-word;'><img src='{1}' style='height:50px; margin: 0 auto;'/></td>".format(j,imagedict[colnum])
                    else:
                        table +=    "<td style='text-align:center'><img src='http://fchasen.com/cal/open-data/icons/none-icon.png' style='height:30px; margin: 10px auto;'/></td>"
                else:
                    table +=    "<td>{0}</td>".format(j.encode('utf-8'))
            except:
                table +=    "<td>{0}</td>".format(j)
        table += "</tr>"

    table += "</tbody></table>"
    return table


def searchBook(q, o="title"):
    if o == "title":
        g = "title"
        ol = "title"
        h = "Title"
        
    if o == "author":
        ol = "ol_authors.name"
        g = "creator"
        h = "Imprint" 
        
    
    query = "".join(["select distinct 'Gutenberg' as source, g.title as title, " ,
    "g.creator as author,  g.lang as language,'Unknown' as pubdate, 'public domain' as rights, ",
    "(select about from gut_files where format like '%pdf%' and etext_id=g.etext_id limit 1) as pdf, ",
    "(select about from gut_files where format like '%text%' and etext_id=g.etext_id limit 1) as text, ",
    "(select about from gut_files where format like '%epub%' and etext_id=g.etext_id limit 1) as epub ",
    "from gut_books g  ",
    "where lower("+g+") like lower('%"+q+"%')  "  ,
    "union ",
    "select distinct 'Hathitrust' as source, Title as title,  ",
    "'Unknown' as author, lang as language, PubDate as pubdate, case Access when 'allow' then 'public domain' else 'non public' end as rights, ",
    "concat('http://hdl.handle.net/2027/',VolumeID) as pdf,'N/A' as epub, 'N/A' as text from ",
    "ht_books where lower("+h+") like lower('%"+q+"%') and Access = 'allow' ",
    "union ",
    "select distinct 'OpenLibrary' as source, title, name as author, ",
    "language, publish_date as pubdate, 'Unknown' as rights, ",
    "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '.pdf') as pdf, ",
    "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '_djvu.txt') as text, ",
    "concat('http://www.archive.org/download/',ocaid, '/', ocaid, '.epub') as epub ",
    "from ol_books LEFT JOIN ol_authors ON ol_books.author_key = ol_authors.key where lower("+ol+") like lower('%"+q+"%')"])
    
    cursor.execute(query)
    res = list(cursor.fetchall())
    
    if(len(res)):
        dfresult = pd.DataFrame(res ,columns=['source','title', 'author','language','pubdate','rights','pdf','text','epub'] )
        dfresult['language']=dfresult.language.apply(language)
        print "Total Results:",len(dfresult)
        return dfresult
    else:
        print "No results returned, try again"
        return False

inputHTML = '''
<input id="book-search" stype="width:400px">
    <select id="book-select">
       <option value="title">Title</option>
      <option value="author">Author</option>
    </select> 
<input type="submit" id="book-submit" value="Search">'''

js_code = ("""
    var $body = $("body");
        
    $body.on('click', '#book-submit', function() {
        var $search = $('#book-search'),
            $select = $('#book-select');
        
        var py_code1 = "queryterm = '" + $search.val() + "'";
        var py_code2 = "on = '" + $select.val() + "'";

        console.log('submit:', $search.val(), $select.val());
        
        IPython.notebook.kernel.execute(py_code1);
        IPython.notebook.kernel.execute(py_code2);

    });
    

    """)


IPython.core.display.Javascript(js_code)

HTML(inputHTML)

dfresult = searchBook(queryterm, on)

source = dfresult.groupby('source')
source['source'].value_counts().plot(kind="bar", title="Count distribution of searched results")


lbl= dfresult['language'].value_counts().index
#dfresult.value_counts().plot(kind="bar")
pie(dfresult['language'].value_counts(), labels=lbl, autopct='%1.1f%%')
#type(dfresult['language'])

display(HTML(maketable(dfresult)))

if on == "title":
    h = "Title"
        
if on == "author":
    h = "Imprint"#Fetching books only from hathitrust


query = "".join(["select Title as title, 'Unknown' as subtitle, 'Unknown' as author,",
"lang as language, PubDate as pubdate, Access as rights, concat('http://hdl.handle.net/2027/',VolumeID) as link,",
"'pdf' as format from ht_books where "+h+" like '%" ,
queryterm,
"%'"])
query
cursor.execute(query)
htres = list(cursor.fetchall())
dfht= pd.DataFrame(htres , columns=['title','subtitle','author','language','pubdate','rights','link','format'])

#dislaying results from hathitrust
display(HTML(dfht.to_html()))

dfht.pubdate.apply(returnnumber).value_counts().plot(kind="barh", title = "number of books published per year for the queries book")

dfht.language.apply(language)
lbl = dfht.language.apply(language).value_counts().index
pie(dfht.language.apply(language).value_counts(), labels=lbl, autopct='%1.1f%%')