DATA_FILES = {"nondigitized":"Project/Data/non_google_pd_pdus.xml",
         "fullhathifiles":"Project/Data/hathi_full_20130301.txt"}

import os
import sys
import pandas as pd
import numpy as np
def file_path(key):
    return os.path.join(os.pardir, DATA_FILES[key])   

file_path("nondigitized")

#In the original file, replaced xmlns:dc= url with xmlns:dc="#" this works , loaing xml file data into mysql database, this needs to be worked on
from lxml import etree
from collections import defaultdict
import pandas as pd
import sqlite3
from pandas.io import sql
tree = etree.parse(file_path("nondigitized"))
root=tree.getroot()
finaldictlist=list()
for record in root.getchildren():
    tmpdict=defaultdict(list)
    tmpdict.update(record.attrib)
    for ch in record.getchildren():
        if ((ch.tag.replace("{#}","") == 'identifier') or (ch.tag.replace("{#}","") == 'description')):
            pass
        else:
            #tmpdict[ch.tag.replace("{#}","")].append(ch.text)
            tmpdict.update({ch.tag.replace("{#}",""):ch.text})
            #tmpdict[ch.tag.replace("{#}","")] = [''.join(tmpdict[ch.tag.replace("{#}","")])]
    finaldictlist.append(tmpdict)
finaldictlist
dfnondigitized = pd.DataFrame(finaldictlist)

#Finding out authors with maximum publications
creatercount  =  dfnondigitized['creator'].value_counts()
maxpublications = creatercount[creatercount>50]
maxpublications

#Plot the graph for top 10 authors along with the number of publications
import matplotlib
maxpublications[:11].plot(kind='barh',rot=0)

#Fetch dataframe row corresponding to the most popular author
tmp = maxpublications.index
topauthor = dfnondigitized[dfnondigitized['creator']==tmp[0]]

#For the top author plot a graph of when sevral books were ublished over years.
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set_size_inches(5,5)
topauthorpub = pd.Series(map(lambda x : re.findall(("(\d+)") ,x)[0] , topauthor['date'])).value_counts()
topauthorpub.drop('19').sort_index().plot()
#topauthorpub.sort_index().plot()

#Plot the books by access rights
rights = dfnondigitized['rights'].value_counts()
figure(1, figsize=(6,6))
labels = 'Public Domain', 'Public Domain US'
explode=(0, 0.05)
pie(rights,labels=labels, explode=explode,autopct='%1.1f%%')

#top five languages in which the books are available
lang = dfnondigitized['language'].value_counts()
dflang = pd.DataFrame(lang)
lang[:5]

#Statistics various types of books available
typedata = dfnondigitized['type'].value_counts()
typedata