DATA_FILES = {"nondigitized":"Project/Data/non_google_pd_pdus.xml", "fullhathifiles":"Project/Data/hathi_full_20130301.txt"} import os import sys import pandas as pd import numpy as np def file_path(key): return os.path.join(os.pardir, DATA_FILES[key]) file_path("nondigitized") #In the original file, replaced xmlns:dc= url with xmlns:dc="#" this works , loaing xml file data into mysql database, this needs to be worked on from lxml import etree from collections import defaultdict import pandas as pd import sqlite3 from pandas.io import sql tree = etree.parse(file_path("nondigitized")) root=tree.getroot() finaldictlist=list() for record in root.getchildren(): tmpdict=defaultdict(list) tmpdict.update(record.attrib) for ch in record.getchildren(): if ((ch.tag.replace("{#}","") == 'identifier') or (ch.tag.replace("{#}","") == 'description')): pass else: #tmpdict[ch.tag.replace("{#}","")].append(ch.text) tmpdict.update({ch.tag.replace("{#}",""):ch.text}) #tmpdict[ch.tag.replace("{#}","")] = [''.join(tmpdict[ch.tag.replace("{#}","")])] finaldictlist.append(tmpdict) finaldictlist dfnondigitized = pd.DataFrame(finaldictlist) #Finding out authors with maximum publications creatercount = dfnondigitized['creator'].value_counts() maxpublications = creatercount[creatercount>50] maxpublications #Plot the graph for top 10 authors along with the number of publications import matplotlib maxpublications[:11].plot(kind='barh',rot=0) #Fetch dataframe row corresponding to the most popular author tmp = maxpublications.index topauthor = dfnondigitized[dfnondigitized['creator']==tmp[0]] #For the top author plot a graph of when sevral books were ublished over years. import matplotlib.pyplot as plt fig = plt.figure() fig.set_size_inches(5,5) topauthorpub = pd.Series(map(lambda x : re.findall(("(\d+)") ,x)[0] , topauthor['date'])).value_counts() topauthorpub.drop('19').sort_index().plot() #topauthorpub.sort_index().plot() #Plot the books by access rights rights = dfnondigitized['rights'].value_counts() figure(1, figsize=(6,6)) labels = 'Public Domain', 'Public Domain US' explode=(0, 0.05) pie(rights,labels=labels, explode=explode,autopct='%1.1f%%') #top five languages in which the books are available lang = dfnondigitized['language'].value_counts() dflang = pd.DataFrame(lang) lang[:5] #Statistics various types of books available typedata = dfnondigitized['type'].value_counts() typedata