In [15]:
DATA_FILES = {"nondigitized":"Project/Data/non_google_pd_pdus.xml",
         "fullhathifiles":"Project/Data/hathi_full_20130301.txt"}
In [62]:
import os
import sys
import pandas as pd
import numpy as np
def file_path(key):
    return os.path.join(os.pardir, DATA_FILES[key])   
In [19]:
file_path("nondigitized")
Out[19]:
'../Project/Data/non_google_pd_pdus.xml'
In [5]:
#In the original file, replaced xmlns:dc= url with xmlns:dc="#" this works , loaing xml file data into mysql database, this needs to be worked on
from lxml import etree
from collections import defaultdict
import pandas as pd
import sqlite3
from pandas.io import sql
tree = etree.parse(file_path("nondigitized"))
root=tree.getroot()
finaldictlist=list()
for record in root.getchildren():
    tmpdict=defaultdict(list)
    tmpdict.update(record.attrib)
    for ch in record.getchildren():
        if ((ch.tag.replace("{#}","") == 'identifier') or (ch.tag.replace("{#}","") == 'description')):
            pass
        else:
            #tmpdict[ch.tag.replace("{#}","")].append(ch.text)
            tmpdict.update({ch.tag.replace("{#}",""):ch.text})
            #tmpdict[ch.tag.replace("{#}","")] = [''.join(tmpdict[ch.tag.replace("{#}","")])]
    finaldictlist.append(tmpdict)
finaldictlist
dfnondigitized = pd.DataFrame(finaldictlist)
In [6]:
#Finding out authors with maximum publications
creatercount  =  dfnondigitized['creator'].value_counts()
maxpublications = creatercount[creatercount>50]
maxpublications
Out[6]:
Oliphant, Mrs. (Margaret), 1828-1897.                           197
Abdülhamid II, Sultan of the Turks, 1842-1918, former owner.    192
Valpy, Abraham John, 1787-1854.                                 170
Cairns Collection of American Women Writers.                    156
United States. Office of the Federal Register.                  146
Freeman, A. C. (Abraham Clark), 1843-1911.                      146
Lytton, Edward Bulwer Lytton, Baron, 1803-1873.                 145
Scott, Walter, Sir, 1771-1832.                                  136
Dickens, Charles, 1812-1870.                                    116
Saintsbury, George, 1845-1933.                                  114
Lawrence J. Gutter Collection of Chicagoana (University of Illinois at Chicago) ICIU    111
Defoe, Daniel, 1661?-1731.                                      110
James, G. P. R. (George Payne Rainsford), 1801?-1860.           108
Balzac, Honoré de, 1799-1850.                                   108
Canada. Parliament.                                             107
Lever, Charles James, 1806-1872.                                103
Black, William, 1841-1898.                                      101
Chalmers, Alexander, 1759-1834.                                 101
Whyte-Melville, G. J. (George John), 1821-1878.                 100
Eliot, George, 1819-1880.                                        98
Thackeray, William Makepeace, 1811-1863.                         96
Wood, Henry, Mrs., 1814-1887.                                    95
Leigh, Oliver Herbrand Gordon.                                   89
Baring-Gould, S. (Sabine), 1834-1924.                            88
Swift, Jonathan, 1667-1745.                                      84
Russell, William Clark, 1844-1911.                               81
Carlyle, Thomas, 1795-1881.                                      79
Kipling, Rudyard, 1865-1936.                                     77
Braddon, M. E. (Mary Elizabeth), 1837-1915.                      77
Disraeli, Benjamin, Earl of Beaconsfield, 1804-1881.             76
Lang, Andrew, 1844-1912.                                         75
Ouida, 1839-1908.                                                75
Longfellow, Henry Wadsworth, 1807-1882.                          75
Besant, Walter, Sir, 1836-1901.                                  74
Gore, Mrs. (Catherine Grace Frances), 1799-1861.                 74
Cooper, James Fenimore, 1789-1851.                               73
Kingsley, Charles, 1819-1875.                                    72
Dumas, Alexandre, 1802-1870.                                     71
Reid, Mayne, 1818-1883.                                          71
Swedenborg, Emanuel, 1688-1772.                                  71
Stevenson, Robert Louis, 1850-1894.                              71
Johnson, Samuel, 1709-1784.                                      69
Ruskin, John, 1819-1900.                                         69
Yonge, Charlotte Mary, 1823-1901.                                69
Harte, Bret, 1836-1902.                                          66
New York (State). Court of Appeals                               66
Hewlett, Maurice, 1861-1923.                                     66
Smart, Hawley, 1833-1893.                                        65
Moore, Thomas, 1779-1852.                                        64
MacDonald, George, 1824-1905.                                    64
Karl Baedeker (Firm)                                             63
Braddon, M. E. (Mary Elizabeth), 1835-1915.                      63
Phillpotts, Eden, 1862-1960.                                     63
Meyerhof, Max, 1874-1945, former owner.                          63
Irving, Washington, 1783-1859.                                   61
Burke, Edmund, 1729-1797.                                        61
Ridpath, John Clark, 1840-1900.                                  61
Great Britain. Laws, statutes, etc.                              61
Hunt, Leigh, 1784-1859.                                          60
Collins, Wilkie, 1824-1889.                                      60
Payn, James, 1830-1898.                                          60
Burton, Richard Francis, Sir, 1821-1890.                         59
Ohio.                                                            59
Schiller, Friedrich, 1759-1805.                                  59
Riley, James Whitcomb, 1849-1916.                                59
Meredith, George, 1828-1909.                                     58
Mühlbach, L. (Luise), 1814-1873.                                 58
Fitzgerald, Percy Hetherington, 1834-1925.                       58
Shakespeare, William, 1564-1616.                                 58
Grant, James, 1822-1887.                                         57
Arthur, T. S. (Timothy Shay), 1809-1885.                         56
Henty, G. A. (George Alfred), 1832-1902.                         56
Traill, H. D. (Henry Duff), 1842-1900.                           55
Abbott, Jacob, 1803-1879.                                        54
Bennett, Arnold, 1867-1931.                                      54
Tabor, Eliza.                                                    54
Dobson, Austin, 1840-1921.                                       54
Byron, George Gordon Byron, Baron, 1788-1824.                    53
Königlich Bayerische Akademie der Wissenschaften. Historische Kommission.     53
Lecky, William Edward Hartpole, 1838-1903.                       53
Walls, Thomas H., tr.                                            52
Rouillu, Charles August, 1883- ed.                               52
Crockett, S. R. (Samuel Rutherford), 1860-1914.                  52
Gladstone, W. E. (William Ewart), 1809-1898.                     52
Length: 84
In [7]:
#Plot the graph for top 10 authors along with the number of publications
import matplotlib
maxpublications[:11].plot(kind='barh',rot=0)
Out[7]:
<matplotlib.axes.AxesSubplot at 0x10f1f1c50>
In [152]:
#Fetch dataframe row corresponding to the most popular author
tmp = maxpublications.index
topauthor = dfnondigitized[dfnondigitized['creator']==tmp[0]]
In [153]:
#For the top author plot a graph of when sevral books were ublished over years.
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set_size_inches(5,5)
topauthorpub = pd.Series(map(lambda x : re.findall(("(\d+)") ,x)[0] , topauthor['date'])).value_counts()
topauthorpub.drop('19').sort_index().plot()
#topauthorpub.sort_index().plot()
Out[153]:
<matplotlib.axes.AxesSubplot at 0x11cd39050>
In [97]:
#Plot the books by access rights
rights = dfnondigitized['rights'].value_counts()
figure(1, figsize=(6,6))
labels = 'Public Domain', 'Public Domain US'
explode=(0, 0.05)
pie(rights,labels=labels, explode=explode,autopct='%1.1f%%')
Out[97]:
([<matplotlib.patches.Wedge at 0x112ec8650>,
  <matplotlib.patches.Wedge at 0x112ec8c10>],
 [<matplotlib.text.Text at 0x112ec8a10>,
  <matplotlib.text.Text at 0x112ecc090>],
 [<matplotlib.text.Text at 0x112ec8ad0>,
  <matplotlib.text.Text at 0x112ec8b90>])
In [121]:
#top five languages in which the books are available
lang = dfnondigitized['language'].value_counts()
dflang = pd.DataFrame(lang)
lang[:5]
Out[121]:
eng    107510
fre      2947
ger      2703
spa       652
ita       584
In [155]:
#Statistics various types of books available
typedata = dfnondigitized['type'].value_counts()
typedata
Out[155]:
Text                                                            113133
Collection                                                        3024
Authors' presentation copies (Provenance) rbprov IU-R              141
Three deckers. rbgenr                                               60
Publishers' cloth bindings (Binding) New York (State) New York 20th century. rbbin        49
Publishers' advertisements. rbgenr                                  33
Historical fiction. gsafd                                           22
Mystery and detective fiction Great Britain. rbgenr                 19
Periodicals. rbgenr                                                 18
Publishers' cloth bindings (Binding) Illinois Chicago 19th century. rbbin        17
Publishers' cloth bindings (Binding) New York (State) New York 19th century. rbbin        17
Juvenile literature. rbgenr                                         15
Bookplates (Provenance). rbprov                                     15
Author's presentation copies (Provenance). rbprov IU-R              13
California, Southern Imprints Specimens. local                      13
...
Gilt edges. rbbin                                               1
Juvenile literature Vermont Woodstock 1823. rbgenr              1
Short stories. gsafd                                            1
Unopened books (Binding). rbbin                                 1
Presentation inscriptions. rbprov                               1
Annotations (Provenance) 20th century. rbprov CLUW              1
Guidebooks Washington (State) Seattle 20th century. rbgenr      1
Travel literature Canada 19th century. rbgenr                   1
Headpieces (layout features) 16th century. aat.                 1
Publishers' copies (Provenance) rbprov.                         1
Publishers' paper bindings (Binding) Indiana Indianapolis 20th century. rbbin    1
Ink stamps (Provenance). rbprov                                 1
Political fiction. lcsh                                         1
Devotional literature. rbgenr                                   1
Religious fiction. lcsh                                         1
Length: 218