import pandas as pd
authors = ['Dickens, Charles', 'Collins, Wilkie', 'Scott, Walter', 'Meredith, George', 'Conrad, Joseph', 'Cather, Willa']
database = 'sqlite:////home/jon/Code/gitenberg-scrape/pg-text-7.db'
def getMeta(author):
query = 'select * from meta where author="%s"' % author
meta = pd.read_sql(query, database)
return meta
dickensMeta = getMeta(authors[0])
dickensNovelIDs = [3, 13, 34, 12, 28, 35, 19, 14, 36, 15, 33, 1, 40, 23, 2]
dNovelsMeta = dickensMeta.loc[dickensNovelIDs][['title', 'gr_pubDate', 'id']]
dickensBib = pd.read_csv('dickens.csv')
dNovelsMeta = dNovelsMeta.reset_index()
dNovelsMeta['wp_pubDate'] = dickensBib['Year completed']
def getText(id):
query = 'select text from text where id="%s"' % id
result = pd.read_sql(query, database)
if len(result) > 0:
return result.loc[0].text
else:
return None
dNovelsMeta['text'] = dNovelsMeta['id'].apply(getText)
dNovelsMeta.loc[15]
title Hard Times gr_pubDate 1854-- id 786.0 wp_pubDate NaN text None Name: 15, dtype: object
dickensMeta[dickensMeta.title=='Hard Times'].title
15 Hard Times Name: title, dtype: object
dNovelsMeta = dNovelsMeta.reset_index()
hardTimes = open('/home/jon/Code/gitenberg-scrape/hard-times.txt').read()
dNovelsMeta.set_value(15, 'text', hardTimes)
title gr_pubDate id \ 3 The Pickwick Papers 1837-- 580.0 13 Oliver Twist 1838-- 730.0 34 Nicholas Nickleby 1839-- 967.0 12 The Old Curiosity Shop 1840-- 700.0 28 Barnaby Rudge: A Tale of the Riots of 'Eighty -- 917.0 35 Martin Chuzzlewit 1844-- 968.0 19 Dombey and Son 1848-- 821.0 14 David Copperfield 1849-- 766.0 36 Bleak House 1853-- 1023.0 15 Hard Times 1854-- 786.0 33 Little Dorrit 1855-- 963.0 1 A Tale of Two Cities 1859-- 98.0 40 Great Expectations 1860-- 1400.0 23 Our Mutual Friend 1865-11- 883.0 2 The Mystery of Edwin Drood 1870-- 564.0 wp_pubDate text 3 1841.0 \n\n\n\n\n\n\n\nTHE PICKWICK PAPERS\n\n\nBy Ch... 13 1865.0 \n\n\n\n\n\n\n\n\nOLIVER TWIST\n\nOR\n\nTHE PA... 34 NaN \n\n\n\n\nTHE LIFE AND ADVENTURES OF NICHOLAS ... 12 1861.0 \n\n\n\n\n\n\n\n\nThe Old Curiosity Shop\n\nBy... 28 NaN \n\n\n\n\nBARNABY RUDGE\n\nA TALE OF THE RIOTS... 35 NaN \n\n\n\n\nLIFE AND ADVENTURES OF MARTIN CHUZZL... 19 NaN \n\n\n\n\n\n\n\nDOMBEY AND SON\n\nBy Charles D... 14 1870.0 \n\n\n\n\nDAVID COPPERFIELD\n\n\nBy Charles Di... 36 NaN and revised by Thomas Berger and Joseph E. Loe... 15 NaN The Project Gutenberg eBook, Hard Times, by C... 33 NaN \n\n\n\n\nLITTLE DORRIT\n\nBy Charles Dickens\... 1 1839.0 \n\n\n\n\n\n\n\nA TALE OF TWO CITIES\n\nA STOR... 40 NaN \n\n\n\n\nGREAT EXPECTATIONS\n\n[1867 Edition]... 23 NaN \n\n\n\n\nOUR MUTUAL FRIEND\n\nCharles Dickens... 2 1839.0 \n\nTranscribed from the Chapman and Hall, 191...
dNovelsMeta
index title gr_pubDate id \ 0 3 The Pickwick Papers 1837-- 580.0 1 13 Oliver Twist 1838-- 730.0 2 34 Nicholas Nickleby 1839-- 967.0 3 12 The Old Curiosity Shop 1840-- 700.0 4 28 Barnaby Rudge: A Tale of the Riots of 'Eighty -- 917.0 5 35 Martin Chuzzlewit 1844-- 968.0 6 19 Dombey and Son 1848-- 821.0 7 14 David Copperfield 1849-- 766.0 8 36 Bleak House 1853-- 1023.0 9 15 Hard Times 1854-- 786.0 10 33 Little Dorrit 1855-- 963.0 11 1 A Tale of Two Cities 1859-- 98.0 12 40 Great Expectations 1860-- 1400.0 13 23 Our Mutual Friend 1865-11- 883.0 14 2 The Mystery of Edwin Drood 1870-- 564.0 wp_pubDate text 0 1837 \n\n\n\n\n\n\n\nTHE PICKWICK PAPERS\n\n\nBy Ch... 1 1839 \n\n\n\n\n\n\n\n\nOLIVER TWIST\n\nOR\n\nTHE PA... 2 1839 \n\n\n\n\nTHE LIFE AND ADVENTURES OF NICHOLAS ... 3 1841 \n\n\n\n\n\n\n\n\nThe Old Curiosity Shop\n\nBy... 4 1841 \n\n\n\n\nBARNABY RUDGE\n\nA TALE OF THE RIOTS... 5 1844 \n\n\n\n\nLIFE AND ADVENTURES OF MARTIN CHUZZL... 6 1848 \n\n\n\n\n\n\n\nDOMBEY AND SON\n\nBy Charles D... 7 1850 \n\n\n\n\nDAVID COPPERFIELD\n\n\nBy Charles Di... 8 1853 and revised by Thomas Berger and Joseph E. Loe... 9 1854 The Project Gutenberg eBook, Hard Times, by C... 10 1857 \n\n\n\n\nLITTLE DORRIT\n\nBy Charles Dickens\... 11 1859 \n\n\n\n\n\n\n\nA TALE OF TWO CITIES\n\nA STOR... 12 1861 \n\n\n\n\nGREAT EXPECTATIONS\n\n[1867 Edition]... 13 1865 \n\n\n\n\nOUR MUTUAL FRIEND\n\nCharles Dickens... 14 1870 \n\nTranscribed from the Chapman and Hall, 191...
def writeFiles(df, author):
for i, row in df.iterrows():
title = row.title
if title.startswith('The'):
title = title[4:]
if title.startswith('A'):
title = title[3:]
if ':' in title:
title = title.split(':')[0]
title = row.title.lower().replace(' ','-')
year = row.wp_pubDate
filename = "%s/%s-%s-%s.txt" % (author, author, row.wp_pubDate, title)
with open(filename, 'w') as f:
f.write(row.text)
writeFiles(dNovelsMeta, 'dickens')
collinsMeta = getMeta(authors[1])
collinsMeta.title
0 The Moonstone 1 The Haunted Hotel: A Mystery of Modern Venice 2 The Woman in White 3 No Name 4 Man and Wife 5 The Black Robe 6 A Rogue's Life 7 Miss or Mrs.? 8 The Law and the Lady 9 The New Magdalen 10 The Two Destinies 11 The Frozen Deep 12 After Dark 13 The Evil Genius: A Domestic Story 14 My Lady's Money 15 "I Say No" 16 Little Novels 17 Armadale 18 The Queen of Hearts 19 The Legacy of Cain 20 A Fair Penitent 21 Antonina; Or, The Fall of Rome 22 Poor Miss Finch 23 Jezebel's Daughter 24 The Guilty River 25 Basil 26 Blind Love 27 The Dead Alive 28 Heart and Science: A Story of the Present Time 29 Hide and Seek 30 The Fallen Leaves 31 L'hôtel hanté 32 John Jagon henki tahi kuollutko vai elävä? 33 Ilman menestyksettä: Joulukertomus 34 Sisar Rosa 35 Rambles Beyond Railways; or, Notes in Cornwall... 36 Kamala yösija 37 The Dead Secret: A Novel 38 My Miscellanies, Vol. 1 (of 2) 39 My Miscellanies, Vol. 2 (of 2) 40 Valkopukuinen nainen I: Perheromaani 41 Valkopukuinen nainen II: Perheromaani Name: title, dtype: object
collinsWP = pd.Series({'Basil': 1852, 'Hide and Seek': 1854, 'The Dead Secret': 1856, 'The Woman in White': 1860,
'Armadale': 1866, 'No Thoroughfare': 1867, 'The Moonstone': 1868, 'Man and Wife': 1870, 'Poor Miss Finch': 1872, 'The Dead Alive': 1874, 'The Law and the Lady': 1875,
'The Black Robe': 1881, 'Blind Love': 1889})
collinsWP
Armadale 1866 Basil 1852 Blind Love 1889 Hide and Seek 1854 Man and Wife 1870 No Thoroughfare 1867 Poor Miss Finch 1872 The Black Robe 1881 The Dead Alive 1874 The Dead Secret 1856 The Law and the Lady 1875 The Moonstone 1868 The Woman in White 1860 dtype: int64
def lookup(title):
return collinsMeta[collinsMeta.title.str.contains(title)]['id'].item()
lookup('Basil')
'4605.0'
collinsWP
Armadale 1866 Basil 1852 Blind Love 1889 Hide and Seek 1854 Man and Wife 1870 No Thoroughfare 1867 Poor Miss Finch 1872 The Black Robe 1881 The Dead Alive 1874 The Dead Secret 1856 The Law and the Lady 1875 The Moonstone 1868 The Woman in White 1860 dtype: int64
for title, year in collinsWP.iteritems():
print(title, lookup(title))
Armadale 1895.0 Basil 4605.0 Blind Love 7890.0 Hide and Seek 7893.0 Man and Wife 1586.0
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-100-d97b2bcc313a> in <module>() 1 for title, year in collinsWP.iteritems(): ----> 2 print(title, lookup(title)) <ipython-input-86-cb4a295caca1> in lookup(title) 1 def lookup(title): ----> 2 return collinsMeta[collinsMeta.title.str.contains(title)]['id'].item() ~/.local/lib/python3.6/site-packages/pandas/core/base.py in item(self) 812 """ 813 try: --> 814 return self.values.item() 815 except IndexError: 816 # copy numpy's message here because Py26 raises an IndexError ValueError: can only convert an array of size 1 to a Python scalar
collinsWPDF = pd.DataFrame(collinsWP)
collinsWPDF.drop(5)
0 | title | wp_pubYear | |
---|---|---|---|
0 | 1866 | Armadale | 1866 |
1 | 1852 | Basil | 1852 |
2 | 1889 | Blind Love | 1889 |
3 | 1854 | Hide and Seek | 1854 |
4 | 1870 | Man and Wife | 1870 |
6 | 1872 | Poor Miss Finch | 1872 |
7 | 1881 | The Black Robe | 1881 |
8 | 1874 | The Dead Alive | 1874 |
9 | 1856 | The Dead Secret | 1856 |
10 | 1875 | The Law and the Lady | 1875 |
11 | 1868 | The Moonstone | 1868 |
12 | 1860 | The Woman in White | 1860 |
collinsWPDF['title'] = collinsWPDF.index
collinsWPDF
0 | title | |
---|---|---|
Armadale | 1866 | Armadale |
Basil | 1852 | Basil |
Blind Love | 1889 | Blind Love |
Hide and Seek | 1854 | Hide and Seek |
Man and Wife | 1870 | Man and Wife |
No Thoroughfare | 1867 | No Thoroughfare |
Poor Miss Finch | 1872 | Poor Miss Finch |
The Black Robe | 1881 | The Black Robe |
The Dead Alive | 1874 | The Dead Alive |
The Dead Secret | 1856 | The Dead Secret |
The Law and the Lady | 1875 | The Law and the Lady |
The Moonstone | 1868 | The Moonstone |
The Woman in White | 1860 | The Woman in White |
collinsWPDF = collinsWPDF.reset_index()
collinsWPDF['wp_pubYear'] = collinsWPDF[0]
collinsWPDF = collinsWPDF.drop('index', axis=1)
collinsWPDF = collinsWPDF.drop(5)
collinsWPDF
index | 0 | title | wp_pubYear | |
---|---|---|---|---|
0 | 0 | 1866 | Armadale | 1866 |
1 | 1 | 1852 | Basil | 1852 |
2 | 2 | 1889 | Blind Love | 1889 |
3 | 3 | 1854 | Hide and Seek | 1854 |
4 | 4 | 1870 | Man and Wife | 1870 |
6 | 6 | 1872 | Poor Miss Finch | 1872 |
7 | 7 | 1881 | The Black Robe | 1881 |
8 | 8 | 1874 | The Dead Alive | 1874 |
9 | 9 | 1856 | The Dead Secret | 1856 |
10 | 10 | 1875 | The Law and the Lady | 1875 |
11 | 11 | 1868 | The Moonstone | 1868 |
12 | 12 | 1860 | The Woman in White | 1860 |
collinsWPDF['id'] = collinsWPDF.title.apply(lookup)
collinsWPDF['text'] = collinsWPDF['id'].apply(getText)
collinsWPDF['wp_pubDate'] = collinsWPDF['wp_pubYear']
collinsWPDF.wp_pubDate
0 1866 1 1852 2 1889 3 1854 4 1870 6 1872 7 1881 8 1874 9 1856 10 1875 11 1868 12 1860 Name: wp_pubDate, dtype: int64
writeFiles(collinsWPDF, 'collins')
authors[3]
'Meredith, George'
meredithMeta = getMeta(authors[3])
meredithCSV = pd.read_csv('meredith.csv')
meredithCSV
title | year | |
---|---|---|
0 | The Shaving of Shagpat | 1856 |
1 | Farina | 1857 |
2 | The Ordeal of Richard Feverel | 1859 |
3 | Evan Harrington | 1861 |
4 | Emilia in England | 1864 |
5 | Rhoda Fleming | 1865 |
6 | Vittoria | 1867 |
7 | The Adventures of Harry Richmond | 1871 |
8 | Beauchamp's Career | 1875 |
9 | The House on the Beach | 1877 |
10 | The Case of General Ople and Lady Camper | 1877 |
11 | The Tale of Chloe | 1879 |
12 | The Egoist | 1879 |
13 | The Tragic Comedians | 1880 |
14 | Diana of the Crossways | 1885 |
15 | One of our Conquerors | 1891 |
16 | Lord Ormont and his Aminta | 1894 |
17 | The Amazing Marriage | 1895 |
18 | Celt and Saxon | 1910 |
def meredithLookup(title):
meta = meredithMeta[(meredithMeta.title.str.contains(title))]
if len(meta) > 1:
complete = meta[meredithMeta.title.str.contains('Complete')]
if len(complete) > 0:
meta = complete
else:
meta = meta.loc[0]
if len(meta) == 0:
return None
metaID = meta.id
metaTitle = meta.title
print(title, '---', metaID, metaTitle, '\n\n')
return metaID.item()
meredithCSV['id'] = meredithCSV.title.apply(meredithLookup)
The Shaving of Shagpat --- 10 4405.0 Name: id, dtype: object 10 The Shaving of Shagpat; an Arabian entertainme... Name: title, dtype: object Farina --- 97 4492.0 Name: id, dtype: object 97 Farina Name: title, dtype: object The Ordeal of Richard Feverel --- 17 4412.0 Name: id, dtype: object 17 The Ordeal of Richard Feverel — Complete Name: title, dtype: object Evan Harrington --- 39 4434.0 Name: id, dtype: object 39 Evan Harrington — Complete Name: title, dtype: object Emilia in England --- 25 4420.0 Name: id, dtype: object 25 Sandra Belloni (originally Emilia in England) ... Name: title, dtype: object Rhoda Fleming --- 31 4426.0 Name: id, dtype: object 31 Rhoda Fleming — Complete Name: title, dtype: object Vittoria --- 48 4443.0 Name: id, dtype: object 48 Vittoria — Complete Name: title, dtype: object The Adventures of Harry Richmond --- 57 4452.0 Name: id, dtype: object 57 The Adventures of Harry Richmond — Complete Name: title, dtype: object Beauchamp's Career --- 65 4460.0 Name: id, dtype: object 65 Beauchamp's Career — Complete Name: title, dtype: object The House on the Beach --- 100 4495.0 Name: id, dtype: object 100 The House on the Beach: A Realistic Tale Name: title, dtype: object The Case of General Ople and Lady Camper --- 98 4493.0 Name: id, dtype: object 98 The Case of General Ople and Lady Camper Name: title, dtype: object The Tale of Chloe --- 99 4494.0 Name: id, dtype: object 99 The Tale of Chloe: An Episode in the History o... Name: title, dtype: object The Egoist --- 5 1684.0 Name: id, dtype: object 5 The Egoist: A Comedy in Narrative Name: title, dtype: object The Tragic Comedians --- 69 4464.0 Name: id, dtype: object 69 The Tragic Comedians: A Study in a Well-known ... Name: title, dtype: object Diana of the Crossways --- 75 4470.0 Name: id, dtype: object 75 Diana of the Crossways — Complete Name: title, dtype: object The Amazing Marriage --- 93 4488.0 Name: id, dtype: object 93 The Amazing Marriage — Complete Name: title, dtype: object Celt and Saxon --- 96 4491.0 Name: id, dtype: object 96 Celt and Saxon — Complete Name: title, dtype: object
/home/jon/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. after removing the cwd from sys.path.
meredithCSV
title | year | id | |
---|---|---|---|
0 | The Shaving of Shagpat | 1856 | 4405.0 |
1 | Farina | 1857 | 4492.0 |
2 | The Ordeal of Richard Feverel | 1859 | 4412.0 |
3 | Evan Harrington | 1861 | 4434.0 |
4 | Emilia in England | 1864 | 4420.0 |
5 | Rhoda Fleming | 1865 | 4426.0 |
6 | Vittoria | 1867 | 4443.0 |
7 | The Adventures of Harry Richmond | 1871 | 4452.0 |
8 | Beauchamp's Career | 1875 | 4460.0 |
9 | The House on the Beach | 1877 | 4495.0 |
10 | The Case of General Ople and Lady Camper | 1877 | 4493.0 |
11 | The Tale of Chloe | 1879 | 4494.0 |
12 | The Egoist | 1879 | 1684.0 |
13 | The Tragic Comedians | 1880 | 4464.0 |
14 | Diana of the Crossways | 1885 | 4470.0 |
15 | One of our Conquerors | 1891 | None |
16 | Lord Ormont and his Aminta | 1894 | None |
17 | The Amazing Marriage | 1895 | 4488.0 |
18 | Celt and Saxon | 1910 | 4491.0 |
meredithCSV.set_value(15, 'id', meredithMeta.loc[81].id)
title | year | id | |
---|---|---|---|
0 | The Shaving of Shagpat | 1856 | 4405.0 |
1 | Farina | 1857 | 4492.0 |
2 | The Ordeal of Richard Feverel | 1859 | 4412.0 |
3 | Evan Harrington | 1861 | 4434.0 |
4 | Emilia in England | 1864 | 4420.0 |
5 | Rhoda Fleming | 1865 | 4426.0 |
6 | Vittoria | 1867 | 4443.0 |
7 | The Adventures of Harry Richmond | 1871 | 4452.0 |
8 | Beauchamp's Career | 1875 | 4460.0 |
9 | The House on the Beach | 1877 | 4495.0 |
10 | The Case of General Ople and Lady Camper | 1877 | 4493.0 |
11 | The Tale of Chloe | 1879 | 4494.0 |
12 | The Egoist | 1879 | 1684.0 |
13 | The Tragic Comedians | 1880 | 4464.0 |
14 | Diana of the Crossways | 1885 | 4470.0 |
15 | One of our Conquerors | 1891 | 4476.0 |
16 | Lord Ormont and his Aminta | 1894 | 4482.0 |
17 | The Amazing Marriage | 1895 | 4488.0 |
18 | Celt and Saxon | 1910 | 4491.0 |
meredithCSV['text'] = meredithCSV.id.apply(getText)
meredithCSV['wp_pubDate'] = meredithCSV['year']
writeFiles(meredithCSV, 'meredith')
authors[4]
'Conrad, Joseph'
conradMeta = getMeta(authors[4])
conradMeta
LCC | author | authoryearofbirth | authoryearofdeath | downloads | formats | id | languages | lcsh | ... | gr_image_url | gr_small_image_url | am_title | am_author | am_cat | am_cat_ancestors | am_genre | am_editorial_reviews | am_sales_rank | am_isbn | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 212 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 5481 | {'text/html; charset=utf-8': 'http://www.guten... | 219.0 | ['en'] | {'Psychological fiction', 'Trading posts -- Fi... | ... | https://images.gr-assets.com/books/1392799983m... | https://images.gr-assets.com/books/1392799983s... | Heart of Darkness (Dover Thrift Editions) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<div> <div> <P>Although Polish by birth, Jos... | 1416 | 0486264645 | |
1 | 213 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 339 | {'text/plain; charset=utf-8': 'http://www.gute... | 220.0 | ['en'] | {'Fugitives from justice -- Fiction', 'Psychol... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Heart of Darkness and The Secret Sharer (Banta... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<i>Heart Of Darkness</i>. The story of the c... | 180198 | 0553212141 | |
2 | 442 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 189 | {'text/plain; charset=utf-8': 'http://www.gute... | 451.0 | ['en'] | {'Sea stories', 'Ship captains -- Fiction'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Shadow-Line: A Confession (The Cambridge E... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ["Joseph Conrad's short novel The Shadow-Line:... | 1922208 | 1107024420 | |
3 | 484 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 27 | {'text/html; charset=utf-8': 'http://www.guten... | 493.0 | ['en'] | {'Man-woman relationships -- Fiction', 'Seafar... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Falk: A Reminiscence | Joseph Conrad | Literary | [<Element {http://webservices.amazon.com/AWSEC... | ['Falk: A Reminiscence is a humorous engaging ... | 8541480 | 1542575060 | |
4 | 485 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 19 | {'text/html; charset=utf-8': 'http://www.guten... | 494.0 | ['en'] | {'Man-woman relationships -- Fiction', 'Ship c... | ... | https://images.gr-assets.com/books/1423751323m... | https://images.gr-assets.com/books/1423751323s... | To-morrow (Penguin Little Black Classics) by J... | Joseph Conrad | Subjects | [<Element {http://webservices.amazon.com/AWSEC... | [] | None | ||
5 | 486 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 101 | {'text/plain; charset=utf-8': 'http://www.gute... | 495.0 | ['en'] | {'Man-woman relationships -- Fiction'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Amy Foster | Joseph Conrad | Antiques & Collectibles | [<Element {http://webservices.amazon.com/AWSEC... | ['Amy Foster'] | 2580252 | 1470157535 | |
6 | 516 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 127 | {'text/plain; charset=utf-8': 'http://www.gute... | 525.0 | ['en'] | {'Fiction', 'Sea stories', 'Adventure stories'} | ... | https://images.gr-assets.com/books/1475304733m... | https://images.gr-assets.com/books/1475304733s... | Youth A Narrative | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['This anthology is a thorough introduction to... | 1360553 | 1466200448 | |
7 | 517 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 1962 | {'text/html': 'http://www.gutenberg.org/ebooks... | 526.0 | ['en'] | {'Psychological fiction', 'Trading posts -- Fi... | ... | https://images.gr-assets.com/books/1392799983m... | https://images.gr-assets.com/books/1392799983s... | Heart of Darkness (Dover Thrift Editions) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<div> <div> <P>Although Polish by birth, Jos... | 1416 | 0486264645 | |
8 | 518 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 40 | {'text/html; charset=utf-8': 'http://www.guten... | 527.0 | ['en'] | {'Older men -- Fiction', 'Ship captains -- Fic... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Youth; Heart of Darkness; The End of the Tethe... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<b>Conrad\'s aim was by "the power of the wr... | 562090 | 0140185135 | |
9 | 623 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 108 | {'text/plain; charset=utf-8': 'http://www.gute... | 638.0 | ['en'] | {'Psychological fiction', 'Clerks -- Fiction',... | ... | https://images.gr-assets.com/books/1305500836m... | https://images.gr-assets.com/books/1305500836s... | An Outcast of the Islands | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['This work has been selected by scholars as b... | 2745059 | 1346655405 | |
10 | 672 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 110 | {'text/plain; charset=utf-8': 'http://www.gute... | 687.0 | ['en'] | {'Conrad, Joseph, 1857-1924', 'Novelists, Engl... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Mirror of the Sea and A Personal Record (T... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<em>The Mirror of the Sea</em> offers, in Co... | 2173716 | 0192817299 | |
11 | 705 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 116 | {'text/html': 'http://www.gutenberg.org/files/... | 720.0 | ['en'] | {'Psychological fiction', 'Borneo -- Fiction',... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Almayer's Folly (Annotated): A Story of an Eas... | Joseph Conrad | Literature & Fiction | [<Element {http://webservices.amazon.com/AWSEC... | ["Almayer's Folly, published in 1895, is Josep... | 1898545 | 1534812083 | |
12 | 957 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 888 | {'text/html': 'http://www.gutenberg.org/files/... | 974.0 | ['en'] | {'Bombings -- Fiction', 'Anarchists -- Fiction... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Secret Agent (Wordsworth Classics) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ["With an Introduction and Notes by Hugh Epste... | 49462 | 1853260657 | |
13 | 1036 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 35 | {'text/html': 'http://www.gutenberg.org/files/... | 1053.0 | ['en'] | {'Fiction', 'Short stories, English'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Within the Tides; Tales. [1916] | Joseph Conrad | World | [<Element {http://webservices.amazon.com/AWSEC... | ['Leopold is delighted to publish this classic... | None | ||
14 | 1038 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 61 | {'image/jpeg': 'http://www.gutenberg.org/cache... | 1055.0 | ['en'] | {'Indian Ocean -- Fiction', 'Ship captains -- ... | ... | https://images.gr-assets.com/books/1328250713m... | https://images.gr-assets.com/books/1328250713s... | 'Twixt Land & Sea: Tales--A Smile of Fortune, ... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | [] | 2332388 | ||
15 | 1041 | {'G', 'PR'} | Conrad, Joseph | 1857 | 1924 | 89 | {'text/plain; charset=utf-8': 'http://www.gute... | 1058.0 | ['en'] | {'Conrad, Joseph, 1857-1924 -- Travel', 'Seafa... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Mirror of the Sea | Joseph Conrad | Travelers & Explorers | [<Element {http://webservices.amazon.com/AWSEC... | ['First published in 1906, The Mirror of the S... | 1042752 | 1533398992 | |
16 | 1066 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 50 | {'text/html': 'http://www.gutenberg.org/files/... | 1083.0 | ['en'] | {'Illegal arms transfers -- Fiction', 'Sea sto... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Arrow of Gold: A Story Between Two Notes (... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ["<p>Reflecting Conrad's genius for narrative ... | 3198078 | 081221885X | |
17 | 1125 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 203 | {'application/x-mobipocket-ebook': 'http://www... | 1142.0 | ['en'] | {'Typhoons -- Fiction', 'Sea stories', 'Ship c... | ... | https://images.gr-assets.com/books/1312065477m... | https://images.gr-assets.com/books/1312065477s... | Typhoon and Other Stories (Everyman's Library) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['Joseph Conrad’s long experience as a working... | 267063 | 067940547X | |
18 | 1126 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 86 | {'text/html': 'http://www.gutenberg.org/files/... | 1143.0 | ['en'] | {'Essays', 'Conrad, Joseph, 1857-1924', 'Liter... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Notes on Life & Letters | Joseph Conrad | Action & Adventure | [<Element {http://webservices.amazon.com/AWSEC... | ['One of the greatest English writers of the 1... | 2781548 | 1519643950 | |
19 | 1185 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 219 | {'text/plain': 'http://www.gutenberg.org/ebook... | 1202.0 | ['en'] | {'Short stories', 'Adventure stories, English'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Tales of Unrest (Annotated) | Joseph Conrad | Literature & Fiction | [<Element {http://webservices.amazon.com/AWSEC... | ['Tales of Unrest is a collection of short sto... | 2581462 | 1534836403 | |
20 | 1299 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 14 | {'text/html; charset=utf-8': 'http://www.guten... | 1316.0 | ['en'] | {'Authors, English -- 19th century -- Biograph... | ... | Some Reminiscences | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['As a general rule we do not want much encour... | 8711533 | 1514856158 | |||
21 | 1459 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 63 | {'text/html': 'http://www.gutenberg.org/files/... | 1476.0 | ['en'] | {'Love stories', 'Psychological fiction', 'You... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Chance: A Tale in Two Parts (Oxford World's Cl... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<em>Chance</em>(1914) was the first of Conra... | 1139414 | 019954977X | |
22 | 1694 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 38 | {'text/html; charset=utf-8': 'http://www.guten... | 1712.0 | ['en'] | {'Love stories', 'Malay Archipelago -- Fiction... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Rescue: A Romance of the Shallows | Joseph Conrad | Subjects | [<Element {http://webservices.amazon.com/AWSEC... | ['Shipped from UK, please allow 10 to 21 busin... | 3890418 | ||
23 | 2003 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 425 | {'text/html; charset=utf-8': 'http://www.guten... | 2021.0 | ['en'] | {'Latin America -- Fiction', 'Sea stories', 'S... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Nostromo: A Tale of the Seaboard (Classic Novels) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['Nostromo A Tale of the Seaboard ... | None | 1494250985 | |
24 | 2263 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 79 | {'text/plain; charset=utf-8': 'http://www.gute... | 2305.0 | ['en'] | {'Short stories', 'Adventure and adventurers -... | ... | A set of six. By: Joseph Conrad: A Set of Six.... | Joseph Conrad | Literature & Fiction | [<Element {http://webservices.amazon.com/AWSEC... | ['A Set of Six. (collection of story): Gaspar ... | 6227962 | 1542786436 | |||
25 | 2438 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 157 | {'text/html; charset=utf-8': 'http://www.guten... | 2480.0 | ['en'] | {'Bombings -- Fiction', 'Russia -- History -- ... | ... | https://images.gr-assets.com/books/1328865695m... | https://images.gr-assets.com/books/1328865695s... | Under Western Eyes (Penguin Classics) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['"It was I who removed de P- this morning." W... | 697577 | 0141441941 | |
26 | 5572 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 617 | {'text/plain; charset=utf-8': 'http://www.gute... | 5658.0 | ['en'] | {'Indonesia -- Fiction', 'British -- Indonesia... | ... | https://images.gr-assets.com/books/1372366969m... | https://images.gr-assets.com/books/1372366969s... | Lord Jim (Jovian Press) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ["Haunted by the memory of a moment of lost ne... | 622612 | 1547129255 | |
27 | 6287 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 100 | {'text/html; charset=utf-8': 'http://www.guten... | 6378.0 | ['en'] | {'Indonesia -- Fiction', 'Psychological fictio... | ... | https://images.gr-assets.com/books/1497951498m... | https://images.gr-assets.com/books/1497951498s... | Victory: An Island Tale (Penguin Classics) | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ["<b>One of the most powerful and psychologica... | 298625 | 0241189659 | |
28 | 8576 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 16 | {'text/html; charset=utf-8': 'http://www.guten... | 8736.0 | ['en'] | {'Chile -- History -- War of Independence, 181... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | A set of six. By: Joseph Conrad: A Set of Six.... | Joseph Conrad | Literature & Fiction | [<Element {http://webservices.amazon.com/AWSEC... | ['A Set of Six. (collection of story): Gaspar ... | 6235465 | 1542786436 | |
29 | 14192 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 86 | {'text/html': 'http://www.gutenberg.org/ebooks... | 14888.0 | ['en'] | {'Science fiction'} | ... | The Inheritors | Joseph Conrad | Historical | [<Element {http://webservices.amazon.com/AWSEC... | ['The Inheritors is a quasi-science fiction no... | 1967267 | 1481901990 | |||
30 | 16922 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 25 | {'image/jpeg': 'http://www.gutenberg.org/cache... | 17620.0 | ['en'] | {'Historical fiction'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Point of Honor; A Military Tale | Joseph Conrad | Subjects | [<Element {http://webservices.amazon.com/AWSEC... | ['Leopold is delighted to publish this classic... | None | ||
31 | 16923 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 38 | {'application/x-mobipocket-ebook': 'http://www... | 17621.0 | ['en'] | {'England -- Drama'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | One Day More: A Play in One Act (Classic One A... | Joseph Conrad | Dramas & Plays | [<Element {http://webservices.amazon.com/AWSEC... | ['One Day More, A Play in One Act, By... | 2689878 | 1494978474 | |
32 | 16944 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 40 | {'text/html; charset=iso-8859-1': 'http://www.... | 17642.0 | ['en'] | {'Fiction'} | ... | Romance: By Joseph Conrad - Illustrated | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<h2>How is this book unique?</h2> <ol><li>F... | 717210 | 1521135223 | |||
33 | 17033 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 243 | {'text/plain; charset=utf-8': 'http://www.gute... | 17731.0 | ['en'] | {'West Indians -- England -- Fiction', 'Psycho... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Nigger of the Narcissus, a Tale of the For... | Mr Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['James Wait is a dying West Indian black sail... | 1544594 | 1532720351 | |
34 | 17034 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 37 | {'text/html; charset=us-ascii': 'http://www.gu... | 17732.0 | ['en'] | {'Short stories'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Tales Of Hearsay | Joseph Conrad | Literary | [<Element {http://webservices.amazon.com/AWSEC... | ['Tales Of Hearsay'] | 2908590 | 1522927182 | |
35 | 19351 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 10 | {'application/rdf+xml': 'http://www.gutenberg.... | 20150.0 | ['en'] | {'Conrad, Joseph, 1857-1924 -- Criticism and i... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Notes on My Books | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['One of the greatest English writers of the 1... | 7219191 | 1519644507 | |
36 | 22474 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 20 | {'text/plain; charset=us-ascii': 'http://www.g... | 23506.0 | ['en'] | {'Love stories', 'Psychological fiction', 'You... | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | Chance: A Tale in Two Parts (Oxford World's Cl... | Joseph Conrad | Classics | [<Element {http://webservices.amazon.com/AWSEC... | ['<em>Chance</em>(1914) was the first of Conra... | 1172889 | 019954977X | |
37 | 27921 | {'PR'} | Conrad, Joseph | 1857 | 1924 | 37 | {'text/plain; charset=us-ascii': 'http://www.g... | 29156.0 | ['en'] | {'Indexes'} | ... | https://s.gr-assets.com/assets/nophoto/book/11... | https://s.gr-assets.com/assets/nophoto/book/50... | The Project Gutenberg Works of Joseph Conrad A... | None | World | [<Element {http://webservices.amazon.com/AWSEC... | ['Unlike some other reproductions of classic t... | 8673322 | 1318920051 | |
38 | 45578 | set() | Conrad, Joseph | 1857 | 1924 | 1 | {'text/html': 'http://www.gutenberg.org/ebooks... | 46819.0 | ['fi'] | set() | ... | Voimaihminen: Vaiheita Chilen vapaussodasta (F... | Joseph Conrad | War | [<Element {http://webservices.amazon.com/AWSEC... | ['Joseph Conrad on puolalaista sukujuurta, syn... | None | ||||
39 | 45855 | set() | Conrad, Joseph | 1857 | 1924 | 3 | {'text/html': 'http://www.gutenberg.org/ebooks... | 47096.0 | ['fi'] | set() | ... | Tuulentupia (Finnish Edition) | Joseph Conrad | Literary | [<Element {http://webservices.amazon.com/AWSEC... | ['Tuulentupia'] | None | 1522927336 |
40 rows × 62 columns
conradCSV = pd.read_csv('conrad.csv')
conradCSV
title | wp_pubDate | |
---|---|---|
0 | Almayer's Folly | 1895 |
1 | An Outcast of the Islands | 1896 |
2 | The Nigger of the Narcissus | 1897 |
3 | Heart of Darkness | 1899 |
4 | Lord Jim | 1900 |
5 | Typhoon | 1902 |
6 | The End of the Tether | 1902 |
7 | Nostromo | 1904 |
8 | The Secret Agent | 1907 |
9 | Under Western Eyes | 1911 |
10 | Chance | 1913 |
11 | Victory | 1915 |
12 | The Shadow Line | 1917 |
13 | The Arrow of Gold | 1919 |
14 | The Rescue | 1920 |
15 | The Rover | 1923 |
16 | Suspense: A Napoleonic Novel | 1925 |
def conradLookup(title):
meta = conradMeta[(conradMeta.title.str.contains(title))]
if len(meta) > 1:
complete = meta[conradMeta.title.str.contains('Complete')]
if len(complete) > 0:
meta = complete
else:
try:
meta = meta.reset_index().loc[0]
except:
pass
if len(meta) == 0:
return None
metaID = meta.id
metaTitle = meta.title
print(title, '---', metaID, metaTitle, '\n\n')
try:
return metaID.item()
except:
return metaID
conradCSV['id'] = conradCSV.title.apply(conradLookup)
Almayer's Folly --- 11 720.0 Name: id, dtype: object 11 Almayer's Folly: A Story of an Eastern River Name: title, dtype: object An Outcast of the Islands --- 9 638.0 Name: id, dtype: object 9 An Outcast of the Islands Name: title, dtype: object Heart of Darkness --- 219.0 Heart of Darkness Lord Jim --- 26 5658.0 Name: id, dtype: object 26 Lord Jim Name: title, dtype: object Typhoon --- 17 1142.0 Name: id, dtype: object 17 Typhoon Name: title, dtype: object The End of the Tether --- 8 527.0 Name: id, dtype: object 8 The End of the Tether Name: title, dtype: object Nostromo --- 23 2021.0 Name: id, dtype: object 23 Nostromo: A Tale of the Seaboard Name: title, dtype: object The Secret Agent --- 12 974.0 Name: id, dtype: object 12 The Secret Agent: A Simple Tale Name: title, dtype: object Under Western Eyes --- 25 2480.0 Name: id, dtype: object 25 Under Western Eyes Name: title, dtype: object Chance --- 1476.0 Chance: A Tale in Two Parts Victory --- 27 6378.0 Name: id, dtype: object 27 Victory: An Island Tale Name: title, dtype: object The Shadow Line --- 2 451.0 Name: id, dtype: object 2 The Shadow Line: A Confession Name: title, dtype: object The Arrow of Gold --- 16 1083.0 Name: id, dtype: object 16 The Arrow of Gold: A Story Between Two Notes Name: title, dtype: object The Rescue --- 22 1712.0 Name: id, dtype: object 22 The Rescue: A Romance of the Shallows Name: title, dtype: object
/home/jon/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. after removing the cwd from sys.path.
conradCSV
title | wp_pubDate | id | |
---|---|---|---|
0 | Almayer's Folly | 1895 | 720.0 |
1 | An Outcast of the Islands | 1896 | 638.0 |
2 | The Nigger of the Narcissus | 1897 | None |
3 | Heart of Darkness | 1899 | 219.0 |
4 | Lord Jim | 1900 | 5658.0 |
5 | Typhoon | 1902 | 1142.0 |
6 | The End of the Tether | 1902 | 527.0 |
7 | Nostromo | 1904 | 2021.0 |
8 | The Secret Agent | 1907 | 974.0 |
9 | Under Western Eyes | 1911 | 2480.0 |
10 | Chance | 1913 | 1476.0 |
11 | Victory | 1915 | 6378.0 |
12 | The Shadow Line | 1917 | 451.0 |
13 | The Arrow of Gold | 1919 | 1083.0 |
14 | The Rescue | 1920 | 1712.0 |
15 | The Rover | 1923 | None |
16 | Suspense: A Napoleonic Novel | 1925 | None |
conradMeta[conradMeta.title.str.contains('Suspense')].id
Series([], Name: id, dtype: object)
conradCSV.set_value(2, 'id', 17731.)
title | wp_pubDate | id | |
---|---|---|---|
0 | Almayer's Folly | 1895 | 720.0 |
1 | An Outcast of the Islands | 1896 | 638.0 |
2 | The Nigger of the Narcissus | 1897 | 17731 |
3 | Heart of Darkness | 1899 | 219.0 |
4 | Lord Jim | 1900 | 5658.0 |
5 | Typhoon | 1902 | 1142.0 |
6 | The End of the Tether | 1902 | 527.0 |
7 | Nostromo | 1904 | 2021.0 |
8 | The Secret Agent | 1907 | 974.0 |
9 | Under Western Eyes | 1911 | 2480.0 |
10 | Chance | 1913 | 1476.0 |
11 | Victory | 1915 | 6378.0 |
12 | The Shadow Line | 1917 | 451.0 |
13 | The Arrow of Gold | 1919 | 1083.0 |
14 | The Rescue | 1920 | 1712.0 |
15 | The Rover | 1923 | None |
16 | Suspense: A Napoleonic Novel | 1925 | None |
conradCSV['text'] = conradCSV.id.apply(getText)
conradCSV
title | wp_pubDate | id | text | |
---|---|---|---|---|
0 | Almayer's Folly | 1895 | 720.0 | \n\n\n\n\nTranscribed from the 1915 T. Fisher ... |
1 | An Outcast of the Islands | 1896 | 638.0 | \n\n\n\n\nAN OUTCAST OF THE ISLANDS\n\nby Jose... |
2 | The Nigger of the Narcissus | 1897 | 17731 | \n\n\n\n\nTHE NIGGER of THE NARCISSUS\n\nA TAL... |
3 | Heart of Darkness | 1899 | 219.0 | \n\n\n\n\nHEART OF DARKNESS\n\nBy Joseph Conra... |
4 | Lord Jim | 1900 | 5658.0 | \n\n\n\n\nLORD JIM\n\nBY JOSEPH CONRAD\n\n\n\n... |
5 | Typhoon | 1902 | 1142.0 | \n\n\n\n\n[The other stories included in this ... |
6 | The End of the Tether | 1902 | 527.0 | \n\n\n\n\nTHE END OF THE TETHER\n\n\nBy Joseph... |
7 | Nostromo | 1904 | 2021.0 | \n\n\n\n\nNOSTROMO\n\nA TALE OF THE SEABOARD\n... |
8 | The Secret Agent | 1907 | 974.0 | \n\nTranscribed from the 1907 Methuen & Co edi... |
9 | Under Western Eyes | 1911 | 2480.0 | \n\n\n\n\nUNDER WESTERN EYES\n\nby JOSEPH CONR... |
10 | Chance | 1913 | 1476.0 | \n\n\n\n\nTranscribed form the 1914 Methuen & ... |
11 | Victory | 1915 | 6378.0 | \n\n\n\n\nVICTORY: AN ISLAND TALE\n\nBy Joseph... |
12 | The Shadow Line | 1917 | 451.0 | \n\n\n\n\nTHE SHADOW-LINE\n\nA CONFESSION\n\nB... |
13 | The Arrow of Gold | 1919 | 1083.0 | \n\nTranscribed from the 1921 T. Fisher Unwin ... |
14 | The Rescue | 1920 | 1712.0 | \n\n\n\n\nTHE RESCUE\n\nA ROMANCE OF THE SHALL... |
15 | The Rover | 1923 | None | None |
16 | Suspense: A Napoleonic Novel | 1925 | None | None |
writeFiles(conradCSV.loc[:14], 'conrad')
authors[5]
'Cather, Willa'
catherMeta = getMeta(authors[5])
catherMeta.title
0 O Pioneers! 1 The Song of the Lark 2 Alexander's Bridge 3 My Antonia 4 The Troll Garden, and Selected Stories 5 One of Ours 6 Youth and the Bright Medusa 7 My Ántonia 8 A Collection of Stories, Reviews and Essays Name: title, dtype: object
def getAusMeta(author):
query = 'select * from pgaus where author="%s"' % author
database = "sqlite:////home/jon/Code/gitenberg-scrape/pgaus.db"
result = pd.read_sql(query, database)
return result
catherAusMeta = getAusMeta('Willa Cather')
catherMeta
index | DateAdded | Title and Author | IDa | IDb | URLOther | URLs | htmlURL | Title | txtURL | Author | Notes | Subtitle | ShortTitle | AuthorAlt | Text | HTML | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 904 | Jul 2013 | The Enchanted Bluff, Willa Cather | [130437xx.xxx] | 2894A | http://gutenberg.net.au/ebooks13/1304371h.html | http://gutenberg.net.au/ebooks13/1304371h.html | http://gutenberg.net.au/ebooks13/1304371h.html | The Enchanted Bluff | None | Willa Cather | None | None | None | None | None | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 T... |
1 | 2432 | Nov 2006 | The Professor's House, Willa Cather | [060849xx.xxx] | 1367A | http://gutenberg.net.au/ebooks06/0608491.txt o... | http://gutenberg.net.au/ebooks06/0608491.txt o... | None | The Professor's House | http://gutenberg.net.au/ebooks06/0608491.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
2 | 3283 | May 2005 | Not Under Forty, by Willa Cather | [050044xx.xxx] | 0439A | http://gutenberg.net.au/ebooks05/0500441.txt o... | http://gutenberg.net.au/ebooks05/0500441.txt o... | http://gutenberg.net.au/ebooks05/0500441h.html | Not Under Forty | http://gutenberg.net.au/ebooks05/0500441.txt | Willa Cather | None | None | None | None | <table width="45%" border ="0">\r\n<tr>\r\n... | None |
3 | 3295 | Apr 2005 | My Mortal Enemy, by Willa Cather | [050032xx.xxx] | 0427A | http://gutenberg.net.au/ebooks05/0500321.txt o... | http://gutenberg.net.au/ebooks05/0500321.txt o... | http://gutenberg.net.au/ebooks05/0500321h.html | My Mortal Enemy | http://gutenberg.net.au/ebooks05/0500321.txt | Willa Cather | None | None | None | None | <table width="45%" border ="0">\r\n<tr>\r\n... | None |
4 | 3575 | Dec 2002 | Obscure Destinies, by Willa Cather | [020113xx.xxx] | 0147A | http://gutenberg.net.au/ebooks02/0201131.txt o... | http://gutenberg.net.au/ebooks02/0201131.txt o... | None | Obscure Destinies | http://gutenberg.net.au/ebooks02/0201131.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
5 | 3605 | Nov 2002 | The Professor's House, by Willa Cather | [020083xx.xxx] | 0117A | http://gutenberg.net.au/ebooks02/0200831.txt o... | http://gutenberg.net.au/ebooks02/0200831.txt o... | None | The Professor's House | http://gutenberg.net.au/ebooks02/0200831.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
6 | 3612 | Oct 2002 | Shadows on the Rock, by Willa Cather | [020076xx.xxx] | 0110A | http://gutenberg.net.au/ebooks02/0200761.txt o... | http://gutenberg.net.au/ebooks02/0200761.txt o... | None | Shadows on the Rock | http://gutenberg.net.au/ebooks02/0200761.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
7 | 3639 | Jul 2002 | Death Comes for the Archbishop, by Willa Cather | [020049xx.xxx] | 0083A | http://gutenberg.net.au/ebooks02/0200491.txt o... | http://gutenberg.net.au/ebooks02/0200491.txt o... | http://gutenberg.net.au/ebooks02/0200491h.html | Death Comes for the Archbishop | http://gutenberg.net.au/ebooks02/0200491.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
8 | 3640 | Jul 2002 | Lucy Gayheart, by Willa Cather | [020048xx.xxx] | 0082A | http://gutenberg.net.au/ebooks02/0200481.txt o... | http://gutenberg.net.au/ebooks02/0200481.txt o... | http://gutenberg.net.au/ebooks02/0200481h.html | Lucy Gayheart | http://gutenberg.net.au/ebooks02/0200481.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
9 | 3642 | Jul 2002 | Sapphira and the Slave Girl, Willa Cather | [020046xx.xxx] | 0080A | http://gutenberg.net.au/ebooks02/0200461.txt o... | http://gutenberg.net.au/ebooks02/0200461.txt o... | http://gutenberg.net.au/ebooks02/0200461h.html | Sapphira and the Slave Girl | http://gutenberg.net.au/ebooks02/0200461.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
10 | 3643 | Jul 2002 | A Lost Lady, by Willa Cather | [020045xx.xxx] | 0079A | http://gutenberg.net.au/ebooks02/0200451.txt o... | http://gutenberg.net.au/ebooks02/0200451.txt o... | http://gutenberg.net.au/ebooks02/0200451h.html | A Lost Lady | http://gutenberg.net.au/ebooks02/0200451.txt | Willa Cather | None | None | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... | None |
catherCSV = pd.read_csv('cather.csv')
catherCSV
title | wp_pubDate | |
---|---|---|
0 | Alexander's Bridge | 1912 |
1 | O Pioneers! | 1913 |
2 | The Song of the Lark | 1915 |
3 | My Ántonia | 1918 |
4 | One of Ours | 1922 |
5 | A Lost Lady | 1923 |
6 | The Professor's House | 1925 |
7 | My Mortal Enemy | 1926 |
8 | Death Comes for the Archbishop | 1927 |
9 | Shadows on the Rock | 1931 |
10 | Lucy Gayheart | 1935 |
11 | Sapphira and the Slave Girl | 1940 |
def catherLookup(title):
return catherMeta[catherMeta.Title == title]
catherLookup("Alexander's Bridge")
index | DateAdded | Title and Author | IDa | IDb | URLOther | URLs | htmlURL | Title | txtURL | Author | Notes | Subtitle | ShortTitle | AuthorAlt | Text | HTML |
---|
def catherLookup(title):
meta = catherMeta[(catherMeta.title.str.contains(title))]
if len(meta) > 1:
complete = meta[catherMeta.title.str.contains('Complete')]
if len(complete) > 0:
meta = complete
else:
try:
meta = meta.reset_index().loc[0]
except:
pass
if len(meta) == 0:
return None
metaID = meta.id
metaTitle = meta.title
print(title, '---', metaID, metaTitle, '\n\n')
try:
return metaID.item()
except:
return metaID
catherCSV['pgID'] = catherCSV.title.apply(catherLookup)
Alexander's Bridge --- 2 94.0 Name: id, dtype: object 2 Alexander's Bridge Name: title, dtype: object O Pioneers! --- 0 24.0 Name: id, dtype: object 0 O Pioneers! Name: title, dtype: object The Song of the Lark --- 1 44.0 Name: id, dtype: object 1 The Song of the Lark Name: title, dtype: object My Ántonia --- 7 19810.0 Name: id, dtype: object 7 My Ántonia Name: title, dtype: object One of Ours --- 5 2369.0 Name: id, dtype: object 5 One of Ours Name: title, dtype: object
catherCSV
title | wp_pubDate | pgID | |
---|---|---|---|
0 | Alexander's Bridge | 1912 | 94.0 |
1 | O Pioneers! | 1913 | 24.0 |
2 | The Song of the Lark | 1915 | 44.0 |
3 | My Ántonia | 1918 | 19810.0 |
4 | One of Ours | 1922 | 2369.0 |
5 | A Lost Lady | 1923 | None |
6 | The Professor's House | 1925 | None |
7 | My Mortal Enemy | 1926 | None |
8 | Death Comes for the Archbishop | 1927 | None |
9 | Shadows on the Rock | 1931 | None |
10 | Lucy Gayheart | 1935 | None |
11 | Sapphira and the Slave Girl | 1940 | None |
catherCSV['text'] = catherCSV.pgID.apply(getText)
def catherAusLookup(title):
results = catherAusMeta[catherAusMeta.Title == title]
if len(results) == 1:
return results.Text.item()
else:
return None
catherCSV['ausText'] = catherCSV[5:].title.apply(catherAusLookup)
catherCSV
title | wp_pubDate | pgID | text | ausText | |
---|---|---|---|---|---|
0 | Alexander's Bridge | 1912 | 94.0 | \n\n\n\n\nALEXANDER'S BRIDGE\n\nby Willa Cathe... | NaN |
1 | O Pioneers! | 1913 | 24.0 | \n\n\n\n\nO PIONEERS!\n\nby Willa Cather\n\n\n... | NaN |
2 | The Song of the Lark | 1915 | 44.0 | \n\n\n\n\nSONG OF THE LARK\n\nBy Willa Cather\... | NaN |
3 | My Ántonia | 1918 | 19810.0 | \n\n\n\n\nMy Antonia\n\nBy Willa Sibert Cather... | NaN |
4 | One of Ours | 1922 | 2369.0 | \n\n\n\n\nOne of Ours\n\nby Willa Cather\n\n\n... | NaN |
5 | A Lost Lady | 1923 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
6 | The Professor's House | 1925 | None | None | None |
7 | My Mortal Enemy | 1926 | None | None | <table width="45%" border ="0">\r\n<tr>\r\n... |
8 | Death Comes for the Archbishop | 1927 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
9 | Shadows on the Rock | 1931 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
10 | Lucy Gayheart | 1935 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
11 | Sapphira and the Slave Girl | 1940 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
catherCSV.set_value(6, 'ausText', catherAusMeta.loc[1]['Text'])
title | wp_pubDate | pgID | text | ausText | |
---|---|---|---|---|---|
0 | Alexander's Bridge | 1912 | 94.0 | \n\n\n\n\nALEXANDER'S BRIDGE\n\nby Willa Cathe... | NaN |
1 | O Pioneers! | 1913 | 24.0 | \n\n\n\n\nO PIONEERS!\n\nby Willa Cather\n\n\n... | NaN |
2 | The Song of the Lark | 1915 | 44.0 | \n\n\n\n\nSONG OF THE LARK\n\nBy Willa Cather\... | NaN |
3 | My Ántonia | 1918 | 19810.0 | \n\n\n\n\nMy Antonia\n\nBy Willa Sibert Cather... | NaN |
4 | One of Ours | 1922 | 2369.0 | \n\n\n\n\nOne of Ours\n\nby Willa Cather\n\n\n... | NaN |
5 | A Lost Lady | 1923 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
6 | The Professor's House | 1925 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
7 | My Mortal Enemy | 1926 | None | None | <table width="45%" border ="0">\r\n<tr>\r\n... |
8 | Death Comes for the Archbishop | 1927 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
9 | Shadows on the Rock | 1931 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
10 | Lucy Gayheart | 1935 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
11 | Sapphira and the Slave Girl | 1940 | None | None | \r\n\r\n<table width="45%" border ="0">\r\n... |
writeFiles(catherCSV[:5], 'cather')
def writeFilesAus(df, author):
for i, row in df.iterrows():
title = row.title
if title.startswith('The'):
title = title[4:]
if title.startswith('A'):
title = title[3:]
if ':' in title:
title = title.split(':')[0]
title = row.title.lower().replace(' ','-')
year = row.wp_pubDate
filename = "%s/%s-%s-%s.txt" % (author, author, row.wp_pubDate, title)
with open(filename, 'w') as f:
f.write(row.ausText)
writeFilesAus(catherCSV[5:], 'cather')