This is a script to donwload the plain text of the Septuagint from Sacred Texts
import os,sys,re,collections
from lxml import html
import requests
base_url = 'http://sacred-texts.com/bib/sep'
top_url = '{}/index.htm'.format(base_url)
page = requests.get(top_url)
tree = html.fromstring(page.content)
books = collections.OrderedDict()
start = False
for x in tree.iter('a'):
link_text = ''.join(y.text if y.text != None else '' for y in x.iter())
if not start and link_text == 'Genesis': start = True
elif not start: continue
link = x.get('href')
books[link_text] = '{}/{}'.format(base_url, link)
print(', '.join(books))
Genesis, Exodus, Leviticus, Numbers, Deuteronomy, Joshua B, Joshua A, Judges B, Judges A, Ruth, 1 Samuel, 2 Samuel, 1 Kings, 2 Kings, 1 Chronicles, 2 Chronicles, 1 Esdras, 2 Esdras, Esther, Judith, Tobit BA, Tobit S, 1 Macabees, 2 Macabees, 3 Macabees, 4 Macabees, Psalms, Odes, Proverbs, Ecclesiastes, Song of Solomon, Job, Wisdom, Sirach, Psalms of Solomon, Hosea, Micah, Amos, Joel, Jonah, Obadiah, Nahum, Habakkuk, Zephaniah, Haggai, Zechariah, Malachi, Isaiah, Jeremiah, Baruch, Epistle of Jeremiah, Lamentations, Ezekiel, Bel and the Dragon, Bel and the Dragon Th, Daniel, Daniel Th, Susanna, Susanna Th
chapters = collections.defaultdict(dict)
def getchapters(book):
book_url = books[book]
page = requests.get(book_url)
tree = html.fromstring(page.content)
chfilter = re.compile(book+' Chapter ([0-9]+)')
for p in tree.iter('p'):
for x in p.iter('a'):
link_text = ''.join(y.text if y.text != None else '' for y in x.iter())
match = chfilter.match(link_text)
if match:
chnum = int(match.group(1))
link = x.get('href')
chapters[book][chnum] = '{}/{}'.format(base_url, link)
print('{}: {} chapters'.format(book, max(x for x in chapters[book])))
for book in books: getchapters(book)
Genesis: 50 chapters Exodus: 40 chapters Leviticus: 27 chapters Numbers: 36 chapters Deuteronomy: 34 chapters Joshua B: 24 chapters Joshua A: 19 chapters Judges B: 21 chapters Judges A: 21 chapters Ruth: 4 chapters 1 Samuel: 31 chapters 2 Samuel: 24 chapters 1 Kings: 22 chapters 2 Kings: 25 chapters 1 Chronicles: 29 chapters 2 Chronicles: 36 chapters 1 Esdras: 9 chapters 2 Esdras: 23 chapters Esther: 10 chapters Judith: 16 chapters Tobit BA: 14 chapters Tobit S: 14 chapters 1 Macabees: 16 chapters 2 Macabees: 15 chapters 3 Macabees: 7 chapters 4 Macabees: 18 chapters Psalms: 151 chapters Odes: 14 chapters Proverbs: 36 chapters Ecclesiastes: 12 chapters Song of Solomon: 8 chapters Job: 42 chapters Wisdom: 19 chapters Sirach: 51 chapters Psalms of Solomon: 18 chapters Hosea: 14 chapters Micah: 7 chapters Amos: 9 chapters Joel: 4 chapters Jonah: 4 chapters Obadiah: 1 chapters Nahum: 3 chapters Habakkuk: 3 chapters Zephaniah: 3 chapters Haggai: 2 chapters Zechariah: 14 chapters Malachi: 3 chapters Isaiah: 66 chapters Jeremiah: 52 chapters Baruch: 5 chapters Epistle of Jeremiah: 1 chapters Lamentations: 5 chapters Ezekiel: 48 chapters Bel and the Dragon: 1 chapters Bel and the Dragon Th: 1 chapters Daniel: 12 chapters Daniel Th: 12 chapters Susanna: 1 chapters Susanna Th: 1 chapters
def getchapter(book, chapter):
url = chapters[book][chapter]
page = requests.get(url)
page.encoding = 'utf-8'
tree = html.fromstring(page.content)
chtext = ['\n{} {}\n'.format(book, chapter)]
for x in tree.iter('p'):
chtext.append(x.text_content())
return chtext
sf = open('septuagint.txt', 'w')
for book in books:
sys.stdout.write('writing {} '.format(book))
sys.stdout.flush()
for chapter in chapters[book]:
sys.stdout.write('.')
sys.stdout.flush()
sf.write('\n'.join(getchapter(book, chapter)))
sys.stdout.write('\n')
sys.stdout.flush()
sf.close()
writing Genesis .................................................. writing Exodus ........................................ writing Leviticus ........................... writing Numbers .................................... writing Deuteronomy .................................. writing Joshua B ........................ writing Joshua A ... writing Judges B ..................... writing Judges A ..................... writing Ruth .... writing 1 Samuel ............................... writing 2 Samuel ........................ writing 1 Kings ...................... writing 2 Kings ......................... writing 1 Chronicles ............................. writing 2 Chronicles .................................... writing 1 Esdras ......... writing 2 Esdras ....................... writing Esther .......... writing Judith ................ writing Tobit BA .............. writing Tobit S .............. writing 1 Macabees ................ writing 2 Macabees ............... writing 3 Macabees ....... writing 4 Macabees .................. writing Psalms ....................................................................................................................................................... writing Odes .............. writing Proverbs ............................... writing Ecclesiastes ............ writing Song of Solomon ........ writing Job .......................................... writing Wisdom ................... writing Sirach .................................................... writing Psalms of Solomon .................. writing Hosea .............. writing Micah ....... writing Amos ......... writing Joel .... writing Jonah .... writing Obadiah . writing Nahum ... writing Habakkuk ... writing Zephaniah ... writing Haggai .. writing Zechariah .............. writing Malachi ... writing Isaiah .................................................................. writing Jeremiah .................................................... writing Baruch ..... writing Epistle of Jeremiah . writing Lamentations ...... writing Ezekiel ................................................ writing Bel and the Dragon . writing Bel and the Dragon Th . writing Daniel ............ writing Daniel Th ............ writing Susanna . writing Susanna Th .