%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
from matplotlib import pyplot as plt
import xml.etree.ElementTree as ET
tropnames = {'etnakhta': u'\u0591', 'segol': u'\u0592', 'shalshelet': u'\u0593', 'katan': u'\u0594', 'gadol': u'\u0595',
'tipkha': u'\u0596', 'revii': u'\u0597', 'tsinnorit': u'\u0598', 'pashta': u'\u0599', 'yetiv': u'\u059a', 'tevir': u'\u059b',
'geresh': u'\u059c', 'gereshmukdam': u'\u059d', 'gershayim': u'\u059e', 'karnepara': u'\u059f', 'telishagedola': u'\u05a0',
'pazer': u'\u05a1', 'munakh': u'\u05a3', 'mapakh': u'\u05a4', 'merkha': u'\u05a5',
'merkhakfula': u'\u05a6', 'darga': u'\u05a7', 'kadma': u'\u05a8', 'telishaketana': u'\u05a9', 'yerakhbenyomo': u'\u05aa',
'sofpasuk': u'\u05c3', 'zarka': u'\u05ae'}
# per wikipedia: Note that both marks have been wrongly named by Unicode.[5][6] Zarqa/tsinnor corresponds to Unicode
# "Hebrew accent zinor", code point U+05AE (where "zinor" is a misspelled form for tsinnor), while tsinnorit maps to
# "Hebrew accent zarqa", code point U+0598.
sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim']
counts = {}
for sefer in sfarim:
counts[sefer] = {}
tree = ET.parse(sefer + '.xml')
root = tree.getroot()
prakim = root.findall('.//c')
for perek in prakim:
pereknum = int(perek.attrib['n'])
if pereknum not in counts[sefer]: counts[sefer][pereknum] = {}
psukim = perek.findall('v')
for pasuk in psukim:
pasuknum = int(pasuk.attrib['n'])
if pasuknum not in counts[sefer][pereknum]:
row = {name: 0 for name in tropnames.keys()}
row['sefer'] = sefer
row['pasuk'] = pasuknum
row['perek'] = pereknum
counts[sefer][pereknum][pasuknum] = row
words = pasuk.findall('w')
if 'wordcount' not in counts[sefer][pereknum][pasuknum]:
counts[sefer][pereknum][pasuknum]['wordcount'] = len(words)
else:
counts[sefer][pereknum][pasuknum]['wordcount'] += len(words)
for wordobj in words:
word = wordobj.text
for trop in tropnames:
if tropnames[trop] in word: # if there is ever the same trop more than once on one word, it won't be captured here
counts[sefer][pereknum][pasuknum][trop] += 1
flatcounts = [counts[s][pe][pa] for s in counts for pe in counts[s] for pa in counts[s][pe]]
df = pd.DataFrame(flatcounts)
df.set_index(['sefer', 'perek', 'pasuk'], inplace=True)
df = df.reindex(sfarim, level=0)
# for pretty plot labels
seferticklocs = [0,df.index.get_loc(('shmot',1,1)),df.index.get_loc(('vayikra',1,1)),df.index.get_loc(('bmidbar',1,1)),df.index.get_loc(('dvarim',1,1))]
seferlabels = ['Bereshit', 'Shemot', 'Vayikra', 'B\'midbar', 'D\'varim']
sefergroup = df.groupby(level=['sefer'])
sefermeans = sefergroup.mean()
sefersums = sefergroup.sum()
# I didn't end up using these, but they're here
perekgroup = df.groupby(level=['sefer','perek'], sort=False)
perekmeans = perekgroup.mean()
perekmeans = perekmeans.reindex(sfarim, level=0) # sort=False isn't working, so...
perekmeans.dropna(inplace=True) # workaround for https://github.com/pydata/pandas/issues/9344
pereksums = perekgroup.sum()
pereksums = pereksums.reindex(sfarim, level=0) # sort=False isn't working, so...
pereksums.dropna(inplace=True) # workaround for https://github.com/pydata/pandas/issues/9344
ax = sefermeans[['telishaketana','telishagedola']].plot(by='sefer', kind='bar', figsize=(9,6)) # 'shalshelet', 'merkhakfula'
plt.xticks(range(5), seferlabels, fontsize='large', rotation=0)
plt.title('Telishas per pasuk by sefer', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, ['Telisha Ketana','Telisha Gedola'], loc='upper left', framealpha=0.9)
plt.savefig('telisha_bar.svg', transparent=True)
plt.savefig('telisha_bar.png', transparent=True)
(sefermeans['telishaketana']/sefermeans['telishagedola']).plot(kind='bar', figsize=(9,6))
plt.xticks(range(5), seferlabels, fontsize='large', rotation=0)
plt.title('Ratio of telisha ketana to telisha gedola by sefer', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Telisha Ketana:Telisha Gedola', fontsize='large')
plt.savefig('telisha_ratios.svg', transparent=True)
plt.savefig('telisha_ratios.png', transparent=True)
rolling = pd.rolling_mean(df, 500, center=True)
rolling.plot(y=['telishaketana', 'telishagedola'], figsize=(9,6))
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Trop occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
plt.legend(['Telisha Ketana', 'Telisha Gedola'], loc='upper left', framealpha=0.9)
plt.savefig('telisha.svg', transparent=True)
plt.savefig('telisha.png', transparent=True)
rolling.plot(y=['munakh', 'katan', 'pashta', 'mapakh', 'revii'], figsize=(9,6))
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Trop occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
plt.legend(['Munakh', 'Katan', 'Pashta', 'Mapakh', 'Revi\'i'], loc='upper left', framealpha=0.9)
plt.savefig('common_trop.svg', transparent=True)
plt.savefig('common_trop.png', transparent=True)
# trying to make an area plot to see how all these compare to the total number of munakhs.
rolling.plot(y=['katan', 'pashta', 'mapakh', 'revii'], kind='area', figsize=(9,6))
# plt.hold(True)
# rolling.plot(y=['munakh'])
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Trop occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
# plt.legend(['Katan', 'Pashta', 'Mapakh', 'Revi\'i'], loc='upper left', framealpha=0.9)
# plt.savefig('common_trop.svg', transparent=True)
<matplotlib.text.Text at 0x108b07d10>
rolling.plot(y=['zarka', 'segol'], figsize=(9,6))
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Trop occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
plt.legend(['Zarka', 'Zegol'], loc='upper left', framealpha=0.9)
# plt.savefig('zarkasegol.svg', transparent=True)
plt.savefig('zarkasegol.png', transparent=True)
rolling.plot(y=['tevir', 'darga'], figsize=(9,6))
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Trop occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
plt.legend(['Tevir', 'Darga'], loc='upper left', framealpha=0.9)
plt.savefig('dargatevir.svg', transparent=True)
plt.savefig('dargatevir.png', transparent=True)
rolling.plot(y=['etnakhta'], figsize=(9,6), legend=False)
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Etnakhta occurence across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
# plt.legend(['Tevir', 'Darga'], loc='upper left', framealpha=0.9)
plt.savefig('etnakhta.svg', transparent=True)
plt.savefig('etnakhta.png', transparent=True)
ax = rolling[['etnakhta', 'wordcount']].plot(secondary_y=['wordcount'], figsize=(9,6), legend=False)
plt.xticks(seferticklocs, seferlabels, fontsize='large') # this doesn't seem to be setting the font size
ax.grid(axis='x')
ax.set_title('Etnakhta occurence and word count across the Torah', fontsize='x-large')
ax.set_xlabel('')
ax.set_ylabel('Average trop count per pasuk', fontsize='large')
ax.right_ax.set_ylabel('Average word count per pasuk', fontsize='large')
plt.legend((ax.get_lines()[0], ax.right_ax.get_lines()[0]), ['Etnakhta', 'Word count (right)'], loc='lower left', framealpha=0.9)
plt.savefig('etnakhtawordcount.svg', transparent=True)
plt.savefig('etnakhtawordcount.png', transparent=True)
# what the heck is this syntax? http://pandas.pydata.org/pandas-docs/stable/cookbook.html#if-then
# I'd rather it be df['fivewords'] = 1 if df['wordcount'] > 5 else 0
df.ix[df['wordcount'] > 5, 'fivewords'] = 1
df.ix[df['wordcount'] <= 5, 'fivewords'] = 0
rolling = pd.rolling_mean(df, 500, center=True)
rolling.plot(y=['etnakhta', 'fivewords'], figsize=(9,6))
plt.xticks(seferticklocs, seferlabels, fontsize='large')
plt.title('Etnakhta occurence and word count across the Torah', fontsize='x-large')
plt.xlabel('')
plt.ylabel('Average count per pasuk', fontsize='large')
plt.legend(['Etnakhta', 'Psukim with > 5 words'], loc='lower left', framealpha=0.9)
plt.savefig('etnakhtafivewords.svg', transparent=True)
plt.savefig('etnakhtafivewords.png', transparent=True)
bywordcount = df.groupby('wordcount').aggregate('mean')
bywordcount = bywordcount.drop('fivewords', 1)
df['wordcount'].hist(bins=20)
plt.title('Wordcount distribution', fontsize='x-large')
plt.xlabel('Number of words', fontsize='large')
plt.ylabel('Number of psukim', fontsize='large')
plt.savefig('wordcountdist.svg', transparent=True)
plt.savefig('wordcountdist.png', transparent=True)
# export a csv to bring into a (still-in-progress) D3 graph to turn trop on and off
bywordcount[1:32].to_csv('bywordcount/bywordcount.csv')
bywordcount.plot(y='munakh', figsize=(9,6), xlim=(0,32))
<matplotlib.axes._subplots.AxesSubplot at 0x10ffa7590>
dfforcorrs = df.drop(['wordcount', 'fivewords'],1)
dfforcorrs.corr().to_csv('rawcorr.csv')
rollingforcorrs = pd.rolling_mean(dfforcorrs, 500, center=True)
rollingforcorrs.corr().to_csv('rollingcorr.csv')
df.loc['bmidbar'].query('etnakhta > 1')[['etnakhta']]
etnakhta | ||
---|---|---|
perek | pasuk |
# this seems to occur when there's a k'tiv/kri on the last word of a pasuk.
df.query('sofpasuk == 0')[['sofpasuk']]
sofpasuk | |||
---|---|---|---|
sefer | perek | pasuk | |
bereshit | 27 | 3 | 0 |
43 | 28 | 0 | |
49 | 11 | 0 | |
shmot | 2 | 5 | 0 |
14 | 25 | 0 | |
29 | 0 | ||
20 | 3 | 0 | |
4 | 0 | ||
8 | 0 | ||
9 | 0 | ||
10 | 0 | ||
34 | 6 | 0 | |
37 | 8 | 0 | |
vayikra | 18 | 17 | 0 |
19 | 1 | 0 | |
26 | 7 | 0 | |
bmidbar | 7 | 32 | 0 |
40 | 0 | ||
55 | 0 | ||
68 | 0 | ||
25 | 19 | 0 | |
dvarim | 5 | 10 | 0 |
12 | 0 | ||
6 | 4 | 0 | |
9 | 20 | 0 | |
22 | 20 | 0 | |
25 | 9 | 0 |
df.query('tsinnorit > 0')[['tsinnorit']]
tsinnorit | |||
---|---|---|---|
sefer | perek | pasuk | |
bereshit | 17 | 20 | 1 |
47 | 29 | 1 | |
shmot | 6 | 6 | 1 |
30 | 12 | 1 | |
36 | 2 | 1 | |
vayikra | 4 | 2 | 1 |
20 | 2 | 1 | |
bmidbar | 20 | 19 | 1 |
dvarim | 14 | 24 | 1 |
31 | 7 | 1 |
df['wordcount'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1097b2e10>
perekmeans[['munakh','pashta']].hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1089e6890>, <matplotlib.axes._subplots.AxesSubplot object at 0x1087e7e90>]], dtype=object)
perekmeans[['telishaketana','telishagedola']].hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x108a6c7d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x10978b1d0>]], dtype=object)
# this is a little off from https://en.wikipedia.org/wiki/Munach#Total_occurrences
sum(df['munakh'])
9451
df.to_csv('trop.csv')