Notebook

Comparison of the differences between the plain text of ETCBC versions 3 and 4¶

In [1]:

import sys
import collections
import re
import unicodedata

from IPython.display import clear_output, display, HTML

from laf.fabric import LafFabric
fabric3 = LafFabric()
fabric4 = LafFabric()

from etcbc.lib import Transcription
from etcbc.preprocess import prepare
tr = Transcription()

  0.00s This is LAF-Fabric 4.4.1
http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
  0.00s This is LAF-Fabric 4.4.1
http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html

In [2]:

API3 = fabric3.load('etcbc3', '--', 'monads', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype monads text suffix
        graphical_word
        verse_label
    ''',''),
    "primary": False,
    "prepare": prepare,
})
API4 = fabric4.load('etcbc4', '--', 'monads', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype monads g_word_utf8 trailer_utf8
        g_word
        label
    ''',''),
    "primary": False,
    "prepare": prepare,
})

  0.00s LOADING API: please wait ... 
  0.01s INFO: USING DATA COMPILED AT: 2014-06-27T12-21-04
  2.66s LOGFILE=/Users/dirk/laf-fabric-output/etcbc3/monads/__log__monads.txt
  3.18s INFO: DATA LOADED FROM SOURCE etcbc3 AND ANNOX -- FOR TASK monads AT 2014-07-16T09-47-51
  0.00s LOADING API: please wait ... 
  0.00s INFO: USING DATA COMPILED AT: 2014-07-14T16-45-08
  3.17s LOGFILE=/Users/dirk/laf-fabric-output/etcbc4/monads/__log__monads.txt
  3.88s INFO: DATA LOADED FROM SOURCE etcbc4 AND ANNOX -- FOR TASK monads AT 2014-07-16T09-47-55

In [3]:

splits = {
    (3, ' JES 19,18', 21): 2,
    (3, ' DAN 01,02', 11): 2,
    (3, ' DAN 01,05', 25): 3,
    (3, ' DAN 01,15', 2): 2,
    (3, ' DAN 01,18', 3): 2,
    (3, ' NEH 07,69', 2): 2,
    (3, ' ICHR27,12', 15): 0,
}

def monad_passage_index(API, vr, vlabel_name, graphical_name, text_name, trailer_name):
    mp_index = collections.OrderedDict()
    np_index = collections.OrderedDict()
    
    cur_label = None
    cur_wn = None
    cur_nodes = None
    NN = API['NN']
    F = API['F']
    msg = API['msg']
    chunk = 50000
    i = 0
    ci = 0
    for n in NN():
        otype = F.otype.v(n)
        if otype == 'verse':
            if cur_nodes != None: np_index[cur_label] = tuple(cur_nodes)
            cur_label = F.item[vlabel_name].v(n)
            cur_nodes = []
            cur_wn = 0
        elif otype == 'word':
            i += 1
            ci += 1
            if ci == chunk:
                ci = 0
                msg("{} words".format(i))
            m = F.monads.v(n)
            cur_nodes.append(n)
                        
            translit = F.item[graphical_name].v(n)
            text = (F.item[text_name].v(n) + F.item[trailer_name].v(n).replace('\n',''))
        
            if len(text) == 0:
                cur_wn += 1
                mp_index[(cur_label, cur_wn)] = (n, m, translit, '')
                continue
            start = 0
            while start < len(text):
                s_maqef = text.find('\u05BE', start)
                s_space = text.find(' ', start)
                s_min = min(s_maqef if s_maqef >= 0 else len(text) - 1, s_space if s_space >= 0 else len(text) - 1) + 1
                comp = text[start:s_min]
                start = s_min
                comps = comp
                while comps != None:
                    cur_wn += 1
                    spos = splits.get((vr, cur_label, cur_wn), None)
                    if spos != None:
                        do_comp = comp[0:spos]
                        comps = comp[spos:]
                    else:
                        do_comp = comp
                        comps = None
                    mp_index[(cur_label, cur_wn)] = (n, m, translit, do_comp)
    if cur_nodes != None: np_index[cur_label] = tuple(cur_nodes)
    msg("{} words".format(i))
    return (mp_index, np_index)

In [4]:

API3['msg']("Making index for ETCBC3")
(etcbc3, etcbc3n) = monad_passage_index(API3, 3, 'verse_label', 'graphical_word', 'text', 'suffix')
API3['msg']("Done")
API4['msg']("Making index for ETCBC4")
(etcbc4, etcbc4n) = monad_passage_index(API4, 4, 'label', 'g_word', 'g_word_utf8', 'trailer_utf8')
API4['msg']("Done")

    17s Making index for ETCBC3
    18s 50000 words
    19s 100000 words
    20s 150000 words
    21s 200000 words
    22s 250000 words
    23s 300000 words
    25s 350000 words
    26s 400000 words
    27s 426499 words
    27s Done
    23s Making index for ETCBC4
    24s 50000 words
    25s 100000 words
    26s 150000 words
    27s 200000 words
    28s 250000 words
    29s 300000 words
    30s 350000 words
    31s 400000 words
    32s 426555 words
    32s Done

In [6]:

show_case_init = '''<html><head><style type="text/css">
.casehd {border-top: 4pt solid black; text-align: center; font-size: 24pt; height: 28pt; font-weight: bold; color: #00B060;}
.heb {padding-right: 12pt; text-align: right; font-family: SBL Hebrew; font-size: 32pt; height: 40pt;}
.heb1 {padding-right: 12pt;}
.heb2 {text-align: right; font-family: SBL Hebrew; font-size: 32pt; line-height: 40pt}
.gw {padding-right: 12pt; text-align: right; font-weight: bold; font-size: 18pt; height: 20pt;}
.hdiv {background-color: yellow;}
body {margin-left: 2em; margin-right: 2em; margin-top: 2em; margin-bottom: 2em;}
td {padding: 2pt;}
th {padding: 6pt;}
    </style></head><body>
'''
show_case_table_init = '''<table rules="all" border="all">
'''
show_case_table_final = '''</table>
'''
show_case_final = '''</body></html>
'''

heb_css = '''
    text-align: right; 
    font-family: SBL Hebrew; 
    font-size: 32pt; 
    height: 40pt;}
'''
def hebuni(heb):
    display(HTML('''<p style="{}">&nbsp;{}</p>'''.format(heb_css, heb)))

def check_dagesh_accent_vowel_pos(c, a):
    d = '\u05BC'
    row_tpl = '<tr>' + ('<td class="heb">&nbsp;{}</td><td class="heb hdiv">&nbsp;{}</td>' * 6) + '</tr>'
    vowels = [chr(n) for n in range(0x5B0, 0x5BC)]
    def perms(v): return (
                c + d + v + a, 
                c + d + a + v,
                c + a + d + v, 
                c + v + d + a, 
                c + v + a + d, 
                c + a + v + d,
            )
    def row(v):
        r = []
        for x in perms(v):
            r.append(x)
            r.append(unicodedata.normalize('NFKC', x))
        return tuple(r)
    
    rows = [row_tpl.format(*row(v)) for v in vowels]
    t = show_case_init + show_case_table_init + '\n'.join(rows) + show_case_table_final + show_case_final
    display(HTML(t))

def check_accent_vowel_pos(c, a):
    row_tpl = '<tr>' + ('<td class="heb">&nbsp;{}</td><td class="heb hdiv">&nbsp;{}</td>' * 2) + '</tr>'
    vowels = [chr(n) for n in range(0x5B0, 0x5BC)]
    def perms(v): return (
                c + v + a, 
                c + a + v,
            )
    def row(v):
        r = []
        for x in perms(v):
            r.append(x)
            r.append(unicodedata.normalize('NFKC', x))
        return tuple(r)
    
    rows = [row_tpl.format(*row(v)) for v in vowels]
    t = show_case_init + show_case_table_init + '\n'.join(rows) + show_case_table_final + show_case_final
    display(HTML(t))


def show_range(passage, wnums):
    gw = ''
    heb = ''
    ms = []
    ns = []
    good = True
    for wnum in wnums:
        if (passage, wnum) not in etcbc4:
            print("No word {} in passage {} in etcbc4".format(wnum, passage))
            good = False
            break
        info = etcbc4[(passage, wnum)]
        ms.append(info[0])
        ns.append(info[1])
        gw += info[2]
        heb += info[3]
    if not good: return 'ERROR'
    lus = show_heb(heb)
    wnumsrep = ', '.join(str(wnum) for wnum in wnums)
    msrep = ', '.join(str(x) for x in ms)
    nsrep = ', '.join(str(x) for x in ns)

    t = ''
    t += '<tr><th colspan="3" class="casehd"><a name="case.{}.{}">{} words {}</a></th></tr>\n'.format(
        passage, wnumsrep, passage, wnumsrep,
    )
    t += '<tr><th colspan="3">ETCBC4</th></tr>\n'
    t += '<tr><td colspan="3"><p>monad {}, node {}</p></td></tr>\n'.format(msrep, nsrep)
    t += '<tr><td colspan="3" class="heb"><p>{}</p></td></tr>\n'.format(heb)
    t += '<tr><td colspan="3" class="gw"><p>{}</p></td></tr>\n'.format(
        gw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'), 
    )
    for lu in lus:
        t += '<tr>{}</tr>\n'.format(lu)
    ht = '{}{}{}{}{}'.format(
        show_case_init, 
        show_case_table_init, 
        t,
        show_case_table_final,
        show_case_final,
    )
    display(HTML(ht))
    oht = API4['outfile']('{}-{}.html'.format(passage.strip(), wnumsrep))
    oht.write(ht)
    oht.close()
        
def show_heb(heb):
    return ['<td>{}</td><td>{:04X}</td><td>{}</td>'.format(
            tr.hebrew_mappingi[c] if c in tr.hebrew_mappingi else c, 
            ord(c), 
            unicodedata.name(c).replace('HEBREW ',''),
        ) for c in Transcription._decomp(unicodedata.normalize('NFKD', heb))]

def show_hstring(heb):
    t = '''<p class="heb2">{}</p><p>{}</p>'''.format(
        heb,
        '<br/>'.join('{:04X}={}'.format(ord(c), unicodedata.name(c).replace('HEBREW ','')) for c in heb),
    )
    display(HTML(t))
   
def _show_case(passage, wnum):
    g_n = {}
    g_m = {}
    g_w = {}
    g_h = {}
    l_u = {}
    good = True
    for v in (('3', etcbc3), ('4', etcbc4)):
        (vn, vindex) = v
        if (passage, wnum) not in vindex:
            print("No word {} in passage {} in etcbc{}".format(wnum, passage, vn))
            good = False
            continue
        (g_n[vn], g_m[vn], g_w[vn], g_h[vn]) = vindex[(passage, wnum)]
        l_u[vn] = show_heb(g_h[vn])
    if not good: return 'ERROR'
    lx = max(len(l_u['3']), len(l_u['4']))
    ln = min(len(l_u['3']), len(l_u['4']))

    t = ''
    t += '<tr><th colspan="6" class="casehd"><a name="case.{}.{}">{} word {}</a></th></tr>\n'.format(passage, wnum, passage, wnum)
    t += '<tr><th colspan="3">ETCBC3</th><th colspan="3">ETCBC4</th></tr>\n'
    t += '<tr><td colspan="3"><p>monad {}, node {}</p></td><td colspan="3"><p>monad {}, node {}</p></td></tr>\n'.format(
        g_m['3'], g_n['3'], g_m['4'], g_n['4'],
    )
    t += '<tr><td colspan="3" class="heb"><p>{}</p></td><td colspan="3" class="heb"><p>{}</p></td></tr>\n'.format(
        g_h['3'], g_h['4'],
    )
    t += '<tr><td colspan="3" class="gw"><p>{}</p></td><td colspan="3" class="gw"><p>{}</p></td></tr>\n'.format(
        g_w['3'].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'), 
        g_w['4'].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'),
    )
    for r in range(lx):
        t += '<tr{}>{}{}</tr>\n'.format(' class="hdiv"' if r >= ln or l_u['3'][r] != l_u['4'][r] else '',
            l_u['3'][r] if r < len(l_u['3']) else '<td colspan="3">&nbsp;</td>',
            l_u['4'][r] if r < len(l_u['4']) else '<td colspan="3">&nbsp;</td>',
        )
    return t

def show_case(passage, wnum):
    t = '{}{}{}{}{}'.format(
        show_case_init, 
        show_case_table_init, 
        _show_case(passage, wnum),
        show_case_table_final,
        show_case_final)
    display(HTML(t))

def write_cases(elist, oh):
    oh.write(show_case_table_init)
    for e in elist: oh.write(_show_case(*e))
    oh.write(show_case_table_final)
    

In [10]:

check_dagesh_accent_vowel_pos('\u05DE', '\u0596')
check_accent_vowel_pos('\u05DE', '\u0596')

מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖	מְּ֖
מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖	מֱּ֖
מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖	מֲּ֖
מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖	מֳּ֖
מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖	מִּ֖
מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖	מֵּ֖
מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖	מֶּ֖
מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖	מַּ֖
מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖	מָּ֖
מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖	מֹּ֖
מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖	מֺּ֖
מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖	מֻּ֖

מְ֖	מְ֖	מְ֖	מְ֖
מֱ֖	מֱ֖	מֱ֖	מֱ֖
מֲ֖	מֲ֖	מֲ֖	מֲ֖
מֳ֖	מֳ֖	מֳ֖	מֳ֖
מִ֖	מִ֖	מִ֖	מִ֖
מֵ֖	מֵ֖	מֵ֖	מֵ֖
מֶ֖	מֶ֖	מֶ֖	מֶ֖
מַ֖	מַ֖	מַ֖	מַ֖
מָ֖	מָ֖	מָ֖	מָ֖
מֹ֖	מֹ֖	מֹ֖	מֹ֖
מֺ֖	מֺ֖	מֺ֖	מֺ֖
מֻ֖	מֻ֖	מֻ֖	מֻ֖

In [11]:

hebuni('\u05C6\u0307')

׆̇

In [12]:

F = API4['F']
show_hstring('אַ֚ף')
show_hstring(F.g_word_utf8.v(1186))
show_hstring('ה֠וּא')
n = 76786
show_hstring(F.g_word_utf8.v(n) + F.trailer_utf8.v(n).replace('\n', ''))

אַ֚ף

05D0=LETTER ALEF
05B7=POINT PATAH
059A=ACCENT YETIV
05E3=LETTER FINAL PE

אַ֚ף

05D0=LETTER ALEF
05B7=POINT PATAH
059A=ACCENT YETIV
05E3=LETTER FINAL PE

ה֠וּא

05D4=LETTER HE
05A0=ACCENT TELISHA GEDOLA
05D5=LETTER VAV
05BC=POINT DAGESH OR MAPIQ
05D0=LETTER ALEF

יִשְׂרָאֵֽל׃ ׆̇ פ

05D9=LETTER YOD
05B4=POINT HIRIQ
FB2B=LETTER SHIN WITH SIN DOT
05B0=POINT SHEVA
05E8=LETTER RESH
05B8=POINT QAMATS
05D0=LETTER ALEF
05B5=POINT TSERE
05BD=POINT METEG
05DC=LETTER LAMED
05C3=PUNCTUATION SOF PASUQ
0020=SPACE
05C6=PUNCTUATION NUN HAFUKHA
0307=COMBINING DOT ABOVE
0020=SPACE
05E4=LETTER PE

In [13]:

show_case(' JES 19,18', 23)

JES 19,18 word 23
ETCBC3			ETCBC4
monad 219372, node 218383			monad 218425, node 218424
יֵאָמֵ֖ר			יֵאָמֵ֖ר
J;>@M;73R			J;>@M;73R
J	05D9	LETTER YOD	J	05D9	LETTER YOD
;	05B5	POINT TSERE	;	05B5	POINT TSERE
>	05D0	LETTER ALEF	>	05D0	LETTER ALEF
@	05B8	POINT QAMATS	@	05B8	POINT QAMATS
M	05DE	LETTER MEM	M	05DE	LETTER MEM
;	05B5	POINT TSERE	;	05B5	POINT TSERE
73	0596	ACCENT TIPEHA	73	0596	ACCENT TIPEHA
R	05E8	LETTER RESH	R	05E8	LETTER RESH
_	0020	SPACE	_	0020	SPACE

In [35]:

show_range(' GEN 01,03', range(6,9))

GEN 01,03 words 6, 7, 8
ETCBC4
monad 36, 37, 38, node 37, 38, 39
וַֽיְהִי־אֹֽור׃
WA75-J:HIJ&>O75WR00
W	05D5	LETTER VAV
A	05B7	POINT PATAH
35	05BD	POINT METEG
J	05D9	LETTER YOD
:	05B0	POINT SHEVA
H	05D4	LETTER HE
I	05B4	POINT HIRIQ
J	05D9	LETTER YOD
&	05BE	PUNCTUATION MAQAF
>	05D0	LETTER ALEF
O	05B9	POINT HOLAM
35	05BD	POINT METEG
W	05D5	LETTER VAV
R	05E8	LETTER RESH
00	05C3	PUNCTUATION SOF PASUQ

In [27]:

for i in range(6,9):
    show_case(' GEN 01,03', i)

GEN 01,03 word 6
ETCBC3			ETCBC4
monad 37, node 36			monad 37, node 36
וַֽ			וַֽ
WA75-			WA75-
W	05D5	LETTER VAV	W	05D5	LETTER VAV
A	05B7	POINT PATAH	A	05B7	POINT PATAH
35	05BD	POINT METEG	35	05BD	POINT METEG

GEN 01,03 word 7
ETCBC3			ETCBC4
monad 38, node 37			monad 38, node 37
יְהִי־			יְהִי־
J:HIJ&			J:HIJ&
J	05D9	LETTER YOD	J	05D9	LETTER YOD
:	05B0	POINT SHEVA	:	05B0	POINT SHEVA
H	05D4	LETTER HE	H	05D4	LETTER HE
I	05B4	POINT HIRIQ	I	05B4	POINT HIRIQ
J	05D9	LETTER YOD	J	05D9	LETTER YOD
&	05BE	PUNCTUATION MAQAF	&	05BE	PUNCTUATION MAQAF

GEN 01,03 word 8
ETCBC3			ETCBC4
monad 39, node 38			monad 39, node 38
אֹֽור׃			אֹֽור׃
>O75WR00			>O75WR00
>	05D0	LETTER ALEF	>	05D0	LETTER ALEF
O	05B9	POINT HOLAM	O	05B9	POINT HOLAM
35	05BD	POINT METEG	35	05BD	POINT METEG
W	05D5	LETTER VAV	W	05D5	LETTER VAV
R	05E8	LETTER RESH	R	05E8	LETTER RESH
00	05C3	PUNCTUATION SOF PASUQ	00	05C3	PUNCTUATION SOF PASUQ

In [14]:

missing_in_3 = collections.OrderedDict()
missing_in_4 = collections.OrderedDict()
different = collections.OrderedDict()

codes = collections.OrderedDict((
    ('q', 'qetiv/qere'),
    ('m', 'merecha -silluq-tifcha-mehuppach-nothing'),
    ('n', 'nun hafukha'),
    ('d', 'dagesh'),
    (None, 'NOT ANALYSED'),
))

for e in etcbc3:
    if e not in etcbc4: missing_in_4[e] = None
for e in etcbc4:
    if e not in etcbc3: missing_in_3[e] = None

for e in etcbc4:
    if e in etcbc3:
        un3 = Transcription._decomp(unicodedata.normalize('NFKD', etcbc3[e][3]))
        un4 = Transcription._decomp(unicodedata.normalize('NFKD', etcbc4[e][3]))
        if  un3 != un4:
            code = None
            if '\u05AF' in un4:
                code = 'q'
            elif un3.replace('\u05A5', '').replace('\u05BD', '').replace('\u0596', '').replace('\u05A4', '').replace('\u05BE', '') == un4.replace('\u05A5', '').replace('\u05BD', '').replace('\u0596', '').replace('\u05A4', '').replace('\u05BE', ''):
                code = 'm'
            elif '\u05C6' in un3 + un4:
                code = 'n'
            elif un3.replace('\u05BC', '') == un4.replace('\u05BC', ''):
                code = 'd'
            different[e] = code

print("Missing in etcbc3: {} words".format(len(missing_in_3)))
print("Missing in etcbc4: {} words".format(len(missing_in_4)))
print("Different in etcbc3 and etcbc4: {} words".format(len(different)))

for code in codes:
    print("{}x : {}".format(len([e for e in different if different[e] == code]), codes[code]))

Missing in etcbc3: 2 words
Missing in etcbc4: 1 words
Different in etcbc3 and etcbc4: 2020 words
1892x : qetiv/qere
52x : merecha -silluq-tifcha-mehuppach-nothing
9x : nun hafukha
11x : dagesh
56x : NOT ANALYSED

In [15]:

ohtest = API4['outfile']('test.html')
testcases = [(' EXO 32,17', i) for i in range(1,11)]
write_cases(testcases, ohtest)
ohtest.close()

In [16]:

print(missing_in_4)

OrderedDict([((' EZE 18,14', 19), None)])

In [17]:

ohf = API4['outfile']('comp.txt')
ohe = API4['outfile']('cases.html')
toc = ["<h1>Table of Contents</h1>\n"]
for code in reversed(codes):
    if code == 'q': continue
    desc = codes[code]
    toc.append('<p><a href="#cases.{}"><b>Cases of {}</b></a></p>\n'.format(code, desc))
    cases = list(e for e in different if different[e] == code)
    for (p,w) in cases:
        toc.append('<a href="#case.{}.{}">{}w{}</a>\n'.format(p, w, p.replace(' ',''), w))
ohe.write(show_case_init)
ohe.write(''.join(toc))
for code in reversed(codes):
    if code == 'q': continue
    desc = codes[code]
    cases = list(e for e in different if different[e] == code)
    ohe.write('<h1><a name="cases.{}">Cases of {}</a></h1>\n'.format(code, desc))
    write_cases(cases, ohe)
    ohf.write('# {}\n'.format(desc))
    ohf.write('\n'.join('{}\t{}\t{}'.format(e[0], e[1], 'x') for e in cases))
ohe.write(show_case_final)
ohf.close()
ohe.close()

In [18]:

def show_swap(p, wn, ws):
    F3 = API3['F']
    F4 = API4['F']
    v3n = etcbc3n[p]
    v4n = etcbc4n[p]
    v3text = ''.join('{}{}'.format(F3.text.v(n), F3.suffix.v(n)) for n in v3n)
    v4text = ''.join('{}{}'.format(F4.g_word_utf8.v(n), F4.trailer_utf8.v(n)) for n in v4n)
    d_tpl = '''
<tr>
    <td>{} word {} m={} n={}</td>
    <td class="gw">{}</td>
    <td class="heb">&nbsp;{}</td>
    <td class="gw">{}</td>
    <td class="heb">&nbsp;{}</td>
    <td class="heb">&nbsp;{}</td>
</tr>'''
    v_tpl = '<tr><td>{}</td><td class="heb1" colspan="5"><p class="heb2">&nbsp;{}</p></td></tr>'
    t = '{}{}{}{}{}'.format(
        show_case_init,
        show_case_table_init,
        d_tpl.format(p, wn, *ws),
        v_tpl.format('ETCBC3', v3text),
        v_tpl.format('ETCBC4', v4text),
        show_case_table_final,
        show_case_final,
    )
    display(HTML(t))

In [20]:

NN = API4['NN']
F = API4['F']
msg = API4['msg']

def find_swaps():
    diffs = collections.OrderedDict()
    cur_label = None
    cur_w = 0
    for n in NN():
        otype = F.otype.v(n)
        if otype == 'verse':
            cur_label = F.label.v(n)
            cur_w = 0
            continue
        elif otype == 'word':
            cur_w += 1
            translit = F.g_word.v(n)
            translit_sw = Transcription.swap_accent_pat.sub(Transcription._swap_accent, translit)
            if translit != translit_sw:
                diffs[(cur_label, cur_w)] = (
                    F.monads.v(n),
                    n,
                    translit, 
                    Transcription.to_hebrew_x(translit), 
                    translit_sw, 
                    Transcription.to_hebrew_x(translit_sw), 
                    F.g_word_utf8.v(n),
                )
    print(len(diffs))
    return diffs

swaps = find_swaps()

In [21]:

oh = API4['outfile']('swaps.txt')
i = 0
limit = 10
for (p, wn) in swaps:
    i += 1
    if limit != None and i > limit: break
    show_swap(p, wn, swaps[(p, wn)])
    info = swaps[(p, wn)]
    oh.write("={}\n#{}\n\n".format(info[0], info[2]))
oh.close()

GEN 01,11 word 8 m=186 n=185	10<;FEB	֚עֵשֶׂב	<;10FEB	עֵ֚שֶׂב	עֵ֚שֶׂב
ETCBC3	וַיֹּ֣אמֶר אֱלֹהִ֗ים תַּֽדְשֵׁ֤א הָאָ֨רֶץ֙ דֶּ֔שֶׁא עֵ֚שֶׂב מַזְרִ֣יעַ זֶ֔רַע עֵ֣ץ פְּרִ֞י עֹ֤שֶׂה פְּרִי֙ לְמִינֹ֔ו אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו עַל־הָאָ֑רֶץ וַֽיְהִי־כֵֽן׃
ETCBC4	וַיֹּ֣אמֶר אֱלֹהִ֗ים תַּֽדְשֵׁ֤א הָאָ֨רֶץ֙ דֶּ֔שֶׁא עֵ֚שֶׂב מַזְרִ֣יעַ זֶ֔רַע עֵ֣ץ פְּרִ֞י עֹ֤שֶׂה פְּרִי֙ לְמִינֹ֔ו אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו עַל־הָאָ֑רֶץ וַֽיְהִי־כֵֽן׃

GEN 01,12 word 5 m=208 n=207	14D.ECE>	֠דֶּשֶׁא	D.E14CE>	דֶּ֠שֶׁא	דֶּ֠שֶׁא
ETCBC3	וַתֹּוצֵ֨א הָאָ֜רֶץ דֶּ֠שֶׁא עֵ֣שֶׂב מַזְרִ֤יעַ זֶ֨רַע֙ לְמִינֵ֔הוּ וְעֵ֧ץ עֹֽשֶׂה־פְּרִ֛י אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו לְמִינֵ֑הוּ וַיַּ֥רְא אֱלֹהִ֖ים כִּי־טֹֽוב׃
ETCBC4	וַתֹּוצֵ֨א הָאָ֜רֶץ דֶּ֠שֶׁא עֵ֣שֶׂב מַזְרִ֤יעַ זֶ֨רַע֙ לְמִינֵ֔הוּ וְעֵ֧ץ עֹ֥שֶׂה פְּרִ֛י אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו לְמִינֵ֑הוּ וַיַּ֥רְא אֱלֹהִ֖ים כִּי־טֹֽוב׃

GEN 01,30 word 5 m=626 n=625	14H@-	֠הָ-	H@14-	הָ֠-	הָ֠
ETCBC3	וּֽלְכָל־חַיַּ֣ת הָ֠אָרֶץ וּלְכָל־עֹ֨וף הַשָּׁמַ֜יִם וּלְכֹ֣ל ׀ רֹומֵ֣שׂ עַל־הָאָ֗רֶץ אֲשֶׁר־בֹּו֙ נֶ֣פֶשׁ חַיָּ֔ה אֶת־כָּל־יֶ֥רֶק עֵ֖שֶׂב לְאָכְלָ֑ה וַֽיְהִי־כֵֽן׃
ETCBC4	וּֽלְכָל־חַיַּ֣ת הָ֠אָרֶץ וּלְכָל־עֹ֨וף הַשָּׁמַ֜יִם וּלְכֹ֣ל ׀ רֹומֵ֣שׂ עַל־הָאָ֗רֶץ אֲשֶׁר־בֹּו֙ נֶ֣פֶשׁ חַיָּ֔ה אֶת־כָּל־יֶ֥רֶק עֵ֖שֶׂב לְאָכְלָ֑ה וַֽיְהִי־כֵֽן׃

GEN 02,05 word 6 m=750 n=749	10VEREM	֚טֶרֶמ	VE10REM	טֶ֚רֶמ	טֶ֚רֶם
ETCBC3	וְכֹ֣ל ׀ שִׂ֣יחַ הַשָּׂדֶ֗ה טֶ֚רֶם יִֽהְיֶ֣ה בָאָ֔רֶץ וְכָל־עֵ֥שֶׂב הַשָּׂדֶ֖ה טֶ֣רֶם יִצְמָ֑ח כִּי֩ לֹ֨א הִמְטִ֜יר יְהוָ֤ה אֱלֹהִים֙ עַל־הָאָ֔רֶץ וְאָדָ֣ם אַ֔יִן לַֽעֲבֹ֖ד אֶת־הָֽאֲדָמָֽה׃
ETCBC4	וְכֹ֣ל ׀ שִׂ֣יחַ הַשָּׂדֶ֗ה טֶ֚רֶם יִֽהְיֶ֣ה בָאָ֔רֶץ וְכָל־עֵ֥שֶׂב הַשָּׂדֶ֖ה טֶ֣רֶם יִצְמָ֑ח כִּי֩ לֹ֨א הִמְטִ֜יר יְהוָ֤ה אֱלֹהִים֙ עַל־הָאָ֔רֶץ וְאָדָ֣ם אַ֔יִן לַֽעֲבֹ֖ד אֶת־הָֽאֲדָמָֽה׃

GEN 02,11 word 8 m=889 n=888	10>;T	֚אֵת	>;10T	אֵ֚ת	אֵ֚ת
ETCBC3	שֵׁ֥ם הָֽאֶחָ֖ד פִּישֹׁ֑ון ה֣וּא הַסֹּבֵ֗ב אֵ֚ת כָּל־אֶ֣רֶץ הַֽחֲוִילָ֔ה אֲשֶׁר־שָׁ֖ם הַזָּהָֽב׃
ETCBC4	שֵׁ֥ם הָֽאֶחָ֖ד פִּישֹׁ֑ון ה֣וּא הַסֹּבֵ֗ב אֵ֚ת כָּל־אֶ֣רֶץ הַֽחֲוִילָ֔ה אֲשֶׁר־שָׁ֖ם הַזָּהָֽב׃

GEN 02,23 word 8 m=1123 n=1122	10	֚עֶצֶמ	עֶ֚צֶמ	עֶ֚צֶם
ETCBC3	וַיֹּאמֶר֮ הָֽאָדָם֒ זֹ֣את הַפַּ֗עַם עֶ֚צֶם מֵֽעֲצָמַ֔י וּבָשָׂ֖ר מִבְּשָׂרִ֑י לְזֹאת֙ יִקָּרֵ֣א אִשָּׁ֔ה כִּ֥י מֵאִ֖ישׁ לֻֽקֳחָה־זֹּֽאת׃
ETCBC4	וַיֹּאמֶר֮ הָֽאָדָם֒ זֹ֣את הַפַּ֗עַם עֶ֚צֶם מֵֽעֲצָמַ֔י וּבָשָׂ֖ר מִבְּשָׂרִ֑י לְזֹאת֙ יִקָּרֵ֣א אִשָּׁ֔ה כִּ֥י מֵאִ֖ישׁ לֻֽקֳחָה־זֹּֽאת׃

GEN 03,01 word 20 m=1187 n=1186	10>AP	֚אַפ	>A10P	אַ֚פ	אַ֚ף
ETCBC3	וְהַנָּחָשׁ֙ הָיָ֣ה עָר֔וּם מִכֹּל֙ חַיַּ֣ת הַשָּׂדֶ֔ה אֲשֶׁ֥ר עָשָׂ֖ה יְהוָ֣ה אֱלֹהִ֑ים וַיֹּ֨אמֶר֙ אֶל־הָ֣אִשָּׁ֔ה אַ֚ף כִּֽי־אָמַ֣ר אֱלֹהִ֔ים לֹ֣א תֹֽאכְל֔וּ מִכֹּ֖ל עֵ֥ץ הַגָּֽן׃
ETCBC4	וְהַנָּחָשׁ֙ הָיָ֣ה עָר֔וּם מִכֹּל֙ חַיַּ֣ת הַשָּׂדֶ֔ה אֲשֶׁ֥ר עָשָׂ֖ה יְהוָ֣ה אֱלֹהִ֑ים וַיֹּ֨אמֶר֙ אֶל־הָ֣אִשָּׁ֔ה אַ֚ף כִּֽי־אָמַ֣ר אֱלֹהִ֔ים לֹ֣א תֹֽאכְל֔וּ מִכֹּ֖ל עֵ֥ץ הַגָּֽן׃

GEN 03,05 word 1 m=1242 n=1241	10K.IJ	֚כִּי	K.I10J	כִּ֚י	כִּ֚י
ETCBC3	כִּ֚י יֹדֵ֣עַ אֱלֹהִ֔ים כִּ֗י בְּיֹום֙ אֲכָלְכֶ֣ם מִמֶּ֔נּוּ וְנִפְקְח֖וּ עֵֽינֵיכֶ֑ם וִהְיִיתֶם֙ כֵּֽאלֹהִ֔ים יֹדְעֵ֖י טֹ֥וב וָרָֽע׃
ETCBC4	כִּ֚י יֹדֵ֣עַ אֱלֹהִ֔ים כִּ֗י בְּיֹום֙ אֲכָלְכֶ֣ם מִמֶּ֔נּוּ וְנִפְקְח֖וּ עֵֽינֵיכֶ֑ם וִהְיִיתֶם֙ כֵּֽאלֹהִ֔ים יֹדְעֵ֖י טֹ֥וב וָרָֽע׃

GEN 03,11 word 3 m=1372 n=1371	10MIJ	֚מִי	MI10J	מִ֚י	מִ֚י
ETCBC3	וַיֹּ֕אמֶר מִ֚י הִגִּ֣יד לְךָ֔ כִּ֥י עֵירֹ֖ם אָ֑תָּה הֲמִן־הָעֵ֗ץ אֲשֶׁ֧ר צִוִּיתִ֛יךָ לְבִלְתִּ֥י אֲכָל־מִמֶּ֖נּוּ אָכָֽלְתָּ׃
ETCBC4	וַיֹּ֕אמֶר מִ֚י הִגִּ֣יד לְךָ֔ כִּ֥י עֵירֹ֖ם אָ֑תָּה הֲמִן־הָעֵ֗ץ אֲשֶׁ֧ר צִוִּיתִ֛יךָ לְבִלְתִּ֥י אֲכָל־מִמֶּ֖נּוּ אָכָֽלְתָּ׃

In [36]:

print(Transcription.to_hebrew_x('M.A33JIm03'))

מַּ֨יִם֙

In [138]:

import collections
from IPython.display import clear_output, display, HTML
from etcbc.lib import Transcription

def htrans(tr):
    return Transcription.to_hebrew_x(tr)

def outfile(name):
    h = open('/Users/dirk/Downloads/{}'.format(name), 'w')
    h.write('''<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<table rules="all" border="all">
''')
    return h

font = dict(
sil='''
font-family: Ezra SIL;
font-size: 20pt;
line-height:28pt;
margin-right:0.5em;
direction:rtl;
unicode-bidi:bidi-override;
text-align: right;
''',
sbl='''
font-family: SBL Hebrew;
font-size: 24pt;
line-height:28pt;
margin-right:0.5em;
direction:rtl;
unicode-bidi:bidi-override;
text-align: right;
''')

hfile = outfile('hebtest.html')

def hebnormal(heb):
    #print("{}\n".format(''.join(trans[c] for c in heb)))
    comps = heb.split(' ')
    plain = ''
    spanned = ''
    first = True
    sep = ''
    for comp in comps:
        plain += sep + ''.join(htrans(word) for word in comp.split('-'))
        spanned += sep + ''.join('<span tr="{}">'.format(word) + htrans(word) + '</span>' for word in comp.split('-'))
        if sep == '': sep = ' '
                
    for f in sorted(font):
        for text in (spanned, plain):
            para = '''<p style="{}">{}</p>'''.format(font[f], text)
            display(HTML(para))
            hfile.write(para + '\n')

klegenda = ('adapted', 'spanned', 'plain')
kcolor = (('#ffddbb','#ffeecc'), ('#ffbbbb','#ffcccc'), ('#bbffbb','#ccffcc'))
plegenda = ('x y', 'x-y', 'xy')

not_to_be_adapted = {
    '&', '.', '.c', '.f', '00', '01', '05', 'O',
}
to_be_adapted = {
    '*', ',', '02', '03', '04', '10', '11', '13', '14',
    '24', '33', '35', '44', '52', '53', '60', '61', '62',
    '63', '64', '65', '70', '71', '72', '73', '74', '75',
    '80', '81', '82', '83', '84', '85', '91', '92', '93',
    '94', '95', 
    ':', ':@', ':A', ':E', ';', '@', 'A', 'E', 'I', 'U',
}

# 02, 03, 04, 10, 13, 24, 84: 
# sbl goes wrong in firefox: eats space in x y plain and adapted

# 14, 44:
# even after adaptation still very tight

# @, A:
# In SBL: heth discards more after-space than he

cnotadapt = 0
cadapt = 0
cremaining = 0
cskip = 0
cdone = ''
first = True
for x in sorted(Transcription.hebrew_mapping):
    if (x.isalpha() and x not in {'A', 'E', 'I', 'O', 'U'}) or x in {'<', '>', '#'}: 
        cskip +=1
        continue
    if x in {'55', '56', '57', '_'}:
        cskip += 1
        continue
    if x in not_to_be_adapted: 
        cnotadapt +=1
        continue
    if x in to_be_adapted:
        cadapt +=1
        continue
    if not first:
        cremaining += 1
        continue
    data = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    for cons in ('>', 'H', 'X', '<', 'W', '#'):
        for (p, pat) in enumerate(('{} {}', '{}-{}', '{}{}')):
            heb = pat.format(cons + x, 'B')
            comps = heb.split(' ')
            plain = ''
            spanned = ''
            aspanned = ''
            first = True
            sep = ''
            for comp in comps:
                plain += sep + ''.join(htrans(word) for word in comp.split('-'))
                spanned += sep + ''.join('<span>'.format(word) + htrans(word) + '</span>' for word in comp.split('-'))
                aspanned += sep + ''.join('<span>'.format(word) + htrans(word) + '&nbsp;</span>' for word in comp.split('-'))
                if sep == '': sep = ' '
            if p != 1:
                data[2][p].append((heb, plain))
            data[1][p].append((heb, spanned))
            data[0][p].append((heb, aspanned))
    for k in sorted(data):
        for p in sorted(data[k]):
            for (heb, text) in data[k][p]:
                for (f, fnt) in enumerate(sorted(font)):
                    para = '''
<tr style="font-family: Menlo; font-size: 12pt; background-color: {};">
<td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td>
<td style="{}">{}</td>
</tr>
        '''.format(
                        kcolor[k][f], 
                        x.replace('&', '&amp;'), klegenda[k], plegenda[p], 
                        heb.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;'), 
                        fnt,
                        font[fnt],
                        text,
        )
                    hfile.write(para)
    cdone = x
    first = False
print('''
Skipped           = {:>3}
To be adapted     = {:>3}
Not to be adapted = {:>3}
Done              = '{}'
Remaining         = {:>3}
'''.format(cskip, cadapt, cnotadapt, cdone, cremaining)
)    


examples = (
    'HA-M.A33JIm03 >:ACER03',
    'XA M.A33JIm03 >:ACER03',
    'XA-M.A33JIm03 >:ACER03',
    'XAM.A33JIm03 >:ACER03',
    'XA B',
    '<A B',
    'HA B.:',
    'W. B',
    'B.:-R',
)

        
'''
for x in sorted(Transcription.hebrew_mapping):
    if (x.isalpha() and x not in {'A', 'E', 'I', 'O', 'U'}) or x in {'<', '>', '#'}: continue
    hebadapted(x, 'HA-M.A33JIm{} >:ACER03'.format(x))
        
for e in examples:
    hebnormal(e)
'''

hfile.write('''
</table>
</body>
</html>
''')
hfile.close()

Skipped           =  34
To be adapted     =  48
Not to be adapted =   8
Done              = ''
Remaining         =   0

In [ ]:

GEN 03,15 word 15 m=1470 n=1469	10HW.>	֚הוּא	H10W.>	ה֚וּא	ה֚וּא
ETCBC3	וְאֵיבָ֣ה ׀ אָשִׁ֗ית בֵּֽינְךָ֙ וּבֵ֣ין הָֽאִשָּׁ֔ה וּבֵ֥ין זַרְעֲךָ֖ וּבֵ֣ין זַרְעָ֑הּ ה֚וּא יְשׁוּפְךָ֣ רֹ֔אשׁ וְאַתָּ֖ה תְּשׁוּפֶ֥נּוּ עָקֵֽב׃ ס
ETCBC4	וְאֵיבָ֣ה ׀ אָשִׁ֗ית בֵּֽינְךָ֙ וּבֵ֣ין הָֽאִשָּׁ֔ה וּבֵ֥ין זַרְעֲךָ֖ וּבֵ֣ין זַרְעָ֑הּ ה֚וּא יְשׁוּפְךָ֣ רֹ֔אשׁ וְאַתָּ֖ה תְּשׁוּפֶ֥נּוּ עָקֵֽב׃ ס