After all these checks we can proceed to print out the tree structures as plain, bracketed text strings.
Per tree we also print a string of the slot numbers that you get when you walk the tree in pre-order. And we produce node numbers from Text-Fabric.
Then we construct a more information rich file with all trees.
TF.info("Writing {} trees".format(rootType))
treeFile = "{}/trees-BHSA.txt".format(OUTPUTDIR)
with open(treeFile, "w") as trees:
verseLabel = ""
s = 0
chunk = 10000
sc = 0
for node in F.otype.s(rootType):
if node in skip:
continue
(treeRep, wordsRep, bSlot) = tree.writeTree(
node, "r", getTag, rev=False, leafNumbers=False
)
trees.write(
"\n#{}\tnode={}\tbSlot={}\t{}\n{}\n".format(
"{} {}:{}".format(*T.sectionFromNode(node)),
node,
bSlot,
wordsRep,
treeRep,
)
)
s += 1
sc += 1
if sc == chunk:
TF.info("{} trees written".format(s))
sc = 0
TF.info("{} trees written to {}".format(s, treeFile))
22m 46s Writing sentence trees 22m 48s 10000 trees written 22m 49s 20000 trees written 22m 50s 30000 trees written 22m 51s 40000 trees written 22m 52s 50000 trees written 22m 53s 60000 trees written 22m 54s 63711 trees written to _temp/2017/trees-BHSA.txt
!head -n 25 {treeFile}
#Genesis 1:1 node=1172209 bSlot=1 0 1 2 3 4 5 6 7 8 9 10 (S(C(PP(pp "בְּ")(n "רֵאשִׁ֖ית"))(VP(vb "בָּרָ֣א"))(NP(n "אֱלֹהִ֑ים"))(PP(U(pp "אֵ֥ת")(dt "הַ")(n "שָּׁמַ֖יִם"))(cj "וְ")(U(pp "אֵ֥ת")(dt "הָ")(n "אָֽרֶץ"))))) #Genesis 1:2 node=1172210 bSlot=12 0 1 2 3 4 5 6 (S(C(CP(cj "וְ"))(NP(dt "הָ")(n "אָ֗רֶץ"))(VP(vb "הָיְתָ֥ה"))(NP(U(n "תֹ֨הוּ֙"))(cj "וָ")(U(n "בֹ֔הוּ"))))) #Genesis 1:2 node=1172211 bSlot=19 0 1 2 3 4 (S(C(CP(cj "וְ"))(NP(n "חֹ֖שֶׁךְ"))(PP(pp "עַל")(U(n "פְּנֵ֣י"))(U(n "תְהֹ֑ום"))))) #Genesis 1:2 node=1172212 bSlot=24 0 1 2 3 4 5 6 7 (S(C(CP(cj "וְ"))(NP(U(n "ר֣וּחַ"))(U(n "אֱלֹהִ֔ים")))(VP(vb "מְרַחֶ֖פֶת"))(PP(pp "עַל")(U(n "פְּנֵ֥י"))(U(dt "הַ")(n "מָּֽיִם"))))) #Genesis 1:3 node=1172213 bSlot=32 0 1 2 (S(C(CP(cj "וַ"))(VP(vb "יֹּ֥אמֶר"))(NP(n "אֱלֹהִ֖ים")))) #Genesis 1:3 node=1172214 bSlot=35 0 1 (S(C(VP(vb "יְהִ֣י"))(NP(n "אֹ֑ור")))) #Genesis 1:3 node=1172215 bSlot=37 0 1 2 (S(C(CP(cj "וַֽ"))(VP(vb "יְהִי"))(NP(n "אֹֽור")))) #Genesis 1:4 node=1172216 bSlot=40 0 1 2 3 4 5 6 7 (S(C(CP(cj "וַ"))(VP(vb "יַּ֧רְא"))(NP(n "אֱלֹהִ֛ים"))(PP(pp "אֶת")(dt "הָ")(n "אֹ֖ור"))(Cobjc(CP(cj "כִּי"))(VP(vb "טֹ֑וב")))))
TF.info("Writing {} trees".format(rootType))
treeFile = "{}/trees-BHSA-nodes.txt".format(OUTPUTDIR)
with open(treeFile, "w") as trees:
verseLabel = ""
s = 0
chunk = 10000
sc = 0
for node in F.otype.s(rootType):
if node in skip:
continue
(treeRep, wordsRep, bSlot) = tree.writeTree(
node, "r", getTagN, rev=False, leafNumbers=False
)
trees.write(
"\n#{}\tnode={}\tbSlot={}\t{}\n{}\n".format(
"{} {}:{}".format(*T.sectionFromNode(node)),
node,
bSlot,
wordsRep,
treeRep,
)
)
s += 1
sc += 1
if sc == chunk:
TF.info("{} trees written".format(s))
sc = 0
TF.info("{} trees written to {}".format(s, treeFile))
23m 52s Writing sentence trees 23m 53s 10000 trees written 23m 55s 20000 trees written 23m 56s 30000 trees written 23m 57s 40000 trees written 23m 58s 50000 trees written 23m 59s 60000 trees written 24m 00s 63711 trees written to _temp/2017/trees-BHSA-nodes.txt
Here are the first lines of the output.
!head -n 25 {treeFile}
#Genesis 1:1 node=1172209 bSlot=1 0 1 2 3 4 5 6 7 8 9 10 (S{1172209}(C{427553}(PP{651503}(pp "בְּ")(n "רֵאשִׁ֖ית"))(VP{651504}(vb "בָּרָ֣א"))(NP{651505}(n "אֱלֹהִ֑ים"))(PP{651506}(U{1300406}(pp "אֵ֥ת")(dt "הַ")(n "שָּׁמַ֖יִם"))(cj "וְ")(U{1300407}(pp "אֵ֥ת")(dt "הָ")(n "אָֽרֶץ"))))) #Genesis 1:2 node=1172210 bSlot=12 0 1 2 3 4 5 6 (S{1172210}(C{427554}(CP{651507}(cj "וְ"))(NP{651508}(dt "הָ")(n "אָ֗רֶץ"))(VP{651509}(vb "הָיְתָ֥ה"))(NP{651510}(U{1300408}(n "תֹ֨הוּ֙"))(cj "וָ")(U{1300409}(n "בֹ֔הוּ"))))) #Genesis 1:2 node=1172211 bSlot=19 0 1 2 3 4 (S{1172211}(C{427555}(CP{651511}(cj "וְ"))(NP{651512}(n "חֹ֖שֶׁךְ"))(PP{651513}(pp "עַל")(U{1300410}(n "פְּנֵ֣י"))(U{1300411}(n "תְהֹ֑ום"))))) #Genesis 1:2 node=1172212 bSlot=24 0 1 2 3 4 5 6 7 (S{1172212}(C{427556}(CP{651514}(cj "וְ"))(NP{651515}(U{1300412}(n "ר֣וּחַ"))(U{1300413}(n "אֱלֹהִ֔ים")))(VP{651516}(vb "מְרַחֶ֖פֶת"))(PP{651517}(pp "עַל")(U{1300414}(n "פְּנֵ֥י"))(U{1300415}(dt "הַ")(n "מָּֽיִם"))))) #Genesis 1:3 node=1172213 bSlot=32 0 1 2 (S{1172213}(C{427557}(CP{651518}(cj "וַ"))(VP{651519}(vb "יֹּ֥אמֶר"))(NP{651520}(n "אֱלֹהִ֖ים")))) #Genesis 1:3 node=1172214 bSlot=35 0 1 (S{1172214}(C{427558}(VP{651521}(vb "יְהִ֣י"))(NP{651522}(n "אֹ֑ור")))) #Genesis 1:3 node=1172215 bSlot=37 0 1 2 (S{1172215}(C{427559}(CP{651523}(cj "וַֽ"))(VP{651524}(vb "יְהִי"))(NP{651525}(n "אֹֽור")))) #Genesis 1:4 node=1172216 bSlot=40 0 1 2 3 4 5 6 7 (S{1172216}(C{427560}(CP{651526}(cj "וַ"))(VP{651527}(vb "יַּ֧רְא"))(NP{651528}(n "אֱלֹהִ֛ים"))(PP{651529}(pp "אֶת")(dt "הָ")(n "אֹ֖ור"))(Cobjc{427561}(CP{651530}(cj "כִּי"))(VP{651531}(vb "טֹ֑וב")))))
Here ends the tree generation. What follows is only important if you test and debug the tree generation.
We can apply our algorithms to limited sets of interesting trees and random samples.
For those cases we also apply a debugWrite()
method that outputs considerably more information.
def passageRoots(passage):
vNode = T.nodeFromSection(passage)
return L.d(vNode, otype=rootType)
def showcases(cases, oFile):
with open(oFile, "w") as out:
for (sNode, caseText) in cases.items():
out.write(
"\n====================\n{}\n{}\n{} TF-node={}:\n".format(
"{} {}:{}".format(*T.sectionFromNode(sNode)),
caseText,
rootType,
sNode,
)
)
for kind in ("e", "r"):
out.write(
"\nTree based on slot embedding {}\n\n".format(
"only"
if kind == "e"
else " and mother+clause_constituent relation"
)
)
(treeRep, wordsRep, bSlot) = tree.writeTree(
sNode, kind, getTag, rev=False, leafNumbers=False
)
out.write("{}\n\n{}\n".format(wordsRep, treeRep))
out.write("\nDepth={}\n".format(tree.depth(sNode, kind)))
out.write(tree.debugWriteTree(sNode, kind, legenda=kind == "r"))
This output (when done for version 4
of the BHSA)
has been visually checked by Constantijn Sikkel and Dirk Roorda.
# below holds for etcbc3, in etcbc4 we have less problem cases
problem_desc = collections.OrderedDict(
(
(1131739, "debug reorder"),
(1131712, "interesting"),
(1131701, "interesting"),
(1140469, "subject clause order"),
(passageRoots(("Genesis", 1, 16))[0], "interesting"),
(1164864, "interesting"),
(1143081, "cyclic mothers"),
(1153973, "cyclic mothers"),
(1158971, "cyclic mothers"),
(1158971, "cyclic mothers"),
(1160416, "cyclic mothers"),
(1160464, "cyclic mothers"),
(1161141, "nested cyclic mothers: C.coor => C.attr => P below first C.coor"),
(1163666, "cyclic mothers"),
(1164830, "cyclic mothers"),
(1167680, "cyclic mothers"),
(1170057, "cyclic mothers"),
(1193065, "cyclic mothers"),
(1199681, "cyclic mothers"),
(1199682, "mother points outside sentence"),
)
)
fixedSample = (
1167680,
1167152,
1145250,
1154339,
1136677,
1166385,
1198984,
1152969,
1153930,
1150648,
1168396,
1151917,
1164750,
1156719,
1148048,
1138673,
1134184,
1156789,
1156600,
1140469,
)
sampleSize = 20
sample = {}
fSample = collections.OrderedDict()
motherKeys = list(sorted(tree.mother))
for s in range(20):
r = random.randint(0, len(motherKeys) - 1)
sNode = tree.getRoot(tree.mother[motherKeys[r]], "e")[0]
sample[sNode] = "random sample in {}s with {}s with mothers".format(
rootType, clauseType
)
for sNode in fixedSample:
fSample[sNode] = "random sample in {}s with {}s with mothers".format(
rootType, clauseType
)
# showcases(problemDesc, 'tree-notabene.txt')
# showcases(sample, '{}/trees-{}-random-{}.txt'.format(OUTPUTDIR, VERSION, sampleSize))
# showcases(fsample, 'trees-fixed-{}.txt'.format(len(fsample)))