#!/usr/bin/env python # coding: utf-8 # # Heads2TF # # In this NB, we produce two text-fabric features on the BHSA data using the `get_heads` method developed in [getting_heads.ipynb](getting_heads.ipynb). See that notebook for a detailed description of the motivation, method, and shortcomings for this data. # # N.B. this data is experimental and a work in progress! # # ## Production # # Three features are produced herein: # * heads.tf - an edge feature from a phrase(atom) node to its phrase head + its coordinated head words. # * prep_obj.tf - an edge feature from a prepositional phrase type to its noun object. # * noun_heads.tf - an edge feature from a phrase(atom) node to its noun heads, regardless of whether the phrase is a prepositional phrase or not. "noun" is meant loosely and includes adjectives and other parts of speech. # ## Export # # ### Updates # # #### 2021-12-13 (Dirk Roorda) # Generated the features for version 2021, removed version 2016, # adapted some code to modern Text-Fabric. # # #### 06.11.18 (Cody Kingham) # Added a new feature, `noun_heads`, to pluck noun heads from both noun phrases or prepositional phrases. # # #### 23.10.18 (Cody Kingham) # New export for the updated C version of BHSA data. # # #### 21.04.18 (Cody Kingham) # A new function has been added to double check phrase heads. Prepositional phrases whose objects are also prepositions have resulted in some false heads being assigned. This is because prepositional objects receive no subphrase relations in BHSA and appeared to the algorithm as independent. An additional check is required to make sure that a given preposition does not serve as the head of its phrase. The new function, `check_preposition`, looks one word behind a candidate head noun (within the phrase boundaries) and validates only those cases that are not immediately preceded by another preposition. # # #### 20.04.18 (Cody Kingham) # In discussion with Stephen Ku, I've decided to apply the `quantifier` algorithm to prepositional objects so that we retrieve the head of the prepositional object noun phrase rather than a quantifier. For good measure, I will also apply the `attributed` function (see [getting_heads.ipynb](getting_heads.ipynb) for a description of both functions). # In[1]: import collections import random from tf.fabric import Fabric from heads import get_heads, find_quantified, find_attributed # In[2]: # export heads.tf & prep_obj.tf for all TF versions for version in ["2021", "c", "2017"]: print("processing version ", version, "\n") # load Text-Fabric and data TF = Fabric(locations="~/github/etcbc/bhsa/tf", modules=version) api = TF.load( """ book chapter verse typ pdp rela mother function lex sp ls """ ) F, E, T, L = api.F, api.E, api.T, api.L # TF data methods # get heads heads_features = collections.defaultdict(dict) print("\nprocessing heads...") for phrase in list(F.otype.s("phrase")) + list(F.otype.s("phrase_atom")): heads = get_heads(phrase, api) if heads: heads_features["heads"][phrase] = set(heads) # make noun heads part 1 if F.typ.v(phrase) != "PP" and heads: heads_features["noun_heads"][phrase] = set(heads) # do prep objects and noun heads part 2 if F.typ.v(phrase) == "PP" and heads: for head in heads: obj = head + 1 if F.pdp.v(head + 1) != "art" else head + 2 phrase_bounds = L.d(phrase, "word") if obj in phrase_bounds: obj = find_quantified(obj, api) or find_attributed(obj, api) or obj heads_features["prep_obj"][head] = set([obj]) heads_features["noun_heads"][phrase] = set( [obj] ) # make noun heads part 2 # export TF data print("\nexporting TF...") meta = { "": {"created_by": "Cody Kingham", "coreData": "BHSA", "coreVersion": version}, "heads": { "source": "see the notebook at https://github.com/etcbc/lingo/heads", "valueType": "int", "edgeValues": False, }, "prep_obj": { "source": "see the notebook at https://github.com/etcbc/lingo/heads", "valueType": "int", "edgeValues": False, }, "noun_heads": { "source": "see the notebook at https://github.com/etcbc/lingo/heads", "valueType": "int", "edgeValues": False, }, } save_tf = Fabric( locations="~/github/etcbc/lingo/heads/tf", modules=version, silent=True ) save_api = save_tf.load("", silent=True) save_tf.save(nodeFeatures={}, edgeFeatures=heads_features, metaData=meta) print(f"\ndone with {version}") # # Tests # In[3]: from tf.app import use A = use("ETCBC/bhsa", mod="etcbc/lingo/heads/tf:clone", version="2021", hoist=globals()) # ## noun_heads # In[4]: A.show( A.search( """ phrase typ=PP -noun_heads> word """ )[:10] ) # ## prep_obj.tf # In[5]: test_prep = [] for ph in F.typ.s("PP"): heads = E.heads.f(ph) objs = [E.prep_obj.f(prep)[0] for prep in heads if E.prep_obj.f(prep)] test_prep.append(tuple(objs)) random.shuffle(test_prep) # In[6]: A.show(test_prep[:50]) # uncomment me # See what the prepositional object looks like for Genesis 1:21: # In[7]: gen_121_case = L.d(T.nodeFromSection(("Genesis", 1, 21)), "phrase")[13] print("example phrase", gen_121_case, "phrase number 14 in verse") print(T.text(L.d(gen_121_case, "word"))) print("\nGen 1:21 phrase 14's heads, a preposition:") heads = E.heads.f(gen_121_case) print(T.text(heads)) print("\nGen 1:21 phrase 14's prepositional object:") print(T.text(E.prep_obj.f(heads[0]))) # ## heads.tf # In[8]: heads = [E.heads.f(ph) for ph in F.otype.s("phrase") if F.typ.v(ph) == "NP"] random.shuffle(heads) # In[9]: A.show(heads[:50]) # uncomment me # In[ ]: