This notebook combines the participant references and semantic roles computed in other phases of this research project. The two datatypes are combined to create a social network model of the data and to explore this model by social network analytical tools. The first SNA-measures are given in this notobook, while more detailed studies of participant roles are reserved for other notebooks in this repo.
Content
#Dataset path
PATH = 'datasets/'
import csv, collections, html
from operator import itemgetter
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
import networkx as nx
import forceatlas2
import random
Warning: uncompiled fa2util module. Compile with cython for a 10-100x speed boost.
#Importing the Hebrew data and Text-Fabric
from tf.app import use
A = use('bhsa', hoist=globals(), mod='etcbc/heads/tf')
df = pd.read_csv(f'{PATH}participants_FINAL.csv')
df.columns = ['participant','refs']
df.head()
participant | refs | |
---|---|---|
0 | JHWH | 944128 946176 946179 946182 946184 944142 9441... |
1 | MCH= | 945152 945537 945155 945540 945547 945555 9449... |
2 | >HRN | 944640 944641 65555 944662 65561 944666 944667... |
3 | BN JFR>L | 67584 944132 944133 944139 946216 946217 94417... |
4 | >JC >JC | 945664 64514 945666 945668 944135 944136 94567... |
The references are transformed to lists and their respective frequencies in the corpus are counted
ref_list = []
participant_freq = []
for row in df.iterrows():
refs = [int(r) for r in row[1].refs.split()]
ref_list.append(refs)
participant_freq.append(len(refs))
df.insert(2, 'ref_list', ref_list)
df.insert(3, 'freq', participant_freq)
df.head()
participant | refs | ref_list | freq | |
---|---|---|---|---|
0 | JHWH | 944128 946176 946179 946182 946184 944142 9441... | [944128, 946176, 946179, 946182, 946184, 94414... | 476 |
1 | MCH= | 945152 945537 945155 945540 945547 945555 9449... | [945152, 945537, 945155, 945540, 945547, 94555... | 60 |
2 | >HRN | 944640 944641 65555 944662 65561 944666 944667... | [944640, 944641, 65555, 944662, 65561, 944666,... | 164 |
3 | BN JFR>L | 67584 944132 944133 944139 946216 946217 94417... | [67584, 944132, 944133, 944139, 946216, 946217... | 579 |
4 | >JC >JC | 945664 64514 945666 945668 944135 944136 94567... | [945664, 64514, 945666, 945668, 944135, 944136... | 277 |
print(f'Number of participants: {len(df)}')
Number of participants: 75
Two functions fetch the participant label from any given word or phrase in the text.
def getLabel(ref, df=df):
'''
This function fetches the actor/participant reference from the participant dataframe.
'''
actor_list = []
for row in df.iterrows():
if ref in row[1].ref_list:
actor_list.append(row[1].participant)
return actor_list
def Actor(ref, df=df):
'''
This function takes a reference as input and returns the participant label. Phrases are treated differently, becuase
non-verbal phrases require additional measures to find the nominal head of the phrase and return the label for that
particular constituent.
'''
nom_head = E.nhead.t(ref) #Finding the nominal head(s) of the phrase
if F.otype.v(ref) == 'word': #Identifying object suffixes
return getLabel(ref, df=df)
elif F.typ.v(ref) == 'VP':
return getLabel(L.d(ref, 'phrase_atom')[0], df=df)
elif F.typ.v(ref) == 'PP':
if len(nom_head) > 1:
return getLabel(L.d(ref, 'phrase_atom')[0], df=df)
if nom_head != E.head.t(ref): #If equal, the reference is a simple preposition with a suffix
return getLabel(L.u(nom_head[0], 'phrase_atom')[0], df=df)
else:
if getLabel(E.head.t(ref)[0], df=df):
return getLabel(E.head.t(ref)[0], df=df)
else:
return getLabel(L.u(nom_head[0], 'phrase_atom')[0], df=df)
elif F.typ.v(ref) in {'NP','PrNP','PPrP','DPrP','CP'}:
return getLabel(L.u(nom_head[0], 'phrase_atom')[0], df=df)
else:
return "error"
#Actor(65418)
ranks_df = pd.read_csv(f'{PATH}role_ranks.csv', index_col=0)
ranks_df.head()
Vol | Inst | Aff | neg | role | new_role | new_rank | rank | |
---|---|---|---|---|---|---|---|---|
688348 | y | y | n | NaN | Agent | Agent | 5 | 5 |
688349 | y | n | y | NaN | Volitional Undergoer | Volitional Undergoer | -1 | -1 |
688350 | y | y | n | NaN | Agent | Agent | 5 | 5 |
688351 | y | y | n | NaN | Agent | Agent | 5 | 5 |
688352 | y | n | y | NaN | Volitional Undergoer | Volitional Undergoer | -1 | -1 |
A function is defined to return the agency of any given reference
def Agency(ref, colname, df=ranks_df):
if ref in list(df.index):
return df[df.index == ref][colname].item()
#Agency(68032, 'new_rank')
This section cross-tabulates the participant and role data to calculate the mean agency of each participant.
actor_list = [Actor(ph) for ph in list(ranks_df.index)]
ranks_df.insert(8, 'Actor', actor_list) #The actor is inserted as a new column
ranks_df.head()
Vol | Inst | Aff | neg | role | new_role | new_rank | rank | Actor | |
---|---|---|---|---|---|---|---|---|---|
688348 | y | y | n | NaN | Agent | Agent | 5 | 5 | [JHWH] |
688349 | y | n | y | NaN | Volitional Undergoer | Volitional Undergoer | -1 | -1 | [MCH=] |
688350 | y | y | n | NaN | Agent | Agent | 5 | 5 | [JHWH] |
688351 | y | y | n | NaN | Agent | Agent | 5 | 5 | [MCH=] |
688352 | y | n | y | NaN | Volitional Undergoer | Volitional Undergoer | -1 | -1 | [>HRN, BN JFR>L, BN >HRN] |
Cross-tabulation of the data to count how often each participant obtains a certain agency level:
dic = collections.defaultdict(lambda: collections.defaultdict(int))
for row in ranks_df.iterrows():
for n in row[1].Actor:
dic[n][row[1].new_rank] += 1
agency_df = pd.DataFrame(dic).fillna(0).astype('Int64').T
agency_df = agency_df[[5,4,3,1,0,-1,-2]]
agency_df.head()
5 | 4 | 3 | 1 | 0 | -1 | -2 | |
---|---|---|---|---|---|---|---|
JHWH | 118 | 0 | 1 | 8 | 29 | 30 | 17 |
MCH= | 36 | 0 | 1 | 0 | 1 | 19 | 0 |
>HRN | 16 | 0 | 11 | 31 | 1 | 19 | 10 |
BN JFR>L | 99 | 0 | 44 | 72 | 28 | 83 | 31 |
BN >HRN | 16 | 0 | 6 | 22 | 5 | 17 | 5 |
The mean agency is calculated
agency_mean = []
for row in agency_df.iterrows():
n=0
total = 0
for v in row[1]:
total += (v * agency_df.columns[n])
n+=1
agency_mean.append(round(total/row[1].sum(), 3))
agency_df.insert(7, 'mean', agency_mean)
#Inserting labels
labels = [label_gloss[l] if l in label_gloss else l for l in list(agency_df.index)]
agency_df.insert(0, 'label', labels)
agency_df = agency_df[agency_df.sum(axis=1) > 20]
agency_df.sort_values(by='mean', ascending=False)
label | 5 | 4 | 3 | 1 | 0 | -1 | -2 | mean | |
---|---|---|---|---|---|---|---|---|---|
MCH= | Moses | 36 | 0 | 1 | 0 | 1 | 19 | 0 | 2.877 |
JHWH | YHWH | 118 | 0 | 1 | 8 | 29 | 30 | 17 | 2.645 |
>JC >JC | an_Israelite | 60 | 0 | 22 | 7 | 4 | 6 | 38 | 2.124 |
2ms | 2msg | 21 | 0 | 10 | 57 | 8 | 8 | 2 | 1.698 |
BN JFR>L | Israelites | 99 | 0 | 44 | 72 | 28 | 83 | 31 | 1.552 |
GR | sojourner | 45 | 0 | 16 | 5 | 13 | 9 | 38 | 1.532 |
BN >HRN | Aaron's_sons | 16 | 0 | 6 | 22 | 5 | 17 | 5 | 1.310 |
>HRN | Aaron | 16 | 0 | 11 | 31 | 1 | 19 | 10 | 1.193 |
>X -2ms | brother | 11 | 0 | 3 | 1 | 16 | 10 | 13 | 0.537 |
HM | remnants | 3 | 2 | 4 | 0 | 2 | 5 | 13 | 0.138 |
<M | foreign_nations | 3 | 0 | 1 | 0 | 5 | 3 | 10 | -0.227 |
The network model combines participant data and semantic roles. The primary principle is to isolate those clauses where at least two participants occur (they can be identical) which means that isolated participants are ignored. Secondly, the edges are made from the participant with the highest agency level toward the participant with the lowest agency level within the same clause. We can assume that the participant with the highest agency level is also most active in the event and therefore the source of the event.
def createEdges(colname, df=df, ranks_df=ranks_df, verb_list = [], relation='function', label_text='gloss', mode=str()):
'''
Input: dictionary of actors + nodes (references), plus preferred text type, that is, English gloss (default)
or transcription of the Hebrew lexeme (= trans)
colname is name of the rank column (usually "rank" or "new_rank")
Output: dictionary of edges and labels
'''
error_list = []
#Finding intersection between nodes
clause_node_list = []
for i, row in df.iterrows():
refs = [int(r) for r in row.refs.split()]
clause_node_list += list(set([L.u(n, 'clause')[0] for n in refs]))
#Intersections are calculated by counting the frequency of unique clauses. If a clause appears more than once, there is
#an intersection
counter = collections.Counter(clause_node_list)
intersection = [n for n in counter if counter[n] > 1]
edges = []
if intersection:
for cl in intersection: #Looping over clauses with intersecting actors
clause_inventory = []
pred = False
for ph in L.d(cl, 'phrase'):
ph_info = {}
sfx_info = {} #Directory for object suffixes
rank = Agency(ph, colname, ranks_df)
#Get verb gloss if Predicate
if F.function.v(ph) in {'Pred','PreS','PreO','PtcO','PreC'}:
pred = True
#Finding verb gloss:
for w in L.d(ph, 'word'):
if F.sp.v(w) == 'verb':
pred_gloss, pred_lex = F.gloss.v(L.u(w, 'lex')[0]), F.lex.v(w)
#If the phrase is annotated with a rank (agency), it is fetched.
if rank or rank == 0:
ph_info['ref'] = ph
ph_info['function'] = F.function.v(ph)
ph_info['rank'] = rank
clause_inventory.append(ph_info)
#If object suffix, the suffix info is stored separately and added to the clause inventory
if F.function.v(ph) in {'PreO','PtcO'}:
for w in L.d(ph, 'word'):
if F.sp.v(w) == 'verb' and (Agency(w, colname, ranks_df) or Agency(w, colname, ranks_df) == 0):
sfx_info['ref'] = w
sfx_info['function'] = F.function.v(ph)
sfx_info['rank'] = Agency(w, colname, ranks_df)
clause_inventory.append(sfx_info)
if pred == True and pred_lex!= 'HJH[' and len(clause_inventory) > 1:
ranked = sorted(clause_inventory, key=itemgetter('rank'), reverse = True)
#Getting Actor and labels
Actor_ref = ranked[0]['ref']
Actor_rank = ranked[0]['rank']
Actors = Actor(Actor_ref, df=df) #A list of Actors
if Actors == 'error':
error_list.append((cl, Actor_ref))
#Creating edges from Actor to Undergoer(s)
for Undergoer in ranked[1:]:
Undergoer_ref = Undergoer['ref']
Undergoer_rank = Undergoer['rank']
Undergoers = Actor(Undergoer_ref, df=df)
if Undergoers == 'error':
error_list.append((cl, Undergoer_ref))
if (Actors and Undergoers) and (Undergoers != 'error') and (Actors != 'error'):
for A in Actors:
for U in Undergoers:
if mode == 'one-mode':
edge = (A, Actor_ref, Actor_rank, U, Undergoer_ref, Undergoer_rank, pred_gloss, cl)
edges.append(edge)
elif mode == 'two-mode':
Actor_edge = (A, Actor_ref, Actor_rank, pred_gloss, cl)
Undergoer_edge = (pred_gloss, U, Undergoer_ref, Undergoer_rank, cl)
edges.append(Actor_edge), edges.append(Undergoer_edge)
else:
print("You need to specify mode")
return edges, error_list
Two models are created to account for two versions of the agency data. The 'old' data does not account negations in the clause, while the 'new' data involves a recalculation of the agency (NB: the recalculation is done in another notebook)
old_edges = createEdges(colname='rank',df=df, mode='one-mode')
print(len(old_edges[0]))
#With new ranks because of negatives (e.g. Agent -> Frustrative)
new_edges = createEdges(colname='new_rank',df=df,mode='one-mode')
482
errors = old_edges[1]
for e in errors:
A.pretty(e[0], highlights={e[1]:'gold'})
Both errors concern adverbial phrases, both referring to a location, so they are not important.
We will remove edges for which both the Actor and Undergoer i 0 (Neutral) in Agency. In these cases, there is no interaction so those relations are not important:
def removeNeutral(edge_list):
upd_edge_list = []
for e in edge_list[0]:
Actor_rank = e[2]
Undergoer_rank = e[5]
if Actor_rank == 0 and Undergoer_rank == 0:
continue
else:
upd_edge_list.append(e)
return upd_edge_list
old_edges = removeNeutral(old_edges)
new_edges = removeNeutral(new_edges)
Before the final export the edges need review. Several issues need validation:
The review is carried out manually but assisted by an interface and colorcoding. 'Green' signals that the clause is included in the network, 'salmon' signals absence.
first_verse = T.nodeFromSection(('Leviticus',17,1))
last_verse = T.nodeFromSection(('Leviticus',26,46))
clauses = range(L.d(first_verse, 'clause')[0], L.d(last_verse, 'clause')[0]+2)
verbal_clauses = []
for cl in clauses:
pred = False
for ph in L.d(cl, 'phrase'):
if F.function.v(ph) in {'Pred','PreS','PreO','PtcO','PreC'}:
pred = True
for w in L.d(ph, 'word'):
if F.sp.v(w) == 'verb' and F.lex.v(w) != 'HJH[':
verbal_clauses.append(cl)
print(f'Number of clauses to review: {len(verbal_clauses)}')
def validate(clauses, edges, n):
print(f'Nr {n}: {clauses[n]}')
df = pd.DataFrame(edges)
edge_clauses = list(df[7])
if clauses[n] in edge_clauses:
subset = df[df[7] == clauses[n]]
for i, row in subset.iterrows():
print(f'Actor: {row[0]} - Agency: {row[2]}')
print(f'Undergoer: {row[3]} - Agency: {row[5]}\n')
A.pretty(clauses[n], highlights={clauses[n]:'lightgreen'})
else:
A.pretty(clauses[n], highlights={clauses[n]:'salmon'})
n=0
validate(verbal_clauses, old_edges, n)
n+=1
Lev 17
Lev 18
Lev 19
Lev 20
Lev 21
Lev 22
Lev 23
Lev 24
Lev 25
Lev 26
new_df = pd.DataFrame(new_edges)
old_df = pd.DataFrame(old_edges)
old_df.insert(3, 'new_rank_Actor', new_df.iloc[:,2])
old_df.insert(7, 'new_rank_Undergoer', new_df.iloc[:,5])
old_df.head()
0 | 1 | 2 | new_rank_Actor | 3 | 4 | 5 | new_rank_Undergoer | 6 | 7 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BN >HRN | 690343 | 5 | 5 | JHWH | 690347 | 0 | 0 | swing | 440323 |
1 | JHWH | 690383 | 5 | 5 | MCH= | 690384 | -1 | -1 | speak | 440335 |
2 | BN JFR>L | 690397 | 5 | 5 | JHWH | 690399 | -1 | -1 | approach | 440341 |
3 | JHWH | 690402 | 5 | 5 | MCH= | 690403 | -1 | -1 | speak | 440342 |
4 | BN JFR>L | 690415 | 5 | 5 | JHWH | 690417 | -1 | -1 | approach | 440347 |
The labels (generated from the ETCBC-transliteration) will be replaced more readable ones:
label_gloss = {'>CH BN -2ms': 'daughter-in-law',
'>DM': 'human_being',
'GR': 'sojourner',
'>CH#2': 'woman_in_menstruation',
'>X -2ms': 'brother',
'BFR/ BN/ -<M': 'children',
'>T BT/ BN/ ->CH W >T BT/ BT/ ->CH': 'granddaughter_of_woman',
'MLK=': 'idols',
'NPC#3': 'slave',
'<M': 'foreign_nations',
'>T >CH/ >JC/': "fellow's_wife",
'ZR': 'lay-person',
'>B -2ms': 'father',
'JHWH': 'YHWH',
'2mp_sfx': '2mpl',
'>HRN': 'Aaron',
'MN >JC/ ->CH': 'husband',
'DWDH -2ms': 'aunt-in-law',
'>CH >M ->CH': 'woman_and_her_mother',
'RDP': 'no-one',
'BN >CH': 'blasphemer',
'XRC=/': 'deaf',
'BN JFR>L': 'Israelites',
'C>R >B -2ms': 'aunt',
'KL': 'group_of_people',
'<RWH/ -<RWH -2ms': 'granddaughter',
'PNH/ ZQN/': 'elderly',
'BTWLH/': 'virgin',
'>JC >JC': 'an_Israelite',
'BN ->X -2ms': 'son_of_brother',
'QNH': 'purchaser',
'>JC >CH': 'man/woman',
'<RWH/ >CH/ W BT/ ->CH': 'woman_and_her_daughter',
'3mp': 'witnesses',
'>L MCPXT/ ->JC': 'clan',
'BT >B -2msBT >M -2ms': 'sister',
'PNH/ GDWL/': 'rich',
'>XD': "brother's_brother",
'>T== ZKR=/': 'male',
'2ms': '2msg',
'>XWT ->CH': 'sister_of_woman',
'BN TWCB': 'sons_of_sojourners',
'>M -2ms': 'mother',
'L >JC/': 'man',
'ZR< ->JC': 'offspring',
'PNH/ DL/': 'poor',
'L PNH/ <WR/': 'blind',
'>CH': 'woman',
'>CH >B -2ms': "father's_wife",
'MCH=': 'Moses',
'BN >HRN': "Aaron's_sons",
'BT -2ms': 'daughter',
'CPXH': 'handmaid',
'C>R -HW>': 'relative',
'>LMNH GRC XLL': 'widowed/expelled/defiled_woman',
'HM': 'remnants',
'>T PGR/ -<M': 'corpse',
'DWD ->X -2ms': "brother's_uncle",
'B <M/ -2ms': 'kinsmen'
}
edges_df = old_df
Source = []
Target = []
for n, row in edges_df.iterrows():
source = row[0]
target = row[3]
Source.append(label_gloss[source])
Target.append(label_gloss[target])
edges_df.insert(1, 'Source', Source)
edges_df.insert(6, 'Target', Target)
edges_df
0 | Source | 1 | 2 | new_rank_Actor | Target | 3 | 4 | 5 | new_rank_Undergoer | 6 | 7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BN >HRN | Aaron's_sons | 690343 | 5 | 5 | YHWH | JHWH | 690347 | 0 | 0 | swing | 440323 |
1 | JHWH | YHWH | 690383 | 5 | 5 | Moses | MCH= | 690384 | -1 | -1 | speak | 440335 |
2 | BN JFR>L | Israelites | 690397 | 5 | 5 | YHWH | JHWH | 690399 | -1 | -1 | approach | 440341 |
3 | JHWH | YHWH | 690402 | 5 | 5 | Moses | MCH= | 690403 | -1 | -1 | speak | 440342 |
4 | BN JFR>L | Israelites | 690415 | 5 | 5 | YHWH | JHWH | 690417 | -1 | -1 | approach | 440347 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
472 | DWD ->X -2ms | brother's_uncle | 691326 | 5 | 5 | brother | >X -2ms | 68032 | -1 | -1 | redeem | 440637 |
473 | L >JC/ | man | 689041 | 0 | 0 | handmaid | CPXH | 689040 | -2 | -2 | spend autumn | 439885 |
474 | MN >JC/ ->CH | husband | 689652 | 5 | 5 | widowed/expelled/defiled_woman | >LMNH GRC XLL | 689651 | -2 | -2 | drive out | 440088 |
475 | 3mp | witnesses | 690660 | 5 | 5 | blasphemer | BN >CH | 66980 | -2 | -2 | settle | 440424 |
476 | 3mp | witnesses | 690675 | 5 | 5 | blasphemer | BN >CH | 690677 | 0 | 0 | support | 440429 |
477 rows × 12 columns
The weight of the ties between the participants is defined as the difference between Actor and Undergoer Rank. We create time stamps to include original rank and new rank (new rank takes negations into account):
old_weight = (edges_df[2]-edges_df[5])**2
new_weight = (edges_df['new_rank_Actor']-edges_df['new_rank_Undergoer'])**2
#Insert Weight: calculated as the difference between the Actor rank and the Undergoer rank
edges_df.insert(12, 'old_weight', old_weight)
edges_df.insert(13, 'new_weight', new_weight)
edges_df.head()
0 | Source | 1 | 2 | new_rank_Actor | Target | 3 | 4 | 5 | new_rank_Undergoer | 6 | 7 | old_weight | new_weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BN >HRN | Aaron's_sons | 690343 | 5 | 5 | YHWH | JHWH | 690347 | 0 | 0 | swing | 440323 | 25 | 25 |
1 | JHWH | YHWH | 690383 | 5 | 5 | Moses | MCH= | 690384 | -1 | -1 | speak | 440335 | 36 | 36 |
2 | BN JFR>L | Israelites | 690397 | 5 | 5 | YHWH | JHWH | 690399 | -1 | -1 | approach | 440341 | 36 | 36 |
3 | JHWH | YHWH | 690402 | 5 | 5 | Moses | MCH= | 690403 | -1 | -1 | speak | 440342 | 36 | 36 |
4 | BN JFR>L | Israelites | 690415 | 5 | 5 | YHWH | JHWH | 690417 | -1 | -1 | approach | 440347 | 36 | 36 |
We produce two files, one for dynamic networks and one for static networks:
static = edges_df[['Source','new_rank_Actor','Target','new_rank_Undergoer',6,'new_weight',7]]
static.columns = ['Source','Source_agency','Target','Target_agency','Label','Weight','Clause']
#Export
static.to_excel('Lev17-26.edges.Static.xlsx', index=None)
For the sake of consistency, it is possible to easily compare the changes that are made in new models in comparison to old ones. This helps to update the data without going through a manual validation.
data_old = pd.read_excel('Lev17-26.edges.Static_Old.xlsx')
data_new = pd.read_excel('Lev17-26.edges.Static.xlsx')
data_new.head()
len(data_new)-len(data_old)
review_edges1 = []
review_edges2 = []
for n, row in data_new.iterrows():
if row.Clause in list(data_old.Clause):
subset_old = data_old[data_old.Clause == row.Clause]
match = False
for n1, row1 in subset_old.iterrows():
if row1.Source_label == row.Source_label and row1.Target_label == row.Target_label and row1.Label == row.Label:
match = True
if not match:
review_edges1.append(row.Clause)
else:
review_edges1.append(row.Clause) #Clause is added in new dataset
for n, row in data_old.iterrows():
if row.Clause in list(data_old.Clause):
subset_new = data_new[data_new.Clause == row.Clause]
match = False
for n1, row1 in subset_new.iterrows():
if row1.Source_label == row.Source_label and row1.Target_label == row.Target_label and row1.Label == row.Label:
match = True
if not match:
review_edges2.append(row.Clause)
else:
review_edges2.append(row.Clause) #Clause is added in new dataset
#review_edges1
#review_edges2
review_edges3 = []
for n, row in data_new.iterrows():
if row.Clause in list(data_old.Clause):
subset_old = data_old[data_old.Clause == row.Clause]
match = False
for n1, row1 in subset_old.iterrows():
if row1.Source_label == row.Source_label and row1.Target_label == row.Target_label and row1.Label == row.Label and row1.Weight == row.Weight:
match = True
if not match:
review_edges3.append(row.Clause)
else:
review_edges3.append(row.Clause) #Clause is added in new dataset
review_edges3 = [e for e in review_edges3 if e not in review_edges1 and e not in review_edges2]
review_edges3
data = pd.read_excel('Lev17-26.edges.Static.xlsx')
data.head()
G = nx.MultiGraph()
for n, row in data.iterrows():
G.add_edge(row.Source_label, row.Target_label)
pos = { i : (random.random(), random.random()) for i in G.nodes()}
l = forceatlas2.forceatlas2_networkx_layout(G, pos, niter=2000, gravity=30, scalingRatio=2.0)
weight = collections.Counter(G.edges())
for u, v, d in G.edges(data=True):
d['weight'] = weight[u, v]
plt.figure(figsize = (15,15))
nx.draw_networkx(G, l, node_color='violet', node_size=[n[1]*10 for n in G.degree()],
edge_color='grey', width=[d['weight']/3 for _, _, d in G.edges(data=True)])
plt.axis('off')
plt.margins(x=0.1, y=0.1)
plt.savefig('screenshots/Leviticus_SNA.png', dpi=500)
plt.show()
Number of nodes and edges:
print(f'Nodes: {len(G.nodes())}\nEdges: {len(G.edges())}')
Having created the edges and computed a multiple directed graph (MultiDiGraph), we can now explore the resulting network. We will begin with a general inspection:
One of the simplest measures of cohession ("knittedness") is probably density. Density is simply the number of ties in the network proportional to the possible number of ties.
nx.density(G)
Density is sensitive to the size of the network, and large networks tend to have lower density than small networks, simply because it is more realistic for a member of a small network to be connected with most of the remaining participants than in a large network.
Therefore, another approach is average degree:
degree = G.degree()
sum_degree = sum(dict(degree).values())
print(f'Average degree: {sum_degree/len(G.nodes())}')
G = nx.MultiDiGraph()
for n, row in data.iterrows():
G.add_edge(row.Source_label, row.Target_label)
outdegree_sequence = collections.Counter(sorted([d for n, d in G.out_degree()], reverse=True))
indegree_sequence = collections.Counter(sorted([d for n, d in G.in_degree()], reverse=True))
outdegree_df = pd.DataFrame(outdegree_sequence, index=[0]).T
indegree_df = pd.DataFrame([indegree_sequence]).T
degree_df = pd.concat([indegree_df, outdegree_df], axis=1, sort=False)
degree_df.columns = ['indegree','outdegree']
degree_df
fig, ax = plt.subplots(figsize=(15,7))
plt.bar(degree_df.index, degree_df.indegree, width=0.33)
plt.bar(degree_df.index+0.33, degree_df.outdegree, color='tomato', width=0.33)
ax.legend(labels=['indegree', 'outdegree'], fontsize=14)
plt.ylabel("Count", size=14)
plt.xlabel("Degree", size=14)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()
Cumulative:
len(G.nodes())
indegree_cum = [n/len(G.nodes())*100 for n in np.cumsum(degree_df.fillna(0).indegree)]
outdegree_cum = [n/len(G.nodes())*100 for n in np.cumsum(degree_df.fillna(0).outdegree)]
degree_df.insert(2, "indegree_cum (%)", indegree_cum)
degree_df.insert(3, "outdegree_cum (%)", outdegree_cum)
degree_df
Most connected participants:
top_degree = sorted(dict(degree).items(), key=itemgetter(1), reverse=True)
A cummulative view:
cum_degree = pd.DataFrame(top_degree)
cum_degree.columns = ['participant','degree']
degree_cum = [n/(len((G.edges()))*2)*100 for n in np.cumsum(cum_degree.degree)]
cum_degree.insert(2, "degree_cum (%)", degree_cum)
cum_degree.head(10)
Updated graph:
fig, ax1 = plt.subplots(figsize=(15,7))
ax2 = ax1.twinx()
ax1.bar(degree_df.index, degree_df.indegree, width=0.33)
ax1.bar(degree_df.index+0.33, degree_df.outdegree, color='tomato', width=0.33)
ax2.plot(degree_df.index, degree_df['indegree_cum (%)'], linestyle='--', alpha=0.5)
ax2.plot(degree_df.index, degree_df['outdegree_cum (%)'], linestyle='--', alpha=0.5)
ax1.legend(frameon=1, labels=['indegree', 'outdegree'], fontsize=14, facecolor='white', framealpha=1)
ax1.set_ylabel("Count", size=14)
ax2.set_ylabel("Cumulative %", size=14)
ax1.set_xlabel("Degree", size=14)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()
Inspect values:
G.degree()
G.out_degree()
Degree proportion of selected participants:
sel_part = sum(dict(G.degree(['YHWH', 'Moses','Israelites','sojourner','2ms','an_Israelite'])).values())
print(f'{round(sel_part/sum(dict(G.degree()).values())*100, 2)}%')
Reciprocity concerns whether an interaction from one actor to another is returned, or whether the relation is one-sided. A simple measure of reciprocity is to count the number of reciprocal ties and divide these by the total number of ties. For this analysis, we are not interested in the weights of the edges but simply the binary value (connected or not).
digraph = nx.DiGraph()
for n, row in data.iterrows():
digraph.add_edge(row.Source_label, row.Target_label)
nx.reciprocity(digraph)
reci_df = pd.DataFrame([nx.reciprocity(digraph, digraph.nodes())]).T.sort_values(by=0, ascending=False)
fig, ax = plt.subplots(figsize=(15,5))
plt.bar(reci_df.index, reci_df[0], width=0.33)
plt.ylabel("fraction", size=14)
plt.xticks(size=11, rotation=45, ha='right')
plt.yticks(size=12)
plt.show()
We use 4 measures for measuring the centrality of individual nodes. That will give an image of core and periphery of the network. The four measures are Degree, Closeness, Betweenness, and Eigenvector.
indegree = nx.in_degree_centrality(digraph)
outdegree = nx.out_degree_centrality(digraph)
betweenness = nx.betweenness_centrality(digraph)
pagerank = nx.pagerank(digraph)
centrality = pd.DataFrame([indegree, outdegree, betweenness, pagerank]).T
centrality.columns = ['indegree','outdegree','betweeness','pagerank']
centrality
def top(measure, df=centrality):
return df.sort_values(by=measure, ascending=False)[measure][:10]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15,5), sharey=True)
ax1.bar(top('outdegree').index, top('outdegree'))
ax1.set_title("Outdegree", size=16)
ax2.bar(top('indegree').index, top('indegree'))
ax2.set_title("Indegree", size=16)
ax3.bar(top('betweeness').index, top('betweeness'))
ax3.set_title("Betweenness", size=16)
ax4.bar(top('pagerank').index, top('pagerank'))
ax4.set_title("PageRank", size=16)
for ax in fig.axes:
plt.sca(ax)
plt.xticks(rotation=45, ha='right', size=12)
plt.show()