The data was exported from an Excel file, in Open Office, with "Save As"->"Text CSV", as "UTF-8", "{Tabulator}" as field seperator and double quotes (") as text seperator.
%load_ext autoreload
%autoreload 2
import helpers.diana
tier_numbers = {
"clause_id": 2,
"clause_type": 3,
"grammatical_relation": 4,
"pos_agreement": 5,
"last_line": 7
}
ag = helpers.diana.from_excel("data/Hinuq3.csv", tier_numbers=tier_numbers)
import collections
verbs = [ 'COP', 'SAY', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in verbs}
others = [ 'A', 'S', 'P', 'EXP', 'STIM', 'zero-A', 'zero-S', 'zero-P', 'zero-EXP', 'zero-STIM' ]
search_terms = verbs + others
word_orders = collections.defaultdict(int)
word_orders_ids = collections.defaultdict(list)
for wo in helpers.diana.word_orders(ag, search_terms):
word_orders[tuple(wo.word_order)] += 1
word_orders_ids[tuple(wo.word_order)].append(wo.clause_id)
for word_order, count in word_orders.items():
print("{0} => {1}".format(word_order, count))
if count < 5:
print(" {1}".format(word_order, word_orders_ids[word_order]))
('zero-EXP', 'v.aff', 'STIM') => 8 ('zero-A', 'v.tr', 'P') => 42 ('COP', 'S') => 33 ('zero-EXP', 'STIM', 'v.aff') => 26 ('EXP', 'v.aff', 'zero-STIM') => 1 ['clause_id..n#1024'] ('S', 'COP') => 94 ('v.tr', 'P', 'A') => 3 ['clause_id..n#747', 'clause_id..n#798', 'clause_id..n#772'] ('zero-S', 'v.intr') => 273 ('STIM', 'v.aff', 'EXP') => 7 ('v.intr', 'S') => 142 ('A', 'SAY') => 88 ('A', 'zero-P', 'v.tr') => 24 ('STIM', 'EXP', 'v.aff') => 6 ('v.aff', 'EXP', 'STIM') => 10 ('zero-S', 'COP') => 2 ['clause_id..n#786', 'clause_id..n#1087'] ('A', 'v.tr', 'P') => 38 ('COP', 'zero-S') => 1 ['clause_id..n#187'] ('v.aff', 'STIM', 'EXP') => 1 ['clause_id..n#1603'] ('EXP', 'STIM', 'v.aff') => 28 ('zero-STIM', 'v.aff', 'EXP') => 4 ['clause_id..n#41', 'clause_id..n#872', 'clause_id..n#1225', 'clause_id..n#1663'] ('P', 'A', 'v.tr') => 25 ('v.tr', 'A', 'P') => 10 ('zero-A', 'P', 'v.tr') => 253 ('A', 'v.tr', 'zero-P') => 1 ['clause_id..n#1278'] ('S', 'v.intr') => 406 ('zero-A', 'zero-P', 'v.tr') => 94 ('zero-P', 'A', 'v.tr') => 4 ['clause_id..n#957', 'clause_id..n#1472', 'clause_id..n#1526', 'clause_id..n#1518'] ('COP',) => 1 ['clause_id..n#478'] ('zero-EXP', 'zero-STIM', 'v.aff') => 15 ('P', 'v.tr', 'A') => 34 ('A', 'P', 'v.tr') => 112 ('EXP', 'v.aff', 'STIM') => 19 ('zero-A', 'SAY') => 22 ('zero-P', 'v.tr', 'A') => 4 ['clause_id..n#1465', 'clause_id..n#265', 'clause_id..n#948', 'clause_id..n#1375'] ('EXP', 'zero-STIM', 'v.aff') => 9 ('SAY', 'A') => 31
word_orders_main = []
word_orders_main_count = collections.defaultdict(int)
word_orders_sub = []
word_orders_sub_count = collections.defaultdict(int)
main_clause_types = [ "m", "m.rs" ]
sub_clause_types = [ "sub", "sub.rs" ]
clause_types = main_clause_types + sub_clause_types
search_terms = verbs + ['A', 'S', 'P', 'EXP', 'STIM']
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if "V" in wo.word_order and wo.clause_type in clause_types and len(wo.word_order) > 1:
if wo.clause_type in sub_clause_types:
word_orders_sub.append(wo.word_order)
word_orders_sub_count[tuple(wo.word_order)] += 1
else:
word_orders_main.append(wo.word_order)
word_orders_main_count[tuple(wo.word_order)] += 1
Hypothesis H0: It does not depend on the clause type (sub vs. main) whether the clause unit is verb final.
main_v_fin = 0; main_v_nonfin = 0; sub_v_fin = 0; sub_v_nonfin = 0;
for wo, c in word_orders_main_count.items():
if wo[-1] == "V":
main_v_fin += c
else:
main_v_nonfin += c
for wo, c in word_orders_sub_count.items():
if wo[-1] == "V":
sub_v_fin += c
else:
sub_v_nonfin += c
cont_table = [ [main_v_fin, main_v_nonfin], [sub_v_fin, sub_v_nonfin] ]
cont_table
[[734, 346], [343, 38]]
import scipy.stats
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
3.7177758031965255e-19
We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The clause type affects the verb "finalness". In this case subordinate clauses have a sifgnificant higher count of verb final word orders.
Here are are the basic counts:
print("Counts for main clauses:")
for wo, c in word_orders_main_count.items():
print("{0} => {1}".format(wo, c))
print("\nCounts for sub clauses:")
for wo, c in word_orders_sub_count.items():
print("{0} => {1}".format(wo, c))
Counts for main clauses: ('S', 'V') => 377 ('V', 'STIM', 'EXP') => 1 ('V', 'S') => 163 ('A', 'P', 'V') => 88 ('STIM', 'V', 'EXP') => 4 ('V', 'STIM') => 8 ('EXP', 'STIM', 'V') => 26 ('P', 'A', 'V') => 21 ('EXP', 'V', 'STIM') => 19 ('V', 'EXP') => 4 ('A', 'V') => 106 ('STIM', 'V') => 16 ('V', 'P') => 24 ('A', 'V', 'P') => 35 ('V', 'EXP', 'STIM') => 10 ('V', 'A') => 35 ('EXP', 'V') => 7 ('P', 'V') => 87 ('V', 'P', 'A') => 3 ('P', 'V', 'A') => 30 ('V', 'A', 'P') => 10 ('STIM', 'EXP', 'V') => 6 Counts for sub clauses: ('S', 'V') => 123 ('STIM', 'V') => 10 ('V', 'S') => 10 ('P', 'V', 'A') => 4 ('STIM', 'V', 'EXP') => 3 ('EXP', 'STIM', 'V') => 2 ('P', 'A', 'V') => 4 ('EXP', 'V') => 3 ('A', 'V', 'P') => 3 ('P', 'V') => 166 ('A', 'V') => 11 ('A', 'P', 'V') => 24 ('V', 'P') => 18
particles = [ 'G', 'BEN', 'TIME', 'LOC', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
for i, p in enumerate(particles):
print(p)
print(" Count after verb: {0}".format(pos_counts[i][0]))
print(" Count before verb: {0}".format(pos_counts[i][1]))
G Count after verb: 111 Count before verb: 252 BEN Count after verb: 17 Count before verb: 68 TIME Count after verb: 9 Count before verb: 99 LOC Count after verb: 28 Count before verb: 122 ADD Count after verb: 29 Count before verb: 33
particles = [ 'BEN', 'G', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
for i, p in enumerate(particles):
print(p)
print(" Count after verb: {0}".format(pos_counts[i][0]))
print(" Count before verb: {0}".format(pos_counts[i][1]))
BEN Count after verb: 14 Count before verb: 54 G Count after verb: 103 Count before verb: 151 ADD Count after verb: 28 Count before verb: 29
I am using a binomial test here: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom_test.html#scipy.stats.binom_test
part_sum = [ 0, 0 ]
for i, p in enumerate(particles):
part_sum[0] += pos_counts[i][0]
part_sum[1] += pos_counts[i][1]
print("Test for '{0}'".format(p))
print(scipy.stats.binom_test(pos_counts[i]))
print("Test for 'BEN+G+ADD'")
print(scipy.stats.binom_test(part_sum))
Test for 'BEN' 1.10972567279e-06 Test for 'G' 0.00311070663735 Test for 'ADD' 1.0 Test for 'BEN+G+ADD' 5.62656137825e-06
Except for "ADD" it is very unlikely that those counts are random. So the difference for "BEN" and "G" and "BEN+G+ADD" is significant.
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
Hypothesis H0: It does not depend on the grammatical relation type if a participant appears before or after the verb.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
BEN_G_ADD = [ 0, 0 ]
BEN_G_ADD[0] = pos_counts[0][0] + pos_counts[1][0] + pos_counts[2][0]
BEN_G_ADD[1] = pos_counts[0][1] + pos_counts[1][1] + pos_counts[2][1]
cont_table = [ BEN_G_ADD, pos_counts[3] ]
cont_table
[[157, 353], [82, 292]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.0035761096863860167
We reject the null hpythesis H0 because p < 0.05. The grammatical relation type does have influence on whether a participant appears before or after the verb. In this case, A appears significantly more often before the verb then BEN + G + ADD.
cont_table = [ BEN_G_ADD, pos_counts[4] ]
cont_table
[[157, 353], [93, 424]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
2.0804748208486202e-06
Again, we reject the null hypothesis because p < 0.05. P occurs more often before the verb then BEN + G + ADD.
cont_table = [ BEN_G_ADD, pos_counts[5] ]
cont_table
[[157, 353], [28, 122]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.0036929308655807127
Again, we reject the null hypothesis because p < 0.05. LOC occurs more often before the verb then BEN + G + ADD.
A_P = [ 0, 0 ]
A_P[0] = pos_counts[3][0] + pos_counts[4][0]
A_P[1] = pos_counts[3][1] + pos_counts[4][1]
cont_table = [ BEN_G_ADD, A_P ]
cont_table
[[157, 353], [175, 716]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
3.3374996181763293e-06
Again, we reject the null hypothesis because p < 0.05. A + P occurs more often before the verb then BEN + G + ADD.
A_P_LOC = [ 0, 0 ]
A_P_LOC[0] = pos_counts[3][0] + pos_counts[4][0] + pos_counts[5][0]
A_P_LOC[1] = pos_counts[3][1] + pos_counts[4][1] + pos_counts[5][1]
cont_table = [ BEN_G_ADD, A_P_LOC ]
cont_table
[[157, 353], [203, 838]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
1.4327318331957229e-06
Again, we reject the null hypothesis because p < 0.05. A + P + LOC occurs more often before the verb then BEN + G + ADD.
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC', 'S' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
cont_table = [ pos_counts[0], pos_counts[4] ]
cont_table
[[14, 54], [72, 226]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.63498783480640697
We cannot reject H0. There is no significant difference between BEN and P if they appear before or after the verb.
cont_table = [ pos_counts[2], pos_counts[4] ]
cont_table
[[28, 29], [72, 226]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.00031708160303060872
We can reject H0. P occurs more often before the verb then ADD.
BEN_ADD = [ 0, 0 ]
BEN_ADD[0] = pos_counts[0][0] + pos_counts[2][0]
BEN_ADD[1] = pos_counts[0][1] + pos_counts[2][1]
cont_table = [ BEN_ADD, pos_counts[4] ]
cont_table
[[42, 83], [72, 226]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.054556656373823037
We cannot reject H0. There is no significant difference between BEN + ADD vs. P if they appear before or after the verb. Although the evidence is on the edge, so with more data it might be significant.
cont_table = [ pos_counts[3], pos_counts[4] ]
cont_table
[[78, 250], [72, 226]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.92553990155932386
We cannot reject H0. There is no significant difference between A and P if they appear before or after the verb.
cont_table = [ pos_counts[3], pos_counts[5] ]
cont_table
[[78, 250], [27, 97]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.70898128553715467
We cannot reject H0. There is no significant difference between A and S if they appear before or after the verb.
Hypothesis 1 (H0): It does not depend on the type of the verb (SAY vs. others) if the A is expressed ouvertly.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
search_terms = other_verbs + [ 'SAY', 'A', 'zero-A' ]
verb_map = { v: "V" for v in other_verbs }
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if 'SAY' in wo.word_order:
if 'A' in wo.word_order:
SAY_counts[0] += 1
elif 'zero-A' in wo.word_order:
SAY_counts[1] += 1
if 'V' in wo.word_order:
if 'A' in wo.word_order:
others_counts[0] += 1
elif 'zero-A' in wo.word_order:
others_counts[1] += 1
cont_table = [ SAY_counts, others_counts ]
cont_table
[[119, 22], [255, 389]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
3.5525785729571192e-23
We reject the null hypothesis because p < 0.05. A is more often ouvert in SAY sentences than in any other sentence with other verb types.
H0: It does not depend on the verb type (A vs. others) if A is before or after the verb.
search_terms = other_verbs + [ 'SAY', 'A' ]
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if 'SAY' in wo.word_order and 'A' in wo.word_order:
if wo.word_order.index("SAY") < wo.word_order.index("A"):
SAY_counts[0] += 1
else:
SAY_counts[1] += 1
if "V" in wo.word_order and "A" in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index("A"):
others_counts[0] += 1
else:
others_counts[1] += 1
cont_table = [ SAY_counts, others_counts ]
cont_table
[[31, 88], [51, 204]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.22697216348643534
We cannot reject the null hypothesis as p > 0.05. There is no significant difference in SAY vs. other verb sentences between A before and after the verb.
print("A after SAY: {0}".format(SAY_counts[0]))
print("A before SAY: {0}".format(SAY_counts[1]))
A after SAY: 31 A before SAY: 88
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreements = [0, 0]; noagreements = [0, 0];
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
for agr in wo.agreement:
if wo.clause_type in sub_clause_types:
if agr == "noagr":
noagreements[0] += 1
else:
agreements[0] += 1
else:
if agr == "noagr":
noagreements[1] += 1
else:
agreements[1] += 1
print("I found {0} verbs with and {1} verbs without agreement.".format(agreements[0]+agreements[1], noagreements[0]+noagreements[1]))
print("In main clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[1], noagreements[1]))
print("In sub clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[0], noagreements[0]))
I found 1253 verbs with and 346 verbs without agreement. In main clauses: I found 838 verbs with and 222 verbs without agreement. In sub clauses: I found 415 verbs with and 124 verbs without agreement.
Hypothesis H0: It does not depend on the verbal agreement if S (or P or STIM) arguments are expressed overtly.
search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ]
cont_table_S = [ [0, 0], [0, 0] ]
cont_table_P = [ [0, 0], [0, 0] ]
cont_table_STIM = [ [0, 0], [0, 0] ]
cont_table_S_P_STIM = [ [0, 0], [0, 0] ]
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
agreeing = 0
if len(wo.word_order) != len(wo.agreement):
continue
for i, w in enumerate(wo.word_order):
if w in verbs:
if wo.agreement[i] == "noagr":
agreeing = 1
if "zero-S" in wo.word_order:
cont_table_S[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "S" in wo.word_order:
cont_table_S[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
if "zero-P" in wo.word_order:
cont_table_P[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "P" in wo.word_order:
cont_table_P[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
if "zero-STIM" in wo.word_order:
cont_table_STIM[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "STIM" in wo.word_order:
cont_table_STIM[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
cont_table_S
[[237, 603], [39, 72]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S)
pvalue
0.14782310141174806
We cannot reject the null hypothesis, because p > 0.05. It does not depend on the verbal agreement whether S is expressed overtly.
cont_table_P
[[70, 344], [57, 173]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_P)
pvalue
0.017638070366621489
We can reject the hypothesis as p < 0.05. It does depend on verbal agreement whether P is expressed overtly. P is more often expressed overtly when there is agreement on the verb.
cont_table_STIM
[[28, 101], [1, 4]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_STIM)
pvalue
1.0
Here it is not possible to calculate, as the number of non-agreements is too low. Practically all verbs that have a STIM argument show agreement, whether the argument is expressed or not.
cont_table_S_P_STIM
[[335, 1048], [97, 249]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S_P_STIM)
pvalue
0.14533478714180809
We cannot reject the hypothesis as p > 0.05. It does not depend on the verbal agreement whether S + P + STIM is expressed overtly.
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreement_sum = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
for agr in wo.agreement:
agreement_sum[agr] += 1
for agr, count in agreement_sum.items():
print("{} => {}".format(agr, count))
r-5 => 200 o-1 => 354 noagr => 346 y-2 => 161 r-nhpl => 51 r-hpl => 2 b-hpl => 116 b-3 => 316 y-4 => 53
We check here first, if the agreement makes sense, i.e. whether the class marker on P, S or STIM is the same as on the verb. All other cases are printed with clause ID, to check manually.
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ]
agreements = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
v_class = None; n_class = None; v_marker = None; n_type = None;
agreement = False
zero = False
if len(wo.word_order) != len(wo.agreement):
print("length on blue and yellow line different in ID {}".format(wo.clause_id))
continue
for i, w in enumerate(wo.word_order):
if w in verbs:
if wo.agreement[i] != "noagr":
agreement = True
if "-" in wo.agreement[i]:
v_marker, v_class = wo.agreement[i].split("-")
else:
print("no dash in v agr in ID {}".format(wo.clause_id))
else:
if "-" in wo.agreement[i]:
n_split = wo.agreement[i].split("-")
if len(n_split) > 2:
print("more than one dash in n agr in ID {}".format(wo.clause_id))
n_class = n_split[1]
#n_type = n_split[1]
if "." in n_class:
n_class, _ = n_class.split(".")
else:
print("no dash in n agr in ID {}".format(wo.clause_id))
if w.startswith("zero-"):
zero = True
if v_class != n_class and agreement:
print("n class does not equal v class in ID {} (n_class: {} vs. v_class: {})".format(wo.clause_id, n_class, v_class))
elif v_class is not None and n_class is not None and not zero:
agreements["{}-{}".format(v_marker, v_class)] += 1
n class does not equal v class in ID clause_id..n#1037 (n_class: imp vs. v_class: 5) n class does not equal v class in ID clause_id..n#1552 (n_class: 2 vs. v_class: hpl) n class does not equal v class in ID clause_id..n#1139 (n_class: 3 vs. v_class: 1)
Here are the counts for all overt arguments where class markers were equal:
for agr, count in agreements.items():
print("{} => {}".format(agr, count))
r-5 => 167 o-1 => 222 y-2 => 119 r-nhpl => 46 r-hpl => 2 b-hpl => 64 b-3 => 257 y-4 => 42
H0: It does not depend on the prefix (o-1 vs. b) whether the argument is expressed ouvertly.
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
[ agreements["b-3"]+agreements["b-hpl"],
agreement_sum["b-3"]+agreement_sum["b-hpl"]-agreements["b-3"]-agreements["b-hpl"] ] ]
cont_table
[[222, 132], [321, 111]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.00062729277493344911
We reject H0 because p < 0.05. The "b" prefix occurs significantly more often with ouvert arguments then the "o-1" prefix.
H0: It does not depend on the prefix (o-1 vs. r) whether the argument is expressed ouvertly.
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
[ agreements["r-5"]+agreements["r-hpl"]+agreements["r-nhpl"],
agreement_sum["r-5"]+agreement_sum["r-hpl"]+agreement_sum["r-nhpl"]-agreements["r-5"]-agreements["r-hpl"]-agreements["r-nhpl"] ] ]
cont_table
[[222, 132], [215, 38]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
9.856708011476732e-10
We reject H0 because p < 0.05. The "r" prefix occurs significantly more often with ouvert arguments than the "o-1" prefix.
H0: It does not depend on the prefix (o-1 vs. y) whether the argument is expressed ouvertly.
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
[ agreements["y-2"]+agreements["y-4"],
agreement_sum["y-2"]+agreement_sum["y-4"]-agreements["y-2"]-agreements["y-4"] ] ]
cont_table
[[222, 132], [161, 53]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.0022627293600340314
We reject H0 because p < 0.05. The "y" prefix occurs significantly more often with ouvert arguments than the "o-1" prefix.
search_terms = [ "COP", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "COP" in wo.word_order:
possible += 1
if "S" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
127 / 131 = 0.9694656488549618
search_terms = [ "v.intr", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.intr" in wo.word_order:
possible += 1
if "S" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
548 / 821 = 0.6674786845310596
search_terms = [ "v.tr", "A", "P" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.tr" in wo.word_order:
possible += 2
if "A" in wo.word_order:
overt += 1
if "P" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
772 / 1288 = 0.5993788819875776
search_terms = [ "v.aff", "EXP", "STIM" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.aff" in wo.word_order:
possible += 2
if "EXP" in wo.word_order:
overt += 1
if "STIM" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
190 / 268 = 0.7089552238805971
search_terms = [ "v.intr", "S" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.intr" in wo.word_order:
v_index = wo.word_order.index("v.intr")
if wo.agreement[v_index] == "noagr":
noagr_possible += 1
if "S" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 1
if "S" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 476 / 710 = 0.6704225352112676 without agreement: 72 / 111 = 0.6486486486486487
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.665525212907
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
search_terms = [ "v.tr", "A", "P" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.tr" in wo.word_order:
v_index = wo.word_order.index("v.tr")
if wo.agreement[v_index] == "noagr":
noagr_possible += 2
if "A" in wo.word_order:
noagr_overt += 1
if "P" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 2
if "A" in wo.word_order:
agr_overt += 1
if "P" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 506 / 828 = 0.6111111111111112 without agreement: 266 / 460 = 0.5782608695652174
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.259811072564
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
search_terms = [ "v.aff", "EXP", "STIM" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.aff" in wo.word_order:
v_index = wo.word_order.index("v.aff")
if wo.agreement[v_index] == "noagr":
noagr_possible += 2
if "EXP" in wo.word_order:
noagr_overt += 1
if "STIM" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 2
if "EXP" in wo.word_order:
agr_overt += 1
if "STIM" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 185 / 258 = 0.7170542635658915 without agreement: 5 / 10 = 0.5
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.160953027422
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
v_tree = { "v.aff": collections.defaultdict(int), "v.tr": collections.defaultdict(int) }
for wo in word_orders:
v = None
if "v.tr" in wo:
v = "v.tr"
if "v.aff" in wo:
v = "v.aff"
if v is not None:
wo2 = tuple([e for e in sorted(wo) if e != "v.aff" and e != "v.tr"])
v_tree[v][wo2] += word_orders[wo]
for v in ["v.aff", "v.tr"]:
print(v)
for e in v_tree[v]:
print("{0} => {1}".format(e, v_tree[v][e]))
v.aff ('zero-EXP', 'zero-STIM') => 15 ('STIM', 'zero-EXP') => 34 ('EXP', 'zero-STIM') => 14 ('EXP', 'STIM') => 71 v.tr ('A', 'zero-P') => 33 ('A', 'P') => 222 ('P', 'zero-A') => 295 ('zero-A', 'zero-P') => 94
Hypothesis 1 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the A/EXP is expressed ouvertly.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
import scipy.stats
cont_table = [
[ v_tree["v.aff"][('EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'STIM')],
v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('STIM', 'zero-EXP')] ],
[ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('A', 'zero-P')],
v_tree["v.tr"][('P', 'zero-A')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
[[85, 49], [255, 389]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
5.1977253135270883e-07
We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The verb type affects the ouvertness of the A/EXP argument. In this case the "v.aff" sentences have sigificantly more ouvert arguments EXP then the "v.tr" sentences have ouvert A.
Hypothesis 2 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the P/STIM is expressed ouvertly.
import scipy.stats
cont_table = [
[ v_tree["v.aff"][('STIM', 'zero-EXP')] + v_tree["v.aff"][('EXP', 'STIM')],
v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'zero-STIM')] ],
[ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('P', 'zero-A')],
v_tree["v.tr"][('A', 'zero-P')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
[[105, 29], [517, 127]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.63554718109871811
In this case we cannot reject the null hypothesis, as p > 0.05. There is no statistical evidence that the verb type affects the ouvertness of P/STIM.
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in other_verbs }
A_values = []
P_values = []
S_values = []
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
word_order = [w for w in wo.word_order if not w.startswith("zero-")]
if "V" in word_order:
v_index = word_order.index("V")
if "A" in word_order:
A_values.append(word_order.index("A") - v_index)
if "P" in word_order:
P_values.append(word_order.index("P") - v_index)
if "S" in word_order:
S_values.append(word_order.index("S") - v_index)
%matplotlib inline
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(14,4))
axs[0].hist(S_values, range(min(S_values), max(S_values)+2))
axs[0].set_title("Positions of S")
axs[1].hist(A_values, range(min(A_values), max(A_values)+2))
axs[1].set_title("Positions of A")
axs[2].hist(P_values, range(min(P_values), max(P_values)+2))
ret = axs[2].set_title("Positions of P")
plt.figure(figsize=(10,6))
plt.boxplot([S_values, A_values, P_values])
plt.title("Positions of S, A and P")
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])
A_values = [[], []]
P_values = [[], []]
S_values = [[], []]
clause_types = ["m", "m.rs", "sub", "sub.rs"]
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
word_order = [w for w in wo.word_order if not w.startswith("zero-")]
if "V" in word_order and wo.clause_type in clause_types:
ind = 0
if wo.clause_type == "sub" or wo.clause_type == "sub.rs":
ind = 1
v_index = word_order.index("V")
if "A" in word_order:
A_values[ind].append(word_order.index("A") - v_index)
if "P" in word_order:
P_values[ind].append(word_order.index("P") - v_index)
if "S" in word_order:
S_values[ind].append(word_order.index("S") - v_index)
fig, axs = plt.subplots(2, 3, figsize=(14,10))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2))
axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text))
axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2))
axs[ind][1].set_title("Positions of A {0} clauses".format(type_text))
axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2))
ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text))
fig, axs = plt.subplots(1, 2, figsize=(14,6))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]])
axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text))
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])